run out of space on nest, cleaned out

quietlight

Apr 30, 2026, 1:28 AM

KZKLAINJJWZ64T5MUZT34LJVQIKBTKZ6EJGD7C7TTSSDGCHEDPMAC

Dependencies

[2] SJN7IKIV

Change contents

file addition: utils (d--r------)

[2.1]
file addition: xxh64_test.go (----------)

[0.1]

package utils

import (
"os"
"path/filepath"
"testing"
)

func TestComputeXXH64_WAVFile(t *testing.T) {
wavFile := filepath.Join("..", "audio", "N14-2025-02-25-20241116_054500-685-703.wav")

hash, err := ComputeXXH64(wavFile)
if err != nil {
t.Fatalf("ComputeXXH64() error = %v", err)
}

expectedHash := "48dc1684324621de"
if hash != expectedHash {
t.Errorf("ComputeXXH64() = %v, want %v", hash, expectedHash)
}
}

func TestComputeXXH64_Format(t *testing.T) {
wavFile := filepath.Join("..", "audio", "N14-2025-02-25-20241116_054500-685-703.wav")

hash, err := ComputeXXH64(wavFile)
if err != nil {
t.Fatalf("ComputeXXH64() error = %v", err)
}

if len(hash) != 16 {
t.Errorf("hash length = %d, want 16", len(hash))
}

for _, c := range hash {
if (c < '0' || c > '9') && (c < 'a' || c > 'f') {
t.Errorf("invalid hex character '%c' in hash %s", c, hash)
}
}
}

func TestComputeXXH64_FileNotFound(t *testing.T) {
_, err := ComputeXXH64("nonexistent-file.wav")
if err == nil {
t.Error("expected error for nonexistent file, got nil")
}
}

func TestComputeXXH64_EmptyFile(t *testing.T) {
tmpDir := t.TempDir()
emptyFile := filepath.Join(tmpDir, "empty.wav")

if err := createEmptyFile(emptyFile); err != nil {
t.Fatalf("Failed to create empty file: %v", err)
}

hash, err := ComputeXXH64(emptyFile)
if err != nil {
t.Fatalf("ComputeXXH64() error = %v", err)
}

expectedEmpty := "ef46db3751d8e999"
if hash != expectedEmpty {
t.Errorf("ComputeXXH64(empty file) = %v, want %v", hash, expectedEmpty)
}
}

func TestComputeXXH64_Deterministic(t *testing.T) {
wavFile := filepath.Join("..", "audio", "N14-2025-02-25-20241116_054500-685-703.wav")

hash1, err := ComputeXXH64(wavFile)
if err != nil {
t.Fatalf("first call error = %v", err)
}
hash2, err := ComputeXXH64(wavFile)
if err != nil {
t.Fatalf("second call error = %v", err)
}
hash3, err := ComputeXXH64(wavFile)
if err != nil {
t.Fatalf("third call error = %v", err)
}

if hash1 != hash2 || hash2 != hash3 {
t.Errorf("hashes not deterministic: %s, %s, %s", hash1, hash2, hash3)
}
}

func TestComputeXXH64_LeadingZeros(t *testing.T) {
tmpDir := t.TempDir()
smallFile := filepath.Join(tmpDir, "small.dat")

if err := createSmallFile(smallFile); err != nil {
t.Fatalf("Failed to create small file: %v", err)
}

hash, err := ComputeXXH64(smallFile)
if err != nil {
t.Fatalf("ComputeXXH64() error = %v", err)
}

if len(hash) != 16 {
t.Errorf("hash length = %d, want 16 (leading zeros should be preserved)", len(hash))
}
}

func BenchmarkComputeXXH64_Small(b *testing.B) {
f := filepath.Join("..", "audio", "N14-2025-02-25-20241116_054500-685-703.wav") // 547K
b.ResetTimer()
for i := 0; i < b.N; i++ {
ComputeXXH64(f)
}
}

func BenchmarkComputeXXH64_Medium(b *testing.B) {
f := filepath.Join("..", "audio", "20250518_210000.WAV") // 14M
b.ResetTimer()
for i := 0; i < b.N; i++ {
ComputeXXH64(f)
}
}

func BenchmarkComputeXXH64_Large(b *testing.B) {
f := filepath.Join("..", "audio", "E166_BIRD_111211_042726.wav") // 55M
b.ResetTimer()
for i := 0; i < b.N; i++ {
ComputeXXH64(f)
}
}

func createEmptyFile(path string) error {
file, err := os.Create(path)
if err != nil {
return err
}
return file.Close()
}

func createSmallFile(path string) error {
file, err := os.Create(path)
if err != nil {
return err
}
defer file.Close()
_, err = file.Write([]byte{0x42})
return err
}
file addition: xxh64.go (----------)

[0.1]

package utils

import (
"fmt"
"io"
"os"
"sync"

"github.com/cespare/xxhash/v2"
)

var hashBufferPool = sync.Pool{
New: func() any {
buf := make([]byte, 128*1024)
return &buf
},
}

func getHashBuffer() *[]byte {
return hashBufferPool.Get().(*[]byte)
}

func putHashBuffer(buf *[]byte) {
hashBufferPool.Put(buf)
}

// ComputeXXH64 computes the XXH64 hash of a file using streaming I/O.
// Uses a constant ~128KB buffer regardless of file size.
// Returns the hash as a 16-character lowercase hexadecimal string.
func ComputeXXH64(filepath string) (string, error) {
file, err := os.Open(filepath)
if err != nil {
return "", fmt.Errorf("failed to open file: %w", err)
}
defer func() { _ = file.Close() }()

hashBufPtr := getHashBuffer()
defer putHashBuffer(hashBufPtr)

h := xxhash.New()
if _, err := io.CopyBuffer(h, file, *hashBufPtr); err != nil {
return "", fmt.Errorf("failed to read file: %w", err)
}

return fmt.Sprintf("%016x", h.Sum64()), nil
}
file addition: wav_writer.go (----------)

[0.1]

package utils

import (
"bufio"
"encoding/binary"
"fmt"
"os"
)

// WriteWAVFile writes audio samples to a WAV file.
// Samples should be in the range -1.0 to 1.0.
// Output is mono 16-bit PCM.
func WriteWAVFile(filepath string, samples []float64, sampleRate int) error {
if len(samples) == 0 {
return fmt.Errorf("no samples to write")
}

file, err := os.Create(filepath)
if err != nil {
return fmt.Errorf("failed to create file: %w", err)
}
w := bufio.NewWriterSize(file, 64*1024)

// Write WAV and flush; check close to ensure data is persisted.
err = func() error {

// WAV parameters
channels := 1
bitsPerSample := 16
bytesPerSample := bitsPerSample / 8
byteRate := sampleRate * channels * bytesPerSample
blockAlign := channels * bytesPerSample
dataSize := len(samples) * bytesPerSample
totalSize := 36 + dataSize // 36 = header size before data chunk

// Write 44-byte WAV header in one go
header := make([]byte, 44)
copy(header[0:4], "RIFF")
binary.LittleEndian.PutUint32(header[4:8], uint32(totalSize))
copy(header[8:12], "WAVE")
copy(header[12:16], "fmt ")
binary.LittleEndian.PutUint32(header[16:20], 16) // chunk size
binary.LittleEndian.PutUint16(header[20:22], 1) // PCM format
binary.LittleEndian.PutUint16(header[22:24], uint16(channels))
binary.LittleEndian.PutUint32(header[24:28], uint32(sampleRate))
binary.LittleEndian.PutUint32(header[28:32], uint32(byteRate))
binary.LittleEndian.PutUint16(header[32:34], uint16(blockAlign))
binary.LittleEndian.PutUint16(header[34:36], uint16(bitsPerSample))
copy(header[36:40], "data")
binary.LittleEndian.PutUint32(header[40:44], uint32(dataSize))

if _, err := w.Write(header); err != nil {
return err
}

// Convert all float64 samples to 16-bit PCM in a single buffer
buf := make([]byte, dataSize)
for i, sample := range samples {
// Clamp to [-1, 1]
if sample > 1.0 {
sample = 1.0
} else if sample < -1.0 {
sample = -1.0
}
binary.LittleEndian.PutUint16(buf[i*2:], uint16(int16(sample*32767)))
}

if _, err := w.Write(buf); err != nil {
return err
}

return w.Flush()
}()
if err2 := file.Close(); err2 != nil {
if err == nil {
err = fmt.Errorf("failed to close file: %w", err2)
}
}
return err
}
file addition: wav_metadata_test.go (----------)

[0.1]

package utils

import (
"bytes"
"encoding/binary"
"fmt"
"os"
"path/filepath"
"testing"
"time"
)

// createTestWAVFile creates a minimal valid WAV file for testing
func createTestWAVFile(t *testing.T, dir string, filename string, options struct {
duration float64
sampleRate int
channels int
bitsPerSample int
comment string
artist string
}) string {
t.Helper()

path := filepath.Join(dir, filename)
file, err := os.Create(path)
if err != nil {
t.Fatalf("Failed to create test file: %v", err)
}
defer file.Close()

// Calculate data chunk size based on duration
bytesPerSample := options.bitsPerSample / 8
samplesPerSecond := options.sampleRate * options.channels
dataSize := int(options.duration * float64(samplesPerSecond*bytesPerSample))

// Calculate file size (excluding RIFF header)
fileSize := 4 + 8 + 16 + 8 + dataSize // WAVE + fmt chunk + data chunk header

// Add LIST INFO chunk size if metadata provided
var infoChunk []byte
if options.comment != "" || options.artist != "" {
infoChunk = buildINFOChunk(options.comment, options.artist)
fileSize += 8 + len(infoChunk) // LIST chunk header + content
}

buf := &bytes.Buffer{}

// Write RIFF header
buf.WriteString("RIFF")
binary.Write(buf, binary.LittleEndian, uint32(fileSize))
buf.WriteString("WAVE")

// Write fmt chunk
buf.WriteString("fmt ")
binary.Write(buf, binary.LittleEndian, uint32(16)) // chunk size
binary.Write(buf, binary.LittleEndian, uint16(1)) // audio format (PCM)
binary.Write(buf, binary.LittleEndian, uint16(options.channels))
binary.Write(buf, binary.LittleEndian, uint32(options.sampleRate))
byteRate := options.sampleRate * options.channels * bytesPerSample
binary.Write(buf, binary.LittleEndian, uint32(byteRate))
blockAlign := options.channels * bytesPerSample
binary.Write(buf, binary.LittleEndian, uint16(blockAlign))
binary.Write(buf, binary.LittleEndian, uint16(options.bitsPerSample))

// Write LIST INFO chunk if metadata provided
if len(infoChunk) > 0 {
buf.WriteString("LIST")
binary.Write(buf, binary.LittleEndian, uint32(len(infoChunk)))
buf.Write(infoChunk)
}

// Write data chunk
buf.WriteString("data")
binary.Write(buf, binary.LittleEndian, uint32(dataSize))
// Write silence for data
buf.Write(make([]byte, dataSize))

// Write to file
if _, err := file.Write(buf.Bytes()); err != nil {
t.Fatalf("Failed to write test file: %v", err)
}

return path
}

// buildINFOChunk builds a LIST INFO chunk with optional comment and artist
func buildINFOChunk(comment, artist string) []byte {
buf := &bytes.Buffer{}
buf.WriteString("INFO")

if comment != "" {
buf.WriteString("ICMT")
// Size includes null terminator
size := len(comment) + 1
binary.Write(buf, binary.LittleEndian, uint32(size))
buf.WriteString(comment)
buf.WriteByte(0) // null terminator
// Add padding byte if needed for word alignment
if size%2 != 0 {
buf.WriteByte(0)
}
}

if artist != "" {
buf.WriteString("IART")
size := len(artist) + 1
binary.Write(buf, binary.LittleEndian, uint32(size))
buf.WriteString(artist)
buf.WriteByte(0) // null terminator
if size%2 != 0 {
buf.WriteByte(0)
}
}

return buf.Bytes()
}

func TestParseWAVHeader(t *testing.T) {
// Create temporary directory for test files
tmpDir := t.TempDir()

t.Run("should parse basic WAV metadata", func(t *testing.T) {
path := createTestWAVFile(t, tmpDir, "test_basic.wav", struct {
duration float64
sampleRate int
channels int
bitsPerSample int
comment string
artist string
}{
duration: 60.0,
sampleRate: 44100,
channels: 2,
bitsPerSample: 16,
comment: "",
artist: "",
})

metadata, err := ParseWAVHeader(path)
if err != nil {
t.Fatalf("Failed to parse WAV header: %v", err)
}

if metadata.SampleRate != 44100 {
t.Errorf("SampleRate incorrect: got %d, want 44100", metadata.SampleRate)
}

if metadata.Channels != 2 {
t.Errorf("Channels incorrect: got %d, want 2", metadata.Channels)
}

if metadata.BitsPerSample != 16 {
t.Errorf("BitsPerSample incorrect: got %d, want 16", metadata.BitsPerSample)
}

// Duration should be approximately 60 seconds (allow small rounding error)
if metadata.Duration < 59.9 || metadata.Duration > 60.1 {
t.Errorf("Duration incorrect: got %f, want ~60.0", metadata.Duration)
}
})

t.Run("should extract comment metadata", func(t *testing.T) {
expectedComment := "Recorded at 21:00:00 24/02/2025 (UTC+13) by AudioMoth 248AB50153AB0549"
path := createTestWAVFile(t, tmpDir, "test_comment.wav", struct {
duration float64
sampleRate int
channels int
bitsPerSample int
comment string
artist string
}{
duration: 10.0,
sampleRate: 48000,
channels: 1,
bitsPerSample: 16,
comment: expectedComment,
artist: "",
})

metadata, err := ParseWAVHeader(path)
if err != nil {
t.Fatalf("Failed to parse WAV header: %v", err)
}

if metadata.Comment != expectedComment {
t.Errorf("Comment incorrect: got %q, want %q", metadata.Comment, expectedComment)
}
})

t.Run("should extract artist metadata", func(t *testing.T) {
expectedArtist := "AudioMoth"
path := createTestWAVFile(t, tmpDir, "test_artist.wav", struct {
duration float64
sampleRate int
channels int
bitsPerSample int
comment string
artist string
}{
duration: 5.0,
sampleRate: 48000,
channels: 1,
bitsPerSample: 16,
comment: "",
artist: expectedArtist,
})

metadata, err := ParseWAVHeader(path)
if err != nil {
t.Fatalf("Failed to parse WAV header: %v", err)
}

if metadata.Artist != expectedArtist {
t.Errorf("Artist incorrect: got %q, want %q", metadata.Artist, expectedArtist)
}
})

t.Run("should extract both comment and artist", func(t *testing.T) {
expectedComment := "Test recording comment"
expectedArtist := "Test Artist"
path := createTestWAVFile(t, tmpDir, "test_both.wav", struct {
duration float64
sampleRate int
channels int
bitsPerSample int
comment string
artist string
}{
duration: 15.0,
sampleRate: 44100,
channels: 2,
bitsPerSample: 16,
comment: expectedComment,
artist: expectedArtist,
})

metadata, err := ParseWAVHeader(path)
if err != nil {
t.Fatalf("Failed to parse WAV header: %v", err)
}

if metadata.Comment != expectedComment {
t.Errorf("Comment incorrect: got %q, want %q", metadata.Comment, expectedComment)
}

if metadata.Artist != expectedArtist {
t.Errorf("Artist incorrect: got %q, want %q", metadata.Artist, expectedArtist)
}
})

t.Run("should handle different sample rates", func(t *testing.T) {
testCases := []struct {
sampleRate int
}{
{8000},
{16000},
{22050},
{44100},
{48000},
{96000},
}

for _, tc := range testCases {
t.Run("", func(t *testing.T) {
path := createTestWAVFile(t, tmpDir, "test_sr.wav", struct {
duration float64
sampleRate int
channels int
bitsPerSample int
comment string
artist string
}{
duration: 1.0,
sampleRate: tc.sampleRate,
channels: 1,
bitsPerSample: 16,
comment: "",
artist: "",
})

metadata, err := ParseWAVHeader(path)
if err != nil {
t.Fatalf("Failed to parse WAV header: %v", err)
}

if metadata.SampleRate != tc.sampleRate {
t.Errorf("SampleRate incorrect: got %d, want %d", metadata.SampleRate, tc.sampleRate)
}
})
}
})

t.Run("should handle different channel counts", func(t *testing.T) {
testCases := []struct {
channels int
}{
{1}, // Mono
{2}, // Stereo
}

for _, tc := range testCases {
t.Run("", func(t *testing.T) {
path := createTestWAVFile(t, tmpDir, "test_ch.wav", struct {
duration float64
sampleRate int
channels int
bitsPerSample int
comment string
artist string
}{
duration: 1.0,
sampleRate: 44100,
channels: tc.channels,
bitsPerSample: 16,
comment: "",
artist: "",
})

metadata, err := ParseWAVHeader(path)
if err != nil {
t.Fatalf("Failed to parse WAV header: %v", err)
}

if metadata.Channels != tc.channels {
t.Errorf("Channels incorrect: got %d, want %d", metadata.Channels, tc.channels)
}
})
}
})

t.Run("should handle different bit depths", func(t *testing.T) {
testCases := []struct {
bitsPerSample int
}{
{8},
{16},
{24},
{32},
}

for _, tc := range testCases {
t.Run("", func(t *testing.T) {
path := createTestWAVFile(t, tmpDir, "test_bits.wav", struct {
duration float64
sampleRate int
channels int
bitsPerSample int
comment string
artist string
}{
duration: 1.0,
sampleRate: 44100,
channels: 1,
bitsPerSample: tc.bitsPerSample,
comment: "",
artist: "",
})

metadata, err := ParseWAVHeader(path)
if err != nil {
t.Fatalf("Failed to parse WAV header: %v", err)
}

if metadata.BitsPerSample != tc.bitsPerSample {
t.Errorf("BitsPerSample incorrect: got %d, want %d", metadata.BitsPerSample, tc.bitsPerSample)
}
})
}
})

t.Run("should handle very short durations", func(t *testing.T) {
path := createTestWAVFile(t, tmpDir, "test_short.wav", struct {
duration float64
sampleRate int
channels int
bitsPerSample int
comment string
artist string
}{
duration: 0.1, // 100ms
sampleRate: 44100,
channels: 1,
bitsPerSample: 16,
comment: "",
artist: "",
})

metadata, err := ParseWAVHeader(path)
if err != nil {
t.Fatalf("Failed to parse WAV header: %v", err)
}

if metadata.Duration < 0.09 || metadata.Duration > 0.11 {
t.Errorf("Duration incorrect: got %f, want ~0.1", metadata.Duration)
}
})

t.Run("should handle long durations", func(t *testing.T) {
path := createTestWAVFile(t, tmpDir, "test_long.wav", struct {
duration float64
sampleRate int
channels int
bitsPerSample int
comment string
artist string
}{
duration: 600.0, // 10 minutes
sampleRate: 44100,
channels: 1,
bitsPerSample: 16,
comment: "",
artist: "",
})

metadata, err := ParseWAVHeader(path)
if err != nil {
t.Fatalf("Failed to parse WAV header: %v", err)
}

if metadata.Duration < 599.0 || metadata.Duration > 601.0 {
t.Errorf("Duration incorrect: got %f, want ~600.0", metadata.Duration)
}
})

t.Run("should return error for non-existent file", func(t *testing.T) {
_, err := ParseWAVHeader("/nonexistent/file.wav")
if err == nil {
t.Error("Expected error for non-existent file")
}
})

t.Run("should return error for non-WAV file", func(t *testing.T) {
// Create a non-WAV file
path := filepath.Join(tmpDir, "not_a_wav.txt")
if err := os.WriteFile(path, []byte("This is not a WAV file"), 0644); err != nil {
t.Fatalf("Failed to create test file: %v", err)
}

_, err := ParseWAVHeader(path)
if err == nil {
t.Error("Expected error for non-WAV file")
}
})

t.Run("should return error for truncated file", func(t *testing.T) {
// Create a file that's too small to be valid WAV
path := filepath.Join(tmpDir, "truncated.wav")
if err := os.WriteFile(path, []byte("RIFF"), 0644); err != nil {
t.Fatalf("Failed to create test file: %v", err)
}

_, err := ParseWAVHeader(path)
if err == nil {
t.Error("Expected error for truncated file")
}
})

t.Run("should handle empty metadata strings", func(t *testing.T) {
path := createTestWAVFile(t, tmpDir, "test_empty.wav", struct {
duration float64
sampleRate int
channels int
bitsPerSample int
comment string
artist string
}{
duration: 10.0,
sampleRate: 44100,
channels: 1,
bitsPerSample: 16,
comment: "",
artist: "",
})

metadata, err := ParseWAVHeader(path)
if err != nil {
t.Fatalf("Failed to parse WAV header: %v", err)
}

if metadata.Comment != "" {
t.Errorf("Comment should be empty, got %q", metadata.Comment)
}

if metadata.Artist != "" {
t.Errorf("Artist should be empty, got %q", metadata.Artist)
}
})

t.Run("should handle long comment strings", func(t *testing.T) {
longComment := "Recorded at 21:00:00 24/02/2025 (UTC+13) by AudioMoth 248AB50153AB0549 at medium gain while battery was 4.3V and temperature was 15.8C. This is a very long comment with additional information about the recording session."

path := createTestWAVFile(t, tmpDir, "test_long_comment.wav", struct {
duration float64
sampleRate int
channels int
bitsPerSample int
comment string
artist string
}{
duration: 10.0,
sampleRate: 44100,
channels: 1,
bitsPerSample: 16,
comment: longComment,
artist: "",
})

metadata, err := ParseWAVHeader(path)
if err != nil {
t.Fatalf("Failed to parse WAV header: %v", err)
}

if metadata.Comment != longComment {
t.Errorf("Comment incorrect: got %q, want %q", metadata.Comment, longComment)
}
})

t.Run("should extract file modification time", func(t *testing.T) {
path := createTestWAVFile(t, tmpDir, "test_modtime.wav", struct {
duration float64
sampleRate int
channels int
bitsPerSample int
comment string
artist string
}{
duration: 5.0,
sampleRate: 44100,
channels: 1,
bitsPerSample: 16,
comment: "",
artist: "",
})

// Get expected mod time
info, err := os.Stat(path)
if err != nil {
t.Fatalf("Failed to stat file: %v", err)
}
expectedModTime := info.ModTime()

metadata, err := ParseWAVHeader(path)
if err != nil {
t.Fatalf("Failed to parse WAV header: %v", err)
}

// Allow 1 second tolerance for filesystem granularity
diff := metadata.FileModTime.Sub(expectedModTime)
if diff < -1*time.Second || diff > 1*time.Second {
t.Errorf("FileModTime incorrect: got %v, want %v (diff: %v)",
metadata.FileModTime, expectedModTime, diff)
}

// Ensure FileModTime is not zero
if metadata.FileModTime.IsZero() {
t.Error("FileModTime should not be zero")
}
})
}

func TestExtractNullTerminatedString(t *testing.T) {
testCases := []struct {
name string
input []byte
expected string
}{
{
name: "string with null terminator",
input: []byte{'h', 'e', 'l', 'l', 'o', 0, 'w', 'o', 'r', 'l', 'd'},
expected: "hello",
},
{
name: "string without null terminator",
input: []byte{'h', 'e', 'l', 'l', 'o'},
expected: "hello",
},
{
name: "empty string",
input: []byte{},
expected: "",
},
{
name: "only null terminator",
input: []byte{0},
expected: "",
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
result := extractNullTerminatedString(tc.input)
if result != tc.expected {
t.Errorf("Result incorrect: got %q, want %q", result, tc.expected)
}
})
}
}

func TestParseWAVHeaderMinimal(t *testing.T) {
tmpDir := t.TempDir()

t.Run("should parse basic WAV metadata", func(t *testing.T) {
path := createTestWAVFile(t, tmpDir, "test_minimal.wav", struct {
duration float64
sampleRate int
channels int
bitsPerSample int
comment string
artist string
}{
duration: 10.0,
sampleRate: 44100,
channels: 1,
bitsPerSample: 16,
comment: "",
artist: "",
})

sampleRate, duration, err := ParseWAVHeaderMinimal(path)
if err != nil {
t.Fatalf("Failed to parse WAV header: %v", err)
}

if sampleRate != 44100 {
t.Errorf("SampleRate incorrect: got %d, want 44100", sampleRate)
}
if duration < 9.9 || duration > 10.1 {
t.Errorf("Duration incorrect: got %f, want ~10.0", duration)
}
})

t.Run("should handle different sample rates", func(t *testing.T) {
sampleRates := []int{8000, 22050, 44100, 48000, 96000}

for _, sr := range sampleRates {
t.Run(fmt.Sprintf("%dHz", sr), func(t *testing.T) {
path := createTestWAVFile(t, tmpDir, fmt.Sprintf("test_sr_%d.wav", sr), struct {
duration float64
sampleRate int
channels int
bitsPerSample int
comment string
artist string
}{
duration: 5.0,
sampleRate: sr,
channels: 1,
bitsPerSample: 16,
comment: "",
artist: "",
})

sampleRate, duration, err := ParseWAVHeaderMinimal(path)
if err != nil {
t.Fatalf("Failed to parse WAV header: %v", err)
}

if sampleRate != sr {
t.Errorf("SampleRate incorrect: got %d, want %d", sampleRate, sr)
}
if duration < 4.9 || duration > 5.1 {
t.Errorf("Duration incorrect: got %f, want ~5.0", duration)
}
})
}
})

t.Run("should handle stereo files", func(t *testing.T) {
path := createTestWAVFile(t, tmpDir, "test_stereo.wav", struct {
duration float64
sampleRate int
channels int
bitsPerSample int
comment string
artist string
}{
duration: 3.0,
sampleRate: 44100,
channels: 2,
bitsPerSample: 16,
comment: "",
artist: "",
})

sampleRate, duration, err := ParseWAVHeaderMinimal(path)
if err != nil {
t.Fatalf("Failed to parse WAV header: %v", err)
}

if sampleRate != 44100 {
t.Errorf("SampleRate incorrect: got %d, want 44100", sampleRate)
}
if duration < 2.9 || duration > 3.1 {
t.Errorf("Duration incorrect: got %f, want ~3.0", duration)
}
})

t.Run("should return error for non-existent file", func(t *testing.T) {
_, _, err := ParseWAVHeaderMinimal("/nonexistent/file.wav")
if err == nil {
t.Error("Expected error for non-existent file")
}
})

t.Run("should return error for non-WAV file", func(t *testing.T) {
// Create a text file
path := filepath.Join(tmpDir, "notawav.wav")
if err := os.WriteFile(path, []byte("Not a WAV file"), 0644); err != nil {
t.Fatalf("Failed to create test file: %v", err)
}

_, _, err := ParseWAVHeaderMinimal(path)
if err == nil {
t.Error("Expected error for non-WAV file")
}
})
}
file addition: wav_metadata.go (----------)

[0.1]

package utils

import (
"bytes"
"encoding/binary"
"fmt"
"io"
"os"
"sync"
"time"

"github.com/cespare/xxhash/v2"
)

// Buffer pools for reducing GC pressure during batch imports
var (
// headerBufferPool stores 200KB buffers for WAV header reading (full metadata)
headerBufferPool = sync.Pool{
New: func() any {
buf := make([]byte, 200*1024)
return &buf
},
}

// minimalHeaderBufferPool stores 4KB buffers for minimal WAV header reading
// 4KB is sufficient for fmt + data chunk headers in 99% of WAV files
minimalHeaderBufferPool = sync.Pool{
New: func() any {
buf := make([]byte, 4*1024)
return &buf
},
}
)

// getHeaderBuffer gets a 200KB buffer from the pool
func getHeaderBuffer() *[]byte {
return headerBufferPool.Get().(*[]byte)
}

// putHeaderBuffer returns a 200KB buffer to the pool
func putHeaderBuffer(buf *[]byte) {
headerBufferPool.Put(buf)
}

// getMinimalHeaderBuffer gets a 4KB buffer from the pool
func getMinimalHeaderBuffer() *[]byte {
return minimalHeaderBufferPool.Get().(*[]byte)
}

// putMinimalHeaderBuffer returns a 4KB buffer to the pool
func putMinimalHeaderBuffer(buf *[]byte) {
minimalHeaderBufferPool.Put(buf)
}

// WAVMetadata contains metadata extracted from WAV file headers
type WAVMetadata struct {
Duration float64 // Duration in seconds
SampleRate int // Sample rate in Hz
Comment string // Comment from INFO chunk (may contain AudioMoth data)
Artist string // Artist from INFO chunk
Channels int // Number of audio channels
BitsPerSample int // Bits per sample
FileModTime time.Time // File modification time (fallback timestamp)
FileSize int64 // File size in bytes
}

// ParseWAVHeader efficiently reads only the WAV file header to extract metadata.
// It reads the first 200KB of the file, which should be sufficient for all header chunks.
// ParseWAVHeader extracts metadata from WAV file including duration, sample rate, and INFO chunks
func ParseWAVHeader(filepath string) (*WAVMetadata, error) {
file, err := os.Open(filepath)
if err != nil {
return nil, fmt.Errorf("failed to open file: %w", err)
}
defer func() { _ = file.Close() }()

// Get file info for modification time
fileInfo, err := file.Stat()
if err != nil {
return nil, fmt.Errorf("failed to get file info: %w", err)
}
modTime := fileInfo.ModTime()
fileSize := fileInfo.Size()

// Get header buffer from pool
headerBufPtr := getHeaderBuffer()
defer putHeaderBuffer(headerBufPtr)
headerBuf := (*headerBufPtr)[:cap(*headerBufPtr)]

// Read first 200KB for header parsing (more than enough for metadata)
n, err := file.Read(headerBuf)
if err != nil && err != io.EOF {
return nil, fmt.Errorf("failed to read header: %w", err)
}
headerBuf = headerBuf[:n]

metadata, err := parseWAVFromBytes(headerBuf)
if err != nil {
return nil, err
}

// Set file modification time and size
metadata.FileModTime = modTime
metadata.FileSize = fileSize

return metadata, nil
}

// ParseWAVHeaderMinimal reads only the first 4KB of a WAV file to extract essential metadata.
// This is optimized for batch processing where INFO chunks (comment/artist) are not needed.
// It's ~50x faster than ParseWAVHeader for large files due to reduced I/O.
// Returns (sampleRate, duration, error) - the minimal data needed for .data file generation.
func ParseWAVHeaderMinimal(filepath string) (sampleRate int, duration float64, err error) {
file, err := os.Open(filepath)
if err != nil {
return 0, 0, fmt.Errorf("failed to open file: %w", err)
}
defer func() { _ = file.Close() }()

// Get minimal header buffer from pool (4KB)
headerBufPtr := getMinimalHeaderBuffer()
defer putMinimalHeaderBuffer(headerBufPtr)
headerBuf := (*headerBufPtr)[:cap(*headerBufPtr)]

// Read first 4KB - sufficient for fmt + data chunk headers in 99% of files
n, err := file.Read(headerBuf)
if err != nil && err != io.EOF {
return 0, 0, fmt.Errorf("failed to read header: %w", err)
}
headerBuf = headerBuf[:n]

// Parse minimal metadata
sampleRate, duration, err = parseWAVMinimal(headerBuf)
if err != nil {
return 0, 0, err
}

return sampleRate, duration, nil
}

// parseWAVMinimal parses only essential WAV metadata from a byte buffer.
// Returns (sampleRate, duration, error). Does not parse INFO chunks.
func parseWAVMinimal(data []byte) (sampleRate int, duration float64, err error) {
if len(data) < 44 {
return 0, 0, fmt.Errorf("file too small to be valid WAV")
}

// Verify RIFF header
if string(data[0:4]) != "RIFF" {
return 0, 0, fmt.Errorf("not a valid WAV file (missing RIFF header)")
}

// Verify WAVE format
if string(data[8:12]) != "WAVE" {
return 0, 0, fmt.Errorf("not a valid WAV file (missing WAVE format)")
}

var channels, bitsPerSample int

// Parse chunks - stop after finding data chunk
offset := 12
for offset < len(data)-8 {
chunkID := string(data[offset : offset+4])
chunkSize := int(binary.LittleEndian.Uint32(data[offset+4 : offset+8]))
offset += 8

switch chunkID {
case "fmt ":
// Parse format chunk
if chunkSize >= 16 && offset+16 <= len(data) {
channels = int(binary.LittleEndian.Uint16(data[offset+2 : offset+4]))
sampleRate = int(binary.LittleEndian.Uint32(data[offset+4 : offset+8]))
bitsPerSample = int(binary.LittleEndian.Uint16(data[offset+14 : offset+16]))
}

case "data":
// Found data chunk - calculate duration and return
if sampleRate > 0 && channels > 0 && bitsPerSample > 0 {
bytesPerSample := bitsPerSample / 8
bytesPerSecond := sampleRate * channels * bytesPerSample
if bytesPerSecond > 0 {
duration = float64(chunkSize) / float64(bytesPerSecond)
return sampleRate, duration, nil
}
}
return 0, 0, fmt.Errorf("invalid WAV: fmt chunk missing or corrupt before data chunk")
}

// Move to next chunk (word-aligned)
offset += chunkSize
if chunkSize%2 != 0 {
offset++
}
}

// Data chunk not found within 4KB - file may have large INFO chunks
return 0, 0, fmt.Errorf("data chunk not found in first 4KB (try ParseWAVHeader for full parsing)")
}

// ParseWAVHeaderWithHash reads the WAV file once to extract both metadata and hash.
// This is more efficient than calling ParseWAVHeader and ComputeXXH64 separately,
// as it only opens the file once and reads it in a single pass.
// Returns (metadata, hash, error).
func ParseWAVHeaderWithHash(filepath string) (*WAVMetadata, string, error) {
file, err := os.Open(filepath)
if err != nil {
return nil, "", fmt.Errorf("failed to open file: %w", err)
}
defer func() { _ = file.Close() }()

// Get file info for modification time and size
fileInfo, err := file.Stat()
if err != nil {
return nil, "", fmt.Errorf("failed to get file info: %w", err)
}
modTime := fileInfo.ModTime()
fileSize := fileInfo.Size()

// Get header buffer from pool
headerBufPtr := getHeaderBuffer()
defer putHeaderBuffer(headerBufPtr)
headerBuf := (*headerBufPtr)[:cap(*headerBufPtr)]

// Read first 200KB for header parsing
n, err := file.Read(headerBuf)
if err != nil && err != io.EOF {
return nil, "", fmt.Errorf("failed to read header: %w", err)
}
headerBuf = headerBuf[:n]

// Parse header
metadata, err := parseWAVFromBytes(headerBuf)
if err != nil {
return nil, "", err
}
metadata.FileModTime = modTime
metadata.FileSize = fileSize

// Hash: seek back to start and stream entire file
if _, err := file.Seek(0, 0); err != nil {
return nil, "", fmt.Errorf("failed to seek: %w", err)
}

// Get hash buffer from pool
hashBufPtr := getHashBuffer()
defer putHashBuffer(hashBufPtr)
hashBuf := *hashBufPtr

h := xxhash.New()
if _, err := io.CopyBuffer(h, file, hashBuf); err != nil {
return nil, "", fmt.Errorf("failed to read file for hash: %w", err)
}

hash := fmt.Sprintf("%016x", h.Sum64())
return metadata, hash, nil
}

// parseWAVFromBytes parses WAV metadata from a byte buffer
func parseWAVFromBytes(data []byte) (*WAVMetadata, error) {
if len(data) < 44 {
return nil, fmt.Errorf("file too small to be valid WAV")
}

// Verify RIFF header
if string(data[0:4]) != "RIFF" {
return nil, fmt.Errorf("not a valid WAV file (missing RIFF header)")
}

// Verify WAVE format
if string(data[8:12]) != "WAVE" {
return nil, fmt.Errorf("not a valid WAV file (missing WAVE format)")
}

metadata := &WAVMetadata{}

// Parse chunks
offset := 12
for offset < len(data)-8 {
// Read chunk ID and size
chunkID := string(data[offset : offset+4])
chunkSize := int(binary.LittleEndian.Uint32(data[offset+4 : offset+8]))
offset += 8

switch chunkID {
case "fmt ":
// Parse format chunk - need at least 16 bytes of data
if chunkSize >= 16 && offset+16 <= len(data) {
metadata.Channels = int(binary.LittleEndian.Uint16(data[offset+2 : offset+4]))
metadata.SampleRate = int(binary.LittleEndian.Uint32(data[offset+4 : offset+8]))
metadata.BitsPerSample = int(binary.LittleEndian.Uint16(data[offset+14 : offset+16]))
}

case "data":
// Calculate duration from data chunk size
// We only need the chunkSize from the header, not the actual audio data
if metadata.SampleRate > 0 && metadata.Channels > 0 && metadata.BitsPerSample > 0 {
bytesPerSample := metadata.BitsPerSample / 8
bytesPerSecond := metadata.SampleRate * metadata.Channels * bytesPerSample
if bytesPerSecond > 0 {
metadata.Duration = float64(chunkSize) / float64(bytesPerSecond)
}
}
// Data chunk content is the audio data - we don't need to read it

case "LIST":
// Parse LIST chunk for INFO metadata
if chunkSize >= 4 && offset+chunkSize <= len(data) {
listType := string(data[offset : offset+4])
if listType == "INFO" {
parseINFOChunk(data[offset+4:offset+chunkSize], metadata)
}
}
}

// Move to next chunk (chunks are word-aligned)
offset += chunkSize
if chunkSize%2 != 0 {
offset++ // Skip padding byte
}
}

// Validate that we found essential chunks
if metadata.SampleRate == 0 {
return nil, fmt.Errorf("invalid WAV file: missing or corrupt fmt chunk")
}
if metadata.Duration == 0 {
return nil, fmt.Errorf("invalid WAV file: missing or corrupt data chunk")
}

return metadata, nil
}

// parseINFOChunk parses INFO list chunk for comment and artist metadata
func parseINFOChunk(data []byte, metadata *WAVMetadata) {
offset := 0
for offset < len(data)-8 {
// Read subchunk ID and size
if offset+8 > len(data) {
break
}

subchunkID := string(data[offset : offset+4])
subchunkSize := int(binary.LittleEndian.Uint32(data[offset+4 : offset+8]))
offset += 8

if offset+subchunkSize > len(data) {
break
}

// Extract null-terminated string
value := extractNullTerminatedString(data[offset : offset+subchunkSize])

switch subchunkID {
case "ICMT": // Comment
metadata.Comment = value
case "IART": // Artist
metadata.Artist = value
}

// Move to next subchunk (word-aligned)
offset += subchunkSize
if subchunkSize%2 != 0 {
offset++ // Skip padding byte
}
}
}

// extractNullTerminatedString extracts a null-terminated string from bytes
func extractNullTerminatedString(data []byte) string {
before, _, ok := bytes.Cut(data, []byte{0})
if ok {
return string(before)
}
return string(data)
}

// ReadWAVSamples reads audio samples from a WAV file and returns them as float64.
// Mono files: returns single channel.
// Stereo files: returns left channel only.
// Samples are normalized to the range -1.0 to 1.0.
func ReadWAVSamples(filepath string) ([]float64, int, error) {
file, err := os.Open(filepath)
if err != nil {
return nil, 0, fmt.Errorf("failed to open file: %w", err)
}
defer func() { _ = file.Close() }()

// Read header to get format info
headerBuf := make([]byte, 44)
if _, err := io.ReadFull(file, headerBuf); err != nil {
return nil, 0, fmt.Errorf("failed to read header: %w", err)
}

// Verify RIFF/WAVE header
if string(headerBuf[0:4]) != "RIFF" || string(headerBuf[8:12]) != "WAVE" {
return nil, 0, fmt.Errorf("not a valid WAV file")
}

// Parse chunks to find fmt and data
var sampleRate, channels, bitsPerSample int
var dataOffset, dataSize int64

// Seek to first chunk
if _, err := file.Seek(12, 0); err != nil {
return nil, 0, fmt.Errorf("failed to seek: %w", err)
}

for {
chunkHeader := make([]byte, 8)
if _, err := io.ReadFull(file, chunkHeader); err != nil {
if err == io.EOF {
break
}
return nil, 0, fmt.Errorf("failed to read chunk header: %w", err)
}

chunkID := string(chunkHeader[0:4])
chunkSize := int64(binary.LittleEndian.Uint32(chunkHeader[4:8]))

switch chunkID {
case "fmt ":
fmtData := make([]byte, chunkSize)
if _, err := io.ReadFull(file, fmtData); err != nil {
return nil, 0, fmt.Errorf("failed to read fmt chunk: %w", err)
}
if len(fmtData) >= 16 {
channels = int(binary.LittleEndian.Uint16(fmtData[2:4]))
sampleRate = int(binary.LittleEndian.Uint32(fmtData[4:8]))
bitsPerSample = int(binary.LittleEndian.Uint16(fmtData[14:16]))
}

case "data":
dataOffset, _ = file.Seek(0, 1) // Current position
dataSize = chunkSize
// Done - we found the data chunk
goto foundData

default:
// Skip unknown chunk
if _, err := file.Seek(chunkSize, 1); err != nil {
return nil, 0, fmt.Errorf("failed to skip chunk: %w", err)
}
}

// Word align
if chunkSize%2 != 0 {
if _, err := file.Seek(1, 1); err != nil {
return nil, 0, fmt.Errorf("failed to skip padding: %w", err)
}
}
}

return nil, 0, fmt.Errorf("no data chunk found in WAV file")

foundData:
if sampleRate == 0 || channels == 0 || bitsPerSample == 0 {
return nil, 0, fmt.Errorf("missing or invalid fmt chunk")
}

// Read audio data
if _, err := file.Seek(dataOffset, 0); err != nil {
return nil, 0, fmt.Errorf("failed to seek to data: %w", err)
}

audioData := make([]byte, dataSize)
if _, err := io.ReadFull(file, audioData); err != nil {
return nil, 0, fmt.Errorf("failed to read audio data: %w", err)
}

// Convert to float64 samples
samples := convertToFloat64(audioData, bitsPerSample, channels)

return samples, sampleRate, nil
}

// convertToFloat64 converts raw audio bytes to float64 samples
// Returns mono (left channel only for stereo)
func convertToFloat64(data []byte, bitsPerSample, channels int) []float64 {
bytesPerSample := bitsPerSample / 8
blockAlign := bytesPerSample * channels
numSamples := len(data) / blockAlign

samples := make([]float64, numSamples)

switch bitsPerSample {
case 16:
for i := range numSamples {
// Read first (left) channel only for stereo
offset := i * blockAlign
sample := int16(binary.LittleEndian.Uint16(data[offset : offset+2]))
samples[i] = float64(sample) / 32768.0
}

case 24:
for i := range numSamples {
offset := i * blockAlign
// 24-bit signed, little-endian
b := data[offset : offset+3]
sample := int32(b[0]) | int32(b[1])<<8 | int32(b[2])<<16
// Sign extend
if sample >= 0x800000 {
sample -= 0x1000000
}
samples[i] = float64(sample) / 8388608.0
}

case 32:
for i := range numSamples {
offset := i * blockAlign
sample := int32(binary.LittleEndian.Uint32(data[offset : offset+4]))
samples[i] = float64(sample) / 2147483648.0
}

default:
// Fallback: treat as 16-bit
for i := range numSamples {
offset := i * blockAlign
sample := int16(binary.LittleEndian.Uint16(data[offset : offset+2]))
samples[i] = float64(sample) / 32768.0
}
}

return samples
}
file addition: validation_test.go (----------)

[0.1]

package utils

import (
"testing"
)

func TestValidateShortID(t *testing.T) {
tests := []struct {
name string
id string
fieldName string
wantErr bool
}{
{"valid 12-char ID", "abc123XYZ789", "test_id", false},
{"valid with underscore", "abc_123_XYZ_", "test_id", false},
{"valid with dash", "abc-123-XYZ-", "test_id", false},
{"empty string", "", "test_id", true},
{"too short", "abc123", "test_id", true},
{"too long", "abc123XYZ789toolong", "test_id", true},
{"invalid chars", "abc@123#XYZ$", "test_id", true},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := ValidateShortID(tt.id, tt.fieldName)
if (err != nil) != tt.wantErr {
t.Errorf("ValidateShortID() error = %v, wantErr %v", err, tt.wantErr)
}
})
}
}

func TestValidateStringLength(t *testing.T) {
tests := []struct {
name string
value string
field string
maxLen int
wantErr bool
}{
{"within limit", "hello", "test", 10, false},
{"at limit", "1234567890", "test", 10, false},
{"empty string", "", "test", 10, false},
{"over limit", "12345678901", "test", 10, true},
{"zero max", "a", "test", 0, true},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := ValidateStringLength(tt.value, tt.field, tt.maxLen)
if (err != nil) != tt.wantErr {
t.Errorf("ValidateStringLength() error = %v, wantErr %v", err, tt.wantErr)
}
})
}
}

func TestValidateRange(t *testing.T) {
t.Run("int range", func(t *testing.T) {
tests := []struct {
name string
value int
min int
max int
wantErr bool
}{
{"within range", 50, 0, 100, false},
{"at min", 0, 0, 100, false},
{"at max", 100, 0, 100, false},
{"below min", -1, 0, 100, true},
{"above max", 101, 0, 100, true},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := ValidateRange(tt.value, "test", tt.min, tt.max)
if (err != nil) != tt.wantErr {
t.Errorf("ValidateRange() error = %v, wantErr %v", err, tt.wantErr)
}
})
}
})

t.Run("float64 range", func(t *testing.T) {
tests := []struct {
name string
value float64
min float64
max float64
wantErr bool
}{
{"within range", 45.5, -90.0, 90.0, false},
{"at min", -90.0, -90.0, 90.0, false},
{"at max", 90.0, -90.0, 90.0, false},
{"below min", -90.1, -90.0, 90.0, true},
{"above max", 90.1, -90.0, 90.0, true},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := ValidateRange(tt.value, "test", tt.min, tt.max)
if (err != nil) != tt.wantErr {
t.Errorf("ValidateRange() error = %v, wantErr %v", err, tt.wantErr)
}
})
}
})
}

func TestValidatePositive(t *testing.T) {
tests := []struct {
name string
value int
wantErr bool
}{
{"positive", 1, false},
{"large positive", 1000000, false},
{"zero", 0, true},
{"negative", -1, true},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := ValidatePositive(tt.value, "test")
if (err != nil) != tt.wantErr {
t.Errorf("ValidatePositive() error = %v, wantErr %v", err, tt.wantErr)
}
})
}
}

func TestValidateSampleRate(t *testing.T) {
tests := []struct {
name string
rate int
wantErr bool
}{
{"valid low", 1000, false},
{"valid typical", 48000, false},
{"valid high", 250000, false},
{"valid max", 500000, false},
{"too low", 999, true},
{"too high", 500001, true},
{"zero", 0, true},
{"negative", -1000, true},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := ValidateSampleRate(tt.rate)
if (err != nil) != tt.wantErr {
t.Errorf("ValidateSampleRate() error = %v, wantErr %v", err, tt.wantErr)
}
})
}
}

func TestValidateTimezone(t *testing.T) {
tests := []struct {
name string
tz string
wantErr bool
}{
{"valid Auckland", "Pacific/Auckland", false},
{"valid UTC", "UTC", false},
{"valid America/New_York", "America/New_York", false},
{"valid Europe/London", "Europe/London", false},
{"invalid", "Invalid/Timezone", true},
{"garbage", "not-a-timezone", true},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := ValidateTimezone(tt.tz)
if (err != nil) != tt.wantErr {
t.Errorf("ValidateTimezone() error = %v, wantErr %v", err, tt.wantErr)
}
})
}
}

func TestValidateNonNegative(t *testing.T) {
tests := []struct {
name string
value int
wantErr bool
}{
{"positive", 1, false},
{"zero", 0, false},
{"negative", -1, true},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := ValidateNonNegative(tt.value, "test")
if (err != nil) != tt.wantErr {
t.Errorf("ValidateNonNegative() error = %v, wantErr %v", err, tt.wantErr)
}
})
}
}
file addition: validation.go (----------)

[0.1]

package utils

import (
"database/sql"
"fmt"
"regexp"
"time"
)

// ID length constants matching nanoid generation
const (
ShortIDLen = 12 // dataset, location, cluster, pattern, species, filter, call_type
)

// Sample rate reasonable bounds for audio recording
const (
MinSampleRate = 1000 // 1 kHz - below this is unlikely to be real audio
MaxSampleRate = 500000 // 500 kHz - well above bat detectors (~250kHz)
)

// Max string lengths from schema
const (
MaxNameLen = 140 // location.name, cluster.name
MaxDatasetNameLen = 255 // dataset.name
MaxDescriptionLen = 255 // all description fields
MaxPathLen = 255 // cluster.path
MaxFileNameLen = 255 // file.file_name
MaxTimezoneLen = 40 // location.timezone_id
)

// ID format regex - alphanumeric characters (nanoid uses A-Za-z0-9_)
var shortIDRegex = regexp.MustCompile(`^[A-Za-z0-9_-]{12}$`)

// ValidateShortID validates 12-character nanoid format
func ValidateShortID(id, fieldName string) error {
if id == "" {
return fmt.Errorf("%s cannot be empty", fieldName)
}
if len(id) != ShortIDLen {
return fmt.Errorf("%s must be exactly %d characters (got %d)", fieldName, ShortIDLen, len(id))
}
if !shortIDRegex.MatchString(id) {
return fmt.Errorf("%s has invalid format (expected alphanumeric nanoid)", fieldName)
}
return nil
}

// ValidateOptionalShortID validates short ID if provided (non-empty)
func ValidateOptionalShortID(id *string, fieldName string) error {
if id == nil || *id == "" {
return nil
}
return ValidateShortID(*id, fieldName)
}

// ValidateStringLength validates string length constraint
func ValidateStringLength(value, fieldName string, maxLen int) error {
if len(value) > maxLen {
return fmt.Errorf("%s must be %d characters or less (got %d)", fieldName, maxLen, len(value))
}
return nil
}

// ValidateOptionalStringLength validates string length if provided
func ValidateOptionalStringLength(value *string, fieldName string, maxLen int) error {
if value == nil || *value == "" {
return nil
}
return ValidateStringLength(*value, fieldName, maxLen)
}

// ValidateRange validates numeric range constraint (inclusive)
func ValidateRange[T int | float64](value T, fieldName string, min, max T) error {
if value < min || value > max {
return fmt.Errorf("%s must be between %v and %v (got %v)", fieldName, min, max, value)
}
return nil
}

// ValidatePositive validates positive number (> 0)
func ValidatePositive[T int | float64](value T, fieldName string) error {
if value <= 0 {
return fmt.Errorf("%s must be positive (got %v)", fieldName, value)
}
return nil
}

// ValidateNonNegative validates non-negative number (>= 0)
func ValidateNonNegative[T int | float64](value T, fieldName string) error {
if value < 0 {
return fmt.Errorf("%s must be non-negative (got %v)", fieldName, value)
}
return nil
}

// ValidateSampleRate validates audio sample rate is in reasonable range
func ValidateSampleRate(rate int) error {
return ValidateRange(rate, "sample_rate", MinSampleRate, MaxSampleRate)
}

// ValidateTimezone validates IANA timezone ID
func ValidateTimezone(tzID string) error {
if _, err := time.LoadLocation(tzID); err != nil {
return fmt.Errorf("invalid timezone_id '%s': %w", tzID, err)
}
return nil
}

// GetDatasetType returns the type of a dataset
// Returns: (type, exists, error)
func GetDatasetType(db *sql.DB, datasetID string) (string, bool, error) {
var datasetType string
err := db.QueryRow("SELECT type FROM dataset WHERE id = ?", datasetID).Scan(&datasetType)
if err == sql.ErrNoRows {
return "", false, nil
}
if err != nil {
return "", false, err
}
return datasetType, true, nil
}

// ValidateDatasetTypeForImport checks that a dataset is 'structured' type for file imports
// Returns error if dataset doesn't exist or is not 'structured'
func ValidateDatasetTypeForImport(db *sql.DB, datasetID string) error {
datasetType, exists, err := GetDatasetType(db, datasetID)
if err != nil {
return fmt.Errorf("failed to query dataset type: %w", err)
}
if !exists {
return fmt.Errorf("dataset not found: %s", datasetID)
}
if datasetType != "structured" {
return fmt.Errorf("dataset '%s' is type '%s' - file imports only support 'structured' datasets", datasetID, datasetType)
}
return nil
}

// ValidateDatasetTypeUnstructured checks that a dataset is 'unstructured' type
// Returns error if dataset doesn't exist or is not 'unstructured'
func ValidateDatasetTypeUnstructured(db *sql.DB, datasetID string) error {
datasetType, exists, err := GetDatasetType(db, datasetID)
if err != nil {
return fmt.Errorf("failed to query dataset type: %w", err)
}
if !exists {
return fmt.Errorf("dataset not found: %s", datasetID)
}
if datasetType != "unstructured" {
return fmt.Errorf("dataset '%s' is type '%s' - this command only supports 'unstructured' datasets", datasetID, datasetType)
}
return nil
}

// ValidateLocationBelongsToDataset checks that a location belongs to a specific dataset
// Returns error if location doesn't exist or belongs to a different dataset
func ValidateLocationBelongsToDataset(db *sql.DB, locationID, datasetID string) error {
var locationDatasetID string
err := db.QueryRow("SELECT dataset_id FROM location WHERE id = ? AND active = true", locationID).Scan(&locationDatasetID)
if err == sql.ErrNoRows {
return fmt.Errorf("location not found or inactive: %s", locationID)
}
if err != nil {
return fmt.Errorf("failed to query location: %w", err)
}
if locationDatasetID != datasetID {
return fmt.Errorf("location %s does not belong to dataset %s", locationID, datasetID)
}
return nil
}
file addition: terminal_image_test.go (----------)

[0.1]

package utils

import (
"image"
"image/color"
"math/rand"
"strings"
"testing"
)

func TestWriteKittyImage_SmallImage(t *testing.T) {
// 2x2 image produces small base64 payload — single chunk, no m= key
img := image.NewGray(image.Rect(0, 0, 2, 2))
img.SetGray(0, 0, color.Gray{Y: 128})

var buf strings.Builder
if err := WriteKittyImage(img, &buf); err != nil {
t.Fatalf("WriteKittyImage: %v", err)
}

out := buf.String()
if !strings.HasPrefix(out, "\x1b_Gf=100,a=T;") {
t.Error("expected single-chunk header with f=100,a=T")
}
if strings.Contains(out, "m=") {
t.Error("small image should not use chunked m= key")
}
if !strings.HasSuffix(out, "\x1b\\") {
t.Error("expected escape sequence terminator")
}
}

func TestWriteKittyImage_LargeImage_Chunked(t *testing.T) {
// 128x128 random noise image is incompressible — produces >4096 bytes of base64 even with proper LZ77
rng := rand.New(rand.NewSource(42))
img := image.NewGray(image.Rect(0, 0, 128, 128))
for y := range 128 {
for x := range 128 {
img.SetGray(x, y, color.Gray{Y: uint8(rng.Intn(256))})
}
}

var buf strings.Builder
if err := WriteKittyImage(img, &buf); err != nil {
t.Fatalf("WriteKittyImage: %v", err)
}

out := buf.String()

// Should have multiple escape sequences
chunks := strings.Split(out, "\x1b\\")
// Last element is empty after final terminator
chunks = chunks[:len(chunks)-1]

if len(chunks) < 2 {
t.Fatalf("expected multiple chunks, got %d", len(chunks))
}

// First chunk should have f=100,a=T,m=1
if !strings.Contains(chunks[0], "f=100,a=T,m=1") {
t.Errorf("first chunk missing f=100,a=T,m=1: %s", chunks[0][:min(80, len(chunks[0]))])
}

// Last chunk should have m=0
last := chunks[len(chunks)-1]
if !strings.Contains(last, "\x1b_Gm=0;") {
t.Errorf("last chunk missing m=0: %s", last[:min(80, len(last))])
}

// Middle chunks should have m=1
for i := 1; i < len(chunks)-1; i++ {
if !strings.Contains(chunks[i], "\x1b_Gm=1;") {
t.Errorf("middle chunk %d missing m=1", i)
}
}
}

func TestClearKittyImages(t *testing.T) {
var buf strings.Builder
ClearKittyImages(&buf)
expected := "\x1b_Ga=d\x1b\\"
if buf.String() != expected {
t.Errorf("got %q, want %q", buf.String(), expected)
}
}

func TestWriteSixelImage(t *testing.T) {
img := image.NewGray(image.Rect(0, 0, 4, 6))
for y := range 6 {
for x := range 4 {
img.SetGray(x, y, color.Gray{Y: uint8((x + y) * 40)})
}
}

var buf strings.Builder
if err := WriteSixelImage(img, &buf); err != nil {
t.Fatalf("WriteSixelImage: %v", err)
}

out := buf.String()

// Sixel DCS introducer
if !strings.HasPrefix(out, "\x1bP") {
t.Error("expected DCS prefix \\x1bP")
}

// String terminator
if !strings.HasSuffix(out, "\x1b\\") {
t.Error("expected ST suffix \\x1b\\\\")
}

// Should contain 'q' after DCS parameters
if !strings.Contains(out, "q") {
t.Error("expected 'q' in DCS sequence")
}
}

func TestClearImages_Kitty(t *testing.T) {
var buf strings.Builder
ClearImages(&buf, ProtocolKitty)
if buf.String() != "\x1b_Ga=d\x1b\\" {
t.Errorf("got %q, want kitty clear sequence", buf.String())
}
}

func TestClearImages_Sixel(t *testing.T) {
var buf strings.Builder
ClearImages(&buf, ProtocolSixel)
if buf.String() != "" {
t.Errorf("expected no output for sixel clear, got %q", buf.String())
}
}

func TestWriteImage_Kitty(t *testing.T) {
img := image.NewGray(image.Rect(0, 0, 2, 2))
var buf strings.Builder
if err := WriteImage(img, &buf, ProtocolKitty); err != nil {
t.Fatalf("WriteImage kitty: %v", err)
}
if !strings.HasPrefix(buf.String(), "\x1b_G") {
t.Error("expected kitty escape prefix")
}
}

func TestWriteImage_Sixel(t *testing.T) {
img := image.NewGray(image.Rect(0, 0, 4, 6))
var buf strings.Builder
if err := WriteImage(img, &buf, ProtocolSixel); err != nil {
t.Fatalf("WriteImage sixel: %v", err)
}
if !strings.HasPrefix(buf.String(), "\x1bP") {
t.Error("expected sixel DCS prefix")
}
}

func TestWriteITermImage(t *testing.T) {
img := image.NewGray(image.Rect(0, 0, 4, 4))
img.SetGray(0, 0, color.Gray{Y: 128})

var buf strings.Builder
if err := WriteITermImage(img, &buf); err != nil {
t.Fatalf("WriteITermImage: %v", err)
}

out := buf.String()
if !strings.HasPrefix(out, "\x1b]1337;File=") {
t.Errorf("expected iTerm2 OSC prefix, got %q", out[:min(30, len(out))])
}
if !strings.Contains(out, "inline=1") {
t.Error("expected inline=1 parameter")
}
if !strings.HasSuffix(out, "\x07") {
t.Error("expected BEL terminator")
}
}

func TestWriteImage_ITerm(t *testing.T) {
img := image.NewGray(image.Rect(0, 0, 4, 4))
var buf strings.Builder
if err := WriteImage(img, &buf, ProtocolITerm); err != nil {
t.Fatalf("WriteImage iterm: %v", err)
}
if !strings.HasPrefix(buf.String(), "\x1b]1337;File=") {
t.Error("expected iTerm2 OSC prefix")
}
}

func TestClearImages_ITerm(t *testing.T) {
var buf strings.Builder
ClearImages(&buf, ProtocolITerm)
if buf.String() != "" {
t.Errorf("expected no output for iTerm2 clear, got %q", buf.String())
}
}
file addition: terminal_image.go (----------)

[0.1]

package utils

import (
"bytes"
"encoding/base64"
"image"
"image/color"
"image/png"
"io"

"github.com/charmbracelet/x/ansi"
"github.com/charmbracelet/x/ansi/iterm2"
"github.com/charmbracelet/x/ansi/kitty"
"github.com/charmbracelet/x/ansi/sixel"
)

// ImageProtocol selects the terminal graphics protocol.
type ImageProtocol int

const (
ProtocolKitty ImageProtocol = iota
ProtocolSixel
ProtocolITerm
)

// SpectrogramDisplaySize is the default pixel dimension for spectrogram images.
// 448px suits Retina/HiDPI screens (224 logical pixels at 2x).
const SpectrogramDisplaySize = 448

// ClampImageSize clamps a dimension to [224, 448].
func ClampImageSize(size int) int {
return max(224, min(896, size))
}

// WriteImage writes an image using the specified terminal graphics protocol.
func WriteImage(img image.Image, w io.Writer, protocol ImageProtocol) error {
switch protocol {
case ProtocolSixel:
return WriteSixelImage(img, w)
case ProtocolITerm:
return WriteITermImage(img, w)
default:
return WriteKittyImage(img, w)
}
}

// ClearImages clears previously displayed images.
// For kitty, deletes all image placements. For sixel/iTerm2, no-op (inline text).
func ClearImages(w io.Writer, protocol ImageProtocol) error {
switch protocol {
case ProtocolKitty:
return ClearKittyImages(w)
default:
return nil
}
}

// ClearKittyImages clears all previously displayed Kitty images
func ClearKittyImages(w io.Writer) error {
_, err := io.WriteString(w, ansi.KittyGraphics(nil, "a=d"))
return err
}

// WriteKittyImage writes an image to the writer using the Kitty graphics protocol.
// The image is encoded as PNG, base64'd, and sent via chunked Kitty escape sequences.
func WriteKittyImage(img image.Image, w io.Writer) error {
return kitty.EncodeGraphics(w, img, &kitty.Options{
Format: kitty.PNG,
Action: kitty.TransmitAndPut,
Transmission: kitty.Direct,
Chunk: true,
})
}

// WriteSixelImage writes an image using the Sixel graphics protocol.
func WriteSixelImage(img image.Image, w io.Writer) error {
var buf bytes.Buffer
enc := &sixel.Encoder{}
if err := enc.Encode(&buf, img); err != nil {
return err
}
_, err := io.WriteString(w, ansi.SixelGraphics(0, 1, 0, buf.Bytes()))
return err
}

// WriteITermImage writes an image using the iTerm2 Inline Image Protocol.
func WriteITermImage(img image.Image, w io.Writer) error {
var buf bytes.Buffer
if err := png.Encode(&buf, img); err != nil {
return err
}
b64 := base64.StdEncoding.EncodeToString(buf.Bytes())
_, err := io.WriteString(w, ansi.ITerm2(iterm2.File{
Inline: true,
Content: []byte(b64),
}))
return err
}

// CreateGrayscaleImage creates an image.Image from a 2D uint8 array.
// The array is organized as [rows][cols] where rows = frequency bins.
func CreateGrayscaleImage(data [][]uint8) image.Image {
if len(data) == 0 || len(data[0]) == 0 {
return nil
}

height := len(data)
width := len(data[0])

img := image.NewGray(image.Rect(0, 0, width, height))

for y := range height {
off := y * img.Stride
row := data[y]
copy(img.Pix[off:off+width], row)
}

return img
}

// CreateRGBImage creates an image.Image from a 2D RGBPixel array.
// The array is organized as [rows][cols] where rows = frequency bins.
func CreateRGBImage(data [][]RGBPixel) image.Image {
if len(data) == 0 || len(data[0]) == 0 {
return nil
}

height := len(data)
width := len(data[0])

img := image.NewRGBA(image.Rect(0, 0, width, height))

for y := range height {
off := y * img.Stride
row := data[y]
for x := range width {
i := off + x*4
img.Pix[i] = row[x].R
img.Pix[i+1] = row[x].G
img.Pix[i+2] = row[x].B
img.Pix[i+3] = 255
}
}

return img
}

// ResizeImage resizes an image using nearest-neighbor interpolation.
// For higher quality, use golang.org/x/image/draw, but this keeps dependencies minimal.
func ResizeImage(img image.Image, newWidth, newHeight int) image.Image {
bounds := img.Bounds()
srcWidth := bounds.Dx()
srcHeight := bounds.Dy()

scaleX := float64(srcWidth) / float64(newWidth)
scaleY := float64(srcHeight) / float64(newHeight)

if srcGray, ok := img.(*image.Gray); ok {
result := image.NewGray(image.Rect(0, 0, newWidth, newHeight))
for y := range newHeight {
srcY := int(float64(y) * scaleY)
if srcY >= srcHeight {
srcY = srcHeight - 1
}
dstOff := y * result.Stride
srcRowOff := srcY * srcGray.Stride
for x := range newWidth {
srcX := int(float64(x) * scaleX)
if srcX >= srcWidth {
srcX = srcWidth - 1
}
result.Pix[dstOff+x] = srcGray.Pix[srcRowOff+srcX]
}
}
return result
}

if srcRGBA, ok := img.(*image.RGBA); ok {
result := image.NewRGBA(image.Rect(0, 0, newWidth, newHeight))
for y := range newHeight {
srcY := int(float64(y) * scaleY)
if srcY >= srcHeight {
srcY = srcHeight - 1
}
dstOff := y * result.Stride
srcRowOff := srcY * srcRGBA.Stride
for x := range newWidth {
srcX := int(float64(x) * scaleX)
if srcX >= srcWidth {
srcX = srcWidth - 1
}
si := srcRowOff + srcX*4
di := dstOff + x*4
result.Pix[di] = srcRGBA.Pix[si]
result.Pix[di+1] = srcRGBA.Pix[si+1]
result.Pix[di+2] = srcRGBA.Pix[si+2]
result.Pix[di+3] = srcRGBA.Pix[si+3]
}
}
return result
}

// Fallback for other image types
result := image.NewRGBA(image.Rect(0, 0, newWidth, newHeight))
for y := range newHeight {
srcY := int(float64(y) * scaleY)
if srcY >= srcHeight {
srcY = srcHeight - 1
}
for x := range newWidth {
srcX := int(float64(x) * scaleX)
if srcX >= srcWidth {
srcX = srcWidth - 1
}
c := img.At(srcX+bounds.Min.X, srcY+bounds.Min.Y)
r, g, b, _ := c.RGBA()
result.SetRGBA(x, y, color.RGBA{
R: uint8(r >> 8),
G: uint8(g >> 8),
B: uint8(b >> 8),
A: 255,
})
}
}
return result
}

// WritePNG writes an image to a writer in PNG format using fast compression.
func WritePNG(img image.Image, w io.Writer) error {
enc := &png.Encoder{CompressionLevel: png.BestSpeed}
return enc.Encode(w, img)
}
file addition: spectrogram.go (----------)

[0.1]

package utils

import (
"image"
"math"
"strings"
"sync"

"github.com/madelynnblue/go-dsp/window"
)

// cached Hann windows by size, computed once
var (
hannCache = map[int][]float64{}
hannCacheMu sync.RWMutex
)

// getCachedHannWindow returns a cached Hann window of the given size.
func getCachedHannWindow(size int) []float64 {
hannCacheMu.RLock()
if w, ok := hannCache[size]; ok {
hannCacheMu.RUnlock()
return w
}
hannCacheMu.RUnlock()

hannCacheMu.Lock()
defer hannCacheMu.Unlock()
// Double-check after acquiring write lock
if w, ok := hannCache[size]; ok {
return w
}
w := window.Hann(size)
hannCache[size] = w
return w
}

// DefaultMaxSampleRate is the maximum sample rate for spectrograms.
// Higher sample rates are downsampled to this rate for better visualization.
const DefaultMaxSampleRate = 16000

// SpectrogramConfig holds STFT parameters
type SpectrogramConfig struct {
WindowSize int // FFT window size (e.g., 400)
HopSize int // Hop between windows (e.g., 200 for 50% overlap)
SampleRate int // Sample rate in Hz
}

// DefaultSpectrogramConfig returns default config matching Julia implementation
func DefaultSpectrogramConfig(sampleRate int) SpectrogramConfig {
return SpectrogramConfig{
WindowSize: 512,
HopSize: 256, // 50% overlap (window/2)
SampleRate: sampleRate,
}
}

// GenerateSpectrogram generates a spectrogram from audio samples.
// Returns a 2D array of uint8 (0-255) where:
// - First dimension is frequency bins (rows)
// - Second dimension is time frames (columns)
func GenerateSpectrogram(samples []float64, cfg SpectrogramConfig) [][]uint8 {
if len(samples) < cfg.WindowSize {
return nil
}

// Get cached Hann window
hannWindow := getCachedHannWindow(cfg.WindowSize)

// Calculate number of frames
numFrames := (len(samples)-cfg.WindowSize)/cfg.HopSize + 1
if numFrames <= 0 {
return nil
}

// Number of frequency bins (half of FFT due to symmetry)
numFreqBins := cfg.WindowSize/2 + 1

// Allocate power spectrum as flat backing slice (single allocation)
powerFlat := make([]float64, numFreqBins*numFrames)

// Pre-allocate scratch buffers (reused across all frames — zero allocs in loop)
frameData := make([]float64, cfg.WindowSize)
scratch := make([]complex128, cfg.WindowSize)
framePower := make([]float64, numFreqBins)

// Perform STFT
for frame := range numFrames {
start := frame * cfg.HopSize

// Extract and window the frame
for i := 0; i < cfg.WindowSize; i++ {
frameData[i] = samples[start+i] * hannWindow[i]
}

// Compute power spectrum via inline FFT (zero allocations)
PowerSpectrumFFT(frameData, framePower, scratch)

// Copy power into flat matrix (freq bins x time frames layout)
for bin := range numFreqBins {
powerFlat[bin*numFrames+frame] = framePower[bin]
}
}

// Fused normalization: replace zeros, convert to dB, find min/max, normalize to uint8
// All in 2 passes instead of 6
return normalizeFlat(powerFlat, numFreqBins, numFrames)
}

// normalizeFlat converts power values to dB, normalizes to 0-255, in 2 passes.
// Operates on a flat slice laid out as [row0_col0, row0_col1, ..., row1_col0, ...].
// Returns [][]uint8 with rows flipped vertically (low frequencies at bottom).
func normalizeFlat(power []float64, rows, cols int) [][]uint8 {
if rows == 0 || cols == 0 {
return nil
}

// Pass 1: find minNonZero, then convert power to dB in-place, tracking min/max dB
minNonZero := math.MaxFloat64
for _, val := range power {
if val > 0 && val < minNonZero {
minNonZero = val
}
}
if minNonZero == math.MaxFloat64 {
minNonZero = 1e-20 // fallback floor
}

minDB := math.MaxFloat64
maxDB := -math.MaxFloat64
for i, val := range power {
if val <= 0 {
val = minNonZero
}
db := 10.0 * math.Log10(val)
power[i] = db
if db < minDB {
minDB = db
}
if db > maxDB {
maxDB = db
}
}

// Pass 2: normalize dB to uint8 and write into result (with vertical flip)
rangeDB := maxDB - minDB
if rangeDB == 0 {
rangeDB = 1
}
scale := 255.0 / rangeDB

// Allocate result with flat backing slice (single allocation)
resultFlat := make([]uint8, rows*cols)
result := make([][]uint8, rows)
for i := range result {
// Flip: row i in result gets data from row (rows-1-i) in power
srcRow := rows - 1 - i
result[i] = resultFlat[i*cols : (i+1)*cols]
srcOff := srcRow * cols
for j := range cols {
result[i][j] = uint8((power[srcOff+j] - minDB) * scale)
}
}

return result
}

// ExtractSegmentSamples extracts samples from a time range
func ExtractSegmentSamples(samples []float64, sampleRate int, startSec, endSec float64) []float64 {
startIdx := int(startSec * float64(sampleRate))
endIdx := int(endSec * float64(sampleRate))

if startIdx < 0 {
startIdx = 0
}
if endIdx > len(samples) {
endIdx = len(samples)
}
if startIdx >= endIdx {
return nil
}

return samples[startIdx:endIdx]
}

// GenerateSegmentSpectrogram generates a spectrogram image for a time segment.
// Handles WAV loading, downsampling, and image creation.
// color=true applies L4 colormap, color=false creates grayscale.
// imgSize specifies the output image dimensions (clamped to [224, 896]).
func GenerateSegmentSpectrogram(dataFilePath string, startTime, endTime float64, color bool, imgSize int) (image.Image, error) {
// Derive WAV file path (strip .data suffix)
wavPath := strings.TrimSuffix(dataFilePath, ".data")

// Read WAV samples
samples, sampleRate, err := ReadWAVSamples(wavPath)
if err != nil {
return nil, err
}

// Extract segment samples
segSamples := ExtractSegmentSamples(samples, sampleRate, startTime, endTime)
if len(segSamples) == 0 {
return nil, nil
}

// For spectrograms, downsample if sample rate exceeds 16kHz
spectSampleRate := sampleRate
if sampleRate > DefaultMaxSampleRate {
segSamples = ResampleRate(segSamples, sampleRate, DefaultMaxSampleRate)
spectSampleRate = DefaultMaxSampleRate
}

// Generate spectrogram
config := DefaultSpectrogramConfig(spectSampleRate)
spectrogram := GenerateSpectrogram(segSamples, config)
if spectrogram == nil {
return nil, nil
}

// Create image (grayscale or color)
var img image.Image
if color {
colorData := ApplyL4Colormap(spectrogram)
img = CreateRGBImage(colorData)
} else {
img = CreateGrayscaleImage(spectrogram)
}
if img == nil {
return nil, nil
}

// Resize
imgSize = ClampImageSize(imgSize)
return ResizeImage(img, imgSize, imgSize), nil
}
file addition: resample_test.go (----------)

[0.1]

package utils

import (
"math"
"testing"
)

func TestResampleRate(t *testing.T) {
t.Run("should return same samples for same rate", func(t *testing.T) {
samples := []float64{0.1, 0.2, 0.3, 0.4, 0.5}
result := ResampleRate(samples, 16000, 16000)
if len(result) != len(samples) {
t.Errorf("length mismatch: got %d, want %d", len(result), len(samples))
}
for i := range samples {
if result[i] != samples[i] {
t.Errorf("sample %d mismatch: got %f, want %f", i, result[i], samples[i])
}
}
})

t.Run("should downsample from 250000 to 16000", func(t *testing.T) {
// 250000 / 16000 = 15.625 ratio
samples := make([]float64, 2500) // 0.01 seconds at 250kHz
for i := range samples {
samples[i] = float64(i) / float64(len(samples))
}
result := ResampleRate(samples, 250000, 16000)
expectedLen := 160 // 0.01 seconds at 16kHz
if len(result) != expectedLen {
t.Errorf("length mismatch: got %d, want %d", len(result), expectedLen)
}
})

t.Run("should downsample from 44100 to 16000", func(t *testing.T) {
// 44100 / 16000 = 2.75625 ratio
samples := make([]float64, 441) // 0.01 seconds at 44.1kHz
for i := range samples {
samples[i] = float64(i) / float64(len(samples))
}
result := ResampleRate(samples, 44100, 16000)
expectedLen := 160 // 0.01 seconds at 16kHz
if len(result) != expectedLen {
t.Errorf("length mismatch: got %d, want %d", len(result), expectedLen)
}
})

t.Run("should preserve signal shape", func(t *testing.T) {
// Create a simple ramp signal
samples := []float64{0.0, 0.25, 0.5, 0.75, 1.0}
result := ResampleRate(samples, 50000, 16000)
// Should still be a roughly increasing signal
for i := 1; i < len(result); i++ {
if result[i] < result[i-1]-0.1 {
t.Errorf("signal not preserved: result[%d]=%f < result[%d]=%f", i, result[i], i-1, result[i-1])
}
}
})

t.Run("should handle empty samples", func(t *testing.T) {
result := ResampleRate([]float64{}, 44100, 16000)
if len(result) != 0 {
t.Errorf("expected empty result, got %d samples", len(result))
}
})
}

func TestResample(t *testing.T) {
t.Run("should return same samples for speed 1.0", func(t *testing.T) {
samples := []float64{0.1, 0.2, 0.3, 0.4, 0.5}
result := Resample(samples, 1.0)
if len(result) != len(samples) {
t.Errorf("length mismatch: got %d, want %d", len(result), len(samples))
}
for i := range samples {
if result[i] != samples[i] {
t.Errorf("sample %d mismatch: got %f, want %f", i, result[i], samples[i])
}
}
})

t.Run("should double samples for half speed", func(t *testing.T) {
samples := []float64{0.0, 1.0, 0.0, -1.0, 0.0}
result := Resample(samples, 0.5)
// Half speed = 2x more samples
expectedLen := len(samples) * 2
if len(result) != expectedLen {
t.Errorf("length mismatch: got %d, want %d", len(result), expectedLen)
}
})

t.Run("should halve samples for double speed", func(t *testing.T) {
samples := []float64{0.0, 0.5, 1.0, 0.5, 0.0, -0.5, -1.0, -0.5, 0.0}
result := Resample(samples, 2.0)
// Double speed = half the samples
expectedLen := len(samples) / 2
if len(result) != expectedLen {
t.Errorf("length mismatch: got %d, want %d", len(result), expectedLen)
}
})

t.Run("should use linear interpolation", func(t *testing.T) {
// With samples [0, 1], half-speed should interpolate to [0, 0.5, 1]
samples := []float64{0.0, 1.0}
result := Resample(samples, 0.5)
// Expected: 4 samples (2 / 0.5 = 4)
if len(result) != 4 {
t.Errorf("length mismatch: got %d, want 4", len(result))
}
// Check interpolation: index 1 should be ~0.5 (midpoint)
expected := 0.5
if math.Abs(result[1]-expected) > 0.01 {
t.Errorf("interpolated value mismatch: got %f, want ~%f", result[1], expected)
}
})

t.Run("should handle empty samples", func(t *testing.T) {
result := Resample([]float64{}, 0.5)
if len(result) != 0 {
t.Errorf("expected empty result, got %d samples", len(result))
}
})

t.Run("should handle single sample", func(t *testing.T) {
samples := []float64{0.5}
result := Resample(samples, 0.5)
// 1 / 0.5 = 2 samples
if len(result) != 2 {
t.Errorf("length mismatch: got %d, want 2", len(result))
}
})
}

func TestResampleQuality(t *testing.T) {
t.Run("should preserve zero crossings", func(t *testing.T) {
// Sine wave: should have zero crossings at multiples of pi
sampleRate := 1000
samples := make([]float64, sampleRate)
for i := range samples {
samples[i] = math.Sin(2 * math.Pi * float64(i) / float64(sampleRate))
}

// Resample to half speed
result := Resample(samples, 0.5)

// First sample should still be ~0 (sine at 0)
if math.Abs(result[0]) > 0.01 {
t.Errorf("first sample not near zero: got %f", result[0])
}

// Peak should still be ~1.0 (sine max)
peakFound := false
for _, s := range result {
if math.Abs(s-1.0) < 0.1 {
peakFound = true
break
}
}
if !peakFound {
t.Error("peak not preserved in resampled signal")
}
})
}
file addition: resample.go (----------)

[0.1]

package utils

// ResampleRate converts samples from one sample rate to another using linear interpolation.
// This is used to downsample high sample rate audio for spectrogram visualization.
// fromRate: original sample rate (e.g., 250000)
// toRate: target sample rate (e.g., 16000)
func ResampleRate(samples []float64, fromRate, toRate int) []float64 {
if fromRate == toRate || len(samples) == 0 {
return samples
}

// Calculate ratio: toRate/fromRate (e.g., 16000/250000 = 0.064)
ratio := float64(toRate) / float64(fromRate)
newLen := int(float64(len(samples)) * ratio)
if newLen <= 0 {
return samples
}

result := make([]float64, newLen)

for i := range newLen {
// Source index in original samples (floating point)
srcIdx := float64(i) / ratio
idx0 := int(srcIdx)
idx1 := idx0 + 1

// Clamp to valid range
if idx0 >= len(samples) {
idx0 = len(samples) - 1
}
if idx1 >= len(samples) {
idx1 = len(samples) - 1
}

// Linear interpolation between adjacent samples
frac := srcIdx - float64(idx0)
result[i] = samples[idx0]*(1-frac) + samples[idx1]*frac
}

return result
}

// Resample changes playback speed using linear interpolation.
// speed > 1.0 = faster (fewer samples), speed < 1.0 = slower (more samples).
// For half-speed playback, use speed=0.5 which doubles the sample count.
func Resample(samples []float64, speed float64) []float64 {
if speed == 1.0 || len(samples) == 0 {
return samples
}

// Calculate new length: slower speed = more samples
newLen := int(float64(len(samples)) / speed)
if newLen <= 0 {
return samples
}

result := make([]float64, newLen)

for i := range newLen {
// Source index in original samples (floating point)
srcIdx := float64(i) * speed
idx0 := int(srcIdx)
idx1 := idx0 + 1

// Clamp to valid range
if idx0 >= len(samples) {
idx0 = len(samples) - 1
}
if idx1 >= len(samples) {
idx1 = len(samples) - 1
}

// Linear interpolation between adjacent samples
frac := srcIdx - float64(idx0)
result[i] = samples[idx0]*(1-frac) + samples[idx1]*frac
}

return result
}
file addition: path_normalization_test.go (----------)

[0.1]

package utils

import (
"testing"
)

func TestStripMountPoint(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
// macOS
{"macOS volume", "/Volumes/ExternalDrive/Audio", "ExternalDrive/Audio"},
{"macOS root volume", "/Volumes/Drive", "Drive"},

// Linux /media/ with username
{"Linux media mount", "/media/david/USB-Drive/Audio", "USB-Drive/Audio"},
{"Linux media different user", "/media/john/Backup/Audio", "Backup/Audio"},
{"Linux media Pomona", "/media/david/Pomona-4/Pomona/A05/2025-11-08", "Pomona-4/Pomona/A05/2025-11-08"},

// Linux /mnt/
{"Linux mnt mount", "/mnt/storage/Audio", "storage/Audio"},

// No mount point
{"Absolute no mount", "/home/user/Audio", "/home/user/Audio"},
{"Relative path", "./relative/path", "relative/path"},

// Edge cases
{"Root", "/", "/"},
{"Empty", "", "."},
{"Volumes only", "/Volumes/", "."},
{"Media with user only", "/media/david/", "."},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := StripMountPoint(tt.input)
if result != tt.expected {
t.Errorf("StripMountPoint(%q) = %q, want %q", tt.input, result, tt.expected)
}
})
}
}

func TestNormalizeFolderPath(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
// Full workflow
{"Linux media path", "/media/david/Pomona-4/Pomona/A05/2025-11-08/", "Pomona-4/Pomona/A05/2025-11-08"},
{"macOS volumes path", "/Volumes/Drive/Audio/Recordings/", "Drive/Audio/Recordings"},
{"Linux mnt path", "/mnt/storage/Audio/Files/", "storage/Audio/Files"},

// Trailing slashes handled
{"With trailing slash", "/media/david/USB/Audio/", "USB/Audio"},
{"Without trailing slash", "/media/david/USB/Audio", "USB/Audio"},

// Multiple levels
{"Deep nested path", "/media/david/Pomona-4/Level1/Level2/Level3/", "Pomona-4/Level1/Level2/Level3"},

// Edge cases
{"File at mount root", "/media/david/", "."},
{"Volumes with drive only", "/Volumes/Drive/", "Drive"},
{"Volumes drive no trailing slash", "/Volumes/Drive", "Drive"},
{"Root", "/", ""},
{"Empty", "", "."},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := NormalizeFolderPath(tt.input)
if result != tt.expected {
t.Errorf("NormalizeFolderPath(%q) = %q, want %q", tt.input, result, tt.expected)
}
})
}
}
file addition: path_normalization.go (----------)

[0.1]

package utils

import (
"path/filepath"
"runtime"
"strings"
)

// StripMountPoint removes OS-specific mount point prefixes from a path
func StripMountPoint(absPath string) string {
// Clean path first
absPath = filepath.Clean(absPath)

// Handle Windows drive letters
if runtime.GOOS == "windows" {
volumeName := filepath.VolumeName(absPath)
if volumeName != "" {
// Remove "C:\" and return rest
return strings.TrimPrefix(absPath, volumeName+string(filepath.Separator))
}
}

// Handle Unix-like mount points
switch {
case absPath == "/Volumes":
// Exact match to mount point root
return "."

case strings.HasPrefix(absPath, "/Volumes/"):
// macOS external volumes: /Volumes/Drive/... → Drive/...
return strings.TrimPrefix(absPath, "/Volumes/")

case strings.HasPrefix(absPath, "/media/"):
// Linux user mounts: /media/username/Drive/... → Drive/...
// Strip /media/ and the username directory
pathAfterMedia := strings.TrimPrefix(absPath, "/media/")
parts := strings.SplitN(pathAfterMedia, string(filepath.Separator), 2)
if len(parts) > 1 {
return parts[1] // Return everything after username
}
// Just username, no subdirectory (e.g., /media/david)
return "."

case strings.HasPrefix(absPath, "/mnt/"):
// Linux system mounts: /mnt/storage/... → storage/...
return strings.TrimPrefix(absPath, "/mnt/")
}

// No known mount point detected, return as-is
return absPath
}

// NormalizeFolderPath strips mount points and cleans up a folder path
// Unlike a file path normalization, this expects a directory path
func NormalizeFolderPath(folderPath string) string {
// Clean the path
folderPath = filepath.Clean(folderPath)

// Strip mount point
relativePath := StripMountPoint(folderPath)

// Clean up leading/trailing slashes
relativePath = strings.Trim(relativePath, string(filepath.Separator))

return relativePath
}
file addition: nanoid_test.go (----------)

[0.1]

package utils

import (
"regexp"
"testing"
)

func TestGenerateShortID(t *testing.T) {
// Test that it generates a 12-character ID
id, err := GenerateShortID()
if err != nil {
t.Fatalf("GenerateShortID() error = %v", err)
}

if len(id) != 12 {
t.Errorf("GenerateShortID() length = %d, want 12", len(id))
}

// Verify it only contains valid alphabet characters
// Default nanoid alphabet uses A-Za-z0-9_- symbols (64 characters)
validPattern := regexp.MustCompile(`^[0-9A-Za-z_-]{12}$`)
if !validPattern.MatchString(id) {
t.Errorf("GenerateShortID() = %q, contains invalid characters", id)
}

// Test uniqueness - generate multiple IDs and check they're different
ids := make(map[string]bool)
for i := range 100 {
id, err := GenerateShortID()
if err != nil {
t.Fatalf("GenerateShortID() iteration %d error = %v", i, err)
}
if ids[id] {
t.Errorf("GenerateShortID() produced duplicate: %q", id)
}
ids[id] = true
}
}

func TestGenerateLongID(t *testing.T) {
// Test that it generates a 21-character ID
id, err := GenerateLongID()
if err != nil {
t.Fatalf("GenerateLongID() error = %v", err)
}

if len(id) != 21 {
t.Errorf("GenerateLongID() length = %d, want 21", len(id))
}

// Verify it only contains valid alphabet characters
// Default nanoid alphabet uses A-Za-z0-9_- symbols (64 characters)
validPattern := regexp.MustCompile(`^[0-9A-Za-z_-]{21}$`)
if !validPattern.MatchString(id) {
t.Errorf("GenerateLongID() = %q, contains invalid characters", id)
}

// Test uniqueness - generate multiple IDs and check they're different
ids := make(map[string]bool)
for i := range 100 {
id, err := GenerateLongID()
if err != nil {
t.Fatalf("GenerateLongID() iteration %d error = %v", i, err)
}
if ids[id] {
t.Errorf("GenerateLongID() produced duplicate: %q", id)
}
ids[id] = true
}
}

func TestIDsAreDifferent(t *testing.T) {
// Verify that short and long IDs are different types
shortID, err := GenerateShortID()
if err != nil {
t.Fatalf("GenerateShortID() error = %v", err)
}

longID, err := GenerateLongID()
if err != nil {
t.Fatalf("GenerateLongID() error = %v", err)
}

if len(shortID) == len(longID) {
t.Error("Short and long IDs should have different lengths")
}

if len(shortID) != 12 {
t.Errorf("Short ID length = %d, want 12", len(shortID))
}

if len(longID) != 21 {
t.Errorf("Long ID length = %d, want 21", len(longID))
}
}
file addition: nanoid.go (----------)

[0.1]

package utils

import (
gonanoid "github.com/matoous/go-nanoid/v2"
)

// GenerateShortID generates a 12-character nanoid using the full alphabet
// Used for: dataset_id, location_id, cluster_id, pattern_id
// Entropy: ~71 bits (62^12 ≈ 3.2×10^21 combinations)
func GenerateShortID() (string, error) {
return gonanoid.New(12)
}

// GenerateLongID generates a 21-character nanoid using the full alphabet
// Used for: file_id, segment_id, label_id
// Entropy: ~125 bits (62^21 ≈ 2.7×10^37 combinations)
func GenerateLongID() (string, error) {
return gonanoid.New(21)
}
file addition: mapping_test.go (----------)

[0.1]

package utils

import (
"os"
"path/filepath"
"testing"
)

func TestLoadMappingFile(t *testing.T) {
t.Run("valid mapping", func(t *testing.T) {
content := `{
"GSK": {"species": "Roroa", "calltypes": {"Male": "Male - Solo"}},
"Don't Know": {"species": "Don't Know"}
}`
path := createTempFile(t, content)
defer os.Remove(path)

mapping, err := LoadMappingFile(path)
if err != nil {
t.Fatalf("expected no error, got: %v", err)
}
if len(mapping) != 2 {
t.Errorf("expected 2 entries, got %d", len(mapping))
}
if mapping["GSK"].Species != "Roroa" {
t.Errorf("expected GSK -> Roroa, got %s", mapping["GSK"].Species)
}
if mapping["GSK"].Calltypes["Male"] != "Male - Solo" {
t.Errorf("expected GSK Male -> Male - Solo, got %s", mapping["GSK"].Calltypes["Male"])
}
})

t.Run("invalid JSON", func(t *testing.T) {
content := `{invalid json}`
path := createTempFile(t, content)
defer os.Remove(path)

_, err := LoadMappingFile(path)
if err == nil {
t.Fatal("expected error for invalid JSON")
}
})

t.Run("empty file", func(t *testing.T) {
content := `{}`
path := createTempFile(t, content)
defer os.Remove(path)

_, err := LoadMappingFile(path)
if err == nil {
t.Fatal("expected error for empty mapping")
}
})

t.Run("missing species field", func(t *testing.T) {
content := `{"GSK": {"calltypes": {"Male": "Male - Solo"}}}`
path := createTempFile(t, content)
defer os.Remove(path)

_, err := LoadMappingFile(path)
if err == nil {
t.Fatal("expected error for missing species field")
}
})

t.Run("empty species field", func(t *testing.T) {
content := `{"GSK": {"species": ""}}`
path := createTempFile(t, content)
defer os.Remove(path)

_, err := LoadMappingFile(path)
if err == nil {
t.Fatal("expected error for empty species field")
}
})

t.Run("nonexistent file", func(t *testing.T) {
_, err := LoadMappingFile("/nonexistent/path/mapping.json")
if err == nil {
t.Fatal("expected error for nonexistent file")
}
})
}

func TestGetDBSpecies(t *testing.T) {
mapping := MappingFile{
"GSK": {Species: "Roroa"},
"K-M": {Species: "Kiwi"},
}

t.Run("found", func(t *testing.T) {
species, ok := mapping.GetDBSpecies("GSK")
if !ok {
t.Fatal("expected to find GSK")
}
if species != "Roroa" {
t.Errorf("expected Roroa, got %s", species)
}
})

t.Run("not found", func(t *testing.T) {
_, ok := mapping.GetDBSpecies("UNKNOWN")
if ok {
t.Fatal("expected not to find UNKNOWN")
}
})
}

func TestGetDBCalltype(t *testing.T) {
mapping := MappingFile{
"GSK": {
Species: "Roroa",
Calltypes: map[string]string{
"Male": "Male - Solo",
"Female": "Female - Solo",
},
},
"K-M": {Species: "Kiwi"}, // no calltype mapping
}

t.Run("with mapping", func(t *testing.T) {
ct := mapping.GetDBCalltype("GSK", "Male")
if ct != "Male - Solo" {
t.Errorf("expected 'Male - Solo', got %s", ct)
}
})

t.Run("without mapping - passthrough", func(t *testing.T) {
ct := mapping.GetDBCalltype("GSK", "Unknown")
if ct != "Unknown" {
t.Errorf("expected passthrough 'Unknown', got %s", ct)
}
})

t.Run("species not in mapping - passthrough", func(t *testing.T) {
ct := mapping.GetDBCalltype("UNKNOWN", "Male")
if ct != "Male" {
t.Errorf("expected passthrough 'Male', got %s", ct)
}
})

t.Run("species without calltypes - passthrough", func(t *testing.T) {
ct := mapping.GetDBCalltype("K-M", "Male")
if ct != "Male" {
t.Errorf("expected passthrough 'Male', got %s", ct)
}
})
}

func TestMappingValidationResult(t *testing.T) {
t.Run("HasErrors - no errors", func(t *testing.T) {
r := MappingValidationResult{}
if r.HasErrors() {
t.Error("expected no errors")
}
})

t.Run("HasErrors - missing species", func(t *testing.T) {
r := MappingValidationResult{MissingSpecies: []string{"UNKNOWN"}}
if !r.HasErrors() {
t.Error("expected errors")
}
})

t.Run("HasErrors - missing DB species", func(t *testing.T) {
r := MappingValidationResult{MissingDBSpecies: []string{"Phantom"}}
if !r.HasErrors() {
t.Error("expected errors")
}
})

t.Run("HasErrors - missing calltypes", func(t *testing.T) {
r := MappingValidationResult{MissingCalltypes: map[string]string{"GSK/Male": "Roroa/Male - Solo"}}
if !r.HasErrors() {
t.Error("expected errors")
}
})

t.Run("Error - all error types", func(t *testing.T) {
r := MappingValidationResult{
MissingSpecies: []string{"UNKNOWN"},
MissingDBSpecies: []string{"Phantom"},
MissingCalltypes: map[string]string{"GSK/Male": "Roroa/Male - Solo"},
}
errStr := r.Error()
if errStr == "" {
t.Error("expected non-empty error string")
}
// Check all parts are present
if !containsSubstring(errStr, "UNKNOWN") {
t.Error("error string should contain MISSING species")
}
if !containsSubstring(errStr, "Phantom") {
t.Error("error string should contain missing DB species")
}
if !containsSubstring(errStr, "GSK/Male") {
t.Error("error string should contain missing calltype")
}
})
}

// Helper functions

func createTempFile(t *testing.T, content string) string {
t.Helper()
tmpDir := t.TempDir()
path := filepath.Join(tmpDir, "mapping.json")
if err := os.WriteFile(path, []byte(content), 0644); err != nil {
t.Fatalf("failed to create temp file: %v", err)
}
return path
}

func containsSubstring(s, substr string) bool {
return len(s) >= len(substr) && (s == substr || len(s) > 0 && containsSubstringHelper(s, substr))
}

func containsSubstringHelper(s, substr string) bool {
for i := 0; i <= len(s)-len(substr); i++ {
if s[i:i+len(substr)] == substr {
return true
}
}
return false
}
file addition: mapping.go (----------)

[0.1]

package utils

import (
"database/sql"
"encoding/json"
"fmt"
"os"
"sort"
"strings"
)

// SpeciesMapping maps .data species/calltype names to DB labels
type SpeciesMapping struct {
Species string `json:"species"`
Calltypes map[string]string `json:"calltypes,omitempty"`
}

// MappingFile represents the complete mapping file structure
// Key is the .data file species name
type MappingFile map[string]SpeciesMapping

// LoadMappingFile loads and parses a mapping JSON file
func LoadMappingFile(path string) (MappingFile, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("failed to read mapping file: %w", err)
}

var mapping MappingFile
if err := json.Unmarshal(data, &mapping); err != nil {
return nil, fmt.Errorf("failed to parse mapping JSON: %w", err)
}

// Validate non-empty
if len(mapping) == 0 {
return nil, fmt.Errorf("mapping file is empty")
}

// Validate each entry has species
for dataSpecies, sm := range mapping {
if sm.Species == "" {
return nil, fmt.Errorf("mapping entry '%s' has empty species field", dataSpecies)
}
}

return mapping, nil
}

// MappingValidationResult contains validation errors for a mapping
type MappingValidationResult struct {
MissingSpecies []string // .data species not in mapping
MissingDBSpecies []string // mapped species not in DB
MissingCalltypes map[string]string // "dataSpecies/dataCalltype" -> "dbSpecies/dbCalltype"
}

// HasErrors returns true if any validation errors exist
func (r MappingValidationResult) HasErrors() bool {
return len(r.MissingSpecies) > 0 ||
len(r.MissingDBSpecies) > 0 ||
len(r.MissingCalltypes) > 0
}

// Error returns a formatted error message
func (r MappingValidationResult) Error() string {
var parts []string

if len(r.MissingSpecies) > 0 {
parts = append(parts, fmt.Sprintf("species in .data but not in mapping: [%s]",
strings.Join(r.MissingSpecies, ", ")))
}

if len(r.MissingDBSpecies) > 0 {
parts = append(parts, fmt.Sprintf("mapped species not found in DB: [%s]",
strings.Join(r.MissingDBSpecies, ", ")))
}

if len(r.MissingCalltypes) > 0 {
var ctErrors []string
for k, v := range r.MissingCalltypes {
ctErrors = append(ctErrors, fmt.Sprintf("%s->%s", k, v))
}
sort.Strings(ctErrors)
parts = append(parts, fmt.Sprintf("calltypes not found in DB: [%s]",
strings.Join(ctErrors, ", ")))
}

return strings.Join(parts, "; ")
}

// ValidateMappingAgainstDB validates that all mapped species and calltypes exist in the database
// Also validates that the mapping covers all species/calltypes found in .data files
func ValidateMappingAgainstDB(
db *sql.DB,
mapping MappingFile,
dataSpeciesSet map[string]bool,
dataCalltypes map[string]map[string]bool, // species -> calltype -> true
) (MappingValidationResult, error) {
result := MappingValidationResult{
MissingSpecies: make([]string, 0),
MissingDBSpecies: make([]string, 0),
MissingCalltypes: make(map[string]string),
}

// Check all .data species are in mapping
for species := range dataSpeciesSet {
if _, exists := mapping[species]; !exists {
result.MissingSpecies = append(result.MissingSpecies, species)
}
}
sort.Strings(result.MissingSpecies)

// Collect all mapped species and calltypes
mappedSpeciesSet := make(map[string]bool)
mappedCalltypes := make(map[string]map[string]string) // dbSpecies -> dbCalltype -> dataCalltype

for _, sm := range mapping {
mappedSpeciesSet[sm.Species] = true

// Track calltype mappings
if len(sm.Calltypes) > 0 {
if mappedCalltypes[sm.Species] == nil {
mappedCalltypes[sm.Species] = make(map[string]string)
}
for dataCT, dbCT := range sm.Calltypes {
mappedCalltypes[sm.Species][dbCT] = dataCT
}
}
}

// Also collect unmapped calltypes (where .data calltype = DB calltype)
for dataSpecies, ctSet := range dataCalltypes {
sm, exists := mapping[dataSpecies]
if !exists {
continue // Already reported as missing species
}
dbSpecies := sm.Species

for dataCT := range ctSet {
// If no explicit mapping, assume dataCT == dbCT
dbCT := dataCT
if sm.Calltypes != nil {
if mapped, ok := sm.Calltypes[dataCT]; ok {
dbCT = mapped
}
}

if mappedCalltypes[dbSpecies] == nil {
mappedCalltypes[dbSpecies] = make(map[string]string)
}
mappedCalltypes[dbSpecies][dbCT] = dataCT
}
}

// Validate species exist in DB
speciesLabels := make([]string, 0, len(mappedSpeciesSet))
for s := range mappedSpeciesSet {
speciesLabels = append(speciesLabels, s)
}
sort.Strings(speciesLabels)

if len(speciesLabels) > 0 {
query := `SELECT label FROM species WHERE label IN (` + Placeholders(len(speciesLabels)) + `) AND active = true`
args := make([]any, len(speciesLabels))
for i, s := range speciesLabels {
args[i] = s
}

rows, err := db.Query(query, args...)
if err != nil {
return result, fmt.Errorf("failed to query species: %w", err)
}
defer rows.Close()

foundSpecies := make(map[string]bool)
for rows.Next() {
var label string
if err := rows.Scan(&label); err == nil {
foundSpecies[label] = true
}
}

for _, s := range speciesLabels {
if !foundSpecies[s] {
result.MissingDBSpecies = append(result.MissingDBSpecies, s)
}
}
}

// Validate calltypes exist in DB
for dbSpecies, ctMap := range mappedCalltypes {
if len(ctMap) == 0 {
continue
}

ctLabels := make([]string, 0, len(ctMap))
for dbCT := range ctMap {
ctLabels = append(ctLabels, dbCT)
}
sort.Strings(ctLabels)

query := `
SELECT ct.label
FROM call_type ct
JOIN species s ON ct.species_id = s.id
WHERE s.label = ? AND ct.label IN (` + Placeholders(len(ctLabels)) + `) AND ct.active = true`
args := make([]any, 1+len(ctLabels))
args[0] = dbSpecies
for i, ct := range ctLabels {
args[1+i] = ct
}

rows, err := db.Query(query, args...)
if err != nil {
return result, fmt.Errorf("failed to query calltypes for species %s: %w", dbSpecies, err)
}
defer rows.Close()

foundCT := make(map[string]bool)
for rows.Next() {
var label string
if err := rows.Scan(&label); err == nil {
foundCT[label] = true
}
}

for dbCT, dataCT := range ctMap {
if !foundCT[dbCT] {
key := fmt.Sprintf("%s/%s", dbSpecies, dataCT)
value := fmt.Sprintf("%s/%s", dbSpecies, dbCT)
result.MissingCalltypes[key] = value
}
}
}

return result, nil
}

// GetDBSpecies returns the DB species label for a .data species
func (m MappingFile) GetDBSpecies(dataSpecies string) (string, bool) {
sm, exists := m[dataSpecies]
if !exists {
return "", false
}
return sm.Species, true
}

// GetDBCalltype returns the DB calltype label for a .data species/calltype
// Returns the dataCalltype unchanged if no mapping exists
func (m MappingFile) GetDBCalltype(dataSpecies, dataCalltype string) string {
sm, exists := m[dataSpecies]
if !exists || sm.Calltypes == nil {
return dataCalltype
}

if dbCT, ok := sm.Calltypes[dataCalltype]; ok {
return dbCT
}
return dataCalltype
}

// Mapping sentinels: special values for the SpeciesMapping.Species field.
//
// MappingNegative marks a .data species as "confirmed empty" (Noise-equivalent):
// segments matching this name are treated as negative evidence — clips overlapping
// them emit an all-zero row when no positive species also overlaps.
//
// MappingIgnore marks a .data species as "ignored entirely": segments matching
// this name neither label clips nor block them.
const (
MappingNegative = "__NEGATIVE__"
MappingIgnore = "__IGNORE__"
)

// MappingKind describes how a .data species should be treated.
type MappingKind int

const (
MappingReal MappingKind = iota
MappingNeg
MappingIgn
)

// Classify returns the canonical class name and kind for a .data species.
// ok is false if dataSpecies is not present in the mapping.
// For MappingNeg and MappingIgn the canonical string is empty.
func (m MappingFile) Classify(dataSpecies string) (canonical string, kind MappingKind, ok bool) {
sm, exists := m[dataSpecies]
if !exists {
return "", MappingReal, false
}
switch sm.Species {
case MappingNegative:
return "", MappingNeg, true
case MappingIgnore:
return "", MappingIgn, true
default:
return sm.Species, MappingReal, true
}
}

// ValidateCoversSpecies returns the sorted list of species in speciesSet that
// are missing from the mapping. Empty result means full coverage.
func (m MappingFile) ValidateCoversSpecies(speciesSet map[string]bool) []string {
missing := make([]string, 0)
for s := range speciesSet {
if _, exists := m[s]; !exists {
missing = append(missing, s)
}
}
sort.Strings(missing)
return missing
}

// Classes returns the sorted unique non-sentinel canonical class names from the mapping.
// Used to build the CSV column header for clip-labels.
func (m MappingFile) Classes() []string {
set := make(map[string]bool)
for _, sm := range m {
switch sm.Species {
case MappingNegative, MappingIgnore, "":
continue
default:
set[sm.Species] = true
}
}
out := make([]string, 0, len(set))
for s := range set {
out = append(out, s)
}
sort.Strings(out)
return out
}

// placeholders generates SQL placeholder string for IN clauses
func Placeholders(n int) string {
if n == 0 {
return ""
}
ph := make([]string, n)
for i := range ph {
ph[i] = "?"
}
return strings.Join(ph, ", ")
}
file addition: filename_parser_test.go (----------)

[0.1]

package utils

import (
"testing"
)

func TestParseFilenameTimestamps(t *testing.T) {
t.Run("should parse YYMMDD format (test case a)", func(t *testing.T) {
filenames := []string{
"201012_123456.wav",
"201014_123456.WAV",
"201217_123456.wav",
"211122_123456.WAV",
}

results, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse filenames: %v", err)
}

if len(results) != 4 {
t.Fatalf("Expected 4 results, got %d", len(results))
}

// Year 20 should be interpreted as 2020 (less variance than days)
if results[0].Timestamp.Year() != 2020 {
t.Errorf("Year incorrect for file 0: got %d, want 2020", results[0].Timestamp.Year())
}
if results[0].Timestamp.Month() != 10 { // October
t.Errorf("Month incorrect for file 0: got %d, want 10", results[0].Timestamp.Month())
}
if results[0].Timestamp.Day() != 12 {
t.Errorf("Day incorrect for file 0: got %d, want 12", results[0].Timestamp.Day())
}
if results[0].Timestamp.Hour() != 12 {
t.Errorf("Hour incorrect for file 0: got %d, want 12", results[0].Timestamp.Hour())
}
if results[0].Timestamp.Minute() != 34 {
t.Errorf("Minute incorrect for file 0: got %d, want 34", results[0].Timestamp.Minute())
}
if results[0].Timestamp.Second() != 56 {
t.Errorf("Second incorrect for file 0: got %d, want 56", results[0].Timestamp.Second())
}

if results[3].Timestamp.Year() != 2021 {
t.Errorf("Year incorrect for file 3: got %d, want 2021", results[3].Timestamp.Year())
}
if results[3].Timestamp.Month() != 11 { // November
t.Errorf("Month incorrect for file 3: got %d, want 11", results[3].Timestamp.Month())
}
if results[3].Timestamp.Day() != 22 {
t.Errorf("Day incorrect for file 3: got %d, want 22", results[3].Timestamp.Day())
}
})

t.Run("should parse DDMMYY format (test case b)", func(t *testing.T) {
filenames := []string{
"121020_123456.WAV",
"141020_123456.wav",
"171220_123456.WAV",
"221121_123456.wav",
}

results, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse filenames: %v", err)
}

if len(results) != 4 {
t.Fatalf("Expected 4 results, got %d", len(results))
}

// More variance in first two digits (12,14,17,22) than last two (20,20,20,21)
// So DDMMYY format: day=first, month=middle, year=last+2000
if results[0].Timestamp.Day() != 12 {
t.Errorf("Day incorrect for file 0: got %d, want 12", results[0].Timestamp.Day())
}
if results[0].Timestamp.Month() != 10 { // October
t.Errorf("Month incorrect for file 0: got %d, want 10", results[0].Timestamp.Month())
}
if results[0].Timestamp.Year() != 2020 {
t.Errorf("Year incorrect for file 0: got %d, want 2020", results[0].Timestamp.Year())
}

if results[2].Timestamp.Day() != 17 {
t.Errorf("Day incorrect for file 2: got %d, want 17", results[2].Timestamp.Day())
}
if results[2].Timestamp.Month() != 12 { // December
t.Errorf("Month incorrect for file 2: got %d, want 12", results[2].Timestamp.Month())
}
if results[2].Timestamp.Year() != 2020 {
t.Errorf("Year incorrect for file 2: got %d, want 2020", results[2].Timestamp.Year())
}
})

t.Run("should parse YYYYMMDD format (test case c)", func(t *testing.T) {
filenames := []string{
"20230609_103000.WAV",
"20241109_201504.wav",
}

results, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse filenames: %v", err)
}

if len(results) != 2 {
t.Fatalf("Expected 2 results, got %d", len(results))
}

if results[0].Timestamp.Year() != 2023 {
t.Errorf("Year incorrect: got %d, want 2023", results[0].Timestamp.Year())
}
if results[0].Timestamp.Month() != 6 { // June
t.Errorf("Month incorrect: got %d, want 6", results[0].Timestamp.Month())
}
if results[0].Timestamp.Day() != 9 {
t.Errorf("Day incorrect: got %d, want 9", results[0].Timestamp.Day())
}
if results[0].Timestamp.Hour() != 10 {
t.Errorf("Hour incorrect: got %d, want 10", results[0].Timestamp.Hour())
}
if results[0].Timestamp.Minute() != 30 {
t.Errorf("Minute incorrect: got %d, want 30", results[0].Timestamp.Minute())
}
if results[0].Timestamp.Second() != 0 {
t.Errorf("Second incorrect: got %d, want 0", results[0].Timestamp.Second())
}

if results[1].Timestamp.Year() != 2024 {
t.Errorf("Year incorrect: got %d, want 2024", results[1].Timestamp.Year())
}
})

t.Run("should parse mixed 6-digit dates with variance detection (test case d)", func(t *testing.T) {
filenames := []string{
"120119_003002.wav",
"180120_231502.wav",
"170122_010005.wav",
"010419_234502.WAV",
"310320_231502.wav",
"220824_231502.WAV",
"240123_231502.wav",
}

results, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse filenames: %v", err)
}

if len(results) != 7 {
t.Fatalf("Expected 7 results, got %d", len(results))
}

// First two digits: 12,18,17,01,31,22,24 (variance = high)
// Last two digits: 19,20,22,19,20,24,23 (variance = lower)
// Should be DDMMYY format
if results[0].Timestamp.Day() != 12 {
t.Errorf("Day incorrect: got %d, want 12", results[0].Timestamp.Day())
}
if results[0].Timestamp.Month() != 1 { // January
t.Errorf("Month incorrect: got %d, want 1", results[0].Timestamp.Month())
}
if results[0].Timestamp.Year() != 2019 {
t.Errorf("Year incorrect: got %d, want 2019", results[0].Timestamp.Year())
}

if results[4].Timestamp.Day() != 31 {
t.Errorf("Day incorrect for file 4: got %d, want 31", results[4].Timestamp.Day())
}
if results[4].Timestamp.Month() != 3 { // March
t.Errorf("Month incorrect for file 4: got %d, want 3", results[4].Timestamp.Month())
}
})

t.Run("should throw error for empty filename array", func(t *testing.T) {
_, err := ParseFilenameTimestamps([]string{})
if err == nil {
t.Error("Expected error for empty filename array")
}
if err != nil && err.Error() != "no filenames provided" {
t.Logf("Error message: %v", err)
}
})

t.Run("should throw error for filenames without date patterns", func(t *testing.T) {
_, err := ParseFilenameTimestamps([]string{"invalid_filename.wav"})
if err == nil {
t.Error("Expected error for filenames without date patterns")
}
})

t.Run("should parse filenames with prefixes (test case e)", func(t *testing.T) {
filenames := []string{
"XYZ123_7689_20230609_103000.WAV",
"string 20241109_201504.wav",
}

results, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse filenames: %v", err)
}

if len(results) != 2 {
t.Fatalf("Expected 2 results, got %d", len(results))
}

if results[0].Timestamp.Year() != 2023 {
t.Errorf("Year incorrect: got %d, want 2023", results[0].Timestamp.Year())
}
if results[0].Timestamp.Month() != 6 { // June
t.Errorf("Month incorrect: got %d, want 6", results[0].Timestamp.Month())
}
if results[0].Timestamp.Day() != 9 {
t.Errorf("Day incorrect: got %d, want 9", results[0].Timestamp.Day())
}
if results[0].Timestamp.Hour() != 10 {
t.Errorf("Hour incorrect: got %d, want 10", results[0].Timestamp.Hour())
}
if results[0].Timestamp.Minute() != 30 {
t.Errorf("Minute incorrect: got %d, want 30", results[0].Timestamp.Minute())
}
if results[0].Timestamp.Second() != 0 {
t.Errorf("Second incorrect: got %d, want 0", results[0].Timestamp.Second())
}

if results[1].Timestamp.Year() != 2024 {
t.Errorf("Year incorrect: got %d, want 2024", results[1].Timestamp.Year())
}
if results[1].Timestamp.Month() != 11 { // November
t.Errorf("Month incorrect: got %d, want 11", results[1].Timestamp.Month())
}
if results[1].Timestamp.Day() != 9 {
t.Errorf("Day incorrect: got %d, want 9", results[1].Timestamp.Day())
}
if results[1].Timestamp.Hour() != 20 {
t.Errorf("Hour incorrect: got %d, want 20", results[1].Timestamp.Hour())
}
if results[1].Timestamp.Minute() != 15 {
t.Errorf("Minute incorrect: got %d, want 15", results[1].Timestamp.Minute())
}
if results[1].Timestamp.Second() != 4 {
t.Errorf("Second incorrect: got %d, want 4", results[1].Timestamp.Second())
}
})

t.Run("should parse filenames with complex prefixes (test case f)", func(t *testing.T) {
filenames := []string{
"abcdefg__1234_180120_231502.wav",
"string 120119_003002.wav",
"ABCD EFG___170122_010005.wav",
"BHD_1234 010419_234502.WAV",
"cill xyz 310320_231502.wav",
"220824_231502.WAV",
"240123_231502.wav",
}

results, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse filenames: %v", err)
}

if len(results) != 7 {
t.Fatalf("Expected 7 results, got %d", len(results))
}

// Same pattern as test case d - should be DDMMYY
if results[0].Timestamp.Day() != 18 {
t.Errorf("Day incorrect: got %d, want 18", results[0].Timestamp.Day())
}
if results[0].Timestamp.Month() != 1 { // January
t.Errorf("Month incorrect: got %d, want 1", results[0].Timestamp.Month())
}
if results[0].Timestamp.Year() != 2020 {
t.Errorf("Year incorrect: got %d, want 2020", results[0].Timestamp.Year())
}
if results[0].Timestamp.Hour() != 23 {
t.Errorf("Hour incorrect: got %d, want 23", results[0].Timestamp.Hour())
}
if results[0].Timestamp.Minute() != 15 {
t.Errorf("Minute incorrect: got %d, want 15", results[0].Timestamp.Minute())
}
if results[0].Timestamp.Second() != 2 {
t.Errorf("Second incorrect: got %d, want 2", results[0].Timestamp.Second())
}

if results[1].Timestamp.Day() != 12 {
t.Errorf("Day incorrect: got %d, want 12", results[1].Timestamp.Day())
}
if results[1].Timestamp.Month() != 1 { // January
t.Errorf("Month incorrect: got %d, want 1", results[1].Timestamp.Month())
}
if results[1].Timestamp.Year() != 2019 {
t.Errorf("Year incorrect: got %d, want 2019", results[1].Timestamp.Year())
}

if results[4].Timestamp.Day() != 31 {
t.Errorf("Day incorrect: got %d, want 31", results[4].Timestamp.Day())
}
if results[4].Timestamp.Month() != 3 { // March
t.Errorf("Month incorrect: got %d, want 3", results[4].Timestamp.Month())
}
if results[4].Timestamp.Year() != 2020 {
t.Errorf("Year incorrect: got %d, want 2020", results[4].Timestamp.Year())
}
})

t.Run("should throw error for mixed date formats", func(t *testing.T) {
mixedFormats := []string{"201012_123456.wav", "20231012_123456.wav"} // 6-digit vs 8-digit
_, err := ParseFilenameTimestamps(mixedFormats)
if err == nil {
t.Error("Expected error for mixed date formats")
}
})

t.Run("should throw error for wrong length patterns", func(t *testing.T) {
wrongLength := []string{"2010_123456.wav"} // 4 digits instead of 6 or 8
_, err := ParseFilenameTimestamps(wrongLength)
if err == nil {
t.Error("Expected error for wrong length patterns")
}
})

t.Run("should throw error when not enough files for 6-digit disambiguation", func(t *testing.T) {
singleFile := []string{"120119_003002.wav"}
_, err := ParseFilenameTimestamps(singleFile)
if err == nil {
t.Error("Expected error when not enough files for 6-digit disambiguation")
}
})
}

func TestApplyTimezoneOffset(t *testing.T) {
t.Run("should apply UTC timezone correctly", func(t *testing.T) {
filenames := []string{
"201012_123456.wav",
"201014_123456.WAV",
}

parsed, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse filenames: %v", err)
}

results, err := ApplyTimezoneOffset(parsed, "UTC")
if err != nil {
t.Fatalf("Failed to apply timezone: %v", err)
}

if len(results) != 2 {
t.Fatalf("Expected 2 results, got %d", len(results))
}

// Check timezone offset is +00:00
_, offset := results[0].Zone()
if offset != 0 {
t.Errorf("UTC offset should be 0, got %d", offset)
}
})

t.Run("should use fixed offset for entire cluster spanning DST transition", func(t *testing.T) {
// Test files spanning the Auckland DST transition in April 2021
// DST ended on April 4, 2021 (UTC+13 -> UTC+12)
filenames := []string{
"20210401_120000.wav", // April 1st - DST still active (UTC+13)
"20210410_120000.wav", // April 10th - DST ended (would be UTC+12 if DST applied)
"20210420_120000.wav", // April 20th - Standard time (would be UTC+12 if DST applied)
}

parsed, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse filenames: %v", err)
}

results, err := ApplyTimezoneOffset(parsed, "Pacific/Auckland")
if err != nil {
t.Fatalf("Failed to apply timezone: %v", err)
}

if len(results) != 3 {
t.Fatalf("Expected 3 results, got %d", len(results))
}

// All files should use the same offset (from April 1st - earliest file)
offsets := make([]int, len(results))
for i, r := range results {
_, offset := r.Zone()
offsets[i] = offset
}

// Check all offsets are the same
firstOffset := offsets[0]
for i, offset := range offsets {
if offset != firstOffset {
t.Errorf("File %d has different offset: got %d, want %d", i, offset, firstOffset)
}
}

// The offset should be UTC+13 (from the earliest file: April 1st)
expectedOffsetSeconds := 13 * 3600
if firstOffset != expectedOffsetSeconds {
t.Errorf("Offset incorrect: got %d seconds, want %d seconds (UTC+13)", firstOffset, expectedOffsetSeconds)
}

// Verify UTC conversion uses the fixed offset consistently
// All files at 12:00 local should convert to the same UTC hour (with UTC+13 offset)
// 12:00 Auckland time - 13 hours = 23:00 UTC previous day
for i, utcTime := range results {
utc := utcTime.UTC()
if utc.Hour() != 23 {
t.Errorf("File %d UTC hour incorrect: got %d, want 23", i, utc.Hour())
}
}
})

t.Run("should handle out-of-order filenames correctly", func(t *testing.T) {
// Files not in chronological order - should still use earliest file for offset
filenames := []string{
"20210410_120000.wav", // April 10th (later)
"20210401_120000.wav", // April 1st (earliest - should determine offset)
"20210405_120000.wav", // April 5th (middle)
}

parsed, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse filenames: %v", err)
}

results, err := ApplyTimezoneOffset(parsed, "Pacific/Auckland")
if err != nil {
t.Fatalf("Failed to apply timezone: %v", err)
}

// All files should use UTC+13 offset (from April 1st, the earliest)
for i, r := range results {
_, offset := r.Zone()
expectedOffset := 13 * 3600
if offset != expectedOffset {
t.Errorf("File %d offset incorrect: got %d, want %d", i, offset, expectedOffset)
}
}

// Results should maintain original filename order
if results[0].Day() != 10 {
t.Errorf("Result 0 should be April 10th, got day %d", results[0].Day())
}
if results[1].Day() != 1 {
t.Errorf("Result 1 should be April 1st, got day %d", results[1].Day())
}
if results[2].Day() != 5 {
t.Errorf("Result 2 should be April 5th, got day %d", results[2].Day())
}
})

t.Run("should apply fixed offset consistently across large time spans", func(t *testing.T) {
// Test files spanning multiple months with different DST periods
filenames := []string{
"20210215_120000.wav", // February 15th (summer, UTC+13)
"20210615_120000.wav", // June 15th (winter, would be UTC+12 if DST applied)
"20210815_120000.wav", // August 15th (winter, would be UTC+12 if DST applied)
}

parsed, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse filenames: %v", err)
}

results, err := ApplyTimezoneOffset(parsed, "Pacific/Auckland")
if err != nil {
t.Fatalf("Failed to apply timezone: %v", err)
}

// All files should use the same offset from the earliest file (February)
expectedOffset := 13 * 3600
for i, r := range results {
_, offset := r.Zone()
if offset != expectedOffset {
t.Errorf("File %d offset incorrect: got %d, want %d", i, offset, expectedOffset)
}
}

// Verify UTC conversion is consistent with fixed offset
for i, r := range results {
utc := r.UTC()
if utc.Hour() != 23 { // 12 - 13 = -1 hour (23:00 previous day)
t.Errorf("File %d UTC hour incorrect: got %d, want 23", i, utc.Hour())
}
}
})

t.Run("should handle US DST transitions with fixed offset", func(t *testing.T) {
// Test US spring DST transition (March 14, 2021)
filenames := []string{
"20210310_120000.wav", // March 10th - before DST (UTC-5)
"20210320_120000.wav", // March 20th - after DST (would be UTC-4 if DST applied)
}

parsed, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse filenames: %v", err)
}

results, err := ApplyTimezoneOffset(parsed, "America/New_York")
if err != nil {
t.Fatalf("Failed to apply timezone: %v", err)
}

// All files should use the same offset from earliest file (March 10th)
expectedOffset := -5 * 3600
for i, r := range results {
_, offset := r.Zone()
if offset != expectedOffset {
t.Errorf("File %d offset incorrect: got %d, want %d", i, offset, expectedOffset)
}
}

// Verify UTC conversion uses fixed offset
for i, r := range results {
utc := r.UTC()
if utc.Hour() != 17 { // 12 + 5 = 17
t.Errorf("File %d UTC hour incorrect: got %d, want 17", i, utc.Hour())
}
}
})

t.Run("should handle empty timestamps array", func(t *testing.T) {
_, err := ApplyTimezoneOffset([]FilenameTimestamp{}, "UTC")
if err == nil {
t.Error("Expected error for empty timestamps array")
}
})

t.Run("should handle invalid timezone", func(t *testing.T) {
filenames := []string{"20210401_120000.wav"}
parsed, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse filenames: %v", err)
}

_, err = ApplyTimezoneOffset(parsed, "Invalid/Timezone")
if err == nil {
t.Error("Expected error for invalid timezone")
}
})
}

func TestHasTimestampFilename(t *testing.T) {
testCases := []struct {
filename string
expected bool
}{
{"201012_123456.wav", true},
{"20230609_103000.WAV", true},
{"invalid_filename.wav", false},
{"201012_123456.txt", false},
{"201012.wav", false},
{"_123456.wav", false},
{"", false},
}

for _, tc := range testCases {
t.Run(tc.filename, func(t *testing.T) {
result := HasTimestampFilename(tc.filename)
if result != tc.expected {
t.Errorf("HasTimestampFilename(%q) = %v, want %v", tc.filename, result, tc.expected)
}
})
}
}

func TestFilenameParserEdgeCases(t *testing.T) {
t.Run("should handle case-insensitive file extensions", func(t *testing.T) {
filenames := []string{
"201012_123456.wav",
"201014_123456.WAV",
"201217_123456.Wav",
}

results, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse filenames: %v", err)
}

if len(results) != 3 {
t.Errorf("Expected 3 results, got %d", len(results))
}
})

t.Run("should validate invalid dates", func(t *testing.T) {
// 32nd day doesn't exist - should be caught by validation
filenames := []string{"20240132_120000.wav"}

_, err := ParseFilenameTimestamps(filenames)
if err == nil {
t.Error("Expected error for invalid date (day 32)")
}
})

t.Run("should validate invalid months", func(t *testing.T) {
// 13th month doesn't exist
filenames := []string{"20241301_120000.wav"}

_, err := ParseFilenameTimestamps(filenames)
if err == nil {
t.Error("Expected error for invalid month (13)")
}
})

t.Run("should handle February 29th in leap year", func(t *testing.T) {
filenames := []string{"20240229_120000.wav"} // 2024 is a leap year

results, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse leap year date: %v", err)
}

if results[0].Timestamp.Day() != 29 {
t.Errorf("Expected day 29, got %d", results[0].Timestamp.Day())
}
})

t.Run("should reject February 29th in non-leap year", func(t *testing.T) {
filenames := []string{"20230229_120000.wav"} // 2023 is not a leap year

_, err := ParseFilenameTimestamps(filenames)
if err == nil {
t.Error("Expected error for Feb 29th in non-leap year")
}
})
}

func TestUTCConversionCorrectness(t *testing.T) {
t.Run("should convert Pacific/Auckland night recordings correctly to UTC", func(t *testing.T) {
// Test a night recording: 21:00 (9 PM) Pacific/Auckland
// In May 2021, Pacific/Auckland is UTC+12 (standard time)
// So 21:00 Pacific/Auckland should become 09:00 UTC same day
filenames := []string{"20210505_210000.wav"}

parsed, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse filenames: %v", err)
}

results, err := ApplyTimezoneOffset(parsed, "Pacific/Auckland")
if err != nil {
t.Fatalf("Failed to apply timezone: %v", err)
}

utcDate := results[0].UTC()
if utcDate.Year() != 2021 {
t.Errorf("Year incorrect: got %d, want 2021", utcDate.Year())
}
if utcDate.Month() != 5 {
t.Errorf("Month incorrect: got %d, want 5", utcDate.Month())
}
if utcDate.Day() != 5 {
t.Errorf("Day incorrect: got %d, want 5 (same day)", utcDate.Day())
}
if utcDate.Hour() != 9 {
t.Errorf("Hour incorrect: got %d, want 9 (21 - 12 = 9)", utcDate.Hour())
}
})

t.Run("should convert day recordings correctly to UTC", func(t *testing.T) {
// Test a day recording: 12:00 (noon) Pacific/Auckland
// Should become 00:00 UTC same day (midnight)
filenames := []string{"20210505_120000.wav"}

parsed, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse filenames: %v", err)
}

results, err := ApplyTimezoneOffset(parsed, "Pacific/Auckland")
if err != nil {
t.Fatalf("Failed to apply timezone: %v", err)
}

utcDate := results[0].UTC()
if utcDate.Hour() != 0 {
t.Errorf("Hour incorrect: got %d, want 0 (12 - 12 = 0, midnight UTC)", utcDate.Hour())
}
if utcDate.Day() != 5 {
t.Errorf("Day incorrect: got %d, want 5 (same day)", utcDate.Day())
}
})

t.Run("should handle date rollover correctly", func(t *testing.T) {
// Test early morning: 02:00 Pacific/Auckland
// Should become 14:00 UTC previous day
filenames := []string{"20210505_020000.wav"}

parsed, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse filenames: %v", err)
}

results, err := ApplyTimezoneOffset(parsed, "Pacific/Auckland")
if err != nil {
t.Fatalf("Failed to apply timezone: %v", err)
}

utcDate := results[0].UTC()
if utcDate.Day() != 4 {
t.Errorf("Day incorrect: got %d, want 4 (previous day)", utcDate.Day())
}
if utcDate.Hour() != 14 {
t.Errorf("Hour incorrect: got %d, want 14 (2 - 12 = -10, so previous day 14:00)", utcDate.Hour())
}
})

t.Run("should convert correctly for negative offset timezone", func(t *testing.T) {
// Test 15:00 (3 PM) New York in June (UTC-4 during DST)
// Should become 19:00 UTC same day
filenames := []string{"20210615_150000.wav"}

parsed, err := ParseFilenameTimestamps(filenames)
if err != nil {
t.Fatalf("Failed to parse filenames: %v", err)
}

results, err := ApplyTimezoneOffset(parsed, "America/New_York")
if err != nil {
t.Fatalf("Failed to apply timezone: %v", err)
}

utcDate := results[0].UTC()
if utcDate.Hour() != 19 {
t.Errorf("Hour incorrect: got %d, want 19 (15 + 4 = 19)", utcDate.Hour())
}
if utcDate.Day() != 15 {
t.Errorf("Day incorrect: got %d, want 15 (same day)", utcDate.Day())
}
})
}
file addition: filename_parser.go (----------)

[0.1]

package utils

import (
"fmt"
"path/filepath"
"regexp"
"strconv"
"time"
)

// DateFormat represents the detected filename date format
type DateFormat int

// Date format constants for filename timestamp parsing
const (
Format8Digit DateFormat = iota // YYYYMMDD_HHMMSS (e.g., 20230609_103000.wav)
Format6YYMMDD // YYMMDD_HHMMSS (e.g., 201012_123456.wav) - year first
Format6DDMMYY // DDMMYY_HHMMSS (e.g., 121020_123456.wav) - year last
)

var (
// Pattern to match timestamp filenames
// Supports: YYYYMMDD_HHMMSS, YYMMDD_HHMMSS, DDMMYY_HHMMSS
// Case-insensitive for file extension (.wav, .WAV, .Wav)
// Allows prefixes before the timestamp pattern
// Allows optional suffixes between timestamp and extension (e.g., _16kHz)
timestampPattern = regexp.MustCompile(`(?i)(\d{6,8})_(\d{6})(?:_[^/\\]*)?\.wav$`)
)

// dateParts represents parsed date components for format detection
type dateParts struct {
x1 int // First 2 digits
m int // Middle 2 digits (always month)
x2 int // Last 2 digits
}

// FilenameTimestamp represents a parsed timestamp from a filename
type FilenameTimestamp struct {
Filename string
Timestamp time.Time
Format DateFormat
}

// ParseFilenameTimestamps parses timestamps from a batch of filenames.
// Uses variance-based disambiguation for 6-digit dates (YYMMDD vs DDMMYY).
// Returns timestamps in UTC (timezone must be applied separately).
// ParseFilenameTimestamps extracts timestamps from filenames using variance-based format detection
func ParseFilenameTimestamps(filenames []string) ([]FilenameTimestamp, error) {
if len(filenames) == 0 {
return nil, fmt.Errorf("no filenames provided")
}

// Detect date format by analyzing all filenames
format, err := detectDateFormat(filenames)
if err != nil {
return nil, err
}

// Parse all filenames using detected format
results := make([]FilenameTimestamp, 0, len(filenames))
for _, filename := range filenames {
timestamp, err := parseFilenameWithFormat(filename, format)
if err != nil {
return nil, fmt.Errorf("failed to parse %s: %w", filename, err)
}
results = append(results, FilenameTimestamp{
Filename: filename,
Timestamp: timestamp,
Format: format,
})
}

return results, nil
}

// ApplyTimezoneOffset applies a fixed timezone offset to timestamps
// Uses the EARLIEST (chronologically) timestamp to determine the offset, then applies it to all
// This matches AudioMoth behavior (no DST adjustment during deployment)
// ApplyTimezoneOffset converts local timestamps to location timezone with DST handling
func ApplyTimezoneOffset(timestamps []FilenameTimestamp, timezoneID string) ([]time.Time, error) {
if len(timestamps) == 0 {
return nil, fmt.Errorf("no timestamps provided")
}

// Load timezone location
loc, err := time.LoadLocation(timezoneID)
if err != nil {
return nil, fmt.Errorf("invalid timezone %s: %w", timezoneID, err)
}

// Find chronologically earliest timestamp
earliestUTC := timestamps[0].Timestamp
for _, ts := range timestamps[1:] {
if ts.Timestamp.Before(earliestUTC) {
earliestUTC = ts.Timestamp
}
}

// Calculate offset from earliest timestamp
earliestInZone := time.Date(
earliestUTC.Year(), earliestUTC.Month(), earliestUTC.Day(),
earliestUTC.Hour(), earliestUTC.Minute(), earliestUTC.Second(),
0, loc,
)

// Get fixed offset (doesn't change for DST)
_, offsetSeconds := earliestInZone.Zone()
fixedOffset := time.FixedZone("Fixed", offsetSeconds)

// Apply SAME offset to ALL timestamps (maintaining original order)
results := make([]time.Time, len(timestamps))
for i, ts := range timestamps {
adjusted := time.Date(
ts.Timestamp.Year(), ts.Timestamp.Month(), ts.Timestamp.Day(),
ts.Timestamp.Hour(), ts.Timestamp.Minute(), ts.Timestamp.Second(),
0, fixedOffset,
)
results[i] = adjusted
}

return results, nil
}

// detectDateFormat analyzes filenames to determine the date format
func detectDateFormat(filenames []string) (DateFormat, error) {
// Extract all date parts from filenames
var parts []dateParts
var has8Digit bool

for _, filename := range filenames {
basename := filepath.Base(filename)
matches := timestampPattern.FindStringSubmatch(basename)
if matches == nil {
continue
}

dateStr := matches[1]

// Check for 8-digit format (YYYYMMDD)
if len(dateStr) == 8 {
has8Digit = true
continue
}

// Parse 6-digit format
if len(dateStr) == 6 {
x1, _ := strconv.Atoi(dateStr[0:2])
m, _ := strconv.Atoi(dateStr[2:4])
x2, _ := strconv.Atoi(dateStr[4:6])
parts = append(parts, dateParts{x1: x1, m: m, x2: x2})
}
}

// If all files are 8-digit, that's the format
if has8Digit && len(parts) == 0 {
return Format8Digit, nil
}

// If mixed 8-digit and 6-digit, return error
if has8Digit && len(parts) > 0 {
return 0, fmt.Errorf("mixed date formats detected (8-digit and 6-digit)")
}

// If no 6-digit dates found, cannot determine
if len(parts) == 0 {
return 0, fmt.Errorf("no valid timestamp filenames found")
}

// Need at least 2 files with different dates to disambiguate YYMMDD vs DDMMYY
if len(parts) == 1 {
return 0, fmt.Errorf("need at least 2 files to disambiguate 6-digit date format (YYMMDD vs DDMMYY)")
}

// Use variance-based disambiguation for 6-digit dates
// Compare uniqueness of x1 (first 2 digits) vs x2 (last 2 digits)
// Day values vary more than year values across recordings
uniqueX1 := countUnique(parts, func(p dateParts) int { return p.x1 })
uniqueX2 := countUnique(parts, func(p dateParts) int { return p.x2 })

if uniqueX2 >= uniqueX1 {
// x2 has more variance → likely day values → YYMMDD format
return Format6YYMMDD, nil
} else {
// x1 has more variance → likely day values → DDMMYY format
return Format6DDMMYY, nil
}
}

// parseFilenameWithFormat parses a filename using the specified format
func parseFilenameWithFormat(filename string, format DateFormat) (time.Time, error) {
basename := filepath.Base(filename)
matches := timestampPattern.FindStringSubmatch(basename)
if matches == nil {
return time.Time{}, fmt.Errorf("filename does not match timestamp pattern: %s", basename)
}

dateStr := matches[1]
timeStr := matches[2]

var year, month, day int

switch format {
case Format8Digit:
if len(dateStr) != 8 {
return time.Time{}, fmt.Errorf("expected 8-digit date, got %d digits", len(dateStr))
}
year, _ = strconv.Atoi(dateStr[0:4])
month, _ = strconv.Atoi(dateStr[4:6])
day, _ = strconv.Atoi(dateStr[6:8])

case Format6YYMMDD:
if len(dateStr) != 6 {
return time.Time{}, fmt.Errorf("expected 6-digit date, got %d digits", len(dateStr))
}
yy, _ := strconv.Atoi(dateStr[0:2])
month, _ = strconv.Atoi(dateStr[2:4])
day, _ = strconv.Atoi(dateStr[4:6])
// Convert 2-digit year to 4-digit (assume 2000-2099)
year = 2000 + yy

case Format6DDMMYY:
if len(dateStr) != 6 {
return time.Time{}, fmt.Errorf("expected 6-digit date, got %d digits", len(dateStr))
}
day, _ = strconv.Atoi(dateStr[0:2])
month, _ = strconv.Atoi(dateStr[2:4])
yy, _ := strconv.Atoi(dateStr[4:6])
// Convert 2-digit year to 4-digit (assume 2000-2099)
year = 2000 + yy
}

// Parse time (HHMMSS)
if len(timeStr) != 6 {
return time.Time{}, fmt.Errorf("invalid time format: %s", timeStr)
}
hour, _ := strconv.Atoi(timeStr[0:2])
minute, _ := strconv.Atoi(timeStr[2:4])
second, _ := strconv.Atoi(timeStr[4:6])

// Construct timestamp in UTC (timezone applied separately)
timestamp := time.Date(year, time.Month(month), day, hour, minute, second, 0, time.UTC)

// Validate date
if timestamp.Month() != time.Month(month) || timestamp.Day() != day {
return time.Time{}, fmt.Errorf("invalid date: %04d-%02d-%02d", year, month, day)
}

return timestamp, nil
}

// countUnique counts unique values using an extractor function
func countUnique(parts []dateParts, extractor func(p dateParts) int) int {
seen := make(map[int]bool)
for _, p := range parts {
seen[extractor(p)] = true
}
return len(seen)
}

// HasTimestampFilename checks if a filename matches the timestamp pattern
// HasTimestampFilename checks if filename contains a timestamp pattern
func HasTimestampFilename(filename string) bool {
basename := filepath.Base(filename)
return timestampPattern.MatchString(basename)
}
file addition: file_import_test.go (----------)

[0.1]

package utils

import (
"testing"
"time"
)

func TestGenerateFileID(t *testing.T) {
t.Run("generates 21-character ID", func(t *testing.T) {
id, err := GenerateLongID()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(id) != 21 {
t.Errorf("expected length 21, got %d: %q", len(id), id)
}
})

t.Run("uses only valid alphabet characters", func(t *testing.T) {
id, err := GenerateLongID()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
// Default nanoid alphabet includes: 0-9, A-Z, a-z, _, -
for _, c := range id {
if (c < '0' || c > '9') && (c < 'A' || c > 'Z') && (c < 'a' || c > 'z') && c != '_' && c != '-' {
t.Errorf("invalid character %q in ID %q", string(c), id)
}
}
})

t.Run("generates unique IDs", func(t *testing.T) {
seen := make(map[string]bool)
for range 100 {
id, err := GenerateLongID()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if seen[id] {
t.Errorf("duplicate ID generated: %q", id)
}
seen[id] = true
}
})
}

func TestResolveTimestamp(t *testing.T) {
t.Run("resolves AudioMoth timestamp", func(t *testing.T) {
meta := &WAVMetadata{
Comment: "Recorded at 21:00:00 24/02/2025 (UTC+13) by AudioMoth 248AB50153AB0549 at medium gain while battery was 4.3V and temperature was 15.8C.",
Artist: "AudioMoth",
}
result, err := ResolveTimestamp(meta, "20250224_210000.wav", "Pacific/Auckland", false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !result.IsAudioMoth {
t.Error("expected IsAudioMoth to be true")
}
if result.MothData == nil {
t.Error("expected MothData to be non-nil")
}
// AudioMoth parser returns UTC+13 fixed offset
expectedUTC := time.Date(2025, 2, 24, 8, 0, 0, 0, time.UTC)
if !result.Timestamp.UTC().Equal(expectedUTC) {
t.Errorf("expected UTC timestamp %v, got %v", expectedUTC, result.Timestamp.UTC())
}
})

t.Run("falls back to filename timestamp", func(t *testing.T) {
meta := &WAVMetadata{
Comment: "",
Artist: "",
}
result, err := ResolveTimestamp(meta, "20250224_210000.wav", "Pacific/Auckland", false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if result.IsAudioMoth {
t.Error("expected IsAudioMoth to be false")
}
if result.Timestamp.IsZero() {
t.Error("expected non-zero timestamp")
}
})

t.Run("falls back to file mod time when enabled", func(t *testing.T) {
modTime := time.Date(2025, 1, 15, 10, 30, 0, 0, time.UTC)
meta := &WAVMetadata{
Comment: "",
Artist: "",
FileModTime: modTime,
}
result, err := ResolveTimestamp(meta, "nopattern.wav", "Pacific/Auckland", true)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !result.Timestamp.Equal(modTime) {
t.Errorf("expected timestamp %v, got %v", modTime, result.Timestamp)
}
})

t.Run("errors when no timestamp available and file mod time disabled", func(t *testing.T) {
meta := &WAVMetadata{
Comment: "",
Artist: "",
}
_, err := ResolveTimestamp(meta, "nopattern.wav", "Pacific/Auckland", false)
if err == nil {
t.Error("expected error when no timestamp available")
}
})

t.Run("errors when no timestamp available and no file mod time", func(t *testing.T) {
meta := &WAVMetadata{
Comment: "",
Artist: "",
}
_, err := ResolveTimestamp(meta, "nopattern.wav", "Pacific/Auckland", true)
if err == nil {
t.Error("expected error when no timestamp available")
}
})

t.Run("AudioMoth detected but parse fails falls back to filename", func(t *testing.T) {
meta := &WAVMetadata{
Comment: "AudioMoth garbage data",
Artist: "",
}
result, err := ResolveTimestamp(meta, "20250224_210000.wav", "Pacific/Auckland", false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !result.IsAudioMoth {
t.Error("expected IsAudioMoth to be true (detected even if parse failed)")
}
if result.MothData != nil {
t.Error("expected MothData to be nil since parsing failed")
}
if result.Timestamp.IsZero() {
t.Error("expected non-zero timestamp from filename fallback")
}
})
}
file addition: file_import.go (----------)

[0.1]

package utils

import (
"database/sql"
"fmt"
"path/filepath"
"time"
)

// TimestampResult holds the result of timestamp resolution for a single file
type TimestampResult struct {
Timestamp time.Time
IsAudioMoth bool
MothData *AudioMothData
}

// ResolveTimestamp resolves a file's timestamp using the standard priority chain:
// 1. AudioMoth comment parsing
// 2. Filename timestamp parsing + timezone offset
// 3. File modification time (if useFileModTime is true)
//
// Returns an error if no timestamp could be determined.
func ResolveTimestamp(wavMeta *WAVMetadata, filePath string, timezoneID string, useFileModTime bool) (*TimestampResult, error) {
result := &TimestampResult{}

// Step 1: Try AudioMoth comment
if IsAudioMoth(wavMeta.Comment, wavMeta.Artist) {
result.IsAudioMoth = true
mothData, err := ParseAudioMothComment(wavMeta.Comment)
if err == nil {
result.MothData = mothData
result.Timestamp = mothData.Timestamp
return result, nil
}
// AudioMoth detected but parsing failed — fall through to filename
}

// Step 2: Try filename timestamp
if HasTimestampFilename(filePath) {
filenameTimestamps, err := ParseFilenameTimestamps([]string{filepath.Base(filePath)})
if err == nil {
adjustedTimestamps, err := ApplyTimezoneOffset(filenameTimestamps, timezoneID)
if err == nil && len(adjustedTimestamps) > 0 {
result.Timestamp = adjustedTimestamps[0]
return result, nil
}
}
}

// Step 3: File modification time fallback (optional)
if useFileModTime && !wavMeta.FileModTime.IsZero() {
result.Timestamp = wavMeta.FileModTime
return result, nil
}

return nil, fmt.Errorf("cannot resolve timestamp (no AudioMoth, filename pattern, or file modification time)")
}

// FileProcessingResult holds all extracted metadata for a single file
type FileProcessingResult struct {
FileName string
Hash string
Duration float64
SampleRate int
TimestampLocal time.Time
IsAudioMoth bool
MothData *AudioMothData
AstroData AstronomicalData
}

// ProcessSingleFile runs the full single-file processing pipeline:
// WAV header parsing → XXH64 hash → timestamp resolution → astronomical data
//
// Set useFileModTime to true to allow file modification time as a timestamp fallback.
func ProcessSingleFile(filePath string, latitude, longitude float64, timezoneID string, useFileModTime bool) (*FileProcessingResult, error) {
// Step 1: Parse WAV header
metadata, err := ParseWAVHeader(filePath)
if err != nil {
return nil, fmt.Errorf("WAV header parsing failed: %w", err)
}

// Step 2: Calculate hash
hash, err := ComputeXXH64(filePath)
if err != nil {
return nil, fmt.Errorf("hash calculation failed: %w", err)
}

// Step 3: Resolve timestamp
tsResult, err := ResolveTimestamp(metadata, filePath, timezoneID, useFileModTime)
if err != nil {
return nil, err
}

// Step 4: Calculate astronomical data
astroData := CalculateAstronomicalData(
tsResult.Timestamp.UTC(),
metadata.Duration,
latitude,
longitude,
)

return &FileProcessingResult{
FileName: filepath.Base(filePath),
Hash: hash,
Duration: metadata.Duration,
SampleRate: metadata.SampleRate,
TimestampLocal: tsResult.Timestamp,
IsAudioMoth: tsResult.IsAudioMoth,
MothData: tsResult.MothData,
AstroData: astroData,
}, nil
}

// DBQueryable is an interface satisfied by both *sql.DB and *sql.Tx
// for running duplicate hash checks against either.
type DBQueryable interface {
QueryRow(query string, args ...any) *sql.Row
}

// CheckDuplicateHash checks if a file with the given XXH64 hash already exists.
// Returns the existing file ID if found, or empty string if no duplicate.
// Works with both *sql.DB and *sql.Tx.
func CheckDuplicateHash(q DBQueryable, hash string) (existingID string, isDuplicate bool, err error) {
err = q.QueryRow(
"SELECT id FROM file WHERE xxh64_hash = ? AND active = true",
hash,
).Scan(&existingID)

if err == nil {
return existingID, true, nil
}
if err == sql.ErrNoRows {
return "", false, nil
}
return "", false, fmt.Errorf("duplicate check failed: %w", err)
}
file addition: fft_test.go (----------)

[0.1]

package utils

import (
"math"
"math/rand"
"testing"

"github.com/madelynnblue/go-dsp/fft"
)

// referencepower computes the power spectrum using go-dsp as ground truth.
func referencePower(samples []float64) []float64 {
result := fft.FFTReal(samples)
n := len(samples)
numBins := n/2 + 1
power := make([]float64, numBins)
for k := range numBins {
re := real(result[k])
im := imag(result[k])
power[k] = re*re + im*im
}
return power
}

func TestPowerSpectrumFFT_Sinusoid(t *testing.T) {
// 512-point FFT of a pure 1kHz sine at 16kHz sample rate
// Expected: peak at bin k = 1000 * 512 / 16000 = 32
n := 512
sampleRate := 16000.0
freq := 1000.0

samples := make([]float64, n)
for i := range samples {
samples[i] = math.Sin(2.0 * math.Pi * freq * float64(i) / sampleRate)
}

power := make([]float64, n/2+1)
scratch := make([]complex128, n)
PowerSpectrumFFT(samples, power, scratch)

// Find peak bin
maxBin := 0
maxVal := 0.0
for k, v := range power {
if v > maxVal {
maxVal = v
maxBin = k
}
}

expectedBin := int(freq * float64(n) / sampleRate)
if maxBin != expectedBin {
t.Errorf("peak at bin %d, expected %d", maxBin, expectedBin)
}

// Compare against reference
ref := referencePower(samples)
for k := range power {
if math.Abs(power[k]-ref[k]) > 1e-6*math.Abs(ref[k])+1e-10 {
t.Errorf("bin %d: got %g, ref %g", k, power[k], ref[k])
}
}
}

func TestPowerSpectrumFFT_Random(t *testing.T) {
n := 512
rng := rand.New(rand.NewSource(42))

samples := make([]float64, n)
for i := range samples {
samples[i] = rng.Float64()*2 - 1
}

power := make([]float64, n/2+1)
scratch := make([]complex128, n)
PowerSpectrumFFT(samples, power, scratch)

ref := referencePower(samples)
for k := range power {
relErr := math.Abs(power[k]-ref[k]) / (math.Abs(ref[k]) + 1e-15)
if relErr > 1e-8 {
t.Errorf("bin %d: got %g, ref %g (relErr=%g)", k, power[k], ref[k], relErr)
}
}
}

func TestPowerSpectrumFFT_DC(t *testing.T) {
n := 512
samples := make([]float64, n)
for i := range samples {
samples[i] = 1.0
}

power := make([]float64, n/2+1)
scratch := make([]complex128, n)
PowerSpectrumFFT(samples, power, scratch)

ref := referencePower(samples)
for k := range power {
if math.Abs(power[k]-ref[k]) > 1e-6 {
t.Errorf("bin %d: got %g, ref %g", k, power[k], ref[k])
}
}

// DC bin should have all the energy
if power[0] < power[1]*1000 {
t.Errorf("DC bin should dominate: power[0]=%g, power[1]=%g", power[0], power[1])
}
}

func TestPowerSpectrumFFT_Silence(t *testing.T) {
n := 512
samples := make([]float64, n)

power := make([]float64, n/2+1)
scratch := make([]complex128, n)
PowerSpectrumFFT(samples, power, scratch)

for k, v := range power {
if v != 0 {
t.Errorf("bin %d: expected 0, got %g", k, v)
}
}
}

func TestPowerSpectrumFFT_Impulse(t *testing.T) {
n := 512
samples := make([]float64, n)
samples[0] = 1.0

power := make([]float64, n/2+1)
scratch := make([]complex128, n)
PowerSpectrumFFT(samples, power, scratch)

ref := referencePower(samples)
for k := range power {
if math.Abs(power[k]-ref[k]) > 1e-10 {
t.Errorf("bin %d: got %g, ref %g", k, power[k], ref[k])
}
}

// Impulse: flat power spectrum, all bins should be equal (= 1.0)
for k, v := range power {
if math.Abs(v-1.0) > 1e-10 {
t.Errorf("bin %d: expected ~1.0, got %g", k, v)
}
}
}

func TestPowerSpectrumFFT_DifferentSizes(t *testing.T) {
rng := rand.New(rand.NewSource(99))

for _, n := range []int{2, 4, 8, 16, 64, 256, 1024} {
samples := make([]float64, n)
for i := range samples {
samples[i] = rng.Float64()*2 - 1
}

power := make([]float64, n/2+1)
scratch := make([]complex128, n)
PowerSpectrumFFT(samples, power, scratch)

ref := referencePower(samples)
for k := range power {
relErr := math.Abs(power[k]-ref[k]) / (math.Abs(ref[k]) + 1e-15)
if relErr > 1e-8 {
t.Errorf("n=%d bin %d: got %g, ref %g (relErr=%g)", n, k, power[k], ref[k], relErr)
}
}
}
}

func BenchmarkPowerSpectrumFFT_512(b *testing.B) {
n := 512
rng := rand.New(rand.NewSource(42))
samples := make([]float64, n)
for i := range samples {
samples[i] = rng.Float64()*2 - 1
}
power := make([]float64, n/2+1)
scratch := make([]complex128, n)

b.ResetTimer()
for range b.N {
PowerSpectrumFFT(samples, power, scratch)
}
}

func BenchmarkGodsFFTReal_512(b *testing.B) {
n := 512
rng := rand.New(rand.NewSource(42))
samples := make([]float64, n)
for i := range samples {
samples[i] = rng.Float64()*2 - 1
}

b.ResetTimer()
for range b.N {
fft.FFTReal(samples)
}
}
file addition: fft.go (----------)

[0.1]

package utils

import (
"math"
"sync"
)

// FFT twiddle factors and bit-reversal tables, cached per size.
var (
fftCacheMu sync.RWMutex
fftCache = map[int]*fftPlan{}
)

// fftPlan holds pre-computed data for a given FFT size.
type fftPlan struct {
n int
twiddle []complex128 // twiddle factors: exp(-2*pi*i*k/N) for k=0..N/2-1
bitrev []int // bit-reversal permutation table
}

// getFFFTPlan returns a cached FFT plan for the given size (must be power of 2).
func getFFTPlan(n int) *fftPlan {
fftCacheMu.RLock()
if p, ok := fftCache[n]; ok {
fftCacheMu.RUnlock()
return p
}
fftCacheMu.RUnlock()

fftCacheMu.Lock()
defer fftCacheMu.Unlock()
if p, ok := fftCache[n]; ok {
return p
}

p := &fftPlan{n: n}

// Compute twiddle factors: exp(-2*pi*i*k/N) for k = 0..N/2-1
p.twiddle = make([]complex128, n/2)
for k := range p.twiddle {
angle := -2.0 * math.Pi * float64(k) / float64(n)
sin, cos := math.Sincos(angle)
p.twiddle[k] = complex(cos, sin)
}

// Compute bit-reversal permutation
bits := 0
for v := n; v > 1; v >>= 1 {
bits++
}
p.bitrev = make([]int, n)
for i := range p.bitrev {
p.bitrev[i] = reverseBitsN(i, bits)
}

fftCache[n] = p
return p
}

// reverseBitsN reverses the lowest `bits` bits of v.
func reverseBitsN(v, bits int) int {
var r int
for range bits {
r = (r << 1) | (v & 1)
v >>= 1
}
return r
}

// PowerSpectrumFFT computes the power spectrum of a real-valued signal using radix-2 FFT.
//
// samples: real input of length N (must be power of 2, N >= 2)
// power: output buffer of length >= N/2+1; receives |X[k]|^2 for k=0..N/2
// scratch: working buffer of length >= N; contents are overwritten
//
// All buffers are caller-provided to enable zero-allocation across repeated calls.
func PowerSpectrumFFT(samples []float64, power []float64, scratch []complex128) {
n := len(samples)
plan := getFFTPlan(n)

// Bit-reversal copy: load real samples into scratch in bit-reversed order
for i, j := range plan.bitrev {
scratch[j] = complex(samples[i], 0)
}

// Iterative Cooley-Tukey butterfly (decimation-in-time)
for size := 2; size <= n; size <<= 1 {
half := size >> 1
step := n / size // twiddle index step

for start := 0; start < n; start += size {
tw := 0
for j := range half {
u := scratch[start+j]
v := scratch[start+j+half] * plan.twiddle[tw]
scratch[start+j] = u + v
scratch[start+j+half] = u - v
tw += step
}
}
}

// Extract power spectrum: |X[k]|^2 = re^2 + im^2 for k = 0..N/2
numBins := n/2 + 1
for k := range numBins {
re := real(scratch[k])
im := imag(scratch[k])
power[k] = re*re + im*im
}
}
file addition: data_file_test.go (----------)

[0.1]

package utils

import (
"os"
"testing"
)

func TestDataFileParse(t *testing.T) {
// Create a test .data file
content := `[
{"Operator": "Auto", "Reviewer": null, "Duration": 60.0},
[10.0, 20.0, 0, 0, [{"species": "Kiwi", "certainty": 70, "filter": "test-filter"}]],
[30.0, 40.0, 1000, 5000, [{"species": "Morepork", "certainty": 80, "filter": "M"}]]
]`

tmpfile, err := os.CreateTemp("", "test*.data")
if err != nil {
t.Fatal(err)
}
defer os.Remove(tmpfile.Name())

if _, err := tmpfile.Write([]byte(content)); err != nil {
t.Fatal(err)
}
tmpfile.Close()

// Parse
df, err := ParseDataFile(tmpfile.Name())
if err != nil {
t.Fatal(err)
}

// Check metadata
if df.Meta.Operator != "Auto" {
t.Errorf("expected Operator=Auto, got %s", df.Meta.Operator)
}
if df.Meta.Duration != 60.0 {
t.Errorf("expected Duration=60.0, got %f", df.Meta.Duration)
}

// Check segments
if len(df.Segments) != 2 {
t.Errorf("expected 2 segments, got %d", len(df.Segments))
}

// Check first segment (sorted by start time)
if df.Segments[0].StartTime != 10.0 {
t.Errorf("expected StartTime=10.0, got %f", df.Segments[0].StartTime)
}
if df.Segments[0].EndTime != 20.0 {
t.Errorf("expected EndTime=20.0, got %f", df.Segments[0].EndTime)
}

// Check labels
if len(df.Segments[0].Labels) != 1 {
t.Errorf("expected 1 label, got %d", len(df.Segments[0].Labels))
}
if df.Segments[0].Labels[0].Species != "Kiwi" {
t.Errorf("expected Species=Kiwi, got %s", df.Segments[0].Labels[0].Species)
}
if df.Segments[0].Labels[0].Certainty != 70 {
t.Errorf("expected Certainty=70, got %d", df.Segments[0].Labels[0].Certainty)
}
}

func TestDataFileWrite(t *testing.T) {
df := &DataFile{
FilePath: "",
Meta: &DataMeta{
Operator: "Test",
Reviewer: "David",
Duration: 120.0,
},
Segments: []*Segment{
{
StartTime: 5.0,
EndTime: 15.0,
FreqLow: 0,
FreqHigh: 0,
Labels: []*Label{
{Species: "Kiwi", Certainty: 100, Filter: "test"},
},
},
},
}

tmpfile, err := os.CreateTemp("", "test*.data")
if err != nil {
t.Fatal(err)
}
tmpfile.Close()
defer os.Remove(tmpfile.Name())

// Write
if err := df.Write(tmpfile.Name()); err != nil {
t.Fatal(err)
}

// Re-parse and verify
df2, err := ParseDataFile(tmpfile.Name())
if err != nil {
t.Fatal(err)
}

if df2.Meta.Reviewer != "David" {
t.Errorf("expected Reviewer=David, got %s", df2.Meta.Reviewer)
}
if len(df2.Segments) != 1 {
t.Errorf("expected 1 segment, got %d", len(df2.Segments))
}
if df2.Segments[0].Labels[0].Species != "Kiwi" {
t.Errorf("expected Species=Kiwi, got %s", df2.Segments[0].Labels[0].Species)
}
}

func TestHasFilterLabel(t *testing.T) {
seg := &Segment{
Labels: []*Label{
{Species: "Kiwi", Filter: "test-filter"},
{Species: "Morepork", Filter: "M"},
},
}

if !seg.HasFilterLabel("test-filter") {
t.Error("expected HasFilterLabel(test-filter)=true")
}
if !seg.HasFilterLabel("M") {
t.Error("expected HasFilterLabel(M)=true")
}
if seg.HasFilterLabel("other") {
t.Error("expected HasFilterLabel(other)=false")
}
if !seg.HasFilterLabel("") {
t.Error("expected HasFilterLabel('')=true (no filter)")
}
}

func TestGetFilterLabels(t *testing.T) {
seg := &Segment{
Labels: []*Label{
{Species: "Kiwi", Filter: "test-filter", Certainty: 70},
{Species: "Morepork", Filter: "M", Certainty: 80},
{Species: "Don't Know", Filter: "test-filter", Certainty: 0},
},
}

labels := seg.GetFilterLabels("test-filter")
if len(labels) != 2 {
t.Errorf("expected 2 labels, got %d", len(labels))
}

labels = seg.GetFilterLabels("")
if len(labels) != 3 {
t.Errorf("expected 3 labels (no filter), got %d", len(labels))
}
}

func TestLabelComment(t *testing.T) {
// Test parsing comment from .data file
content := `[
{"Operator": "Test", "Duration": 60.0},
[10.0, 20.0, 0, 0, [{"species": "Kiwi", "certainty": 100, "filter": "M", "comment": "Good call"}]]
]`

tmpfile, err := os.CreateTemp("", "test*.data")
if err != nil {
t.Fatal(err)
}
defer os.Remove(tmpfile.Name())

if _, err := tmpfile.Write([]byte(content)); err != nil {
t.Fatal(err)
}
tmpfile.Close()

df, err := ParseDataFile(tmpfile.Name())
if err != nil {
t.Fatal(err)
}

if df.Segments[0].Labels[0].Comment != "Good call" {
t.Errorf("expected Comment='Good call', got '%s'", df.Segments[0].Labels[0].Comment)
}

// Test writing comment
df.Segments[0].Labels[0].Comment = "Updated comment"

tmpfile2, err := os.CreateTemp("", "test2*.data")
if err != nil {
t.Fatal(err)
}
tmpfile2.Close()
defer os.Remove(tmpfile2.Name())

if err := df.Write(tmpfile2.Name()); err != nil {
t.Fatal(err)
}

// Re-parse and verify
df2, err := ParseDataFile(tmpfile2.Name())
if err != nil {
t.Fatal(err)
}

if df2.Segments[0].Labels[0].Comment != "Updated comment" {
t.Errorf("expected Comment='Updated comment', got '%s'", df2.Segments[0].Labels[0].Comment)
}
}

func TestSkraakHashRoundTrip(t *testing.T) {
// Test that skraak_hash in metadata is preserved through parse/write cycle
df := &DataFile{
Meta: &DataMeta{
Operator: "Test",
Duration: 60.0,
Extra: map[string]any{
"skraak_hash": "abc123def456",
},
},
Segments: []*Segment{
{
StartTime: 10.0,
EndTime: 20.0,
Labels: []*Label{
{Species: "Kiwi", Certainty: 100, Filter: "M"},
},
},
},
}

tmpfile, err := os.CreateTemp("", "test*.data")
if err != nil {
t.Fatal(err)
}
tmpfile.Close()
defer os.Remove(tmpfile.Name())

// Write
if err := df.Write(tmpfile.Name()); err != nil {
t.Fatal(err)
}

// Re-parse
df2, err := ParseDataFile(tmpfile.Name())
if err != nil {
t.Fatal(err)
}

// Verify skraak_hash preserved
if df2.Meta.Extra == nil {
t.Fatal("expected Extra to be non-nil")
}
hash, ok := df2.Meta.Extra["skraak_hash"].(string)
if !ok {
t.Fatal("expected skraak_hash to be string")
}
if hash != "abc123def456" {
t.Errorf("expected skraak_hash=abc123def456, got %s", hash)
}
}

func TestSkraakLabelIDRoundTrip(t *testing.T) {
// Test that skraak_label_id in labels is preserved through parse/write cycle
df := &DataFile{
Meta: &DataMeta{
Operator: "Test",
Duration: 60.0,
},
Segments: []*Segment{
{
StartTime: 10.0,
EndTime: 20.0,
Labels: []*Label{
{
Species: "Kiwi",
Certainty: 100,
Filter: "M",
Extra: map[string]any{
"skraak_label_id": "label_abc123",
},
},
},
},
},
}

tmpfile, err := os.CreateTemp("", "test*.data")
if err != nil {
t.Fatal(err)
}
tmpfile.Close()
defer os.Remove(tmpfile.Name())

// Write
if err := df.Write(tmpfile.Name()); err != nil {
t.Fatal(err)
}

// Re-parse
df2, err := ParseDataFile(tmpfile.Name())
if err != nil {
t.Fatal(err)
}

// Verify skraak_label_id preserved
if len(df2.Segments) != 1 {
t.Fatalf("expected 1 segment, got %d", len(df2.Segments))
}
if len(df2.Segments[0].Labels) != 1 {
t.Fatalf("expected 1 label, got %d", len(df2.Segments[0].Labels))
}

label := df2.Segments[0].Labels[0]
if label.Extra == nil {
t.Fatal("expected label Extra to be non-nil")
}
labelID, ok := label.Extra["skraak_label_id"].(string)
if !ok {
t.Fatal("expected skraak_label_id to be string")
}
if labelID != "label_abc123" {
t.Errorf("expected skraak_label_id=label_abc123, got %s", labelID)
}
}

func TestSkraakFieldsBothPresent(t *testing.T) {
// Test both skraak_hash and skraak_label_id together
df := &DataFile{
Meta: &DataMeta{
Operator: "Test",
Duration: 60.0,
Extra: map[string]any{
"skraak_hash": "file_hash_xyz",
},
},
Segments: []*Segment{
{
StartTime: 10.0,
EndTime: 20.0,
Labels: []*Label{
{
Species: "Kiwi",
Certainty: 100,
Filter: "M",
Extra: map[string]any{
"skraak_label_id": "label_id_1",
},
},
{
Species: "Roroa",
Certainty: 90,
Filter: "M",
Extra: map[string]any{
"skraak_label_id": "label_id_2",
},
},
},
},
},
}

tmpfile, err := os.CreateTemp("", "test*.data")
if err != nil {
t.Fatal(err)
}
tmpfile.Close()
defer os.Remove(tmpfile.Name())

// Write
if err := df.Write(tmpfile.Name()); err != nil {
t.Fatal(err)
}

// Re-parse
df2, err := ParseDataFile(tmpfile.Name())
if err != nil {
t.Fatal(err)
}

// Verify skraak_hash
if df2.Meta.Extra["skraak_hash"] != "file_hash_xyz" {
t.Errorf("expected skraak_hash=file_hash_xyz, got %v", df2.Meta.Extra["skraak_hash"])
}

// Verify both label IDs
if len(df2.Segments[0].Labels) != 2 {
t.Fatalf("expected 2 labels, got %d", len(df2.Segments[0].Labels))
}

labelIDs := []string{"label_id_1", "label_id_2"}
for i, label := range df2.Segments[0].Labels {
if label.Extra["skraak_label_id"] != labelIDs[i] {
t.Errorf("label %d: expected skraak_label_id=%s, got %v", i, labelIDs[i], label.Extra["skraak_label_id"])
}
}
}

func TestSegmentMatchesFilters(t *testing.T) {
// Create test segments with various labels
seg := &Segment{
Labels: []*Label{
{Species: "Kiwi", Filter: "model-1.0", CallType: "Duet", Certainty: 70},
{Species: "Morepork", Filter: "model-2.0", CallType: "", Certainty: 100},
},
}

tests := []struct {
name string
filter string
species string
callType string
certainty int
want bool
}{
{"no filters", "", "", "", -1, true},
{"filter only match", "model-1.0", "", "", -1, true},
{"filter only no match", "model-3.0", "", "", -1, false},
{"species only match", "", "Kiwi", "", -1, true},
{"species only no match", "", "Tomtit", "", -1, false},
{"calltype only match", "", "", "Duet", -1, true},
{"calltype only no match", "", "", "Male", -1, false},
{"certainty match", "", "", "", 70, true},
{"certainty no match", "", "", "", 80, false},
{"certainty 100 match", "", "", "", 100, true},
{"filter+species match", "model-1.0", "Kiwi", "", -1, true},
{"filter+species+calltype match", "model-1.0", "Kiwi", "Duet", -1, true},
{"filter+species+calltype+certainty match", "model-1.0", "Kiwi", "Duet", 70, true},
{"filter+species+calltype certainty miss", "model-1.0", "Kiwi", "Duet", 100, false},
{"filter match species miss", "model-1.0", "Morepork", "", -1, false},
{"all miss", "model-3.0", "Tomtit", "Male", -1, false},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := seg.SegmentMatchesFilters(tt.filter, tt.species, tt.callType, tt.certainty)
if got != tt.want {
t.Errorf("SegmentMatchesFilters(%q, %q, %q, %d) = %v, want %v",
tt.filter, tt.species, tt.callType, tt.certainty, got, tt.want)
}
})
}
}

func TestParseSpeciesCallType(t *testing.T) {
tests := []struct {
input string
species string
callType string
}{
{"", "", ""},
{"Kiwi", "Kiwi", ""},
{"Kiwi+Duet", "Kiwi", "Duet"},
{"GSK+Female", "GSK", "Female"},
{"Species+With+Multiple+Plus", "Species", "With+Multiple+Plus"},
}

for _, tt := range tests {
t.Run(tt.input, func(t *testing.T) {
species, callType := ParseSpeciesCallType(tt.input)
if species != tt.species || callType != tt.callType {
t.Errorf("ParseSpeciesCallType(%q) = (%q, %q), want (%q, %q)",
tt.input, species, callType, tt.species, tt.callType)
}
})
}
}
file addition: data_file.go (----------)

[0.1]

package utils

import (
"encoding/json"
"fmt"
"maps"
"os"
"sort"
"strings"
)

// DataFile represents an AviaNZ .data file
type DataFile struct {
Meta *DataMeta
Segments []*Segment
FilePath string
}

// DataMeta contains metadata for a .data file
type DataMeta struct {
Operator string
Reviewer string
Duration float64
Extra map[string]any // preserve unknown fields
}

// Segment represents a detection segment
type Segment struct {
StartTime float64
EndTime float64
FreqLow float64
FreqHigh float64
Labels []*Label
}

// Label represents a species label within a segment
type Label struct {
Species string
Certainty int
Filter string
CallType string
Comment string // user comment (max 140 chars, ASCII only)
Bookmark bool // user bookmark for navigation
Extra map[string]any // preserve unknown fields
}

// ParseDataFile reads and parses a .data file
func ParseDataFile(path string) (*DataFile, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, err
}

var raw []json.RawMessage
if err := json.Unmarshal(data, &raw); err != nil {
return nil, fmt.Errorf("parse JSON: %w", err)
}

if len(raw) == 0 {
return nil, fmt.Errorf("empty .data file")
}

df := &DataFile{
FilePath: path,
Segments: make([]*Segment, 0, len(raw)-1),
}

// Parse metadata (first element)
df.Meta = parseMeta(raw[0])

// Parse segments
for i := 1; i < len(raw); i++ {
seg, err := parseSegment(raw[i])
if err != nil {
continue // skip invalid segments
}
df.Segments = append(df.Segments, seg)
}

// Sort segments by start time
sort.Slice(df.Segments, func(i, j int) bool {
return df.Segments[i].StartTime < df.Segments[j].StartTime
})

return df, nil
}

// parseMeta parses the metadata object
func parseMeta(raw json.RawMessage) *DataMeta {
var obj map[string]any
if err := json.Unmarshal(raw, &obj); err != nil {
return &DataMeta{}
}

meta := &DataMeta{Extra: make(map[string]any)}

if v, ok := obj["Operator"].(string); ok {
meta.Operator = v
delete(obj, "Operator")
}
if v, ok := obj["Reviewer"].(string); ok {
meta.Reviewer = v
delete(obj, "Reviewer")
}
if v, ok := obj["Duration"].(float64); ok {
meta.Duration = v
delete(obj, "Duration")
}

// Store remaining fields
maps.Copy(meta.Extra, obj)

return meta
}

// parseSegment parses a segment array
func parseSegment(raw json.RawMessage) (*Segment, error) {
var arr []json.RawMessage
if err := json.Unmarshal(raw, &arr); err != nil {
return nil, err
}

if len(arr) < 5 {
return nil, fmt.Errorf("segment too short")
}

seg := &Segment{}

// Parse time and frequency
if v, err := parseFloat(arr[0]); err == nil {
seg.StartTime = v
}
if v, err := parseFloat(arr[1]); err == nil {
seg.EndTime = v
}
if v, err := parseFloat(arr[2]); err == nil {
seg.FreqLow = v
}
if v, err := parseFloat(arr[3]); err == nil {
seg.FreqHigh = v
}

// Parse labels
var labelArr []json.RawMessage
if err := json.Unmarshal(arr[4], &labelArr); err == nil {
for _, labelRaw := range labelArr {
if label := parseLabel(labelRaw); label != nil {
seg.Labels = append(seg.Labels, label)
}
}
}

// Sort labels alphabetically by species
sort.Slice(seg.Labels, func(i, j int) bool {
return seg.Labels[i].Species < seg.Labels[j].Species
})

return seg, nil
}

// parseLabel parses a label object
func parseLabel(raw json.RawMessage) *Label {
var obj map[string]any
if err := json.Unmarshal(raw, &obj); err != nil {
return nil
}

label := &Label{Extra: make(map[string]any)}

if v, ok := obj["species"].(string); ok {
label.Species = v
delete(obj, "species")
}
if v, ok := obj["certainty"].(float64); ok {
label.Certainty = int(v)
delete(obj, "certainty")
}
if v, ok := obj["filter"].(string); ok {
label.Filter = v
delete(obj, "filter")
}
if v, ok := obj["calltype"].(string); ok {
label.CallType = v
delete(obj, "calltype")
}
if v, ok := obj["comment"].(string); ok {
label.Comment = v
delete(obj, "comment")
}
if v, ok := obj["bookmark"].(bool); ok {
label.Bookmark = v
delete(obj, "bookmark")
}

// Store remaining fields
maps.Copy(label.Extra, obj)

return label
}

// parseFloat extracts a float from JSON
func parseFloat(raw json.RawMessage) (float64, error) {
var v float64
err := json.Unmarshal(raw, &v)
return v, err
}

// WriteDataFile writes a DataFile back to disk
func (df *DataFile) Write(path string) error {
var raw []any

// Build metadata
meta := make(map[string]any)
if df.Meta.Operator != "" {
meta["Operator"] = df.Meta.Operator
}
if df.Meta.Reviewer != "" {
meta["Reviewer"] = df.Meta.Reviewer
}
if df.Meta.Duration > 0 {
meta["Duration"] = df.Meta.Duration
}
maps.Copy(meta, df.Meta.Extra)
raw = append(raw, meta)

// Build segments
for _, seg := range df.Segments {
labels := make([]any, 0, len(seg.Labels))
for _, label := range seg.Labels {
l := make(map[string]any)
l["species"] = label.Species
l["certainty"] = label.Certainty
if label.Filter != "" {
l["filter"] = label.Filter
}
if label.CallType != "" {
l["calltype"] = label.CallType
}
if label.Comment != "" {
l["comment"] = label.Comment
}
if label.Bookmark {
l["bookmark"] = true
}
maps.Copy(l, label.Extra)
labels = append(labels, l)
}

segArr := []any{
seg.StartTime,
seg.EndTime,
seg.FreqLow,
seg.FreqHigh,
labels,
}
raw = append(raw, segArr)
}

data, err := json.MarshalIndent(raw, "", " ")
if err != nil {
return err
}

return os.WriteFile(path, data, 0644)
}

// HasFilterLabel returns true if segment has a label matching the filter
func (s *Segment) HasFilterLabel(filter string) bool {
if filter == "" {
return true
}
for _, label := range s.Labels {
if label.Filter == filter {
return true
}
}
return false
}

// GetFilterLabels returns labels matching the filter
func (s *Segment) GetFilterLabels(filter string) []*Label {
var result []*Label
for _, label := range s.Labels {
if filter == "" || label.Filter == filter {
result = append(result, label)
}
}
return result
}

// SegmentMatchesFilters returns true if the segment has any label matching all filter criteria.
// All non-empty/non-negative parameters must match for a label to be considered a match.
// Use certainty=-1 to indicate no certainty filtering (since 0 is a valid certainty value).
func (s *Segment) SegmentMatchesFilters(filter, species, callType string, certainty int) bool {
if filter == "" && species == "" && callType == "" && certainty < 0 {
return true // No filters, match all
}

for _, label := range s.Labels {
if filter != "" && label.Filter != filter {
continue
}
if species != "" && label.Species != species {
continue
}
if callType != "" && label.CallType != callType {
continue
}
if certainty >= 0 && label.Certainty != certainty {
continue
}
return true
}
return false
}

// ParseSpeciesCallType parses a species string with optional calltype into separate values.
// Format: "Species" or "Species+CallType" (e.g., "Kiwi" or "Kiwi+Duet").
func ParseSpeciesCallType(s string) (species, callType string) {
if s == "" {
return "", ""
}
if before, after, ok := strings.Cut(s, "+"); ok {
return before, after
}
return s, ""
}

// FindDataFiles finds all .data files in a folder, ignoring hidden files (starting with ".")
func FindDataFiles(folder string) ([]string, error) {
var files []string

entries, err := os.ReadDir(folder)
if err != nil {
return nil, err
}

for _, entry := range entries {
name := entry.Name()
// Skip hidden files (starting with ".")
if strings.HasPrefix(name, ".") {
continue
}
if strings.HasSuffix(name, ".data") {
files = append(files, folder+"/"+name)
}
}

return files, nil
}
file addition: config.go (----------)

[0.1]

package utils

import (
"encoding/json"
"fmt"
"os"
"path/filepath"
)

// ~/.skraak/config.json schema (reference):
//
// {
// "classify": {
// "reviewer": "string, required. Name stamped into .data file meta on any edit.",
// "color": "bool, optional. Colored spectrograms in the TUI. Default false.",
// "sixel": "bool, optional. Use sixel image protocol. Default false (Kitty).",
// "iterm": "bool, optional. Use iTerm inline-image protocol. Default false.",
// "img_dims": "int, optional. Spectrogram display size in pixels. 0 = default.",
//
// "bindings": {
// "<key>": "Species" // e.g. "c": "comcha"
// "<key>": "Species+CallType" // e.g. "1": "Kiwi+Duet"
// // <key> is a single character. Reserved: ",", ".", "0", " " (space).
// // Pressing <key> labels the current segment (certainty 100, or 0 for
// // "Don't Know"), saves, and advances.
// },
//
// "secondary_bindings": {
// "<primary-key>": {
// "<key>": "CallType" // e.g. "a": "alarm"
// // <key> is a single character, same reserved-key rules as bindings.
// // Outer <primary-key> must also exist in "bindings".
// }
// // Optional. Invoked via Shift+<primary-key>: labels the species with
// // an empty calltype, does NOT advance, and waits for one follow-up
// // key looked up in this inner map. Match -> set calltype, save,
// // advance. Esc -> exit wait mode without advancing. Any other key ->
// // exit wait mode and handle the key normally.
// // Shift+<primary-key> on a primary without a secondary_bindings entry
// // falls back to normal primary behavior.
// }
// }
// }
//
// Example:
//
// {
// "classify": {
// "reviewer": "David",
// "color": true,
// "bindings": {
// "c": "comcha",
// "k": "kea1",
// "x": "Noise",
// "z": "Don't Know",
// "1": "Kiwi+Duet",
// "4": "Kiwi"
// },
// "secondary_bindings": {
// "c": { "a": "alarm", "s": "song", "n": "contact" }
// }
// }
// }
//
// Config holds user-level defaults loaded from ~/.skraak/config.json.
// Per-subcommand sections live as named fields.
type Config struct {
Classify ClassifyFileConfig `json:"classify"`
}

// ClassifyFileConfig holds defaults for `skraak calls classify`.
// Bindings maps a single-character key to "Species" or "Species+CallType".
type ClassifyFileConfig struct {
Reviewer string `json:"reviewer"`
Color bool `json:"color"`
Sixel bool `json:"sixel"`
ITerm bool `json:"iterm"`
ImgDims int `json:"img_dims"`
Bindings map[string]string `json:"bindings"`
// SecondaryBindings extends a primary binding with per-species calltype
// choices. Outer key is the primary binding key; inner map is
// single-char key -> calltype string. Invoked via Shift+primary-key.
SecondaryBindings map[string]map[string]string `json:"secondary_bindings,omitempty"`
}

// ConfigPath returns the absolute path to ~/.skraak/config.json.
func ConfigPath() (string, error) {
home, err := os.UserHomeDir()
if err != nil {
return "", fmt.Errorf("resolving home directory: %w", err)
}
return filepath.Join(home, ".skraak", "config.json"), nil
}

// LoadConfig reads ~/.skraak/config.json and returns the parsed config and the
// resolved path (useful for error messages).
func LoadConfig() (Config, string, error) {
var cfg Config
path, err := ConfigPath()
if err != nil {
return cfg, "", err
}
data, err := os.ReadFile(path)
if err != nil {
return cfg, path, fmt.Errorf("reading %s: %w", path, err)
}
if err := json.Unmarshal(data, &cfg); err != nil {
return cfg, path, fmt.Errorf("parsing %s: %w", path, err)
}
return cfg, path, nil
}
file addition: colormap.go (----------)

[0.1]

package utils

// RGBPixel represents an RGB color value
type RGBPixel struct {
R, G, B uint8
}

// L4Colormap is the Black-Red-Yellow heat colormap from PerceptualColourMaps.jl
// Control points:
//
// Index 0: Black (0.0, 0.0, 0.0)
// Index 85: Dark Red (0.85, 0.0, 0.0)
// Index 170: Orange-Red (1.0, 0.15, 0.0)
// Index 255: Yellow (1.0, 1.0, 0.0)
var L4Colormap [256]RGBPixel

func init() {
// Generate L4 colormap using piecewise linear interpolation
// This avoids overshoot issues with cubic splines
controlPoints := []struct {
idx int
r float64
g float64
b float64
}{
{0, 0.0, 0.0, 0.0},
{85, 0.85, 0.0, 0.0},
{170, 1.0, 0.15, 0.0},
{255, 1.0, 1.0, 0.0},
}

for i := range 256 {
// Find the segment we're in
var seg int
for seg = 0; seg < len(controlPoints)-1; seg++ {
if i <= controlPoints[seg+1].idx {
break
}
}
if seg >= len(controlPoints)-1 {
seg = len(controlPoints) - 2
}

// Linear interpolation within segment
p0 := controlPoints[seg]
p1 := controlPoints[seg+1]

t := 0.0
if p1.idx != p0.idx {
t = float64(i-p0.idx) / float64(p1.idx-p0.idx)
}

L4Colormap[i] = RGBPixel{
R: uint8((p0.r + t*(p1.r-p0.r)) * 255.0),
G: uint8((p0.g + t*(p1.g-p0.g)) * 255.0),
B: uint8((p0.b + t*(p1.b-p0.b)) * 255.0),
}
}
}

// ApplyL4Colormap converts a grayscale image to RGB using the L4 colormap
func ApplyL4Colormap(grayscale [][]uint8) [][]RGBPixel {
if len(grayscale) == 0 || len(grayscale[0]) == 0 {
return nil
}

rows := len(grayscale)
cols := len(grayscale[0])

result := make([][]RGBPixel, rows)
for i := range result {
result[i] = make([]RGBPixel, cols)
}

for y := range rows {
for x := range cols {
result[y][x] = L4Colormap[grayscale[y][x]]
}
}

return result
}
file addition: cluster_import.go (----------)

[0.1]

package utils

import (
"context"
"database/sql"
"fmt"
"os"
"path/filepath"
"sort"
"strings"
"time"

"skraak/db"
)

// FileImportError records errors encountered during file processing
type FileImportError struct {
FileName string `json:"file_name"`
Error string `json:"error"`
Stage string `json:"stage"` // "scan", "hash", "parse", "validate", "insert"
}

// ClusterImportInput defines parameters for importing one cluster
type ClusterImportInput struct {
FolderPath string // Absolute path to folder with WAV files
DatasetID string // 12-char dataset ID
LocationID string // 12-char location ID
ClusterID string // 12-char cluster ID
Recursive bool // Scan subfolders?
}

// ClusterImportOutput provides results and statistics
type ClusterImportOutput struct {
TotalFiles int
ImportedFiles int
SkippedFiles int // Duplicates
FailedFiles int
AudioMothFiles int
TotalDuration float64
ProcessingTime string
Errors []FileImportError
}

// LocationData holds location information needed for processing
type LocationData struct {
Latitude float64
Longitude float64
TimezoneID string
}

// fileData holds all data for a single file to be imported
type fileData struct {
FileName string
Hash string
Duration float64
SampleRate int
TimestampLocal time.Time
IsAudioMoth bool
MothData *AudioMothData
AstroData AstronomicalData
}

// ImportCluster imports all WAV files from a folder into a cluster
//
// This is the canonical cluster import logic used by both:
// - import_files.go (single cluster)
// - bulk_file_import.go (multiple clusters)
//
// Steps:
// 1. Validate folder exists
// 2. Get location metadata (lat/lon/timezone) from database
// 3. Scan folder for WAV files (recursive or not)
// 4. Batch process all files:
// - Parse WAV headers (includes file mod time)
// - Batch parse filename timestamps (variance-based)
// - Resolve timestamps (AudioMoth → filename → file mod time)
// - Calculate hashes
// - Calculate astronomical data
// 5. Batch insert in single transaction:
// - Check duplicates
// - INSERT INTO file
// - INSERT INTO file_dataset (ALWAYS)
// - INSERT INTO moth_metadata (if AudioMoth)
// - All-or-nothing commit
// 6. Return summary statistics
//
// Transaction behavior: ALL files succeed or ALL rollback
// This preserves cluster integrity (cluster = complete recording session)
func ImportCluster(
database *sql.DB,
input ClusterImportInput,
) (*ClusterImportOutput, error) {
startTime := time.Now()

// Validate folder exists
info, err := os.Stat(input.FolderPath)
if err != nil {
return nil, fmt.Errorf("folder not accessible: %w", err)
}
if !info.IsDir() {
return nil, fmt.Errorf("path is not a directory: %s", input.FolderPath)
}

// Get location data for astronomical calculations
locationData, err := GetLocationData(database, input.LocationID)
if err != nil {
return nil, fmt.Errorf("failed to get location data: %w", err)
}

// Scan folder for WAV files
wavFiles, err := scanClusterFiles(input.FolderPath, input.Recursive)
if err != nil {
return nil, fmt.Errorf("failed to scan folder: %w", err)
}

// If no files, return early
if len(wavFiles) == 0 {
return &ClusterImportOutput{
TotalFiles: 0,
ProcessingTime: time.Since(startTime).String(),
Errors: []FileImportError{},
}, nil
}

// Batch process all files
filesData, processErrors := batchProcessFiles(wavFiles, locationData)

// Batch insert into database
imported, skipped, insertErrors, err := insertClusterFiles(
database,
filesData,
input.DatasetID,
input.ClusterID,
input.LocationID,
)
if err != nil {
return nil, fmt.Errorf("database insertion failed: %w", err)
}

// Combine all errors
allErrors := append(processErrors, insertErrors...)

// Calculate summary statistics
audiomothCount := 0
totalDuration := 0.0
for _, fd := range filesData {
if fd.IsAudioMoth {
audiomothCount++
}
totalDuration += fd.Duration
}

return &ClusterImportOutput{
TotalFiles: len(wavFiles),
ImportedFiles: imported,
SkippedFiles: skipped,
FailedFiles: len(allErrors),
AudioMothFiles: audiomothCount,
TotalDuration: totalDuration,
ProcessingTime: time.Since(startTime).String(),
Errors: allErrors,
}, nil
}

// GetLocationData retrieves location coordinates and timezone
func GetLocationData(database *sql.DB, locationID string) (*LocationData, error) {
var loc LocationData
err := database.QueryRow(
"SELECT latitude, longitude, timezone_id FROM location WHERE id = ?",
locationID,
).Scan(&loc.Latitude, &loc.Longitude, &loc.TimezoneID)

if err != nil {
return nil, fmt.Errorf("failed to query location data: %w", err)
}

return &loc, nil
}

// EnsureClusterPath sets the cluster's path field if it's currently empty
func EnsureClusterPath(database *sql.DB, clusterID, folderPath string) error {
// Check if cluster already has a path
var currentPath sql.NullString
err := database.QueryRow("SELECT path FROM cluster WHERE id = ?", clusterID).Scan(&currentPath)
if err != nil {
return fmt.Errorf("failed to query cluster: %w", err)
}

// If path is already set, skip
if currentPath.Valid && currentPath.String != "" {
return nil
}

// Normalize folder path
normalizedPath := NormalizeFolderPath(folderPath)

// Update cluster with normalized path
_, err = database.Exec(
"UPDATE cluster SET path = ?, last_modified = now() WHERE id = ?",
normalizedPath,
clusterID,
)
if err != nil {
return fmt.Errorf("failed to update cluster path: %w", err)
}

return nil
}

// scanClusterFiles recursively scans a folder for WAV files, excluding Clips_* subfolders
func scanClusterFiles(rootPath string, recursive bool) ([]string, error) {
var wavFiles []string

if recursive {
err := filepath.Walk(rootPath, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}

// Skip "Clips_*" directories
if info.IsDir() && strings.HasPrefix(info.Name(), "Clips_") {
return filepath.SkipDir
}

// Check for WAV files
if !info.IsDir() {
ext := strings.ToLower(filepath.Ext(path))
if ext == ".wav" && info.Size() > 0 {
wavFiles = append(wavFiles, path)
}
}

return nil
})

if err != nil {
return nil, err
}
} else {
// Non-recursive: scan only top level
entries, err := os.ReadDir(rootPath)
if err != nil {
return nil, err
}

for _, entry := range entries {
if !entry.IsDir() {
name := entry.Name()
ext := strings.ToLower(filepath.Ext(name))
if ext == ".wav" {
path := filepath.Join(rootPath, name)
if info, err := os.Stat(path); err == nil && info.Size() > 0 {
wavFiles = append(wavFiles, path)
}
}
}
}
}

// Sort for consistent processing order
sort.Strings(wavFiles)

return wavFiles, nil
}

// batchProcessFiles extracts metadata and calculates hashes for all files
func batchProcessFiles(wavFiles []string, location *LocationData) ([]*fileData, []FileImportError) {
var filesData []*fileData
var errors []FileImportError

// Step 1: Extract WAV metadata and hash in single pass
type wavInfo struct {
path string
metadata *WAVMetadata
hash string
err error
}

wavInfos := make([]wavInfo, len(wavFiles))
for i, path := range wavFiles {
metadata, hash, err := ParseWAVHeaderWithHash(path)
wavInfos[i] = wavInfo{path: path, metadata: metadata, hash: hash, err: err}
}

// Step 2: Collect filenames for batch timestamp parsing
var filenamesForParsing []string
var filenameIndices []int

for i, info := range wavInfos {
if info.err != nil {
errors = append(errors, FileImportError{
FileName: filepath.Base(info.path),
Error: info.err.Error(),
Stage: "parse",
})
continue
}

// Check if file has timestamp filename format
if HasTimestampFilename(info.path) {
filenamesForParsing = append(filenamesForParsing, filepath.Base(info.path))
filenameIndices = append(filenameIndices, i)
}
}

// Step 3: Parse filename timestamps in batch (if any)
filenameTimestampMap := make(map[int]time.Time) // Maps file index to timestamp

if len(filenamesForParsing) > 0 {
filenameTimestamps, err := ParseFilenameTimestamps(filenamesForParsing)
if err != nil {
// If batch parsing fails, record error for all files
for _, idx := range filenameIndices {
errors = append(errors, FileImportError{
FileName: filepath.Base(wavInfos[idx].path),
Error: fmt.Sprintf("filename timestamp parsing failed: %v", err),
Stage: "parse",
})
}
} else {
// Apply timezone offset
adjustedTimestamps, err := ApplyTimezoneOffset(filenameTimestamps, location.TimezoneID)
if err != nil {
for _, idx := range filenameIndices {
errors = append(errors, FileImportError{
FileName: filepath.Base(wavInfos[idx].path),
Error: fmt.Sprintf("timezone offset failed: %v", err),
Stage: "parse",
})
}
} else {
// Build map from file index to timestamp
for j, idx := range filenameIndices {
filenameTimestampMap[idx] = adjustedTimestamps[j]
}
}
}
}

// Step 4: Process each file
for i, info := range wavInfos {
if info.err != nil {
continue // Already recorded error
}

// Determine timestamp
var timestampLocal time.Time
var isAudioMoth bool
var mothData *AudioMothData

// Try AudioMoth comment first
if IsAudioMoth(info.metadata.Comment, info.metadata.Artist) {
isAudioMoth = true
var parseErr error
mothData, parseErr = ParseAudioMothComment(info.metadata.Comment)
if parseErr == nil {
timestampLocal = mothData.Timestamp
} else {
// AudioMoth detected but parsing failed - try filename
errors = append(errors, FileImportError{
FileName: filepath.Base(info.path),
Error: fmt.Sprintf("AudioMoth comment parsing failed: %v", parseErr),
Stage: "parse",
})
}
}

// If no AudioMoth timestamp, use filename timestamp
if timestampLocal.IsZero() {
if ts, ok := filenameTimestampMap[i]; ok {
timestampLocal = ts
}
}

// If still no timestamp, use file modification time as fallback
if timestampLocal.IsZero() {
if !info.metadata.FileModTime.IsZero() {
// Assume FileModTime is already in location timezone
// (recorder was at the location when it recorded)
timestampLocal = info.metadata.FileModTime
}
}

// If still no timestamp, skip file
if timestampLocal.IsZero() {
errors = append(errors, FileImportError{
FileName: filepath.Base(info.path),
Error: "no timestamp available (not AudioMoth, filename not parseable, and file mod time missing)",
Stage: "parse",
})
continue
}

// Calculate astronomical data
astroData := CalculateAstronomicalData(
timestampLocal.UTC(),
info.metadata.Duration,
location.Latitude,
location.Longitude,
)

// Add to results
filesData = append(filesData, &fileData{
FileName: filepath.Base(info.path),
Hash: info.hash,
Duration: info.metadata.Duration,
SampleRate: info.metadata.SampleRate,
TimestampLocal: timestampLocal,
IsAudioMoth: isAudioMoth,
MothData: mothData,
AstroData: astroData,
})
}

return filesData, errors
}

// insertClusterFiles inserts all file data into database in a single transaction
func insertClusterFiles(
database *sql.DB,
filesData []*fileData,
datasetID, clusterID, locationID string,
) (imported, skipped int, errors []FileImportError, err error) {
// Begin logged transaction
ctx := context.Background()
tx, err := db.BeginLoggedTx(ctx, database, "import_audio_files")
if err != nil {
return 0, 0, nil, fmt.Errorf("failed to begin transaction: %w", err)
}
defer tx.Rollback() // Rollback if not committed

// Prepare statements
fileStmt, err := tx.PrepareContext(ctx, `
INSERT INTO file (
id, file_name, xxh64_hash, location_id, timestamp_local,
cluster_id, duration, sample_rate, maybe_solar_night, maybe_civil_night,
moon_phase, created_at, last_modified, active
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, now(), now(), true)
`)
if err != nil {
return 0, 0, nil, fmt.Errorf("failed to prepare file statement: %w", err)
}
defer fileStmt.Close()

datasetStmt, err := tx.PrepareContext(ctx, `
INSERT INTO file_dataset (file_id, dataset_id, created_at, last_modified)
VALUES (?, ?, now(), now())
`)
if err != nil {
return 0, 0, nil, fmt.Errorf("failed to prepare dataset statement: %w", err)
}
defer datasetStmt.Close()

mothStmt, err := tx.PrepareContext(ctx, `
INSERT INTO moth_metadata (
file_id, timestamp, recorder_id, gain, battery_v, temp_c,
created_at, last_modified, active
) VALUES (?, ?, ?, ?, ?, ?, now(), now(), true)
`)
if err != nil {
return 0, 0, nil, fmt.Errorf("failed to prepare moth statement: %w", err)
}
defer mothStmt.Close()

// Insert each file
for _, fd := range filesData {
// Check for duplicate hash
var exists bool
err = tx.QueryRowContext(ctx,
"SELECT EXISTS(SELECT 1 FROM file WHERE xxh64_hash = ?)",
fd.Hash,
).Scan(&exists)

if err != nil {
errors = append(errors, FileImportError{
FileName: fd.FileName,
Error: fmt.Sprintf("duplicate check failed: %v", err),
Stage: "insert",
})
continue
}

if exists {
skipped++
continue
}

// Generate file ID
fileID, err := GenerateLongID()
if err != nil {
errors = append(errors, FileImportError{
FileName: fd.FileName,
Error: fmt.Sprintf("ID generation failed: %v", err),
Stage: "insert",
})
continue
}

// Insert file record
_, err = fileStmt.ExecContext(ctx,
fileID, fd.FileName, fd.Hash, locationID,
fd.TimestampLocal, clusterID, fd.Duration, fd.SampleRate,
fd.AstroData.SolarNight, fd.AstroData.CivilNight, fd.AstroData.MoonPhase,
)
if err != nil {
errors = append(errors, FileImportError{
FileName: fd.FileName,
Error: fmt.Sprintf("file insert failed: %v", err),
Stage: "insert",
})
continue
}

// Insert file_dataset junction (ALWAYS)
_, err = datasetStmt.ExecContext(ctx, fileID, datasetID)
if err != nil {
errors = append(errors, FileImportError{
FileName: fd.FileName,
Error: fmt.Sprintf("file_dataset insert failed: %v", err),
Stage: "insert",
})
continue
}

// If AudioMoth, insert moth_metadata
if fd.IsAudioMoth && fd.MothData != nil {
_, err = mothStmt.ExecContext(ctx,
fileID,
fd.MothData.Timestamp,
&fd.MothData.RecorderID,
&fd.MothData.Gain,
&fd.MothData.BatteryV,
&fd.MothData.TempC,
)
if err != nil {
errors = append(errors, FileImportError{
FileName: fd.FileName,
Error: fmt.Sprintf("moth_metadata insert failed: %v", err),
Stage: "insert",
})
continue
}
}

imported++
}

// Commit transaction
err = tx.Commit()
if err != nil {
return 0, 0, errors, fmt.Errorf("transaction commit failed: %w", err)
}

return imported, skipped, errors, nil
}
file addition: clip_times_test.go (----------)

[0.1]

package utils

import (
"math"
"testing"
)

// Reference values verified against opensoundscape.utils.generate_clip_times_df
// at https://github.com/kitzeslab/opensoundscape/blob/master/opensoundscape/utils.py

func TestGenerateClipTimes_FullModeBasic(t *testing.T) {
// full_duration=10, clip_duration=4, overlap=0.5, final="full"
// increment = 3.5
// raw starts: 0, 3.5, 7 (next would be 10.5 ≥ 10)
// raw ends: 4, 7.5, 11
// "full": last clip start shifts back by (11-10)=1 → start=6, end=10
// → [(0,4), (3.5,7.5), (6,10)]
got, err := GenerateClipTimes(10, 4, 0.5, FinalClipFull, 10)
if err != nil {
t.Fatal(err)
}
want := []ClipWindow{{0, 4}, {3.5, 7.5}, {6, 10}}
assertClips(t, got, want)
}

func TestGenerateClipTimes_NoneMode(t *testing.T) {
// final="none": drop any clip whose end > full_duration.
// full=10, dur=4, overlap=0: starts 0,4,8; ends 4,8,12 → keep (0,4),(4,8)
got, err := GenerateClipTimes(10, 4, 0, FinalClipNone, 10)
if err != nil {
t.Fatal(err)
}
assertClips(t, got, []ClipWindow{{0, 4}, {4, 8}})
}

func TestGenerateClipTimes_RemainderMode(t *testing.T) {
// full=10, dur=4, overlap=0: starts 0,4,8; ends 4,8,12
// remainder: trim 12 → 10. → (0,4),(4,8),(8,10)
got, err := GenerateClipTimes(10, 4, 0, FinalClipRemainder, 10)
if err != nil {
t.Fatal(err)
}
assertClips(t, got, []ClipWindow{{0, 4}, {4, 8}, {8, 10}})
}

func TestGenerateClipTimes_ExtendMode(t *testing.T) {
got, err := GenerateClipTimes(10, 4, 0, FinalClipExtend, 10)
if err != nil {
t.Fatal(err)
}
assertClips(t, got, []ClipWindow{{0, 4}, {4, 8}, {8, 12}})
}

func TestGenerateClipTimes_AudioShorterThanClip(t *testing.T) {
// full=2, dur=4, overlap=0, final="full":
// raw start=0, end=4; end > full=2 → start shifts to 0-(4-2)=-2 → clamped to 0;
// end=2 → single clip (0,2)
got, err := GenerateClipTimes(2, 4, 0, FinalClipFull, 10)
if err != nil {
t.Fatal(err)
}
assertClips(t, got, []ClipWindow{{0, 2}})
}

func TestGenerateClipTimes_DedupAfterFullShift(t *testing.T) {
// full=8, dur=4, overlap=0:
// raw starts 0,4; ends 4,8 — no shift needed; output (0,4),(4,8).
// (Tests the no-duplicate path.)
got, err := GenerateClipTimes(8, 4, 0, FinalClipFull, 10)
if err != nil {
t.Fatal(err)
}
assertClips(t, got, []ClipWindow{{0, 4}, {4, 8}})
}

func TestGenerateClipTimes_InvalidArgs(t *testing.T) {
_, err := GenerateClipTimes(10, 0, 0, FinalClipFull, 10)
if err == nil {
t.Error("expected error for clip_duration=0")
}
_, err = GenerateClipTimes(10, 4, 4, FinalClipFull, 10)
if err == nil {
t.Error("expected error for clip_overlap >= clip_duration")
}
_, err = GenerateClipTimes(0, 4, 0, FinalClipFull, 10)
if err == nil {
t.Error("expected error for full_duration=0")
}
}

func assertClips(t *testing.T, got, want []ClipWindow) {
t.Helper()
if len(got) != len(want) {
t.Fatalf("len(got)=%d, len(want)=%d\ngot=%v\nwant=%v", len(got), len(want), got, want)
}
for i := range got {
if math.Abs(got[i].Start-want[i].Start) > 1e-9 || math.Abs(got[i].End-want[i].End) > 1e-9 {
t.Errorf("clip %d: got (%v,%v), want (%v,%v)", i, got[i].Start, got[i].End, want[i].Start, want[i].End)
}
}
}
file addition: clip_times.go (----------)

[0.1]

package utils

import (
"fmt"
"math"
)

// ClipWindow is a fixed-duration time window for one audio file.
type ClipWindow struct {
Start float64
End float64
}

// FinalClipMode controls how the trailing partial clip is handled.
// Mirrors opensoundscape.utils.generate_clip_times_df:
// - FinalClipNone: discard any clip whose end exceeds full_duration
// - FinalClipRemainder: trim the final clip's end to full_duration (shorter clip)
// - FinalClipFull: shift the final clip's start back so its end equals full_duration
// - FinalClipExtend: keep the final clip extending beyond full_duration
type FinalClipMode int

const (
FinalClipNone FinalClipMode = iota
FinalClipRemainder
FinalClipFull
FinalClipExtend
)

// ParseFinalClipMode parses a CLI flag value.
func ParseFinalClipMode(s string) (FinalClipMode, error) {
switch s {
case "none", "":
return FinalClipNone, nil
case "remainder":
return FinalClipRemainder, nil
case "full":
return FinalClipFull, nil
case "extend":
return FinalClipExtend, nil
default:
return 0, fmt.Errorf("invalid final-clip mode %q (want one of: none, remainder, full, extend)", s)
}
}

// roundTo rounds x to `precision` decimal places. Mirrors numpy.round behaviour.
// Pass precision < 0 to skip rounding.
func roundTo(x float64, precision int) float64 {
if precision < 0 {
return x
}
scale := math.Pow(10, float64(precision))
return math.Round(x*scale) / scale
}

// GenerateClipTimes ports opensoundscape.utils.generate_clip_times_df.
//
// Args mirror the Python signature: clipDuration > 0, clipOverlap in [0, clipDuration),
// fullDuration > 0. roundingPrecision defaults to 10 in OPSO; pass -1 to skip rounding.
//
// Result is the list of (start, end) windows for one audio file, with duplicates
// removed (which can happen under FinalClipFull when the shifted final clip
// coincides with the previous one).
func GenerateClipTimes(fullDuration, clipDuration, clipOverlap float64, finalClip FinalClipMode, roundingPrecision int) ([]ClipWindow, error) {
if clipDuration <= 0 {
return nil, fmt.Errorf("clipDuration must be > 0, got %v", clipDuration)
}
if clipOverlap < 0 || clipOverlap >= clipDuration {
return nil, fmt.Errorf("clipOverlap must be in [0, clipDuration), got %v with clipDuration=%v", clipOverlap, clipDuration)
}
if fullDuration <= 0 {
return nil, fmt.Errorf("fullDuration must be > 0, got %v", fullDuration)
}

increment := clipDuration - clipOverlap

// numpy.arange(0, fullDuration, increment): half-open interval
// stop when start >= fullDuration
var starts []float64
for s := 0.0; s < fullDuration; s += increment {
starts = append(starts, roundTo(s, roundingPrecision))
}
if len(starts) == 0 {
// Defensive — shouldn't happen since fullDuration > 0 and increment > 0
starts = []float64{0}
}
ends := make([]float64, len(starts))
for i, s := range starts {
ends[i] = s + clipDuration
}

switch finalClip {
case FinalClipNone:
// Drop any window whose end exceeds fullDuration.
kept := make([]ClipWindow, 0, len(starts))
for i := range starts {
if ends[i] <= fullDuration {
kept = append(kept, ClipWindow{Start: starts[i], End: ends[i]})
}
}
return dedupClips(kept), nil

case FinalClipRemainder:
// Trim ends > fullDuration down to fullDuration.
out := make([]ClipWindow, 0, len(starts))
for i := range starts {
e := ends[i]
if e > fullDuration {
e = fullDuration
}
out = append(out, ClipWindow{Start: starts[i], End: e})
}
return dedupClips(out), nil

case FinalClipFull:
// Shift any window whose end exceeds fullDuration back so its end == fullDuration.
// Keep clip length == clipDuration. Clamp start to >= 0 (audio shorter than clip_duration).
out := make([]ClipWindow, 0, len(starts))
for i := range starts {
s := starts[i]
e := ends[i]
if e > fullDuration {
delta := e - fullDuration
s -= delta
e = fullDuration
if s < 0 {
s = 0
}
}
out = append(out, ClipWindow{Start: s, End: e})
}
return dedupClips(out), nil

case FinalClipExtend:
// Keep ends as-is, even past fullDuration.
out := make([]ClipWindow, 0, len(starts))
for i := range starts {
out = append(out, ClipWindow{Start: starts[i], End: ends[i]})
}
return dedupClips(out), nil

default:
return nil, fmt.Errorf("invalid FinalClipMode %d", finalClip)
}
}

// dedupClips removes consecutive duplicates while preserving order.
// Matches pandas.DataFrame.drop_duplicates() at the end of OPSO's
// generate_clip_times_df.
func dedupClips(in []ClipWindow) []ClipWindow {
if len(in) <= 1 {
return in
}
seen := make(map[ClipWindow]bool, len(in))
out := make([]ClipWindow, 0, len(in))
for _, c := range in {
if !seen[c] {
seen[c] = true
out = append(out, c)
}
}
return out
}
file addition: audiomoth_parser_test.go (----------)

[0.1]

package utils

import (
"skraak/db"
"strings"
"testing"
"time"
)

func TestIsAudioMoth(t *testing.T) {
t.Run("should identify AudioMoth files by artist field", func(t *testing.T) {
if !IsAudioMoth("", "AudioMoth") {
t.Error("Should identify AudioMoth by artist field")
}
if !IsAudioMoth("", "AudioMoth 123456") {
t.Error("Should identify AudioMoth with ID in artist field")
}
if IsAudioMoth("", "Other Artist") {
t.Error("Should not identify non-AudioMoth artist")
}
})

t.Run("should identify AudioMoth files by comment field", func(t *testing.T) {
if !IsAudioMoth("Recorded by AudioMoth...", "") {
t.Error("Should identify AudioMoth by comment field")
}
if IsAudioMoth("Regular recording comment", "") {
t.Error("Should not identify non-AudioMoth comment")
}
})

t.Run("should handle missing metadata", func(t *testing.T) {
if IsAudioMoth("", "") {
t.Error("Should not identify empty strings as AudioMoth")
}
})

t.Run("should be case insensitive", func(t *testing.T) {
if !IsAudioMoth("", "audiomoth") {
t.Error("Should be case insensitive")
}
if !IsAudioMoth("", "AUDIOMOTH") {
t.Error("Should be case insensitive")
}
})
}

func TestParseAudioMothComment(t *testing.T) {
t.Run("should parse a valid structured AudioMoth comment", func(t *testing.T) {
comment := "Recorded at 21:00:00 24/02/2025 (UTC+13) by AudioMoth 248AB50153AB0549 at medium gain while battery was 4.3V and temperature was 15.8C."

result, err := ParseAudioMothComment(comment)
if err != nil {
t.Fatalf("Failed to parse comment: %v", err)
}

// Check timestamp (should be in UTC+13)
expected := time.Date(2025, 2, 24, 21, 0, 0, 0, time.FixedZone("UTC+13", 13*3600))
if !result.Timestamp.Equal(expected) {
t.Errorf("Timestamp incorrect: got %v, want %v", result.Timestamp, expected)
}

// Convert to UTC and verify
utc := result.Timestamp.UTC()
expectedUTC := time.Date(2025, 2, 24, 8, 0, 0, 0, time.UTC)
if !utc.Equal(expectedUTC) {
t.Errorf("UTC timestamp incorrect: got %v, want %v", utc, expectedUTC)
}

if result.RecorderID != "248AB50153AB0549" {
t.Errorf("RecorderID incorrect: got %s, want 248AB50153AB0549", result.RecorderID)
}

if result.Gain != db.GainMedium {
t.Errorf("Gain incorrect: got %s, want %s", result.Gain, db.GainMedium)
}

if result.BatteryV != 4.3 {
t.Errorf("BatteryV incorrect: got %f, want 4.3", result.BatteryV)
}

if result.TempC != 15.8 {
t.Errorf("TempC incorrect: got %f, want 15.8", result.TempC)
}
})

t.Run("should return error for invalid comments", func(t *testing.T) {
invalidComments := []string{
"Not an AudioMoth comment",
"Recorded at invalid time format",
"Short comment",
"",
"AudioMoth without proper format",
}

for _, comment := range invalidComments {
_, err := ParseAudioMothComment(comment)
if err == nil {
t.Errorf("Expected error for invalid comment: %s", comment)
}
}
})

t.Run("should handle different timezone formats", func(t *testing.T) {
commentUTCMinus := "Recorded at 10:30:45 15/06/2024 (UTC-5) by AudioMoth 123456789ABCDEF0 at high gain while battery was 3.9V and temperature was 22.1C."

result, err := ParseAudioMothComment(commentUTCMinus)
if err != nil {
t.Fatalf("Failed to parse comment: %v", err)
}

// Check timestamp is in UTC-5
expected := time.Date(2024, 6, 15, 10, 30, 45, 0, time.FixedZone("UTC-5", -5*3600))
if !result.Timestamp.Equal(expected) {
t.Errorf("Timestamp incorrect: got %v, want %v", result.Timestamp, expected)
}

if result.Gain != db.GainHigh {
t.Errorf("Gain incorrect: got %s, want %s", result.Gain, db.GainHigh)
}

if result.BatteryV != 3.9 {
t.Errorf("BatteryV incorrect: got %f, want 3.9", result.BatteryV)
}

if result.TempC != 22.1 {
t.Errorf("TempC incorrect: got %f, want 22.1", result.TempC)
}
})

t.Run("should parse all gain levels", func(t *testing.T) {
testCases := []struct {
gainStr string
expected db.GainLevel
}{
{"low", db.GainLow},
{"low-medium", db.GainLowMedium},
{"medium", db.GainMedium},
{"medium-high", db.GainMediumHigh},
{"high", db.GainHigh},
}

for _, tc := range testCases {
comment := "Recorded at 21:00:00 24/02/2025 (UTC+13) by AudioMoth 248AB50153AB0549 at " + tc.gainStr + " gain while battery was 4.3V and temperature was 15.8C."
result, err := ParseAudioMothComment(comment)
if err != nil {
t.Errorf("Failed to parse comment with gain %s: %v", tc.gainStr, err)
continue
}

if result.Gain != tc.expected {
t.Errorf("Gain incorrect for %s: got %s, want %s", tc.gainStr, result.Gain, tc.expected)
}
}
})

t.Run("should handle negative temperatures", func(t *testing.T) {
comment := "Recorded at 21:00:00 24/02/2025 (UTC+13) by AudioMoth 248AB50153AB0549 at medium gain while battery was 4.3V and temperature was -5.2C."

result, err := ParseAudioMothComment(comment)
if err != nil {
t.Fatalf("Failed to parse comment: %v", err)
}

if result.TempC != -5.2 {
t.Errorf("TempC incorrect: got %f, want -5.2", result.TempC)
}
})

t.Run("should fallback to legacy parsing", func(t *testing.T) {
// Legacy format might not match structured regex but should be parseable
// Test with a legacy-style comment
comment := "Recorded at 21:00:00 24/02/2025 (UTC+13) by AudioMoth 248AB50153AB0549 at medium gain while battery was 4.3V and temperature was 15.8C"

// Note: The legacy parser expects the exact structure, so this might fail
// if the comment doesn't match. Adjust test as needed based on actual legacy format.
result, err := ParseAudioMothComment(comment)

// Either succeeds or fails gracefully
if err == nil {
// If it succeeds, verify basic fields
if result.RecorderID == "" {
t.Error("RecorderID should not be empty")
}
}
})
}

func TestParseGainLevel(t *testing.T) {
testCases := []struct {
input string
expected db.GainLevel
wantErr bool
}{
{"low", db.GainLow, false},
{"LOW", db.GainLow, false},
{" low ", db.GainLow, false},
{"low-medium", db.GainLowMedium, false},
{"medium", db.GainMedium, false},
{"medium-high", db.GainMediumHigh, false},
{"high", db.GainHigh, false},
{"invalid", "", true},
{"", "", true},
{"ultra", "", true},
}

for _, tc := range testCases {
t.Run(tc.input, func(t *testing.T) {
result, err := parseGainLevel(tc.input)

if tc.wantErr {
if err == nil {
t.Errorf("Expected error for input %q, got nil", tc.input)
}
} else {
if err != nil {
t.Errorf("Unexpected error for input %q: %v", tc.input, err)
}
if result != tc.expected {
t.Errorf("Result incorrect for %q: got %s, want %s", tc.input, result, tc.expected)
}
}
})
}
}

func TestParseAudioMothTimestamp(t *testing.T) {
t.Run("should parse standard timestamp format", func(t *testing.T) {
result, err := parseAudioMothTimestamp("21:00:00", "24/02/2025", "UTC+13")
if err != nil {
t.Fatalf("Failed to parse timestamp: %v", err)
}

expected := time.Date(2025, 2, 24, 21, 0, 0, 0, time.FixedZone("UTC+13", 13*3600))
if !result.Equal(expected) {
t.Errorf("Timestamp incorrect: got %v, want %v", result, expected)
}
})

t.Run("should parse timestamp with +HH format", func(t *testing.T) {
result, err := parseAudioMothTimestamp("10:30:45", "15/06/2024", "+13")
if err != nil {
t.Fatalf("Failed to parse timestamp: %v", err)
}

expected := time.Date(2024, 6, 15, 10, 30, 45, 0, time.FixedZone("UTC+13", 13*3600))
if !result.Equal(expected) {
t.Errorf("Timestamp incorrect: got %v, want %v", result, expected)
}
})

t.Run("should parse negative timezone offset", func(t *testing.T) {
result, err := parseAudioMothTimestamp("10:30:45", "15/06/2024", "UTC-5")
if err != nil {
t.Fatalf("Failed to parse timestamp: %v", err)
}

expected := time.Date(2024, 6, 15, 10, 30, 45, 0, time.FixedZone("UTC-5", -5*3600))
if !result.Equal(expected) {
t.Errorf("Timestamp incorrect: got %v, want %v", result, expected)
}
})

t.Run("should handle invalid time format", func(t *testing.T) {
_, err := parseAudioMothTimestamp("25:00:00", "15/06/2024", "UTC+13")
// Note: Go's time.Date will normalize invalid times, so this might not error
// The error would be caught if the format doesn't match
_ = err
})

t.Run("should handle invalid date format", func(t *testing.T) {
_, err := parseAudioMothTimestamp("10:30:45", "32/13/2024", "UTC+13")
// Note: Go's time.Date will normalize invalid dates
_ = err
})
}

func TestStructuredVsLegacyParsing(t *testing.T) {
t.Run("should prefer structured parsing", func(t *testing.T) {
comment := "Recorded at 21:00:00 24/02/2025 (UTC+13) by AudioMoth 248AB50153AB0549 at medium gain while battery was 4.3V and temperature was 15.8C."

result, err := ParseAudioMothComment(comment)
if err != nil {
t.Fatalf("Failed to parse comment: %v", err)
}

// Verify it parsed correctly
if result.RecorderID != "248AB50153AB0549" {
t.Errorf("RecorderID incorrect: got %s, want 248AB50153AB0549", result.RecorderID)
}
})

t.Run("should handle legacy format", func(t *testing.T) {
// Create a comment that matches legacy space-separated format
comment := "Recorded at 21:00:00 24/02/2025 (UTC+13) by AudioMoth 248AB50153AB0549 at medium gain while battery was 4.3V and temperature was 15.8C."

// The structured parser should handle this
result, err := ParseAudioMothComment(comment)
if err != nil {
// If structured fails, legacy should catch it
// (though for this format, structured should work)
t.Logf("Note: Structured parsing failed, expected legacy to handle: %v", err)
} else {
if result.RecorderID == "" {
t.Error("RecorderID should not be empty")
}
}
})
}

func TestAudioMothCommentEdgeCases(t *testing.T) {
t.Run("should handle extra whitespace", func(t *testing.T) {
comment := "Recorded at 21:00:00 24/02/2025 (UTC+13) by AudioMoth 248AB50153AB0549 at medium gain while battery was 4.3V and temperature was 15.8C."

// Depending on implementation, this might or might not parse
_, err := ParseAudioMothComment(comment)
if err != nil {
// Expected - structured regex is strict
t.Logf("Extra whitespace causes parsing to fail (expected): %v", err)
}
})

t.Run("should handle different case in gain", func(t *testing.T) {
comment := "Recorded at 21:00:00 24/02/2025 (UTC+13) by AudioMoth 248AB50153AB0549 at MEDIUM gain while battery was 4.3V and temperature was 15.8C."

result, err := ParseAudioMothComment(comment)
if err == nil {
if result.Gain != db.GainMedium {
t.Errorf("Gain should be normalized: got %s, want %s", result.Gain, db.GainMedium)
}
}
})

t.Run("should handle non-hex recorder ID via legacy parser", func(t *testing.T) {
// Structured regex expects [A-F0-9]+ hex format and will not match
// Legacy parser will catch this and parse it (more lenient)
comment := "Recorded at 21:00:00 24/02/2025 (UTC+13) by AudioMoth GGGGGGGGGGGGGGGG at medium gain while battery was 4.3V and temperature was 15.8C."

result, err := ParseAudioMothComment(comment)
// Legacy parser is lenient and accepts any recorder ID
if err != nil {
t.Fatalf("Legacy parser should handle non-hex recorder ID: %v", err)
}

// Verify it parsed the recorder ID (even though it's not valid hex)
if result.RecorderID != "GGGGGGGGGGGGGGGG" {
t.Errorf("RecorderID incorrect: got %s, want GGGGGGGGGGGGGGGG", result.RecorderID)
}
})

t.Run("should handle recorder ID of different lengths", func(t *testing.T) {
// Short ID
comment := "Recorded at 21:00:00 24/02/2025 (UTC+13) by AudioMoth ABCD at medium gain while battery was 4.3V and temperature was 15.8C."

result, err := ParseAudioMothComment(comment)
if err != nil {
t.Fatalf("Failed to parse comment with short ID: %v", err)
}

if !strings.Contains(result.RecorderID, "ABCD") {
t.Errorf("RecorderID should contain ABCD, got %s", result.RecorderID)
}
})
}
file addition: audiomoth_parser.go (----------)

[0.1]

package utils

import (
"fmt"
"regexp"
"strconv"
"strings"
"time"

"skraak/db"
)

// AudioMothData contains parsed data from AudioMoth comment field
type AudioMothData struct {
Timestamp time.Time
RecorderID string
Gain db.GainLevel
BatteryV float64
TempC float64
}

// AudioMoth comment example:
// "Recorded at 21:00:00 24/02/2025 (UTC+13) by AudioMoth 248AB50153AB0549 at medium gain while battery was 4.3V and temperature was 15.8C."

var (
// Pattern to detect AudioMoth comments
audiomothPattern = regexp.MustCompile(`(?i)AudioMoth`)

// Pattern to extract structured data
// Matches: "Recorded at HH:MM:SS DD/MM/YYYY (UTC±HH) by AudioMoth HEXID at GAIN gain while battery was X.XV and temperature was Y.YC."
structuredPattern = regexp.MustCompile(
`Recorded at (\d{2}:\d{2}:\d{2}) (\d{2}/\d{2}/\d{4}) $UTC([+-]\d+)$ by AudioMoth ([A-F0-9]+) at ([\w-]+) gain while battery was ([\d.]+)V and temperature was ([-\d.]+)C`,
)
)

// IsAudioMoth checks if the comment or artist field indicates an AudioMoth recording
// IsAudioMoth detects if WAV file is from AudioMoth recorder
func IsAudioMoth(comment, artist string) bool {
return audiomothPattern.MatchString(comment) || audiomothPattern.MatchString(artist)
}

// ParseAudioMothComment parses structured AudioMoth comment field
// Returns parsed data or error if parsing fails
// ParseAudioMothComment extracts timestamp, gain, battery, and temperature from AudioMoth comment
func ParseAudioMothComment(comment string) (*AudioMothData, error) {
// Try structured parsing first (newer format)
if data, err := parseStructuredComment(comment); err == nil {
return data, nil
}

// Fallback to legacy space-separated parsing
return parseLegacyComment(comment)
}

// parseStructuredComment parses newer AudioMoth comment format using regex
func parseStructuredComment(comment string) (*AudioMothData, error) {
matches := structuredPattern.FindStringSubmatch(comment)
if matches == nil {
return nil, fmt.Errorf("comment does not match structured AudioMoth format")
}

// Extract matched groups
timeStr := matches[1] // HH:MM:SS
dateStr := matches[2] // DD/MM/YYYY
timezoneStr := matches[3] // ±HH
recorderID := matches[4] // Hex ID
gainStr := matches[5] // gain level
batteryStr := matches[6] // battery voltage
tempStr := matches[7] // temperature

// Parse timestamp
timestamp, err := parseAudioMothTimestamp(timeStr, dateStr, timezoneStr)
if err != nil {
return nil, fmt.Errorf("failed to parse timestamp: %w", err)
}

// Parse gain
gain, err := parseGainLevel(gainStr)
if err != nil {
return nil, fmt.Errorf("failed to parse gain: %w", err)
}

// Parse battery voltage
batteryV, err := strconv.ParseFloat(batteryStr, 64)
if err != nil {
return nil, fmt.Errorf("failed to parse battery voltage: %w", err)
}

// Parse temperature
tempC, err := strconv.ParseFloat(tempStr, 64)
if err != nil {
return nil, fmt.Errorf("failed to parse temperature: %w", err)
}

return &AudioMothData{
Timestamp: timestamp,
RecorderID: recorderID,
Gain: gain,
BatteryV: batteryV,
TempC: tempC,
}, nil
}

// parseLegacyComment parses older AudioMoth comment format (space-separated)
// Example: "Recorded at 21:00:00 24/02/2025 (UTC+13) by AudioMoth 248AB50153AB0549 at medium gain while battery was 4.3V and temperature was 15.8C."
func parseLegacyComment(comment string) (*AudioMothData, error) {
parts := strings.Fields(comment)

if len(parts) < 10 {
return nil, fmt.Errorf("comment has insufficient parts (got %d, need at least 10)", len(parts))
}

// 0-based indices after split by space:
// parts[2] = "21:00:00" (time HH:MM:SS)
// parts[3] = "24/02/2025" (date DD/MM/YYYY)
// parts[4] = "(UTC+13)" (timezone offset)
// parts[7] = "248AB50153AB0549" (moth ID)
// parts[9] = "medium" (gain)
// parts[len-5] = "4.3V" (battery voltage)
// parts[len-1] = "15.8C." (temperature)

timeStr := parts[2]
dateStr := parts[3]
timezoneStr := strings.Trim(parts[4], "()")
recorderID := parts[7]
gainStr := parts[9]

// Parse timestamp
timestamp, err := parseAudioMothTimestamp(timeStr, dateStr, timezoneStr)
if err != nil {
return nil, fmt.Errorf("failed to parse timestamp: %w", err)
}

// Parse gain
gain, err := parseGainLevel(gainStr)
if err != nil {
return nil, fmt.Errorf("failed to parse gain: %w", err)
}

// Parse battery voltage (e.g., "4.3V")
batteryStr := parts[len(parts)-5]
batteryStr = strings.TrimSuffix(batteryStr, "V")
batteryV, err := strconv.ParseFloat(batteryStr, 64)
if err != nil {
return nil, fmt.Errorf("failed to parse battery voltage: %w", err)
}

// Parse temperature (e.g., "15.8C." or "15.8C")
tempStr := parts[len(parts)-1]
tempStr = strings.TrimSuffix(tempStr, ".")
tempStr = strings.TrimSuffix(tempStr, "C")
tempC, err := strconv.ParseFloat(tempStr, 64)
if err != nil {
return nil, fmt.Errorf("failed to parse temperature: %w", err)
}

return &AudioMothData{
Timestamp: timestamp,
RecorderID: recorderID,
Gain: gain,
BatteryV: batteryV,
TempC: tempC,
}, nil
}

// parseAudioMothTimestamp parses AudioMoth timestamp from time, date, and timezone strings
// timeStr: "HH:MM:SS"
// dateStr: "DD/MM/YYYY"
// timezoneStr: "UTC+13" or "+13"
func parseAudioMothTimestamp(timeStr, dateStr, timezoneStr string) (time.Time, error) {
// Parse time components
timeParts := strings.Split(timeStr, ":")
if len(timeParts) != 3 {
return time.Time{}, fmt.Errorf("invalid time format: %s", timeStr)
}
hour, _ := strconv.Atoi(timeParts[0])
minute, _ := strconv.Atoi(timeParts[1])
second, _ := strconv.Atoi(timeParts[2])

// Parse date components
dateParts := strings.Split(dateStr, "/")
if len(dateParts) != 3 {
return time.Time{}, fmt.Errorf("invalid date format: %s", dateStr)
}
day, _ := strconv.Atoi(dateParts[0])
month, _ := strconv.Atoi(dateParts[1])
year, _ := strconv.Atoi(dateParts[2])

// Parse timezone offset
timezoneStr = strings.TrimPrefix(timezoneStr, "UTC")
offsetHours, err := strconv.Atoi(timezoneStr)
if err != nil {
return time.Time{}, fmt.Errorf("invalid timezone offset: %s", timezoneStr)
}

// Create fixed timezone location
offsetSeconds := offsetHours * 3600
loc := time.FixedZone(fmt.Sprintf("UTC%+d", offsetHours), offsetSeconds)

// Construct timestamp
timestamp := time.Date(year, time.Month(month), day, hour, minute, second, 0, loc)

return timestamp, nil
}

// parseGainLevel converts string gain level to GainLevel enum
func parseGainLevel(gainStr string) (db.GainLevel, error) {
gainStr = strings.ToLower(strings.TrimSpace(gainStr))

switch gainStr {
case "low":
return db.GainLow, nil
case "low-medium":
return db.GainLowMedium, nil
case "medium":
return db.GainMedium, nil
case "medium-high":
return db.GainMediumHigh, nil
case "high":
return db.GainHigh, nil
default:
return "", fmt.Errorf("unknown gain level: %s", gainStr)
}
}
file addition: audio_player.go (----------)

[0.1]

package utils

import (
"bytes"
"encoding/binary"
"math"
"sync"

"github.com/ebitengine/oto/v3"
)

// AudioPlayer wraps oto for simple audio playback.
// The oto context is created once and reused across plays.
type AudioPlayer struct {
ctx *oto.Context
mu sync.Mutex
player *oto.Player
}

// NewAudioPlayer creates a new audio player with the given sample rate.
// Only one AudioPlayer should exist per process (oto allows one context).
func NewAudioPlayer(sampleRate int) (*AudioPlayer, error) {
op := &oto.NewContextOptions{
SampleRate: sampleRate,
ChannelCount: 1,
Format: oto.FormatSignedInt16LE,
}
ctx, readyChan, err := oto.NewContext(op)
if err != nil {
return nil, err
}
<-readyChan

return &AudioPlayer{ctx: ctx}, nil
}

// Play stops any current playback and starts playing the given samples.
// Samples are float64 in the range -1.0 to 1.0.
// Playback is non-blocking — audio plays in the background.
func (ap *AudioPlayer) Play(samples []float64, sampleRate int) {
ap.PlayAtSpeed(samples, sampleRate, 1.0)
}

// PlayAtSpeed plays samples at the given speed (1.0 = normal, 0.5 = half speed).
// Speed change is achieved by resampling the audio.
// Playback is non-blocking — audio plays in the background.
func (ap *AudioPlayer) PlayAtSpeed(samples []float64, sampleRate int, speed float64) {
ap.mu.Lock()
defer ap.mu.Unlock()

// Stop previous playback
if ap.player != nil {
ap.player.Pause()
ap.player = nil
}

// Resample if speed is not normal
if speed != 1.0 {
samples = Resample(samples, speed)
}

// Convert float64 samples to signed int16 LE bytes
buf := make([]byte, len(samples)*2)
for i, s := range samples {
// Clamp to [-1.0, 1.0]
if s > 1.0 {
s = 1.0
} else if s < -1.0 {
s = -1.0
}
v := int16(math.Round(s * 32767.0))
binary.LittleEndian.PutUint16(buf[i*2:], uint16(v))
}

ap.player = ap.ctx.NewPlayer(bytes.NewReader(buf))
ap.player.Play()
}

// IsPlaying returns true if audio is currently playing.
func (ap *AudioPlayer) IsPlaying() bool {
ap.mu.Lock()
defer ap.mu.Unlock()
return ap.player != nil && ap.player.IsPlaying()
}

// Stop stops any current playback.
func (ap *AudioPlayer) Stop() {
ap.mu.Lock()
defer ap.mu.Unlock()
if ap.player != nil {
ap.player.Pause()
ap.player = nil
}
}

// Close stops playback and releases the oto context.
func (ap *AudioPlayer) Close() {
ap.Stop()
}
file addition: astronomical_test.go (----------)

[0.1]

package utils

import (
"testing"
"time"
)

// Test location: Auckland, New Zealand (approx coordinates)
var testLocationAuckland = struct {
lat float64
lon float64
}{
lat: -36.8485,
lon: 174.7633,
}

// Test location: London, UK
var testLocationLondon = struct {
lat float64
lon float64
}{
lat: 51.5074,
lon: -0.1278,
}

func TestCalculateAstronomicalData(t *testing.T) {
t.Run("should return valid types for all fields", func(t *testing.T) {
// Winter midnight in Auckland (should be solar night)
winterMidnight := parseTime(t, "2024-06-15T12:00:00Z") // UTC midnight = noon in Auckland (winter)
duration := 60.0 // 1 minute

result := CalculateAstronomicalData(winterMidnight, duration, testLocationAuckland.lat, testLocationAuckland.lon)

// Check types exist
if result.MoonPhase < 0 || result.MoonPhase > 1 {
t.Errorf("MoonPhase out of range: got %f, want 0-1", result.MoonPhase)
}
})

t.Run("should return false for solar night during daytime hours", func(t *testing.T) {
// Summer midday in Auckland (should NOT be solar night)
summerMidday := parseTime(t, "2024-12-15T00:00:00Z") // UTC midnight = noon in Auckland (summer)
duration := 60.0 // 1 minute

result := CalculateAstronomicalData(summerMidday, duration, testLocationAuckland.lat, testLocationAuckland.lon)

// During summer midday, should NOT be solar night
if result.SolarNight {
t.Error("Expected SolarNight to be false during daytime")
}
if result.CivilNight {
t.Error("Expected CivilNight to be false during daytime")
}
})

t.Run("should handle different durations correctly", func(t *testing.T) {
timestamp := parseTime(t, "2024-06-15T10:00:00Z")
shortDuration := 30.0 // 30 seconds
longDuration := 3600.0 // 1 hour

shortResult := CalculateAstronomicalData(timestamp, shortDuration, testLocationAuckland.lat, testLocationAuckland.lon)
longResult := CalculateAstronomicalData(timestamp, longDuration, testLocationAuckland.lat, testLocationAuckland.lon)

// Both should have valid results
if shortResult.MoonPhase < 0 || shortResult.MoonPhase > 1 {
t.Errorf("Short duration moon phase out of range: %f", shortResult.MoonPhase)
}
if longResult.MoonPhase < 0 || longResult.MoonPhase > 1 {
t.Errorf("Long duration moon phase out of range: %f", longResult.MoonPhase)
}
})

t.Run("should calculate midpoint time correctly", func(t *testing.T) {
// Test that the calculation uses the midpoint, not the start time
startTime := parseTime(t, "2024-06-15T10:00:00Z")
duration := 7200.0 // 2 hours (midpoint would be 1 hour later)

result := CalculateAstronomicalData(startTime, duration, testLocationAuckland.lat, testLocationAuckland.lon)

// Should calculate based on 11:00 UTC, not 10:00 UTC
// Just verify we get valid boolean results
_ = result.SolarNight
_ = result.CivilNight
})

t.Run("should handle different geographical locations", func(t *testing.T) {
timestamp := parseTime(t, "2024-06-15T12:00:00Z") // UTC noon
duration := 60.0

aucklandResult := CalculateAstronomicalData(timestamp, duration, testLocationAuckland.lat, testLocationAuckland.lon)
londonResult := CalculateAstronomicalData(timestamp, duration, testLocationLondon.lat, testLocationLondon.lon)

// Both should have valid boolean results (don't compare values, just that they're boolean)
_ = aucklandResult.SolarNight
_ = londonResult.SolarNight

// Results might differ due to different timezones and seasons
// Auckland: UTC noon = midnight local (winter) = likely night
// London: UTC noon = 1pm local (summer) = likely day
})

t.Run("should return valid moon phase values", func(t *testing.T) {
timestamp := parseTime(t, "2024-06-15T12:00:00Z")
duration := 60.0

result := CalculateAstronomicalData(timestamp, duration, testLocationAuckland.lat, testLocationAuckland.lon)

if result.MoonPhase < 0 || result.MoonPhase > 1 {
t.Errorf("MoonPhase out of range: got %f, want 0-1", result.MoonPhase)
}
})

t.Run("should handle edge cases with very short durations", func(t *testing.T) {
timestamp := parseTime(t, "2024-06-15T12:00:00Z")
duration := 0.1 // 0.1 seconds

result := CalculateAstronomicalData(timestamp, duration, testLocationAuckland.lat, testLocationAuckland.lon)

if result.MoonPhase < 0 || result.MoonPhase > 1 {
t.Errorf("MoonPhase out of range: got %f, want 0-1", result.MoonPhase)
}
})

t.Run("should handle edge cases with very long durations", func(t *testing.T) {
timestamp := parseTime(t, "2024-06-15T12:00:00Z")
duration := 86400.0 // 24 hours

result := CalculateAstronomicalData(timestamp, duration, testLocationAuckland.lat, testLocationAuckland.lon)

if result.MoonPhase < 0 || result.MoonPhase > 1 {
t.Errorf("MoonPhase out of range: got %f, want 0-1", result.MoonPhase)
}
})
}

func TestBooleanLogicValidation(t *testing.T) {
t.Run("should never return invalid values for valid inputs", func(t *testing.T) {
testCases := []string{
"2024-06-15T06:00:00Z", // Dawn/dusk time
"2024-06-15T12:00:00Z", // Midday/midnight
"2024-06-15T18:00:00Z", // Evening/morning
"2024-12-15T06:00:00Z", // Summer dawn/dusk
"2024-12-15T12:00:00Z", // Summer midday/midnight
"2024-12-15T18:00:00Z", // Summer evening/morning
}

for _, timestamp := range testCases {
t.Run(timestamp, func(t *testing.T) {
ts := parseTime(t, timestamp)
result := CalculateAstronomicalData(ts, 60, testLocationAuckland.lat, testLocationAuckland.lon)

// These should be proper boolean types
_ = result.SolarNight
_ = result.CivilNight

// MoonPhase should be in valid range
if result.MoonPhase < 0 || result.MoonPhase > 1 {
t.Errorf("MoonPhase out of range: got %f, want 0-1", result.MoonPhase)
}
})
}
})

t.Run("should return false for daytime recordings", func(t *testing.T) {
// Test a known daytime period in Auckland (summer midday UTC)
summerMidday := parseTime(t, "2024-12-15T00:30:00Z") // Should be daytime in Auckland
duration := 60.0

result := CalculateAstronomicalData(summerMidday, duration, testLocationAuckland.lat, testLocationAuckland.lon)

// The key test: false values should remain false
if result.SolarNight && result.CivilNight {
// This would be unexpected during midday
t.Logf("Note: Both SolarNight and CivilNight are true (may be valid depending on season)")
}
})

t.Run("should return true for nighttime recordings", func(t *testing.T) {
// Test a known nighttime period in Auckland (winter midnight UTC)
winterMidnight := parseTime(t, "2024-06-15T12:30:00Z") // Should be nighttime in Auckland
duration := 60.0

result := CalculateAstronomicalData(winterMidnight, duration, testLocationAuckland.lat, testLocationAuckland.lon)

// The key test: true values should remain true
_ = result.SolarNight
_ = result.CivilNight
})
}

func TestCalculateMidpointTime(t *testing.T) {
t.Run("should calculate midpoint correctly", func(t *testing.T) {
startTime := parseTime(t, "2024-06-15T10:00:00Z")
duration := 3600.0 // 1 hour

midpoint := CalculateMidpointTime(startTime, duration)
expected := parseTime(t, "2024-06-15T10:30:00Z")

if !midpoint.Equal(expected) {
t.Errorf("Midpoint incorrect: got %v, want %v", midpoint, expected)
}
})

t.Run("should handle short durations", func(t *testing.T) {
startTime := parseTime(t, "2024-06-15T10:00:00Z")
duration := 10.0 // 10 seconds

midpoint := CalculateMidpointTime(startTime, duration)
expected := parseTime(t, "2024-06-15T10:00:05Z")

if !midpoint.Equal(expected) {
t.Errorf("Midpoint incorrect: got %v, want %v", midpoint, expected)
}
})
}

// Helper function to parse time strings
func parseTime(t *testing.T, s string) time.Time {
t.Helper()
parsed, err := time.Parse(time.RFC3339, s)
if err != nil {
t.Fatalf("Failed to parse time %s: %v", s, err)
}
return parsed
}
file addition: astronomical.go (----------)

[0.1]

package utils

import (
"time"

"github.com/sixdouglas/suncalc"
)

// AstronomicalData contains calculated astronomical data for a recording
type AstronomicalData struct {
SolarNight bool // True if recording midpoint is between sunset and sunrise
CivilNight bool // True if recording midpoint is between dusk and dawn (6° below horizon)
MoonPhase float64 // 0.00=New Moon, 0.25=First Quarter, 0.50=Full Moon, 0.75=Last Quarter
}

// CalculateAstronomicalData calculates astronomical data for a recording.
// Uses the recording MIDPOINT time (not start time) for calculations.
//
// Parameters:
// - timestampUTC: Recording start time in UTC
// - durationSec: Recording duration in seconds
// - lat, lon: Location coordinates in decimal degrees
//
// Returns:
// - solarNight: true if recording midpoint is between sunset and sunrise
// - civilNight: true if recording midpoint is between dusk and dawn
// - moonPhase: 0.00-1.00 representing moon phase (0=New, 0.5=Full)
func CalculateAstronomicalData(
timestampUTC time.Time,
durationSec float64,
lat, lon float64,
) AstronomicalData {
// Calculate recording MIDPOINT (not start time)
midpoint := timestampUTC.Add(time.Duration(durationSec/2) * time.Second)

// Get solar times for midpoint date
times := suncalc.GetTimes(midpoint, lat, lon)

// Solar night: between sunset and sunrise
// Note: Handle day/night transitions properly
sunrise := times[suncalc.Sunrise].Value
sunset := times[suncalc.Sunset].Value
solarNight := isBetweenSunTimes(midpoint, sunset, sunrise)

// Civil night: between dusk and dawn (6° below horizon)
dawn := times[suncalc.Dawn].Value
dusk := times[suncalc.Dusk].Value
civilNight := isBetweenSunTimes(midpoint, dusk, dawn)

// Moon phase: 0.00=New Moon, 0.25=First Quarter, 0.50=Full Moon, 0.75=Last Quarter
moonIllum := suncalc.GetMoonIllumination(midpoint)
moonPhase := moonIllum.Phase

return AstronomicalData{
SolarNight: solarNight,
CivilNight: civilNight,
MoonPhase: moonPhase,
}
}

// isBetweenSunTimes determines if a time is between sunset/dusk and sunrise/dawn
// Handles the case where the night period crosses midnight
func isBetweenSunTimes(t, evening, morning time.Time) bool {
// If evening time is before morning time (normal case: both on same day)
// Then we're NOT in night period (daytime)
if evening.Before(morning) {
return false
}

// Otherwise, night period crosses midnight
// Night is: after evening OR before morning
return t.After(evening) || t.Before(morning)
}

// CalculateMidpointTime calculates the midpoint time of a recording
func CalculateMidpointTime(startTime time.Time, durationSec float64) time.Time {
return startTime.Add(time.Duration(durationSec/2) * time.Second)
}
file addition: tui (d--r------)

[2.1]
file addition: classify.go (----------)

[0.227139]

package tui

import (
"fmt"
"image"
"os"
"path/filepath"
"sort"
"strings"
"time"

tea "charm.land/bubbletea/v2"
"charm.land/lipgloss/v2"

"skraak/tools"
"skraak/utils"
)

// playbackTickMsg is sent every 50ms while audio is playing
type playbackTickMsg struct{}

// Styles
var (
titleStyle = lipgloss.NewStyle().
Bold(true).
Foreground(lipgloss.Color("15")).
Background(lipgloss.Color("62")).
Padding(0, 1)

labelStyle = lipgloss.NewStyle().
Foreground(lipgloss.Color("86"))

errorStyle = lipgloss.NewStyle().
Foreground(lipgloss.Color("196"))

helpStyle = lipgloss.NewStyle().
Foreground(lipgloss.Color("241"))

helpDarkStyle = lipgloss.NewStyle().
Foreground(lipgloss.Color("86"))

commentBoxStyle = lipgloss.NewStyle().
Border(lipgloss.RoundedBorder()).
BorderForeground(lipgloss.Color("62")).
Padding(0, 1)
)

// wrapText wraps text at word boundaries to fit within maxWidth.
// Returns multiple lines joined with newlines.
func wrapText(text string, maxWidth int) string {
if len(text) <= maxWidth {
return text
}

lines := strings.Split(text, "\n")
var result []string

for _, line := range lines {
if len(line) <= maxWidth {
result = append(result, line)
continue
}

// Wrap at word boundaries
words := strings.Fields(line)
var currentLine string
for _, word := range words {
if len(currentLine)+len(word)+1 <= maxWidth {
if currentLine == "" {
currentLine = word
} else {
currentLine += " " + word
}
} else {
if currentLine != "" {
result = append(result, currentLine)
}
// If single word is longer than maxWidth, force break it
if len(word) > maxWidth {
result = append(result, word[:maxWidth])
word = word[maxWidth:]
}
currentLine = word
}
}
if currentLine != "" {
result = append(result, currentLine)
}
}

return strings.Join(result, "\n")
}

// Model holds TUI state
type Model struct {
state *tools.ClassifyState
err string
quitting bool
bindingsHelp string // pre-computed bindings text

// Comment dialog state
commentMode bool // true when comment dialog is open
commentText string // current input text
commentCursor int // cursor position in comment text

// Clip dialog state
clipMode bool // true when clip dialog is open
clipInput string // current prefix input

// Shift+primary wait mode: when non-empty, the next keypress is looked up
// in Config.SecondaryBindings[awaitingSecondaryFor] as a calltype key.
awaitingSecondaryFor string

// Image generation counter - incremented on each segment change,
// used to discard stale inline images (sixel/iTerm).
// Pointer so it survives BubbleTea's value-copy update cycle.
imageGen *uint64
}

// New creates a new TUI model
func New(state *tools.ClassifyState) Model {
// Pre-compute bindings help text, sorted letters a-z then digits 0-9
// (other single-char keys sorted after).
sorted := make([]tools.KeyBinding, len(state.Config.Bindings))
copy(sorted, state.Config.Bindings)
keyRank := func(k string) int {
if len(k) == 0 {
return 3
}
c := k[0]
switch {
case c >= 'a' && c <= 'z':
return 0
case c >= 'A' && c <= 'Z':
return 1
case c >= '0' && c <= '9':
return 2
default:
return 3
}
}
sort.SliceStable(sorted, func(i, j int) bool {
ri, rj := keyRank(sorted[i].Key), keyRank(sorted[j].Key)
if ri != rj {
return ri < rj
}
return sorted[i].Key < sorted[j].Key
})
var bindings []string
for _, b := range sorted {
if b.CallType != "" {
bindings = append(bindings, fmt.Sprintf("%s=%s/%s", b.Key, b.Species, b.CallType))
} else {
bindings = append(bindings, fmt.Sprintf("%s=%s", b.Key, b.Species))
}
}
bindingsHelp := strings.Join(bindings, " ")

gen := uint64(0)
return Model{
state: state,
bindingsHelp: bindingsHelp,
imageGen: &gen,
}
}

func (m Model) protocol() utils.ImageProtocol {
if m.state.Config.ITerm {
return utils.ProtocolITerm
}
if m.state.Config.Sixel {
return utils.ProtocolSixel
}
return utils.ProtocolKitty
}

// Init initializes the model
func (m Model) Init() tea.Cmd {
return inlineImageCmd(m.state, m.protocol(), *m.imageGen, m.imageGen)
}

// Update handles messages
func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
switch msg := msg.(type) {
case tea.KeyPressMsg:
return m.handleKey(msg)
case playbackTickMsg:
if m.state.Player == nil || !m.state.Player.IsPlaying() {
return m, nil // done, triggers re-render to clear "Playing..." text
}
return m, playbackTick()
}

return m, nil
}

// segmentChangeCmd returns the appropriate command after a segment change.
// Clears screen then generates and writes the spectrogram image asynchronously.
func (m Model) segmentChangeCmd() tea.Cmd {
(*m.imageGen)++
gen := *m.imageGen
return tea.Sequence(tea.ClearScreen, inlineImageCmd(m.state, m.protocol(), gen, m.imageGen))
}

func (m Model) handleKey(msg tea.KeyPressMsg) (tea.Model, tea.Cmd) {
// If in comment mode, route to comment handler
if m.commentMode {
return m.handleCommentKey(msg)
}

// If in clip mode, route to clip handler
if m.clipMode {
return m.handleClipKey(msg)
}

m.err = ""

key := msg.Key()

// Secondary-wait mode: next keypress is interpreted as a calltype key
// for the species we just labeled via Shift+primary.
if m.awaitingSecondaryFor != "" {
primary := m.awaitingSecondaryFor
m.awaitingSecondaryFor = ""

// Esc cancels wait mode; species stays labeled without calltype,
// segment does not advance.
if key.Code == tea.KeyEscape || key.Code == tea.KeyEsc {
return m, nil
}

s := msg.String()
if len(s) == 1 {
if callType, ok := m.state.Config.SecondaryBindings[primary][s]; ok {
if m.state.Player != nil {
m.state.Player.Stop()
}
m.state.ApplyCallTypeOnly(callType)
if err := m.state.Save(); err != nil {
m.err = err.Error()
}
if !m.state.NextSegment() {
m.quitting = true
return m, tea.Quit
}
return m, m.segmentChangeCmd()
}
}
// Unknown key — fall through to normal handling of this keypress.
}

// Handle Enter key (main or numpad, check code to catch modifiers)
if key.Code == tea.KeyEnter || key.Code == tea.KeyKpEnter {
speed := 1.0
if key.Mod&tea.ModShift != 0 {
speed = 0.5
}
if errMsg := playCurrentSegmentAtSpeed(m.state, speed); errMsg != "" {
m.err = errMsg
}
return m, playbackTick()
}

// Check for Escape key for quit
if key.Code == tea.KeyEscape || key.Code == tea.KeyEsc {
if m.state.Player != nil {
m.state.Player.Stop()
}
m.quitting = true
return m, tea.Quit
}

// Check for Space key (open comment dialog)
if key.Code == tea.KeySpace {
m.commentText = m.state.GetCurrentComment()
m.commentCursor = len(m.commentText) // start at end
m.commentMode = true
return m, nil
}

// Check for Ctrl+S (save clip dialog)
if msg.String() == "ctrl+s" {
m.clipInput = ""
m.clipMode = true
return m, nil
}

switch msg.String() {
case "ctrl+c":
if m.state.Player != nil {
m.state.Player.Stop()
}
m.quitting = true
return m, tea.Quit

case ",", "left":
// Previous segment
if m.state.Player != nil {
m.state.Player.Stop()
}
m.state.PrevSegment()
return m, m.segmentChangeCmd()

case ".", "right":
// Next segment (no edit)
if m.state.Player != nil {
m.state.Player.Stop()
}
if !m.state.NextSegment() {
m.quitting = true
return m, tea.Quit
}
return m, m.segmentChangeCmd()

case "ctrl+d":
// Toggle bookmark
m.state.ToggleBookmark()
if err := m.state.Save(); err != nil {
m.err = err.Error()
}
return m, nil

case "ctrl+,":
// Previous bookmark
if m.state.Player != nil {
m.state.Player.Stop()
}
if m.state.PrevBookmark() {
return m, m.segmentChangeCmd()
}
m.err = "No bookmarks found"
return m, nil

case "ctrl+.":
// Next bookmark
if m.state.Player != nil {
m.state.Player.Stop()
}
if m.state.NextBookmark() {
return m, m.segmentChangeCmd()
}
m.err = "No bookmarks found"
return m, nil

case "0":
// Confirm existing label (upgrade certainty to 100)
if m.state.Player != nil {
m.state.Player.Stop()
}
if m.state.ConfirmLabel() {
if err := m.state.Save(); err != nil {
m.err = err.Error()
return m, nil
}
}
if !m.state.NextSegment() {
m.quitting = true
return m, tea.Quit
}
return m, m.segmentChangeCmd()

default:
// Check for binding
s := msg.String()
if len(s) == 1 {
k := s

// Shift+letter: if the lowercase primary has secondary bindings,
// label species-only and enter wait mode. Otherwise map to the
// lowercase equivalent and dispatch as a normal primary keypress.
if key.Mod&tea.ModShift != 0 {
lower := strings.ToLower(s)
if lower != s {
if m.state.HasSecondary(lower) {
if result := m.state.ParseKeyBuffer(lower); result != nil {
if m.state.Player != nil {
m.state.Player.Stop()
}
m.state.ApplyBinding(&tools.BindingResult{Species: result.Species})
if err := m.state.Save(); err != nil {
m.err = err.Error()
}
m.awaitingSecondaryFor = lower
return m, nil
}
}
k = lower
}
}

if result := m.state.ParseKeyBuffer(k); result != nil {
if m.state.Player != nil {
m.state.Player.Stop()
}
m.state.ApplyBinding(result)
if err := m.state.Save(); err != nil {
m.err = err.Error()
}
if !m.state.NextSegment() {
m.quitting = true
return m, tea.Quit
}
return m, m.segmentChangeCmd()
}
}
return m, nil
}
}

// handleCommentKey handles key presses in comment mode
func (m Model) handleCommentKey(msg tea.KeyPressMsg) (tea.Model, tea.Cmd) {
key := msg.Key()

// Enter: save comment
if key.Code == tea.KeyEnter {
m.state.SetComment(m.commentText)
if err := m.state.Save(); err != nil {
m.err = err.Error()
}
m.commentMode = false
return m, nil
}

// Escape: cancel
if key.Code == tea.KeyEscape || key.Code == tea.KeyEsc {
m.commentMode = false
return m, nil
}

// Navigation and editing keys (check by code, not string)
switch key.Code {
case tea.KeyLeft:
if m.commentCursor > 0 {
m.commentCursor--
}
return m, nil
case tea.KeyRight:
if m.commentCursor < len(m.commentText) {
m.commentCursor++
}
return m, nil
case tea.KeySpace:
if len(m.commentText) < 140 {
m.commentText = m.commentText[:m.commentCursor] + " " + m.commentText[m.commentCursor:]
m.commentCursor++
}
return m, nil
case tea.KeyBackspace:
if m.commentCursor > 0 {
m.commentText = m.commentText[:m.commentCursor-1] + m.commentText[m.commentCursor:]
m.commentCursor--
}
return m, nil
case tea.KeyDelete:
if m.commentCursor < len(m.commentText) {
m.commentText = m.commentText[:m.commentCursor] + m.commentText[m.commentCursor+1:]
}
return m, nil
}

// Handle via string representation for ctrl combos
switch msg.String() {
case "ctrl+u":
m.commentText = ""
m.commentCursor = 0
return m, nil
case "ctrl+a":
m.commentCursor = 0
return m, nil
case "ctrl+e":
m.commentCursor = len(m.commentText)
return m, nil
}

// Printable ASCII character (space handled above via KeySpace)
s := msg.String()
if len(s) == 1 && s[0] >= 33 && s[0] <= 126 { // 33='!', 126='~' (space=32 handled above)
if len(m.commentText) < 140 {
m.commentText = m.commentText[:m.commentCursor] + s + m.commentText[m.commentCursor:]
m.commentCursor++
}
return m, nil
}

return m, nil
}

// handleClipKey handles key presses in clip mode
func (m Model) handleClipKey(msg tea.KeyPressMsg) (tea.Model, tea.Cmd) {
key := msg.Key()

// Enter: save clip
if key.Code == tea.KeyEnter {
if m.clipInput == "" {
m.clipMode = false
return m, nil
}
// Save the clip
err := saveClip(m.state, m.clipInput)
if err != nil {
m.err = err.Error()
} else {
m.err = "Clip saved: " + m.clipInput
}
m.clipMode = false
return m, nil
}

// Escape: cancel
if key.Code == tea.KeyEscape || key.Code == tea.KeyEsc {
m.clipMode = false
return m, nil
}

// Backspace: remove last character
if key.Code == tea.KeyBackspace {
if len(m.clipInput) > 0 {
m.clipInput = m.clipInput[:len(m.clipInput)-1]
}
return m, nil
}

// Printable characters: append to input
s := msg.String()
if len(s) == 1 && s[0] >= 32 && s[0] <= 126 { // printable ASCII
if len(m.clipInput) < 64 {
m.clipInput += s
}
return m, nil
}

return m, nil
}

// saveClip saves a clip of the current segment to the current working directory
func saveClip(state *tools.ClassifyState, prefix string) error {
df := state.CurrentFile()
seg := state.CurrentSegment()
if df == nil || seg == nil {
return fmt.Errorf("no segment selected")
}

// Get WAV path
wavPath := strings.TrimSuffix(df.FilePath, ".data")

// Get basename without path and extension
basename := wavPath[strings.LastIndex(wavPath, "/")+1:]
basename = strings.TrimSuffix(basename, ".wav")

// Calculate integer times for filename
startInt := int(seg.StartTime)
endInt := int(seg.EndTime)
if seg.EndTime > float64(endInt) {
endInt++ // ceil
}

// Build output paths (current working directory)
cwd, err := os.Getwd()
if err != nil {
return fmt.Errorf("failed to get working directory: %w", err)
}

baseName := fmt.Sprintf("%s_%s_%d_%d", prefix, basename, startInt, endInt)
pngPath := filepath.Join(cwd, baseName+".png")
wavOutPath := filepath.Join(cwd, baseName+".wav")

// Check if files already exist
if _, err := os.Stat(pngPath); err == nil {
return fmt.Errorf("file already exists: %s", pngPath)
}
if _, err := os.Stat(wavOutPath); err == nil {
return fmt.Errorf("file already exists: %s", wavOutPath)
}

// Read WAV samples
samples, sampleRate, err := utils.ReadWAVSamples(wavPath)
if err != nil {
return fmt.Errorf("failed to read WAV: %w", err)
}

// Extract segment samples
segSamples := utils.ExtractSegmentSamples(samples, sampleRate, seg.StartTime, seg.EndTime)
if len(segSamples) == 0 {
return fmt.Errorf("no samples in segment")
}

// Determine output sample rate (downsample if > 16kHz)
outputSampleRate := sampleRate
if sampleRate > utils.DefaultMaxSampleRate {
segSamples = utils.ResampleRate(segSamples, sampleRate, utils.DefaultMaxSampleRate)
outputSampleRate = utils.DefaultMaxSampleRate
}

// Generate spectrogram (224px, color)
config := utils.DefaultSpectrogramConfig(outputSampleRate)
spectrogram := utils.GenerateSpectrogram(segSamples, config)
if spectrogram == nil {
return fmt.Errorf("failed to generate spectrogram")
}

colorData := utils.ApplyL4Colormap(spectrogram)
img := utils.CreateRGBImage(colorData)
if img == nil {
return fmt.Errorf("failed to create image")
}

resized := utils.ResizeImage(img, 224, 224)

// Write PNG
pngFile, err := os.Create(pngPath)
if err != nil {
return fmt.Errorf("failed to create PNG: %w", err)
}
if err := utils.WritePNG(resized, pngFile); err != nil {
_ = pngFile.Close()
return fmt.Errorf("failed to write PNG: %w", err)
}
if err := pngFile.Close(); err != nil {
return fmt.Errorf("failed to close PNG: %w", err)
}

// Write WAV
if err := utils.WriteWAVFile(wavOutPath, segSamples, outputSampleRate); err != nil {
return fmt.Errorf("failed to write WAV: %w", err)
}

return nil
}

// playCurrentSegmentAtSpeed loads and plays the current segment's audio at the given speed.
// speed=1.0 is normal, speed=0.5 is half speed.
// Returns an error message string, or empty string on success.
func playCurrentSegmentAtSpeed(state *tools.ClassifyState, speed float64) string {
df := state.CurrentFile()
seg := state.CurrentSegment()
if df == nil || seg == nil {
return ""
}

wavPath := strings.TrimSuffix(df.FilePath, ".data")
samples, sampleRate, err := utils.ReadWAVSamples(wavPath)
if err != nil {
return fmt.Sprintf("audio: %v", err)
}

// Initialize player lazily on first play
if state.Player == nil {
player, err := utils.NewAudioPlayer(sampleRate)
if err != nil {
return fmt.Sprintf("audio init: %v", err)
}
state.Player = player
}

segSamples := utils.ExtractSegmentSamples(samples, sampleRate, seg.StartTime, seg.EndTime)
if len(segSamples) > 0 {
state.PlaybackSpeed = speed
state.Player.PlayAtSpeed(segSamples, sampleRate, speed)
}
return ""
}

// playbackTick returns a command that sends a playbackTickMsg after 50ms.
func playbackTick() tea.Cmd {
return tea.Tick(50*time.Millisecond, func(t time.Time) tea.Msg {
return playbackTickMsg{}
})
}

// View renders the TUI
func (m Model) View() tea.View {
if m.quitting {
var b strings.Builder
_ = utils.ClearImages(&b, m.protocol())
b.WriteString("\nDone!\n")
return tea.NewView(b.String())
}

var b strings.Builder

// Header: file info
df := m.state.CurrentFile()
seg := m.state.CurrentSegment()
total := m.state.TotalSegments()
current := m.state.CurrentSegmentNumber()

if df == nil || seg == nil {
return tea.NewView("\nNo segments to review.\n")
}

// Bindings help (wrap at 80 chars)
const wrapWidth = 80
b.WriteString(helpStyle.Render(wrapText(m.bindingsHelp, wrapWidth)))
b.WriteString("\n")
b.WriteString(helpDarkStyle.Render(wrapText("[esc]quit [,]prev [.]next [0]confirm [space]comment [ctrl+s]clip [ctrl+d]bookmark [ctrl+,]prev-bk [ctrl+.]next-bk [enter]play [shift+enter]½speed", wrapWidth)))
b.WriteString("\n\n")

// Progress bar
progress := float64(current) / float64(total)
barWidth := 30
filled := int(progress * float64(barWidth))
bar := strings.Repeat("█", filled) + strings.Repeat("░", barWidth-filled)

// Title line
wavFile := strings.TrimSuffix(df.FilePath, ".data")
wavFile = wavFile[strings.LastIndex(wavFile, "/")+1:]
b.WriteString(titleStyle.Render(fmt.Sprintf(" %s [%s] %d/%d Segments ", wavFile, bar, current, total)))
b.WriteString("\n\n")

// Segment info
segInfo := fmt.Sprintf("Segment: %.1fs - %.1fs (%.1fs)", seg.StartTime, seg.EndTime, seg.EndTime-seg.StartTime)
if m.state.HasBookmark() {
segInfo += " [BOOKMARKED]"
}
if m.awaitingSecondaryFor != "" {
segInfo += " Waiting..."
}
if m.state.Player != nil && m.state.Player.IsPlaying() {
if m.state.PlaybackSpeed == 0.5 {
segInfo += " ▶ Playing 0.5x..."
} else {
segInfo += " ▶ Playing..."
}
}
b.WriteString(segInfo)
b.WriteString("\n\n")

// Labels
filterLabels := seg.GetFilterLabels(m.state.Config.Filter)
if len(filterLabels) > 0 {
b.WriteString(labelStyle.Render("Labels:"))
b.WriteString("\n")
for _, l := range filterLabels {
fmt.Fprintf(&b, " • %s\n", tools.FormatLabels([]*utils.Label{l}, m.state.Config.Filter))
}
}
b.WriteString("\n")

// Clip dialog (when active)
if m.clipMode {
m.renderClipDialog(&b)
return tea.NewView(b.String())
}

// Comment dialog (when active)
if m.commentMode {
m.renderCommentDialog(&b)
return tea.NewView(b.String())
}

// Error
if m.err != "" {
b.WriteString(errorStyle.Render(m.err))
}

v := tea.NewView(b.String())
v.AltScreen = true
return v
}

// renderCommentDialog renders the comment input dialog
func (m Model) renderCommentDialog(b *strings.Builder) {
// Build input line with cursor at correct position
before := m.commentText[:m.commentCursor]
after := m.commentText[m.commentCursor:]
inputLine := before + "█" + after

charCount := fmt.Sprintf("%d/140", len(m.commentText))
helpLine := "[enter]save [esc]cancel [←→]move [ctrl+u]clear [ctrl+a]start [ctrl+e]end"

// Render box
content := fmt.Sprintf("Comment:\n%s\n%s\n%s", inputLine, charCount, helpLine)
b.WriteString(commentBoxStyle.Render(content))
}

// renderClipDialog renders the clip prefix input dialog
func (m Model) renderClipDialog(b *strings.Builder) {
inputLine := m.clipInput + "█"
helpLine := "[enter]save [esc]cancel"

// Render box
content := fmt.Sprintf("Clip prefix:\n%s\n%s", inputLine, helpLine)
b.WriteString(commentBoxStyle.Render(content))
}

// generateSpectrogramImage creates a resized spectrogram image from a segment.
func generateSpectrogramImage(state *tools.ClassifyState, dataPath string, seg *utils.Segment) image.Image {
imgSize := state.Config.ImageSize
if imgSize == 0 {
imgSize = utils.SpectrogramDisplaySize
}
img, err := utils.GenerateSegmentSpectrogram(dataPath, seg.StartTime, seg.EndTime, state.Config.Color, imgSize)
if err != nil {
return nil
}
return img
}

// inlineImageCmd returns a tea.Cmd that generates and writes an inline image
// directly to the terminal, bypassing BubbleTea's renderer.
// gen is the generation at dispatch time; currentGen points to the live counter.
// If they differ when the image is ready, a newer segment change has occurred
// and this image is stale — discard it instead of writing.
func inlineImageCmd(state *tools.ClassifyState, protocol utils.ImageProtocol, gen uint64, currentGen *uint64) tea.Cmd {
return func() tea.Msg {
df := state.CurrentFile()
seg := state.CurrentSegment()
if df == nil || seg == nil {
return nil
}

img := generateSpectrogramImage(state, df.FilePath, seg)
if img == nil {
return nil
}

// Discard if a newer segment change has superseded this one
if *currentGen != gen {
return nil
}

// Clear previous kitty images before writing new one.
// Terminal write errors during render are non-recoverable; ignore.
_ = utils.ClearImages(os.Stdout, protocol)
_, _ = fmt.Fprint(os.Stdout, "\r\n\r\n")
_ = utils.WriteImage(img, os.Stdout, protocol)
return nil
}
}
file addition: tools (d--r------)

[2.1]
file addition: update_test.go (----------)

[0.248737]

package tools

import (
"context"
"os"
"testing"

"skraak/db"
)

// setupTestDB creates a temporary database with schema for testing
func setupTestDB(t *testing.T) (string, func()) {
t.Helper()

// Create temp file path (but don't create the file - DuckDB will create it)
tmpFile, err := os.CreateTemp("", "skraak_update_test_*.duckdb")
if err != nil {
t.Fatalf("Failed to create temp file: %v", err)
}
tmpPath := tmpFile.Name()
tmpFile.Close()
os.Remove(tmpPath) // Remove the empty file so DuckDB can create it fresh

// Open database and run schema
database, err := db.OpenWriteableDB(tmpPath)
if err != nil {
t.Fatalf("Failed to open database: %v", err)
}

// Read and execute schema
schema, err := db.ReadSchemaSQL()
if err != nil {
database.Close()
os.Remove(tmpPath)
t.Fatalf("Failed to read schema: %v", err)
}

statements := db.ExtractDDLStatements(schema)
for _, stmt := range statements {
// Skip CREATE TABLE AS (ebird_taxonomy_v2024 was removed)
if stmt.Type == "CREATE_TABLE_AS" {
continue
}
_, err := database.Exec(stmt.SQL)
if err != nil {
database.Close()
os.Remove(tmpPath)
t.Fatalf("Failed to execute DDL: %v\nSQL: %s", err, stmt.SQL)
}
}

database.Close()

cleanup := func() {
os.Remove(tmpPath)
}

return tmpPath, cleanup
}

// TestDatasetUpdatePreservesUnsetFields tests that update only modifies provided fields
func TestDatasetUpdatePreservesUnsetFields(t *testing.T) {
dbPath, cleanup := setupTestDB(t)
defer cleanup()

SetDBPath(dbPath)

// Create a dataset with all fields
name := "Test Dataset"
dsType := "train"
description := "Original description"
createInput := DatasetInput{
Name: &name,
Type: &dsType,
Description: &description,
}

ctx := context.Background()
created, err := CreateOrUpdateDataset(ctx, createInput)
if err != nil {
t.Fatalf("Failed to create dataset: %v", err)
}

// Verify initial values
if created.Dataset.Name != "Test Dataset" {
t.Errorf("Expected name 'Test Dataset', got '%s'", created.Dataset.Name)
}
if created.Dataset.Type != "train" {
t.Errorf("Expected type 'train', got '%s'", created.Dataset.Type)
}
if created.Dataset.Description == nil || *created.Dataset.Description != "Original description" {
t.Errorf("Expected description 'Original description', got '%v'", created.Dataset.Description)
}

// Update only the description (nil for other fields)
newDesc := "Updated description only"
updateInput := DatasetInput{
ID: &created.Dataset.ID,
Description: &newDesc,
// Name and Type are nil - should be preserved
}

updated, err := CreateOrUpdateDataset(ctx, updateInput)
if err != nil {
t.Fatalf("Failed to update dataset: %v", err)
}

// Verify only description changed
if updated.Dataset.Name != "Test Dataset" {
t.Errorf("Name should be preserved, got '%s'", updated.Dataset.Name)
}
if updated.Dataset.Type != "train" {
t.Errorf("Type should be preserved, got '%s'", updated.Dataset.Type)
}
if updated.Dataset.Description == nil || *updated.Dataset.Description != "Updated description only" {
t.Errorf("Description should be updated, got '%v'", updated.Dataset.Description)
}
}

// TestLocationUpdatePreservesUnsetFields tests that update only modifies provided fields
func TestLocationUpdatePreservesUnsetFields(t *testing.T) {
dbPath, cleanup := setupTestDB(t)
defer cleanup()

SetDBPath(dbPath)

// Create a dataset first
dsName := "Test Dataset"
dsCreated, err := CreateOrUpdateDataset(context.Background(), DatasetInput{Name: &dsName})
if err != nil {
t.Fatalf("Failed to create dataset: %v", err)
}

// Create a location with all fields
name := "Test Location"
lat := -36.85
lon := 174.76
tz := "Pacific/Auckland"
description := "Original description"
createInput := LocationInput{
DatasetID: &dsCreated.Dataset.ID,
Name: &name,
Latitude: &lat,
Longitude: &lon,
TimezoneID: &tz,
Description: &description,
}

ctx := context.Background()
created, err := CreateOrUpdateLocation(ctx, createInput)
if err != nil {
t.Fatalf("Failed to create location: %v", err)
}

// Verify initial values
if created.Location.Name != "Test Location" {
t.Errorf("Expected name 'Test Location', got '%s'", created.Location.Name)
}
if created.Location.TimezoneID != "Pacific/Auckland" {
t.Errorf("Expected timezone 'Pacific/Auckland', got '%s'", created.Location.TimezoneID)
}

// Update only the description (nil for other fields)
newDesc := "Updated description only"
updateInput := LocationInput{
ID: &created.Location.ID,
Description: &newDesc,
// Name, Latitude, Longitude, TimezoneID are nil - should be preserved
}

updated, err := CreateOrUpdateLocation(ctx, updateInput)
if err != nil {
t.Fatalf("Failed to update location: %v", err)
}

// Verify only description changed
if updated.Location.Name != "Test Location" {
t.Errorf("Name should be preserved, got '%s'", updated.Location.Name)
}
if updated.Location.Latitude != -36.85 {
t.Errorf("Latitude should be preserved, got %f", updated.Location.Latitude)
}
if updated.Location.Longitude != 174.76 {
t.Errorf("Longitude should be preserved, got %f", updated.Location.Longitude)
}
if updated.Location.TimezoneID != "Pacific/Auckland" {
t.Errorf("TimezoneID should be preserved, got '%s'", updated.Location.TimezoneID)
}
if updated.Location.Description == nil || *updated.Location.Description != "Updated description only" {
t.Errorf("Description should be updated, got '%v'", updated.Location.Description)
}
}

// TestClusterUpdatePreservesUnsetFields tests that update only modifies provided fields
func TestClusterUpdatePreservesUnsetFields(t *testing.T) {
dbPath, cleanup := setupTestDB(t)
defer cleanup()

SetDBPath(dbPath)

// Create dataset and location
dsName := "Test Dataset"
dsCreated, err := CreateOrUpdateDataset(context.Background(), DatasetInput{Name: &dsName})
if err != nil {
t.Fatalf("Failed to create dataset: %v", err)
}

locName := "Test Location"
lat, lon := -36.85, 174.76
tz := "Pacific/Auckland"
locCreated, err := CreateOrUpdateLocation(context.Background(), LocationInput{
DatasetID: &dsCreated.Dataset.ID,
Name: &locName,
Latitude: &lat,
Longitude: &lon,
TimezoneID: &tz,
})
if err != nil {
t.Fatalf("Failed to create location: %v", err)
}

// Create a cluster with all fields
name := "Test Cluster"
sampleRate := 250000
description := "Original description"
createInput := ClusterInput{
DatasetID: &dsCreated.Dataset.ID,
LocationID: &locCreated.Location.ID,
Name: &name,
SampleRate: &sampleRate,
Description: &description,
}

ctx := context.Background()
created, err := CreateOrUpdateCluster(ctx, createInput)
if err != nil {
t.Fatalf("Failed to create cluster: %v", err)
}

// Update only the description (nil for other fields)
newDesc := "Updated description only"
updateInput := ClusterInput{
ID: &created.Cluster.ID,
Description: &newDesc,
// Name, SampleRate are nil - should be preserved
}

updated, err := CreateOrUpdateCluster(ctx, updateInput)
if err != nil {
t.Fatalf("Failed to update cluster: %v", err)
}

// Verify only description changed
if updated.Cluster.Name != "Test Cluster" {
t.Errorf("Name should be preserved, got '%s'", updated.Cluster.Name)
}
if updated.Cluster.SampleRate != 250000 {
t.Errorf("SampleRate should be preserved, got %d", updated.Cluster.SampleRate)
}
if updated.Cluster.Description == nil || *updated.Cluster.Description != "Updated description only" {
t.Errorf("Description should be updated, got '%v'", updated.Cluster.Description)
}
}

// TestPatternUpdatePreservesUnsetFields tests that update only modifies provided fields
func TestPatternUpdatePreservesUnsetFields(t *testing.T) {
dbPath, cleanup := setupTestDB(t)
defer cleanup()

SetDBPath(dbPath)

// Create a pattern
recordSeconds := 60
sleepSeconds := 1740
createInput := PatternInput{
RecordSeconds: &recordSeconds,
SleepSeconds: &sleepSeconds,
}

ctx := context.Background()
created, err := CreateOrUpdatePattern(ctx, createInput)
if err != nil {
t.Fatalf("Failed to create pattern: %v", err)
}

// Verify initial values
if created.Pattern.RecordS != 60 {
t.Errorf("Expected record_s 60, got %d", created.Pattern.RecordS)
}
if created.Pattern.SleepS != 1740 {
t.Errorf("Expected sleep_s 1740, got %d", created.Pattern.SleepS)
}

// Update only the record seconds
newRecord := 30
updateInput := PatternInput{
ID: &created.Pattern.ID,
RecordSeconds: &newRecord,
// SleepSeconds is nil - should be preserved
}

updated, err := CreateOrUpdatePattern(ctx, updateInput)
if err != nil {
t.Fatalf("Failed to update pattern: %v", err)
}

// Verify only record changed
if updated.Pattern.RecordS != 30 {
t.Errorf("RecordS should be updated to 30, got %d", updated.Pattern.RecordS)
}
if updated.Pattern.SleepS != 1740 {
t.Errorf("SleepS should be preserved at 1740, got %d", updated.Pattern.SleepS)
}
}

// TestDatasetUpdateNoFieldsError tests that update with no fields returns error
func TestDatasetUpdateNoFieldsError(t *testing.T) {
dbPath, cleanup := setupTestDB(t)
defer cleanup()

SetDBPath(dbPath)

// Create a dataset
name := "Test Dataset"
created, err := CreateOrUpdateDataset(context.Background(), DatasetInput{Name: &name})
if err != nil {
t.Fatalf("Failed to create dataset: %v", err)
}

// Update with no fields should error
updateInput := DatasetInput{
ID: &created.Dataset.ID,
// All other fields are nil
}

_, err = CreateOrUpdateDataset(context.Background(), updateInput)
if err == nil {
t.Error("Expected error when no fields provided to update")
}
}
file addition: time.go (----------)

[0.248737]

package tools

import (
"context"
"time"
)

// GetCurrentTimeInput defines the input parameters for the get_current_time tool
type GetCurrentTimeInput struct {
// No input parameters needed for basic time query
}

// GetCurrentTimeOutput defines the output structure for the get_current_time tool
type GetCurrentTimeOutput struct {
Time string `json:"time"`
Timezone string `json:"timezone"`
Unix int64 `json:"unix"`
}

// GetCurrentTime returns current system time with timezone and Unix timestamp
func GetCurrentTime(ctx context.Context, input GetCurrentTimeInput) (GetCurrentTimeOutput, error) {
now := time.Now()

output := GetCurrentTimeOutput{
Time: now.Format(time.RFC3339),
Timezone: now.Location().String(),
Unix: now.Unix(),
}

return output, nil
}
file addition: sql.go (----------)

[0.248737]

package tools

import (
"context"
"database/sql"
"encoding/base64"
"fmt"
"regexp"
"strings"
"time"

"skraak/db"
)

// Package-level variable to store database path
var dbPath string

// SetDBPath sets the database path for the tools package
// Called from main.go during initialization
func SetDBPath(path string) {
dbPath = path
}

// ExecuteSQLInput defines the input parameters for the execute_sql tool
type ExecuteSQLInput struct {
Query string `json:"query"`
Parameters []any `json:"parameters,omitempty"`
Limit *int `json:"limit,omitempty"`
}

// ColumnInfo contains metadata about a result column
type ColumnInfo struct {
Name string `json:"name"`
DatabaseType string `json:"database_type"`
}

// ExecuteSQLOutput defines the output structure for the execute_sql tool
type ExecuteSQLOutput struct {
Rows []map[string]any `json:"rows"`
RowCount int `json:"row_count"`
Columns []ColumnInfo `json:"columns"`
Limited bool `json:"limited"`
Query string `json:"query_executed"`
}

// Validation patterns
var (
// Must start with SELECT or WITH (case-insensitive, allows leading whitespace)
selectPattern = regexp.MustCompile(`(?i)^\s*(SELECT|WITH)\s+`)

// Check for forbidden keywords that might indicate write operations
forbiddenPattern = regexp.MustCompile(`(?i)\b(INSERT|UPDATE|DELETE|DROP|CREATE|ALTER|TRUNCATE|GRANT|REVOKE)\b`)

// Check for existing LIMIT clause (case-insensitive)
limitPattern = regexp.MustCompile(`(?i)\bLIMIT\s+\d+`)
)

const (
defaultLimit = 1000
maxLimit = 10000
)

// ExecuteSQL executes arbitrary SQL SELECT queries with safety validation
// ExecuteSQL executes arbitrary SQL SELECT queries with safety validation and row limiting
func ExecuteSQL(
ctx context.Context,
input ExecuteSQLInput,
) (ExecuteSQLOutput, error) {
// Validate query is not empty
if strings.TrimSpace(input.Query) == "" {
return ExecuteSQLOutput{}, fmt.Errorf("query cannot be empty")
}

// Validate query starts with SELECT or WITH
if !selectPattern.MatchString(input.Query) {
return ExecuteSQLOutput{}, fmt.Errorf("only SELECT and WITH queries are allowed")
}

// Check for forbidden keywords (defense in depth - database is already read-only)
if forbiddenPattern.MatchString(input.Query) {
return ExecuteSQLOutput{}, fmt.Errorf("query contains forbidden keywords (INSERT/UPDATE/DELETE/DROP/CREATE/ALTER)")
}

// Determine row limit
limit := defaultLimit
if input.Limit != nil {
if *input.Limit < 1 || *input.Limit > maxLimit {
return ExecuteSQLOutput{}, fmt.Errorf("limit must be between 1 and %d", maxLimit)
}
limit = *input.Limit
}

// Add LIMIT clause if not present
// Query for limit+1 rows to detect truncation
query := input.Query
autoAddedLimit := false
if !limitPattern.MatchString(query) {
query = fmt.Sprintf("%s LIMIT %d", strings.TrimSpace(query), limit+1)
autoAddedLimit = true
}

// Get database connection (read-only for security)
database, err := db.OpenReadOnlyDB(dbPath)
if err != nil {
return ExecuteSQLOutput{}, fmt.Errorf("database connection failed: %w", err)
}
defer database.Close() // Always close when done

// Execute query with parameters
var rows *sql.Rows
if len(input.Parameters) > 0 {
rows, err = database.QueryContext(ctx, query, input.Parameters...)
} else {
rows, err = database.QueryContext(ctx, query)
}
if err != nil {
return ExecuteSQLOutput{}, fmt.Errorf("query execution failed: %w", err)
}
defer rows.Close()

// Get column metadata
columns, err := rows.Columns()
if err != nil {
return ExecuteSQLOutput{}, fmt.Errorf("failed to get columns: %w", err)
}

columnTypes, err := rows.ColumnTypes()
if err != nil {
return ExecuteSQLOutput{}, fmt.Errorf("failed to get column types: %w", err)
}

// Build column info
columnInfo := make([]ColumnInfo, len(columns))
for i, col := range columns {
columnInfo[i] = ColumnInfo{
Name: col,
DatabaseType: columnTypes[i].DatabaseTypeName(),
}
}

// Process rows
var results []map[string]any

for rows.Next() {
// Create slice to hold column values
values := make([]any, len(columns))
valuePtrs := make([]any, len(columns))
for i := range values {
valuePtrs[i] = &values[i]
}

// Scan row
if err := rows.Scan(valuePtrs...); err != nil {
return ExecuteSQLOutput{}, fmt.Errorf("row scan failed: %w", err)
}

// Convert to map with type conversion
rowMap := make(map[string]any)
for i, col := range columns {
rowMap[col] = convertValue(values[i])
}

results = append(results, rowMap)
}

// Check for errors during iteration
if err = rows.Err(); err != nil {
return ExecuteSQLOutput{}, fmt.Errorf("row iteration failed: %w", err)
}

// Handle empty results (return empty array, not error)
if results == nil {
results = []map[string]any{}
}

// Detect truncation: if we auto-added limit+1 and got more than limit rows
limited := false
if autoAddedLimit && len(results) > limit {
limited = true
results = results[:limit]
}

// Build the query string to report (show effective limit, not internal limit+1)
queryReported := query
if autoAddedLimit {
queryReported = fmt.Sprintf("%s LIMIT %d", strings.TrimSpace(input.Query), limit)
}

// Create output structure
output := ExecuteSQLOutput{
Rows: results,
RowCount: len(results),
Columns: columnInfo,
Limited: limited,
Query: queryReported,
}

return output, nil
}

// convertValue converts database values to JSON-friendly types
func convertValue(val any) any {
if val == nil {
return nil
}

switch v := val.(type) {
case time.Time:
// Format timestamps as RFC3339 strings (consistent with existing code)
return v.Format(time.RFC3339)
case []byte:
// Convert binary data to base64
return base64.StdEncoding.EncodeToString(v)
case int64, float64, string, bool:
// Pass through primitive types
return v
default:
// For unknown types, convert to string
return fmt.Sprintf("%v", v)
}
}
file addition: prepend_test.go (----------)

[0.248737]

package tools

import (
"os"
"path/filepath"
"testing"
)

func TestShouldPrependFile(t *testing.T) {
tests := []struct {
name string
filename string
prefix string
wantRename bool
wantReason string
}{
// WAV files with datestring
{"wav with datestring", "20250920_011509.wav", "LOC", true, ""},
{"WAV with datestring", "20250920_011509.WAV", "LOC", true, ""},
{"wav.data with datestring", "20250920_011509.wav.data", "LOC", true, ""},
{"WAV.data with datestring", "20250920_011509.WAV.data", "LOC", true, ""},

// Already prefixed
{"already prefixed wav", "LOC_20250920_011509.wav", "LOC", false, "already prefixed"},
{"already prefixed log.txt", "LOC_log.txt", "LOC", false, "already prefixed"},

// No datestring
{"no datestring wav", "mok_nearcamp2_20250920.wav", "LOC", false, "no datestring prefix"},
{"no datestring WAV", "recording.WAV", "LOC", false, "no datestring prefix"},

// log.txt
{"log.txt", "log.txt", "LOC", true, ""},

// Non-target files (silently ignored)
{"readme", "README.txt", "LOC", false, ""},
{"random file", "something.mp3", "LOC", false, ""},
{"LOG.TXT uppercase", "LOG.TXT", "LOC", false, ""}, // Only lowercase log.txt matches
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
gotRename, gotReason := shouldPrependFile(tt.filename, tt.prefix)
if gotRename != tt.wantRename {
t.Errorf("shouldPrependFile() gotRename = %v, want %v", gotRename, tt.wantRename)
}
if gotReason != tt.wantReason {
t.Errorf("shouldPrependFile() gotReason = %v, want %v", gotReason, tt.wantReason)
}
})
}
}

func TestPrepend(t *testing.T) {
// Create temp folder
tmpDir, err := os.MkdirTemp("", "prepend_test")
if err != nil {
t.Fatalf("Failed to create temp dir: %v", err)
}
defer os.RemoveAll(tmpDir)

// Create test files
testFiles := []string{
"20250920_011509.wav",
"20250920_011509.wav.data",
"log.txt",
"mok_nearcamp2_20250920.wav",
"README.txt",
}
for _, f := range testFiles {
if err := os.WriteFile(filepath.Join(tmpDir, f), []byte{}, 0644); err != nil {
t.Fatalf("Failed to create test file: %v", err)
}
}

// Run prepend
output, err := Prepend(PrependInput{
Folder: tmpDir,
Prefix: "TEST",
Recursive: false,
DryRun: false,
})
if err != nil {
t.Fatalf("Prepend() error = %v", err)
}

// Verify renamed files
if len(output.Renamed) != 3 {
t.Errorf("Expected 3 renamed files, got %d", len(output.Renamed))
}

// Verify skipped files
if len(output.Skipped) != 1 {
t.Errorf("Expected 1 skipped file, got %d", len(output.Skipped))
}

// Verify files were actually renamed
if _, err := os.Stat(filepath.Join(tmpDir, "TEST_20250920_011509.wav")); os.IsNotExist(err) {
t.Error("Expected TEST_20250920_011509.wav to exist")
}
if _, err := os.Stat(filepath.Join(tmpDir, "TEST_log.txt")); os.IsNotExist(err) {
t.Error("Expected TEST_log.txt to exist")
}
if _, err := os.Stat(filepath.Join(tmpDir, "mok_nearcamp2_20250920.wav")); os.IsNotExist(err) {
t.Error("Expected mok_nearcamp2_20250920.wav to still exist (skipped)")
}
}

func TestPrependRecursive(t *testing.T) {
// Create temp folder with subfolder
tmpDir, err := os.MkdirTemp("", "prepend_test")
if err != nil {
t.Fatalf("Failed to create temp dir: %v", err)
}
defer os.RemoveAll(tmpDir)

subDir := filepath.Join(tmpDir, "subfolder")
if err := os.Mkdir(subDir, 0755); err != nil {
t.Fatalf("Failed to create subfolder: %v", err)
}

// Create test files
files := map[string]string{
filepath.Join(tmpDir, "20250920_011509.wav"): "",
filepath.Join(subDir, "20250921_120000.wav"): "",
filepath.Join(subDir, "log.txt"): "",
}
for f := range files {
if err := os.WriteFile(f, []byte{}, 0644); err != nil {
t.Fatalf("Failed to create test file: %v", err)
}
}

// Run prepend with recursive
output, err := Prepend(PrependInput{
Folder: tmpDir,
Prefix: "TEST",
Recursive: true,
DryRun: false,
})
if err != nil {
t.Fatalf("Prepend() error = %v", err)
}

// Should rename files in both folders
if len(output.Renamed) != 3 {
t.Errorf("Expected 3 renamed files (recursive), got %d", len(output.Renamed))
}

// Verify subfolder file was renamed
if _, err := os.Stat(filepath.Join(subDir, "TEST_20250921_120000.wav")); os.IsNotExist(err) {
t.Error("Expected TEST_20250921_120000.wav in subfolder to exist")
}
}

func TestPrependDryRun(t *testing.T) {
tmpDir, err := os.MkdirTemp("", "prepend_test")
if err != nil {
t.Fatalf("Failed to create temp dir: %v", err)
}
defer os.RemoveAll(tmpDir)

// Create test file
testFile := filepath.Join(tmpDir, "20250920_011509.wav")
if err := os.WriteFile(testFile, []byte{}, 0644); err != nil {
t.Fatalf("Failed to create test file: %v", err)
}

// Run prepend with dry-run
output, err := Prepend(PrependInput{
Folder: tmpDir,
Prefix: "TEST",
Recursive: false,
DryRun: true,
})
if err != nil {
t.Fatalf("Prepend() error = %v", err)
}

// Should report renamed files
if len(output.Renamed) != 1 {
t.Errorf("Expected 1 renamed file in dry-run output, got %d", len(output.Renamed))
}

// But file should NOT be renamed
if _, err := os.Stat(filepath.Join(tmpDir, "TEST_20250920_011509.wav")); !os.IsNotExist(err) {
t.Error("Expected file NOT to be renamed in dry-run mode")
}
}

func TestPrependIdempotent(t *testing.T) {
tmpDir, err := os.MkdirTemp("", "prepend_test")
if err != nil {
t.Fatalf("Failed to create temp dir: %v", err)
}
defer os.RemoveAll(tmpDir)

// Create test file
if err := os.WriteFile(filepath.Join(tmpDir, "20250920_011509.wav"), []byte{}, 0644); err != nil {
t.Fatalf("Failed to create test file: %v", err)
}

// Run prepend twice
for i := range 2 {
output, err := Prepend(PrependInput{
Folder: tmpDir,
Prefix: "TEST",
Recursive: false,
DryRun: false,
})
if err != nil {
t.Fatalf("Prepend() iteration %d error = %v", i, err)
}

if i == 0 {
// First run should rename
if len(output.Renamed) != 1 {
t.Errorf("First run: expected 1 renamed file, got %d", len(output.Renamed))
}
} else {
// Second run should skip (already prefixed)
if len(output.Renamed) != 0 {
t.Errorf("Second run: expected 0 renamed files, got %d", len(output.Renamed))
}
if len(output.Skipped) != 1 {
t.Errorf("Second run: expected 1 skipped file, got %d", len(output.Skipped))
}
}
}
}
file addition: prepend.go (----------)

[0.248737]

package tools

import (
"fmt"
"os"
"path/filepath"
"regexp"
"strings"
)

// PrependInput contains the parameters for the prepend operation.
type PrependInput struct {
Folder string
Prefix string
Recursive bool
DryRun bool
}

// PrependResult contains the result of a single file rename operation.
type PrependResult struct {
Old string `json:"old"`
New string `json:"new"`
}

// PrependSkipped contains info about a skipped file.
type PrependSkipped struct {
File string `json:"file"`
Reason string `json:"reason"`
}

// PrependError contains info about a failed rename.
type PrependError struct {
File string `json:"file"`
Error string `json:"error"`
}

// PrependOutput contains the complete result of the prepend operation.
type PrependOutput struct {
Folder string `json:"folder"`
Prefix string `json:"prefix"`
Recursive bool `json:"recursive"`
DryRun bool `json:"dry_run"`
Renamed []PrependResult `json:"renamed"`
Skipped []PrependSkipped `json:"skipped"`
Errors []PrependError `json:"errors"`
}

// datestringRegex matches filenames starting with YYYYMMDD_HHMMSS.
var datestringRegex = regexp.MustCompile(`^\d{8}_\d{6}\.`)

// Prepend renames files in a folder by prepending a prefix.
// WAV files (.wav, .WAV) and their .data files are only renamed if they start with a datestring.
// log.txt is always renamed if present.
func Prepend(input PrependInput) (*PrependOutput, error) {
output := &PrependOutput{
Folder: input.Folder,
Prefix: input.Prefix,
Recursive: input.Recursive,
DryRun: input.DryRun,
Renamed: []PrependResult{},
Skipped: []PrependSkipped{},
Errors: []PrependError{},
}

// Collect folders to process
folders := []string{input.Folder}
if input.Recursive {
entries, err := os.ReadDir(input.Folder)
if err != nil {
return nil, fmt.Errorf("failed to read folder: %w", err)
}
for _, entry := range entries {
if entry.IsDir() {
folders = append(folders, filepath.Join(input.Folder, entry.Name()))
}
}
}

// Process each folder
for _, folder := range folders {
entries, err := os.ReadDir(folder)
if err != nil {
return nil, fmt.Errorf("failed to read folder %s: %w", folder, err)
}

for _, entry := range entries {
if entry.IsDir() {
continue
}

filename := entry.Name()
oldPath := filepath.Join(folder, filename)
shouldRename, skipReason := shouldPrependFile(filename, input.Prefix)

if !shouldRename {
if skipReason != "" {
output.Skipped = append(output.Skipped, PrependSkipped{
File: oldPath,
Reason: skipReason,
})
}
continue
}

newFilename := input.Prefix + "_" + filename
newPath := filepath.Join(folder, newFilename)

if input.DryRun {
output.Renamed = append(output.Renamed, PrependResult{
Old: oldPath,
New: newPath,
})
continue
}

// Perform the rename
if err := os.Rename(oldPath, newPath); err != nil {
output.Errors = append(output.Errors, PrependError{
File: oldPath,
Error: err.Error(),
})
continue
}

output.Renamed = append(output.Renamed, PrependResult{
Old: oldPath,
New: newPath,
})
}
}

return output, nil
}

// shouldPrependFile determines if a file should be prepended.
// Returns (shouldRename, skipReason). If shouldRename is false and skipReason is empty,
// the file is not a target type (silently ignored).
func shouldPrependFile(filename, prefix string) (bool, string) {
lowerName := strings.ToLower(filename)

// Check if already prefixed (applies to all target files)
if strings.HasPrefix(filename, prefix+"_") {
// Only report as "already prefixed" if it's a target file type
if filename == prefix+"_log.txt" || isWavOrData(lowerName) {
return false, "already prefixed"
}
return false, ""
}

// Check for log.txt (exact match, case-sensitive as per spec)
if filename == "log.txt" {
return true, ""
}

// Check for WAV files and their .data files
if !isWavOrData(lowerName) {
return false, "" // Not a target file type, silently ignore
}

// Check for datestring prefix (YYYYMMDD_HHMMSS.)
if !datestringRegex.MatchString(filename) {
return false, "no datestring prefix"
}

return true, ""
}

// isWavOrData checks if the lowercase filename is a .wav or .wav.data file
func isWavOrData(lowerName string) bool {
return strings.HasSuffix(lowerName, ".wav") || strings.HasSuffix(lowerName, ".wav.data")
}
file addition: pattern_test.go (----------)

[0.248737]

package tools

import (
"context"
"os"
"path/filepath"
"testing"
)

func TestCreateOrUpdatePattern_CreateDuplicate(t *testing.T) {
// Setup: Use test database
testDB := filepath.Join("..", "db", "test.duckdb")
if _, err := os.Stat(testDB); os.IsNotExist(err) {
t.Skipf("Test database not found at %s", testDB)
}
SetDBPath(testDB)

ctx := context.Background()

// Test 1: Try to create duplicate of existing pattern (60s/1740s)
// Should return existing pattern IBv_KxDGsNQs
t.Run("CreateDuplicatePattern", func(t *testing.T) {
record := 60
sleep := 1740
input := PatternInput{
RecordSeconds: &record,
SleepSeconds: &sleep,
}

output, err := CreateOrUpdatePattern(ctx, input)
if err != nil {
t.Fatalf("Expected no error, got: %v", err)
}

// Should return existing pattern
if output.Pattern.ID != "IBv_KxDGsNQs" {
t.Errorf("Expected existing pattern ID 'IBv_KxDGsNQs', got '%s'", output.Pattern.ID)
}

if output.Pattern.RecordS != 60 {
t.Errorf("Expected record_s 60, got %d", output.Pattern.RecordS)
}
if output.Pattern.SleepS != 1740 {
t.Errorf("Expected sleep_s 1740, got %d", output.Pattern.SleepS)
}

// Check message indicates existing pattern
if output.Message == "" {
t.Error("Expected non-empty message")
}
t.Logf("Message: %s", output.Message)
})

// Test 2: Create new unique pattern
t.Run("CreateUniquePattern", func(t *testing.T) {
record := 999
sleep := 888
input := PatternInput{
RecordSeconds: &record,
SleepSeconds: &sleep,
}

output, err := CreateOrUpdatePattern(ctx, input)
if err != nil {
t.Fatalf("Expected no error, got: %v", err)
}

// Should create new pattern
firstID := output.Pattern.ID
if firstID == "" {
t.Fatal("Expected non-empty ID")
}
if output.Pattern.RecordS != 999 {
t.Errorf("Expected record_s 999, got %d", output.Pattern.RecordS)
}
if output.Pattern.SleepS != 888 {
t.Errorf("Expected sleep_s 888, got %d", output.Pattern.SleepS)
}
t.Logf("Created pattern ID: %s", firstID)

// Test 3: Try to create duplicate of the pattern we just created (idempotent)
output2, err2 := CreateOrUpdatePattern(ctx, input)
if err2 != nil {
t.Fatalf("Expected no error on duplicate, got: %v", err2)
}

// Should return same pattern
if output2.Pattern.ID != firstID {
t.Errorf("Expected same pattern ID '%s', got '%s'", firstID, output2.Pattern.ID)
}
t.Logf("Idempotent test passed - returned same ID: %s", output2.Pattern.ID)
})
}

func TestCreateOrUpdatePattern_Validation(t *testing.T) {
testDB := filepath.Join("..", "db", "test.duckdb")
if _, err := os.Stat(testDB); os.IsNotExist(err) {
t.Skipf("Test database not found at %s", testDB)
}
SetDBPath(testDB)

ctx := context.Background()

// Test invalid inputs for create (no ID = create mode)
tests := []struct {
name string
recordSeconds int
sleepSeconds int
wantError bool
}{
{"ZeroRecordSeconds", 0, 100, true},
{"NegativeRecordSeconds", -10, 100, true},
{"ZeroSleepSeconds", 100, 0, true},
{"NegativeSleepSeconds", 100, -10, true},
{"ValidInputs", 10, 20, false},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
input := PatternInput{
RecordSeconds: &tt.recordSeconds,
SleepSeconds: &tt.sleepSeconds,
}

_, err := CreateOrUpdatePattern(ctx, input)
if (err != nil) != tt.wantError {
t.Errorf("Expected error=%v, got error=%v", tt.wantError, err != nil)
}
})
}
}

func TestCreateOrUpdatePattern_Update(t *testing.T) {
testDB := filepath.Join("..", "db", "test.duckdb")
if _, err := os.Stat(testDB); os.IsNotExist(err) {
t.Skipf("Test database not found at %s", testDB)
}
SetDBPath(testDB)

ctx := context.Background()

t.Run("UpdateNonExistentPattern", func(t *testing.T) {
id := "NONEXISTENT1"
record := 100
input := PatternInput{
ID: &id,
RecordSeconds: &record,
}

_, err := CreateOrUpdatePattern(ctx, input)
if err == nil {
t.Error("Expected error for non-existent pattern")
}
})

t.Run("UpdateNoFields", func(t *testing.T) {
id := "IBv_KxDGsNQs"
input := PatternInput{
ID: &id,
}

_, err := CreateOrUpdatePattern(ctx, input)
if err == nil {
t.Error("Expected error when no fields provided")
}
})
}
file addition: pattern.go (----------)

[0.248737]

package tools

import (
"context"
"database/sql"
"fmt"
"skraak/db"
"skraak/utils"
"strings"
)

// PatternInput defines the input parameters for the create_or_update_pattern tool
type PatternInput struct {
ID *string `json:"id,omitempty"`
RecordSeconds *int `json:"record_seconds,omitempty"`
SleepSeconds *int `json:"sleep_seconds,omitempty"`
}

// PatternOutput defines the output structure
type PatternOutput struct {
Pattern db.CyclicRecordingPattern `json:"pattern"`
Message string `json:"message"`
}

// CreateOrUpdatePattern creates a new recording pattern or updates an existing one
func CreateOrUpdatePattern(
ctx context.Context,
input PatternInput,
) (PatternOutput, error) {
if input.ID != nil && strings.TrimSpace(*input.ID) != "" {
return updatePattern(ctx, input)
}
return createPattern(ctx, input)
}

func createPattern(ctx context.Context, input PatternInput) (PatternOutput, error) {
var output PatternOutput

// Validate required fields for create
if input.RecordSeconds == nil {
return output, fmt.Errorf("record_seconds is required when creating a pattern")
}
if input.SleepSeconds == nil {
return output, fmt.Errorf("sleep_seconds is required when creating a pattern")
}
if err := utils.ValidatePositive(*input.RecordSeconds, "record_seconds"); err != nil {
return output, err
}
if err := utils.ValidatePositive(*input.SleepSeconds, "sleep_seconds"); err != nil {
return output, err
}

// Open writable database connection
database, err := db.OpenWriteableDB(dbPath)
if err != nil {
return output, fmt.Errorf("database connection failed: %w", err)
}
defer database.Close()

// Begin logged transaction
tx, err := db.BeginLoggedTx(ctx, database, "create_or_update_pattern")
if err != nil {
return output, fmt.Errorf("failed to begin transaction: %w", err)
}
defer func() {
if err != nil {
tx.Rollback()
}
}()

// Check if pattern with same record_s/sleep_s already exists
var existingID string
err = tx.QueryRowContext(ctx,
"SELECT id FROM cyclic_recording_pattern WHERE record_s = ? AND sleep_s = ? AND active = true",
*input.RecordSeconds, *input.SleepSeconds,
).Scan(&existingID)

if err == nil {
// Pattern already exists, return it instead of creating duplicate
var pattern db.CyclicRecordingPattern
err = tx.QueryRowContext(ctx,
"SELECT id, record_s, sleep_s, created_at, last_modified, active FROM cyclic_recording_pattern WHERE id = ?",
existingID,
).Scan(&pattern.ID, &pattern.RecordS, &pattern.SleepS, &pattern.CreatedAt, &pattern.LastModified, &pattern.Active)
if err != nil {
return output, fmt.Errorf("failed to fetch existing pattern: %w", err)
}

if err = tx.Commit(); err != nil {
return output, fmt.Errorf("failed to commit transaction: %w", err)
}

output.Pattern = pattern
output.Message = fmt.Sprintf("Pattern already exists with ID %s (record %ds, sleep %ds) - returning existing pattern",
pattern.ID, pattern.RecordS, pattern.SleepS)

return output, nil
} else if err != sql.ErrNoRows {
return output, fmt.Errorf("failed to check for existing pattern: %w", err)
}

// Generate ID
id, err := utils.GenerateShortID()
if err != nil {
return output, fmt.Errorf("failed to generate ID: %w", err)
}

// Insert pattern
_, err = tx.ExecContext(ctx,
"INSERT INTO cyclic_recording_pattern (id, record_s, sleep_s, created_at, last_modified, active) VALUES (?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, TRUE)",
id, *input.RecordSeconds, *input.SleepSeconds,
)
if err != nil {
return output, fmt.Errorf("failed to create pattern: %w", err)
}

// Fetch the created pattern
var pattern db.CyclicRecordingPattern
err = tx.QueryRowContext(ctx,
"SELECT id, record_s, sleep_s, created_at, last_modified, active FROM cyclic_recording_pattern WHERE id = ?",
id,
).Scan(&pattern.ID, &pattern.RecordS, &pattern.SleepS, &pattern.CreatedAt, &pattern.LastModified, &pattern.Active)
if err != nil {
return output, fmt.Errorf("failed to fetch created pattern: %w", err)
}

if err = tx.Commit(); err != nil {
return output, fmt.Errorf("failed to commit transaction: %w", err)
}

output.Pattern = pattern
output.Message = fmt.Sprintf("Successfully created cyclic recording pattern with ID %s (record %ds, sleep %ds)",
pattern.ID, pattern.RecordS, pattern.SleepS)

return output, nil
}

func updatePattern(ctx context.Context, input PatternInput) (PatternOutput, error) {
var output PatternOutput
patternID := *input.ID

// Validate ID format
if err := utils.ValidateShortID(patternID, "pattern_id"); err != nil {
return output, err
}

// Validate fields if provided
if input.RecordSeconds != nil {
if err := utils.ValidatePositive(*input.RecordSeconds, "record_seconds"); err != nil {
return output, err
}
}
if input.SleepSeconds != nil {
if err := utils.ValidateNonNegative(*input.SleepSeconds, "sleep_seconds"); err != nil {
return output, err
}
}

// Open writable database
database, err := db.OpenWriteableDB(dbPath)
if err != nil {
return output, fmt.Errorf("failed to open database: %w", err)
}
defer database.Close()

// Verify pattern exists and check active status
var exists, active bool
err = database.QueryRow(
"SELECT EXISTS(SELECT 1 FROM cyclic_recording_pattern WHERE id = ?), COALESCE((SELECT active FROM cyclic_recording_pattern WHERE id = ?), false)",
patternID, patternID,
).Scan(&exists, &active)
if err != nil {
return output, fmt.Errorf("failed to query pattern: %w", err)
}
if !exists {
return output, fmt.Errorf("pattern not found: %s", patternID)
}
if !active {
return output, fmt.Errorf("pattern '%s' is not active (cannot update inactive patterns)", patternID)
}

// Build dynamic UPDATE query
updates := []string{}
args := []any{}

if input.RecordSeconds != nil {
updates = append(updates, "record_s = ?")
args = append(args, *input.RecordSeconds)
}
if input.SleepSeconds != nil {
updates = append(updates, "sleep_s = ?")
args = append(args, *input.SleepSeconds)
}

if len(updates) == 0 {
return output, fmt.Errorf("no fields provided to update")
}

// Always update last_modified
updates = append(updates, "last_modified = now()")
args = append(args, patternID)

query := fmt.Sprintf("UPDATE cyclic_recording_pattern SET %s WHERE id = ?", strings.Join(updates, ", "))

// Begin logged transaction for update
tx, err := db.BeginLoggedTx(ctx, database, "create_or_update_pattern")
if err != nil {
return output, fmt.Errorf("failed to begin transaction: %w", err)
}
defer func() {
if err != nil {
tx.Rollback()
}
}()

_, err = tx.Exec(query, args...)
if err != nil {
return output, fmt.Errorf("failed to update pattern: %w", err)
}

// Fetch the updated pattern
var pattern db.CyclicRecordingPattern
err = tx.QueryRow(
"SELECT id, record_s, sleep_s, created_at, last_modified, active FROM cyclic_recording_pattern WHERE id = ?",
patternID,
).Scan(&pattern.ID, &pattern.RecordS, &pattern.SleepS, &pattern.CreatedAt, &pattern.LastModified, &pattern.Active)
if err != nil {
return output, fmt.Errorf("failed to fetch updated pattern: %w", err)
}

if err = tx.Commit(); err != nil {
return output, fmt.Errorf("failed to commit transaction: %w", err)
}

output.Pattern = pattern
output.Message = fmt.Sprintf("Successfully updated pattern (ID: %s, record %ds, sleep %ds)",
pattern.ID, pattern.RecordS, pattern.SleepS)

return output, nil
}
file addition: location.go (----------)

[0.248737]

package tools

import (
"context"
"fmt"
"skraak/db"
"skraak/utils"
"strings"
)

// LocationInput defines the input parameters for the create_or_update_location tool
type LocationInput struct {
ID *string `json:"id,omitempty"`
DatasetID *string `json:"dataset_id,omitempty"`
Name *string `json:"name,omitempty"`
Latitude *float64 `json:"latitude,omitempty"`
Longitude *float64 `json:"longitude,omitempty"`
TimezoneID *string `json:"timezone_id,omitempty"`
Description *string `json:"description,omitempty"`
}

// LocationOutput defines the output structure
type LocationOutput struct {
Location db.Location `json:"location"`
Message string `json:"message"`
}

// CreateOrUpdateLocation creates a new location or updates an existing one with GPS coordinates
func CreateOrUpdateLocation(
ctx context.Context,
input LocationInput,
) (LocationOutput, error) {
if input.ID != nil && strings.TrimSpace(*input.ID) != "" {
return updateLocation(ctx, input)
}
return createLocation(ctx, input)
}

// validateLocationFields validates fields common to both create and update
func validateLocationFields(input LocationInput) error {
if err := utils.ValidateOptionalStringLength(input.Name, "name", utils.MaxNameLen); err != nil {
return err
}
if err := utils.ValidateOptionalStringLength(input.Description, "description", utils.MaxDescriptionLen); err != nil {
return err
}
if input.Latitude != nil {
if err := utils.ValidateRange(*input.Latitude, "latitude", -90.0, 90.0); err != nil {
return err
}
}
if input.Longitude != nil {
if err := utils.ValidateRange(*input.Longitude, "longitude", -180.0, 180.0); err != nil {
return err
}
}
if input.TimezoneID != nil {
if err := utils.ValidateStringLength(*input.TimezoneID, "timezone_id", utils.MaxTimezoneLen); err != nil {
return err
}
if err := utils.ValidateTimezone(*input.TimezoneID); err != nil {
return err
}
}
return nil
}

func createLocation(ctx context.Context, input LocationInput) (LocationOutput, error) {
var output LocationOutput

// Validate required fields for create
if input.DatasetID == nil || strings.TrimSpace(*input.DatasetID) == "" {
return output, fmt.Errorf("dataset_id is required when creating a location")
}
if input.Name == nil || strings.TrimSpace(*input.Name) == "" {
return output, fmt.Errorf("name is required when creating a location")
}
if input.Latitude == nil {
return output, fmt.Errorf("latitude is required when creating a location")
}
if input.Longitude == nil {
return output, fmt.Errorf("longitude is required when creating a location")
}
if input.TimezoneID == nil || strings.TrimSpace(*input.TimezoneID) == "" {
return output, fmt.Errorf("timezone_id is required when creating a location")
}

// Validate ID format for dataset_id
if err := utils.ValidateShortID(*input.DatasetID, "dataset_id"); err != nil {
return output, err
}

if err := validateLocationFields(input); err != nil {
return output, err
}

// Open writable database connection
database, err := db.OpenWriteableDB(dbPath)
if err != nil {
return output, fmt.Errorf("database connection failed: %w", err)
}
defer database.Close()

// Begin logged transaction
tx, err := db.BeginLoggedTx(ctx, database, "create_or_update_location")
if err != nil {
return output, fmt.Errorf("failed to begin transaction: %w", err)
}
defer func() {
if err != nil {
tx.Rollback()
}
}()

// Verify dataset exists and is active
var datasetExists, datasetActive bool
err = tx.QueryRowContext(ctx,
"SELECT EXISTS(SELECT 1 FROM dataset WHERE id = ?), COALESCE((SELECT active FROM dataset WHERE id = ?), false)",
*input.DatasetID, *input.DatasetID,
).Scan(&datasetExists, &datasetActive)
if err != nil {
return output, fmt.Errorf("failed to verify dataset: %w", err)
}
if !datasetExists {
return output, fmt.Errorf("dataset with ID '%s' does not exist", *input.DatasetID)
}
if !datasetActive {
return output, fmt.Errorf("dataset (ID: %s) is not active", *input.DatasetID)
}

// Check for existing location with same name in dataset (UNIQUE constraint)
var existingID string
err = tx.QueryRowContext(ctx,
"SELECT id FROM location WHERE dataset_id = ? AND name = ? AND active = true",
*input.DatasetID, *input.Name,
).Scan(&existingID)

if err == nil {
// Location with this name already exists in dataset - return existing (consistent duplicate handling)
var location db.Location
err = tx.QueryRowContext(ctx,
"SELECT id, dataset_id, name, latitude, longitude, description, created_at, last_modified, active, timezone_id FROM location WHERE id = ?",
existingID,
).Scan(&location.ID, &location.DatasetID, &location.Name, &location.Latitude, &location.Longitude,
&location.Description, &location.CreatedAt, &location.LastModified, &location.Active, &location.TimezoneID)
if err != nil {
return output, fmt.Errorf("failed to fetch existing location: %w", err)
}

if err = tx.Commit(); err != nil {
return output, fmt.Errorf("failed to commit transaction: %w", err)
}

output.Location = location
output.Message = fmt.Sprintf("Location '%s' already exists in dataset (ID: %s) - returning existing location", location.Name, location.ID)
return output, nil
}

// Generate ID
id, err := utils.GenerateShortID()
if err != nil {
return output, fmt.Errorf("failed to generate ID: %w", err)
}

// Insert location
_, err = tx.ExecContext(ctx,
"INSERT INTO location (id, dataset_id, name, latitude, longitude, timezone_id, description, created_at, last_modified, active) VALUES (?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, TRUE)",
id, *input.DatasetID, *input.Name, *input.Latitude, *input.Longitude, *input.TimezoneID, input.Description,
)
if err != nil {
return output, fmt.Errorf("failed to create location: %w", err)
}

// Fetch the created location
var location db.Location
err = tx.QueryRowContext(ctx,
"SELECT id, dataset_id, name, latitude, longitude, description, created_at, last_modified, active, timezone_id FROM location WHERE id = ?",
id,
).Scan(&location.ID, &location.DatasetID, &location.Name, &location.Latitude, &location.Longitude,
&location.Description, &location.CreatedAt, &location.LastModified, &location.Active, &location.TimezoneID)
if err != nil {
return output, fmt.Errorf("failed to fetch created location: %w", err)
}

if err = tx.Commit(); err != nil {
return output, fmt.Errorf("failed to commit transaction: %w", err)
}

output.Location = location
output.Message = fmt.Sprintf("Successfully created location '%s' with ID %s (%.6f, %.6f, %s)",
location.Name, location.ID, location.Latitude, location.Longitude, location.TimezoneID)

return output, nil
}

func updateLocation(ctx context.Context, input LocationInput) (LocationOutput, error) {
var output LocationOutput
locationID := *input.ID

// Validate ID format
if err := utils.ValidateShortID(locationID, "location_id"); err != nil {
return output, err
}

if err := validateLocationFields(input); err != nil {
return output, err
}

// Validate dataset_id format if provided
if err := utils.ValidateOptionalShortID(input.DatasetID, "dataset_id"); err != nil {
return output, err
}

// Open writable database
database, err := db.OpenWriteableDB(dbPath)
if err != nil {
return output, fmt.Errorf("failed to open database: %w", err)
}
defer database.Close()

// Verify location exists and check active status
var exists, active bool
var currentDatasetID string
err = database.QueryRow(
"SELECT EXISTS(SELECT 1 FROM location WHERE id = ?), COALESCE((SELECT active FROM location WHERE id = ?), false), COALESCE((SELECT dataset_id FROM location WHERE id = ?), '')",
locationID, locationID, locationID,
).Scan(&exists, &active, &currentDatasetID)
if err != nil {
return output, fmt.Errorf("failed to query location: %w", err)
}
if !exists {
return output, fmt.Errorf("location not found: %s", locationID)
}
if !active {
return output, fmt.Errorf("location '%s' is not active (cannot update inactive locations)", locationID)
}

// Verify dataset exists if DatasetID provided (relationship consistency)
if input.DatasetID != nil {
var datasetExists, datasetActive bool
err = database.QueryRow(
"SELECT EXISTS(SELECT 1 FROM dataset WHERE id = ?), COALESCE((SELECT active FROM dataset WHERE id = ?), false)",
*input.DatasetID, *input.DatasetID,
).Scan(&datasetExists, &datasetActive)
if err != nil {
return output, fmt.Errorf("failed to query dataset: %w", err)
}
if !datasetExists {
return output, fmt.Errorf("dataset not found: %s", *input.DatasetID)
}
if !datasetActive {
return output, fmt.Errorf("dataset '%s' is not active", *input.DatasetID)
}
}

// Build dynamic UPDATE query
updates := []string{}
args := []any{}

if input.DatasetID != nil {
updates = append(updates, "dataset_id = ?")
args = append(args, *input.DatasetID)
}
if input.Name != nil {
updates = append(updates, "name = ?")
args = append(args, *input.Name)
}
if input.Latitude != nil {
updates = append(updates, "latitude = ?")
args = append(args, *input.Latitude)
}
if input.Longitude != nil {
updates = append(updates, "longitude = ?")
args = append(args, *input.Longitude)
}
if input.Description != nil {
updates = append(updates, "description = ?")
args = append(args, *input.Description)
}
if input.TimezoneID != nil {
updates = append(updates, "timezone_id = ?")
args = append(args, *input.TimezoneID)
}

if len(updates) == 0 {
return output, fmt.Errorf("no fields provided to update")
}

// Always update last_modified
updates = append(updates, "last_modified = now()")
args = append(args, locationID)

query := fmt.Sprintf("UPDATE location SET %s WHERE id = ?", strings.Join(updates, ", "))

// Begin logged transaction for update
tx, err := db.BeginLoggedTx(ctx, database, "create_or_update_location")
if err != nil {
return output, fmt.Errorf("failed to begin transaction: %w", err)
}
defer func() {
if err != nil {
tx.Rollback()
}
}()

_, err = tx.ExecContext(ctx, query, args...)
if err != nil {
return output, fmt.Errorf("failed to update location: %w", err)
}

// Fetch the updated location
var location db.Location
err = tx.QueryRow(
"SELECT id, dataset_id, name, latitude, longitude, description, created_at, last_modified, active, timezone_id FROM location WHERE id = ?",
locationID,
).Scan(&location.ID, &location.DatasetID, &location.Name, &location.Latitude, &location.Longitude,
&location.Description, &location.CreatedAt, &location.LastModified, &location.Active, &location.TimezoneID)
if err != nil {
return output, fmt.Errorf("failed to fetch updated location: %w", err)
}

if err = tx.Commit(); err != nil {
return output, fmt.Errorf("failed to commit transaction: %w", err)
}

output.Location = location
output.Message = fmt.Sprintf("Successfully updated location '%s' (ID: %s)", location.Name, location.ID)

return output, nil
}
file addition: isnight.go (----------)

[0.248737]

package tools

import (
"fmt"
"strings"
"time"

"github.com/sixdouglas/suncalc"

"skraak/utils"
)

// IsNightInput defines the input parameters for the isnight tool
type IsNightInput struct {
FilePath string `json:"file_path"`
Lat float64 `json:"lat"`
Lng float64 `json:"lng"`
Timezone string `json:"timezone,omitempty"`
}

// IsNightOutput defines the output structure for the isnight tool
type IsNightOutput struct {
FilePath string `json:"file_path"`
TimestampUTC string `json:"timestamp_utc"`
SolarNight bool `json:"solar_night"`
CivilNight bool `json:"civil_night"`
DiurnalActive bool `json:"diurnal_active"`
MoonPhase float64 `json:"moon_phase"`
DurationSec float64 `json:"duration_seconds"`
TimestampSrc string `json:"timestamp_source"`
MidpointUTC string `json:"midpoint_utc"`
SunriseUTC string `json:"sunrise_utc,omitempty"`
SunsetUTC string `json:"sunset_utc,omitempty"`
DawnUTC string `json:"dawn_utc,omitempty"`
DuskUTC string `json:"dusk_utc,omitempty"`
}

// IsNight determines if a WAV file was recorded at night based on its
// metadata timestamp and the given GPS coordinates.
//
// Timestamp resolution order:
// 1. AudioMoth comment (timezone embedded)
// 2. Filename timestamp + timezone offset (requires --timezone)
// 3. File modification time (system local time)
func IsNight(input IsNightInput) (IsNightOutput, error) {
var output IsNightOutput

// Step 1: Parse WAV header
metadata, err := utils.ParseWAVHeader(input.FilePath)
if err != nil {
return output, fmt.Errorf("WAV header parsing failed: %w", err)
}

output.DurationSec = metadata.Duration

// Step 2: Resolve timestamp (use file mod time as fallback)
tsResult, err := utils.ResolveTimestamp(metadata, input.FilePath, input.Timezone, true)
if err != nil {
return output, fmt.Errorf("cannot determine recording timestamp: %w", err)
}

// Determine timestamp source label
tsSource := "file_mod_time"
if tsResult.IsAudioMoth {
tsSource = "audiomoth_comment"
} else if utils.HasTimestampFilename(input.FilePath) {
tsSource = "filename"
}

// Step 3: Calculate astronomical data using recording midpoint
astroData := utils.CalculateAstronomicalData(
tsResult.Timestamp.UTC(),
metadata.Duration,
input.Lat,
input.Lng,
)

// Step 4: Get sun event times for informational output
midpoint := utils.CalculateMidpointTime(tsResult.Timestamp.UTC(), metadata.Duration)
sunTimes := suncalc.GetTimes(midpoint, input.Lat, input.Lng)

output.FilePath = input.FilePath
output.TimestampUTC = tsResult.Timestamp.UTC().Format(time.RFC3339)
output.SolarNight = astroData.SolarNight
output.CivilNight = astroData.CivilNight
output.MoonPhase = astroData.MoonPhase
output.TimestampSrc = tsSource
output.MidpointUTC = midpoint.Format(time.RFC3339)

if dawn, ok := sunTimes[suncalc.Dawn]; ok && !dawn.Value.IsZero() {
if sunset, ok := sunTimes[suncalc.Sunset]; ok && !sunset.Value.IsZero() {
output.DiurnalActive = !midpoint.Before(dawn.Value) && !midpoint.After(sunset.Value)
}
}

if sr, ok := sunTimes[suncalc.Sunrise]; ok && !sr.Value.IsZero() {
output.SunriseUTC = sr.Value.UTC().Format(time.RFC3339)
}
if ss, ok := sunTimes[suncalc.Sunset]; ok && !ss.Value.IsZero() {
output.SunsetUTC = ss.Value.UTC().Format(time.RFC3339)
}
if d, ok := sunTimes[suncalc.Dawn]; ok && !d.Value.IsZero() {
output.DawnUTC = d.Value.UTC().Format(time.RFC3339)
}
if dk, ok := sunTimes[suncalc.Dusk]; ok && !dk.Value.IsZero() {
output.DuskUTC = dk.Value.UTC().Format(time.RFC3339)
}

return output, nil
}

// String returns a human-readable summary of the isnight result
func (o IsNightOutput) String() string {
var sb strings.Builder
fmt.Fprintf(&sb, "File: %s\n", o.FilePath)
fmt.Fprintf(&sb, "Timestamp (UTC): %s\n", o.TimestampUTC)
fmt.Fprintf(&sb, "Midpoint (UTC): %s\n", o.MidpointUTC)
fmt.Fprintf(&sb, "Duration: %.1f seconds\n", o.DurationSec)
fmt.Fprintf(&sb, "Source: %s\n", o.TimestampSrc)
fmt.Fprintf(&sb, "Solar night: %v\n", o.SolarNight)
fmt.Fprintf(&sb, "Civil night: %v\n", o.CivilNight)
fmt.Fprintf(&sb, "Moon phase: %.2f\n", o.MoonPhase)
if o.SunriseUTC != "" {
fmt.Fprintf(&sb, "Sunrise (UTC): %s\n", o.SunriseUTC)
}
if o.SunsetUTC != "" {
fmt.Fprintf(&sb, "Sunset (UTC): %s\n", o.SunsetUTC)
}
if o.DawnUTC != "" {
fmt.Fprintf(&sb, "Dawn (UTC): %s\n", o.DawnUTC)
}
if o.DuskUTC != "" {
fmt.Fprintf(&sb, "Dusk (UTC): %s\n", o.DuskUTC)
}
return sb.String()
}
file addition: integration_test.go (----------)

[0.248737]

package tools

import (
"context"
"os"
"path/filepath"
"testing"
)

func TestPatternIntegration_CreateClusterWithExistingPattern(t *testing.T) {
// Setup: Use test database
testDB := filepath.Join("..", "db", "test.duckdb")
if _, err := os.Stat(testDB); os.IsNotExist(err) {
t.Skipf("Test database not found at %s", testDB)
}
SetDBPath(testDB)

ctx := context.Background()

// First, verify we can query existing patterns
t.Run("QueryExistingPatterns", func(t *testing.T) {
input := ExecuteSQLInput{
Query: "SELECT id, record_s, sleep_s FROM cyclic_recording_pattern WHERE active = true ORDER BY record_s, sleep_s",
}

output, err := ExecuteSQL(ctx, input)
if err != nil {
t.Fatalf("Failed to query patterns: %v", err)
}

if len(output.Rows) == 0 {
t.Fatal("Expected at least one pattern")
}

t.Logf("Found %d patterns", len(output.Rows))
for i, row := range output.Rows {
t.Logf("Pattern %d: ID=%v, record_s=%v, sleep_s=%v", i+1, row["id"], row["record_s"], row["sleep_s"])
}
})

// Create a cluster using an existing pattern
t.Run("CreateClusterWithExistingPattern", func(t *testing.T) {
// First, find a valid dataset and location
datasetSQL := ExecuteSQLInput{
Query: "SELECT id FROM dataset WHERE active = true LIMIT 1",
}
datasetOutput, err := ExecuteSQL(ctx, datasetSQL)
if err != nil || len(datasetOutput.Rows) == 0 {
t.Skip("No active datasets found in test database")
}
datasetID := datasetOutput.Rows[0]["id"].(string)

locationSQL := ExecuteSQLInput{
Query: "SELECT id FROM location WHERE dataset_id = ? AND active = true LIMIT 1",
Parameters: []any{datasetID},
}
locationOutput, err := ExecuteSQL(ctx, locationSQL)
if err != nil || len(locationOutput.Rows) == 0 {
t.Skip("No active locations found in test database")
}
locationID := locationOutput.Rows[0]["id"].(string)

t.Logf("Using dataset: %s, location: %s", datasetID, locationID)

sampleRate := 16000
input := ClusterInput{
DatasetID: &datasetID,
LocationID: &locationID,
Name: new("Integration Test Cluster"),
SampleRate: &sampleRate,
CyclicRecordingPatternID: new("IBv_KxDGsNQs"), // 60s/1740s pattern
}

output, err := CreateOrUpdateCluster(ctx, input)
if err != nil {
t.Fatalf("Failed to create cluster: %v", err)
}

clusterID := output.Cluster.ID
t.Logf("Created cluster: %s with pattern reference", clusterID)

// Verify the cluster has the pattern reference
sqlInput := ExecuteSQLInput{
Query: "SELECT c.name, c.cyclic_recording_pattern_id, p.record_s, p.sleep_s FROM cluster c LEFT JOIN cyclic_recording_pattern p ON c.cyclic_recording_pattern_id = p.id WHERE c.id = ?",
Parameters: []any{clusterID},
}

sqlOutput, err := ExecuteSQL(ctx, sqlInput)
if err != nil {
t.Fatalf("Failed to verify cluster: %v", err)
}

if len(sqlOutput.Rows) != 1 {
t.Fatalf("Expected 1 row, got %d", len(sqlOutput.Rows))
}

row := sqlOutput.Rows[0]

t.Logf("Row data: %+v", row)

// Check the pattern ID
patternIDStr := row["cyclic_recording_pattern_id"]
if patternIDStr != "IBv_KxDGsNQs" {
t.Errorf("Expected pattern ID 'IBv_KxDGsNQs', got '%v'", patternIDStr)
}

// Check record_s and sleep_s
recordSVal := row["record_s"]
sleepSVal := row["sleep_s"]

t.Logf("✓ Verified cluster has correct pattern reference: ID=%v, record=%v, sleep=%v",
patternIDStr, recordSVal, sleepSVal)

if patternIDStr == nil || patternIDStr == "" {
t.Error("Pattern ID is empty")
}
if recordSVal == nil {
t.Error("record_s is nil")
}
if sleepSVal == nil {
t.Error("sleep_s is nil")
}
})
}
file addition: import_unstructured.go (----------)

[0.248737]

package tools

import (
"context"
"fmt"
"io/fs"
"os"
"path/filepath"
"strings"
"time"

"skraak/db"
"skraak/utils"
)

// ImportUnstructuredInput defines the input parameters for importing files into an unstructured dataset
type ImportUnstructuredInput struct {
DatasetID string `json:"dataset_id"`
FolderPath string `json:"folder_path"`
Recursive *bool `json:"recursive,omitempty"`
}

// ImportUnstructuredOutput defines the output structure
type ImportUnstructuredOutput struct {
TotalFiles int `json:"total_files"`
ImportedFiles int `json:"imported_files"`
SkippedFiles int `json:"skipped_files"` // Duplicates
FailedFiles int `json:"failed_files"`
TotalDuration float64 `json:"total_duration_seconds"`
ProcessingTime string `json:"processing_time"`
Errors []utils.FileImportError `json:"errors,omitempty"`
}

// ImportUnstructured imports WAV files into an unstructured dataset
// Files are stored with minimal metadata: hash, duration, sample_rate, file_mod_time as timestamp
// No location/cluster hierarchy, no astronomical data, no AudioMoth parsing
func ImportUnstructured(
ctx context.Context,
input ImportUnstructuredInput,
) (ImportUnstructuredOutput, error) {
startTime := time.Now()
var output ImportUnstructuredOutput

// Default recursive to true
recursive := true
if input.Recursive != nil {
recursive = *input.Recursive
}

// Validate input
if err := validateUnstructuredInput(input); err != nil {
return output, fmt.Errorf("validation failed: %w", err)
}

// Open database
database, err := db.OpenWriteableDB(dbPath)
if err != nil {
return output, fmt.Errorf("failed to open database: %w", err)
}
defer database.Close()

// Scan for WAV files
files, scanErrors := scanWavFiles(input.FolderPath, recursive)
output.Errors = append(output.Errors, scanErrors...)
output.TotalFiles = len(files)

if len(files) == 0 {
output.ProcessingTime = time.Since(startTime).String()
return output, nil
}

// Begin logged transaction
tx, err := db.BeginLoggedTx(ctx, database, "import_unstructured")
if err != nil {
return output, fmt.Errorf("failed to begin transaction: %w", err)
}
defer func() {
if err != nil {
tx.Rollback()
}
}()

// Process each file
for _, filePath := range files {
fileResult, procErr := processUnstructuredFile(tx, filePath, input.DatasetID)

if procErr != nil {
output.FailedFiles++
output.Errors = append(output.Errors, utils.FileImportError{
FileName: filepath.Base(filePath),
Error: procErr.Error(),
Stage: "process",
})
continue
}

if fileResult.Skipped {
output.SkippedFiles++
} else {
output.ImportedFiles++
output.TotalDuration += fileResult.Duration
}
}

// Commit transaction
if err = tx.Commit(); err != nil {
return output, fmt.Errorf("failed to commit transaction: %w", err)
}

output.ProcessingTime = time.Since(startTime).String()
return output, nil
}

// unstructuredFileResult holds the result of processing a single file
type unstructuredFileResult struct {
Skipped bool // True if duplicate
Duration float64 // Duration in seconds
}

// processUnstructuredFile processes a single WAV file for unstructured import
func processUnstructuredFile(tx *db.LoggedTx, filePath, datasetID string) (*unstructuredFileResult, error) {
result := &unstructuredFileResult{}

// Step 1: Parse WAV header
metadata, err := utils.ParseWAVHeader(filePath)
if err != nil {
return nil, fmt.Errorf("WAV header parsing failed: %w", err)
}

// Step 2: Calculate hash
hash, err := utils.ComputeXXH64(filePath)
if err != nil {
return nil, fmt.Errorf("hash calculation failed: %w", err)
}

// Step 3: Check for duplicate - if exists, skip entirely (do not link to dataset)
_, isDuplicate, err := utils.CheckDuplicateHash(tx, hash)
if err != nil {
return nil, fmt.Errorf("duplicate check failed: %w", err)
}
if isDuplicate {
// File already exists in database - skip completely, do not link to dataset
result.Skipped = true
result.Duration = metadata.Duration
return result, nil
}

// Step 4: Generate file ID
fileID, err := utils.GenerateLongID()
if err != nil {
return nil, fmt.Errorf("ID generation failed: %w", err)
}

// Step 5: Use file modification time as timestamp (no timezone conversion)
timestamp := metadata.FileModTime

// Step 6: Insert into file table
_, err = tx.Exec(`
INSERT INTO file (
id, file_name, xxh64_hash, location_id, cluster_id,
timestamp_local, duration, sample_rate,
maybe_solar_night, maybe_civil_night, moon_phase,
active
) VALUES (?, ?, ?, NULL, NULL, ?, ?, ?, NULL, NULL, NULL, TRUE)
`,
fileID,
filepath.Base(filePath),
hash,
timestamp,
metadata.Duration,
metadata.SampleRate,
)
if err != nil {
return nil, fmt.Errorf("file insert failed: %w", err)
}

// Step 7: Insert into file_dataset table
_, err = tx.Exec(
"INSERT INTO file_dataset (file_id, dataset_id) VALUES (?, ?)",
fileID, datasetID,
)
if err != nil {
return nil, fmt.Errorf("file_dataset insert failed: %w", err)
}

result.Duration = metadata.Duration
return result, nil
}

// validateUnstructuredInput validates the input parameters
func validateUnstructuredInput(input ImportUnstructuredInput) error {
// Validate dataset ID format
if err := utils.ValidateShortID(input.DatasetID, "dataset_id"); err != nil {
return err
}

// Verify folder exists
info, err := os.Stat(input.FolderPath)
if err != nil {
return fmt.Errorf("folder not accessible: %w", err)
}
if !info.IsDir() {
return fmt.Errorf("path is not a directory: %s", input.FolderPath)
}

// Open database for validation
database, err := db.OpenReadOnlyDB(dbPath)
if err != nil {
return fmt.Errorf("failed to open database: %w", err)
}
defer database.Close()

// Verify dataset exists and is active
var datasetExists bool
err = database.QueryRow(
"SELECT EXISTS(SELECT 1 FROM dataset WHERE id = ? AND active = true)",
input.DatasetID,
).Scan(&datasetExists)
if err != nil {
return fmt.Errorf("failed to query dataset: %w", err)
}
if !datasetExists {
return fmt.Errorf("dataset not found or inactive: %s", input.DatasetID)
}

// Verify dataset is 'unstructured' type
if err := utils.ValidateDatasetTypeUnstructured(database, input.DatasetID); err != nil {
return err
}

return nil
}

// scanWavFiles scans a folder for WAV files
func scanWavFiles(folderPath string, recursive bool) ([]string, []utils.FileImportError) {
var files []string
var errors []utils.FileImportError

walkFunc := func(path string, d fs.DirEntry, err error) error {
if err != nil {
errors = append(errors, utils.FileImportError{
FileName: path,
Error: err.Error(),
Stage: "scan",
})
return nil
}

// Skip directories if not recursive
if d.IsDir() {
if !recursive && path != folderPath {
return fs.SkipDir
}
return nil
}

// Check for .wav extension (case-insensitive)
if strings.HasSuffix(strings.ToLower(d.Name()), ".wav") {
files = append(files, path)
}

return nil
}

if recursive {
if err := filepath.WalkDir(folderPath, walkFunc); err != nil {
errors = append(errors, utils.FileImportError{
FileName: folderPath,
Error: err.Error(),
Stage: "scan",
})
}
} else {
// Non-recursive: only scan top-level
entries, err := os.ReadDir(folderPath)
if err != nil {
errors = append(errors, utils.FileImportError{
FileName: folderPath,
Error: err.Error(),
Stage: "scan",
})
return nil, errors
}

for _, entry := range entries {
if !entry.IsDir() && strings.HasSuffix(strings.ToLower(entry.Name()), ".wav") {
files = append(files, filepath.Join(folderPath, entry.Name()))
}
}
}

return files, errors
}
file addition: import_segments_test.go (----------)

[0.248737]

package tools

import (
"testing"

"skraak/utils"
)

func TestValidateSegmentImportInput(t *testing.T) {
t.Run("invalid dataset ID - too short", func(t *testing.T) {
input := ImportSegmentsInput{
DatasetID: "abc",
}
err := validateSegmentImportInput(input)
if err == nil {
t.Fatal("expected error for short dataset ID")
}
})

t.Run("invalid dataset ID - too long", func(t *testing.T) {
input := ImportSegmentsInput{
DatasetID: "abc123def456ghi789",
}
err := validateSegmentImportInput(input)
if err == nil {
t.Fatal("expected error for long dataset ID")
}
})

t.Run("invalid dataset ID - invalid characters", func(t *testing.T) {
input := ImportSegmentsInput{
DatasetID: "abc123!!!456",
}
err := validateSegmentImportInput(input)
if err == nil {
t.Fatal("expected error for invalid characters in dataset ID")
}
})

t.Run("invalid location ID", func(t *testing.T) {
input := ImportSegmentsInput{
DatasetID: "abc123def456",
LocationID: "invalid",
}
err := validateSegmentImportInput(input)
if err == nil {
t.Fatal("expected error for invalid location ID")
}
})

t.Run("invalid cluster ID", func(t *testing.T) {
input := ImportSegmentsInput{
DatasetID: "abc123def456",
LocationID: "xyz789uvw012",
ClusterID: "invalid",
}
err := validateSegmentImportInput(input)
if err == nil {
t.Fatal("expected error for invalid cluster ID")
}
})
}

func TestCountTotalSegments(t *testing.T) {
t.Run("empty", func(t *testing.T) {
count := countTotalSegments(map[string]scannedDataFile{})
if count != 0 {
t.Errorf("expected 0, got %d", count)
}
})

t.Run("single file - no segments", func(t *testing.T) {
files := map[string]scannedDataFile{
"file1": {Segments: []*utils.Segment{}},
}
count := countTotalSegments(files)
if count != 0 {
t.Errorf("expected 0, got %d", count)
}
})

t.Run("single file - multiple segments", func(t *testing.T) {
files := map[string]scannedDataFile{
"file1": {Segments: []*utils.Segment{{}, {}, {}}},
}
count := countTotalSegments(files)
if count != 3 {
t.Errorf("expected 3, got %d", count)
}
})

t.Run("multiple files", func(t *testing.T) {
files := map[string]scannedDataFile{
"file1": {Segments: []*utils.Segment{{}, {}}},
"file2": {Segments: []*utils.Segment{{}}},
"file3": {Segments: []*utils.Segment{{}, {}, {}, {}}},
}
count := countTotalSegments(files)
if count != 7 {
t.Errorf("expected 7, got %d", count)
}
})
}
file addition: import_segments.go (----------)

[0.248737]

package tools

import (
"context"
"database/sql"
"fmt"
"os"
"path/filepath"
"strings"
"time"

"skraak/db"
"skraak/utils"
)

// ImportSegmentsInput defines the input parameters for the import_segments tool
type ImportSegmentsInput struct {
Folder string `json:"folder"`
Mapping string `json:"mapping"`
DatasetID string `json:"dataset_id"`
LocationID string `json:"location_id"`
ClusterID string `json:"cluster_id"`
ProgressHandler func(processed, total int, message string)
}

// ImportSegmentsOutput defines the output structure for the import_segments tool
type ImportSegmentsOutput struct {
Summary ImportSegmentsSummary `json:"summary"`
Segments []SegmentImport `json:"segments"`
Errors []ImportSegmentError `json:"errors,omitempty"`
}

// ImportSegmentsSummary provides summary statistics for the import operation
type ImportSegmentsSummary struct {
DataFilesFound int `json:"data_files_found"`
DataFilesProcessed int `json:"data_files_processed"`
TotalSegments int `json:"total_segments"`
ImportedSegments int `json:"imported_segments"`
ImportedLabels int `json:"imported_labels"`
ImportedSubtypes int `json:"imported_subtypes"`
ProcessingTimeMs int64 `json:"processing_time_ms"`
}

// SegmentImport represents an imported segment in the output
type SegmentImport struct {
SegmentID string `json:"segment_id"`
FileName string `json:"file_name"`
StartTime float64 `json:"start_time"`
EndTime float64 `json:"end_time"`
FreqLow float64 `json:"freq_low"`
FreqHigh float64 `json:"freq_high"`
Labels []LabelImport `json:"labels"`
}

// LabelImport represents an imported label in the output
type LabelImport struct {
LabelID string `json:"label_id"`
Species string `json:"species"`
CallType string `json:"calltype,omitempty"`
Filter string `json:"filter"`
Certainty int `json:"certainty"`
Comment string `json:"comment,omitempty"`
}

// ImportSegmentError records errors encountered during segment import
type ImportSegmentError struct {
File string `json:"file,omitempty"`
Stage string `json:"stage"` // "validation", "hash", "import"
Message string `json:"message"`
}

// scannedDataFile holds parsed data for a .data file
type scannedDataFile struct {
DataPath string
WavPath string
WavHash string
FileID string
Duration float64
Segments []*utils.Segment
}

// ImportSegments imports segments from AviaNZ .data files into the database
func ImportSegments(ctx context.Context, input ImportSegmentsInput) (ImportSegmentsOutput, error) {
startTime := time.Now()
var output ImportSegmentsOutput
output.Segments = make([]SegmentImport, 0)
output.Errors = make([]ImportSegmentError, 0)

// Phase A: Input Validation
if err := validateSegmentImportInput(input); err != nil {
return output, err
}

// Load mapping file
mapping, err := utils.LoadMappingFile(input.Mapping)
if err != nil {
return output, fmt.Errorf("failed to load mapping file: %w", err)
}

// Find .data files
dataFiles, err := utils.FindDataFiles(input.Folder)
if err != nil {
return output, fmt.Errorf("failed to find .data files: %w", err)
}
output.Summary.DataFilesFound = len(dataFiles)

if len(dataFiles) == 0 {
return output, fmt.Errorf("no .data files found in folder: %s", input.Folder)
}

// Phase B: Parse all .data files and collect unique values
scannedFiles, parseErrors, uniqueFilters, uniqueSpecies, uniqueCalltypes := scanAllDataFiles(dataFiles, input.Folder)
output.Errors = append(output.Errors, parseErrors...)

if len(scannedFiles) == 0 {
output.Summary.ProcessingTimeMs = time.Since(startTime).Milliseconds()
return output, nil
}

// Phase C: Pre-Import Validation
database, err := db.OpenWriteableDB(dbPath)
if err != nil {
return output, fmt.Errorf("failed to open database: %w", err)
}
defer database.Close()

// Validate dataset/location/cluster hierarchy
if err := validateSegmentHierarchy(database, input.DatasetID, input.LocationID, input.ClusterID); err != nil {
return output, err
}

// Validate all filters exist
filterIDMap, err := validateFiltersExist(database, uniqueFilters)
if err != nil {
return output, fmt.Errorf("filter validation failed: %w", err)
}

// Validate mapping covers all species/calltypes and they exist in DB
validationResult, err := utils.ValidateMappingAgainstDB(database, mapping, uniqueSpecies, uniqueCalltypes)
if err != nil {
return output, fmt.Errorf("mapping validation failed: %w", err)
}
if validationResult.HasErrors() {
return output, fmt.Errorf("mapping validation failed: %s", validationResult.Error())
}

// Load species and calltype ID maps
speciesIDMap, calltypeIDMap, err := loadSpeciesCalltypeIDs(database, mapping, uniqueSpecies, uniqueCalltypes)
if err != nil {
return output, fmt.Errorf("failed to load species/calltype IDs: %w", err)
}

// Validate files: hash exists, linked to dataset, no existing labels
fileIDMap, hashErrors := validateAndMapFiles(database, scannedFiles, input.ClusterID, input.DatasetID)
output.Errors = append(output.Errors, hashErrors...)

if len(fileIDMap) == 0 && len(scannedFiles) > 0 {
output.Summary.ProcessingTimeMs = time.Since(startTime).Milliseconds()
return output, nil
}

// Phase D: Transactional Import
importedSegments, importedLabels, importedSubtypes, fileUpdates, importErrors := importSegmentsIntoDB(
ctx, database, fileIDMap, scannedFiles, mapping, filterIDMap, speciesIDMap, calltypeIDMap, input.DatasetID, input.ProgressHandler,
)
output.Errors = append(output.Errors, importErrors...)

// Build output segments
output.Segments = append(output.Segments, importedSegments...)

// Phase E: Write IDs back to .data files
if len(fileUpdates) > 0 {
writeErrors := writeIDsToDataFiles(fileUpdates)
output.Errors = append(output.Errors, writeErrors...)
}

output.Summary.DataFilesProcessed = len(fileIDMap)
output.Summary.TotalSegments = countTotalSegments(fileIDMap)
output.Summary.ImportedSegments = len(importedSegments)
output.Summary.ImportedLabels = importedLabels
output.Summary.ImportedSubtypes = importedSubtypes
output.Summary.ProcessingTimeMs = time.Since(startTime).Milliseconds()

return output, nil
}

// validateSegmentImportInput validates input parameters
func validateSegmentImportInput(input ImportSegmentsInput) error {
// Validate folder exists
if info, err := os.Stat(input.Folder); err != nil {
return fmt.Errorf("folder does not exist: %s", input.Folder)
} else if !info.IsDir() {
return fmt.Errorf("path is not a folder: %s", input.Folder)
}

// Validate mapping file exists
if _, err := os.Stat(input.Mapping); err != nil {
return fmt.Errorf("mapping file does not exist: %s", input.Mapping)
}

// Validate IDs
if err := utils.ValidateShortID(input.DatasetID, "dataset_id"); err != nil {
return err
}
if err := utils.ValidateShortID(input.LocationID, "location_id"); err != nil {
return err
}
if err := utils.ValidateShortID(input.ClusterID, "cluster_id"); err != nil {
return err
}

return nil
}

// validateSegmentHierarchy validates dataset/location/cluster relationships
func validateSegmentHierarchy(dbConn *sql.DB, datasetID, locationID, clusterID string) error {
// Validate dataset exists and is structured
var datasetType string
err := dbConn.QueryRow(`SELECT type FROM dataset WHERE id = ? AND active = true`, datasetID).Scan(&datasetType)
if err == sql.ErrNoRows {
return fmt.Errorf("dataset not found: %s", datasetID)
}
if err != nil {
return fmt.Errorf("failed to query dataset: %w", err)
}
if datasetType != "structured" {
return fmt.Errorf("dataset must be 'structured' type, got: %s", datasetType)
}

// Validate location belongs to dataset
var locationExists bool
err = dbConn.QueryRow(`
SELECT EXISTS(SELECT 1 FROM location WHERE id = ? AND dataset_id = ? AND active = true)
`, locationID, datasetID).Scan(&locationExists)
if err != nil {
return fmt.Errorf("failed to query location: %w", err)
}
if !locationExists {
return fmt.Errorf("location not found or not linked to dataset: %s", locationID)
}

// Validate cluster belongs to location
var clusterExists bool
err = dbConn.QueryRow(`
SELECT EXISTS(SELECT 1 FROM cluster WHERE id = ? AND location_id = ? AND active = true)
`, clusterID, locationID).Scan(&clusterExists)
if err != nil {
return fmt.Errorf("failed to query cluster: %w", err)
}
if !clusterExists {
return fmt.Errorf("cluster not found or not linked to location: %s", clusterID)
}

return nil
}

// scanAllDataFiles parses all .data files and collects unique values
func scanAllDataFiles(dataFiles []string, folder string) (
[]scannedDataFile,
[]ImportSegmentError,
map[string]bool,
map[string]bool,
map[string]map[string]bool,
) {
var scanned []scannedDataFile
var errors []ImportSegmentError
uniqueFilters := make(map[string]bool)
uniqueSpecies := make(map[string]bool)
uniqueCalltypes := make(map[string]map[string]bool) // species -> calltype -> true

for _, dataPath := range dataFiles {
// Find corresponding WAV file
wavPath := strings.TrimSuffix(dataPath, ".data")
if _, err := os.Stat(wavPath); err != nil {
errors = append(errors, ImportSegmentError{
File: filepath.Base(dataPath),
Stage: "validation",
Message: fmt.Sprintf("corresponding WAV file not found: %s", filepath.Base(wavPath)),
})
continue
}

// Parse .data file
df, err := utils.ParseDataFile(dataPath)
if err != nil {
errors = append(errors, ImportSegmentError{
File: filepath.Base(dataPath),
Stage: "validation",
Message: fmt.Sprintf("failed to parse .data file: %v", err),
})
continue
}

// Collect unique filters, species, calltypes
for _, seg := range df.Segments {
for _, label := range seg.Labels {
uniqueFilters[label.Filter] = true
uniqueSpecies[label.Species] = true
if label.CallType != "" {
if uniqueCalltypes[label.Species] == nil {
uniqueCalltypes[label.Species] = make(map[string]bool)
}
uniqueCalltypes[label.Species][label.CallType] = true
}
}
}

scanned = append(scanned, scannedDataFile{
DataPath: dataPath,
WavPath: wavPath,
Duration: df.Meta.Duration,
Segments: df.Segments,
})
}

return scanned, errors, uniqueFilters, uniqueSpecies, uniqueCalltypes
}

// validateFiltersExist checks all filters exist in DB and returns ID map
func validateFiltersExist(dbConn *sql.DB, filterNames map[string]bool) (map[string]string, error) {
filterIDMap := make(map[string]string)

if len(filterNames) == 0 {
return filterIDMap, nil
}

names := make([]string, 0, len(filterNames))
for name := range filterNames {
names = append(names, name)
}

query := `SELECT id, name FROM filter WHERE name IN (` + utils.Placeholders(len(names)) + `) AND active = true`
args := make([]any, len(names))
for i, name := range names {
args[i] = name
}

rows, err := dbConn.Query(query, args...)
if err != nil {
return nil, fmt.Errorf("failed to query filters: %w", err)
}
defer rows.Close()

for rows.Next() {
var id, name string
if err := rows.Scan(&id, &name); err == nil {
filterIDMap[name] = id
}
}

// Check for missing filters
var missing []string
for name := range filterNames {
if _, exists := filterIDMap[name]; !exists {
missing = append(missing, name)
}
}

if len(missing) > 0 {
return nil, fmt.Errorf("filters not found in database: [%s]", strings.Join(missing, ", "))
}

return filterIDMap, nil
}

// loadSpeciesCalltypeIDs loads species and calltype ID maps
func loadSpeciesCalltypeIDs(
dbConn *sql.DB,
mapping utils.MappingFile,
uniqueSpecies map[string]bool,
uniqueCalltypes map[string]map[string]bool,
) (map[string]string, map[string]map[string]string, error) {
speciesIDMap := make(map[string]string)
calltypeIDMap := make(map[string]map[string]string) // (dbSpecies, dbCalltype) -> calltype_id

// Collect all DB species labels from mapping
dbSpeciesSet := make(map[string]bool)
for dataSpecies := range uniqueSpecies {
if dbSpecies, ok := mapping.GetDBSpecies(dataSpecies); ok {
dbSpeciesSet[dbSpecies] = true
}
}

// Load species IDs
if len(dbSpeciesSet) > 0 {
dbSpeciesList := make([]string, 0, len(dbSpeciesSet))
for s := range dbSpeciesSet {
dbSpeciesList = append(dbSpeciesList, s)
}

query := `SELECT id, label FROM species WHERE label IN (` + utils.Placeholders(len(dbSpeciesList)) + `) AND active = true`
args := make([]any, len(dbSpeciesList))
for i, s := range dbSpeciesList {
args[i] = s
}

rows, err := dbConn.Query(query, args...)
if err != nil {
return nil, nil, fmt.Errorf("failed to query species: %w", err)
}
defer rows.Close()

for rows.Next() {
var id, label string
if err := rows.Scan(&id, &label); err == nil {
speciesIDMap[label] = id
}
}
}

// Load calltype IDs
for dataSpecies, ctSet := range uniqueCalltypes {
dbSpecies, ok := mapping.GetDBSpecies(dataSpecies)
if !ok {
continue
}

if calltypeIDMap[dbSpecies] == nil {
calltypeIDMap[dbSpecies] = make(map[string]string)
}

for dataCalltype := range ctSet {
dbCalltype := mapping.GetDBCalltype(dataSpecies, dataCalltype)

// Query calltype ID
var calltypeID string
err := dbConn.QueryRow(`
SELECT ct.id
FROM call_type ct
JOIN species s ON ct.species_id = s.id
WHERE s.label = ? AND ct.label = ? AND ct.active = true
`, dbSpecies, dbCalltype).Scan(&calltypeID)

if err == nil {
calltypeIDMap[dbSpecies][dbCalltype] = calltypeID
}
}
}

return speciesIDMap, calltypeIDMap, nil
}

// validateAndMapFiles validates files exist by hash, are linked to dataset, and have no existing labels
func validateAndMapFiles(
dbConn *sql.DB,
scannedFiles []scannedDataFile,
clusterID string,
datasetID string,
) (map[string]scannedDataFile, []ImportSegmentError) {
fileIDMap := make(map[string]scannedDataFile)
var errors []ImportSegmentError

for _, sf := range scannedFiles {
// Compute hash
hash, err := utils.ComputeXXH64(sf.WavPath)
if err != nil {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.WavPath),
Stage: "hash",
Message: fmt.Sprintf("failed to compute hash: %v", err),
})
continue
}
sf.WavHash = hash

// Find file by hash in cluster
var fileID string
var duration float64
err = dbConn.QueryRow(`
SELECT id, duration FROM file WHERE xxh64_hash = ? AND cluster_id = ? AND active = true
`, hash, clusterID).Scan(&fileID, &duration)

if err == sql.ErrNoRows {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.WavPath),
Stage: "validation",
Message: fmt.Sprintf("file hash not found in database for cluster (hash: %s)", hash),
})
continue
}
if err != nil {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.WavPath),
Stage: "validation",
Message: fmt.Sprintf("failed to query file: %v", err),
})
continue
}

sf.FileID = fileID
sf.Duration = duration

// Verify file is linked to dataset via file_dataset junction table (composite FK)
var fileLinkedToDataset bool
err = dbConn.QueryRow(`
SELECT EXISTS(SELECT 1 FROM file_dataset WHERE file_id = ? AND dataset_id = ?)
`, fileID, datasetID).Scan(&fileLinkedToDataset)
if err != nil {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.WavPath),
Stage: "validation",
Message: fmt.Sprintf("failed to verify file-dataset link: %v", err),
})
continue
}
if !fileLinkedToDataset {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.WavPath),
Stage: "validation",
Message: fmt.Sprintf("file exists in cluster but is not linked to dataset %s", datasetID),
})
continue
}

// Check no existing labels for this file
var labelCount int
err = dbConn.QueryRow(`
SELECT COUNT(*) FROM label l
JOIN segment s ON l.segment_id = s.id
WHERE s.file_id = ? AND l.active = true
`, fileID).Scan(&labelCount)

if err != nil {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.WavPath),
Stage: "validation",
Message: fmt.Sprintf("failed to check existing labels: %v", err),
})
continue
}

if labelCount > 0 {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.WavPath),
Stage: "validation",
Message: fmt.Sprintf("file already has %d label(s) - fresh imports only", labelCount),
})
continue
}

fileIDMap[fileID] = sf
}

return fileIDMap, errors
}

// dataFileUpdate holds data to write back to .data file after import
type dataFileUpdate struct {
DataPath string
WavHash string
LabelIDs map[int]map[int]string // segmentIndex -> labelIndex -> labelID
}

// importSegmentsIntoDB performs the transactional import
func importSegmentsIntoDB(
ctx context.Context,
database *sql.DB,
fileIDMap map[string]scannedDataFile,
scannedFiles []scannedDataFile,
mapping utils.MappingFile,
filterIDMap map[string]string,
speciesIDMap map[string]string,
calltypeIDMap map[string]map[string]string,
datasetID string,
progressHandler func(processed, total int, message string),
) ([]SegmentImport, int, int, []dataFileUpdate, []ImportSegmentError) {
var importedSegments []SegmentImport
var errors []ImportSegmentError
importedLabels := 0
importedSubtypes := 0
var fileUpdates []dataFileUpdate

// Begin transaction
tx, err := db.BeginLoggedTx(ctx, database, "import_segments")
if err != nil {
errors = append(errors, ImportSegmentError{
Stage: "import",
Message: fmt.Sprintf("failed to begin transaction: %v", err),
})
return nil, 0, 0, nil, errors
}
defer tx.Rollback()

// Process each validated file
totalFiles := len(fileIDMap)
processedFiles := 0

for _, sf := range fileIDMap {
if sf.FileID == "" {
continue // Was filtered out during validation
}

processedFiles++
if progressHandler != nil {
progressHandler(processedFiles, totalFiles, filepath.Base(sf.DataPath))
}

// Track label IDs for writing back to .data file
fileUpdate := dataFileUpdate{
DataPath: sf.DataPath,
WavHash: sf.WavHash,
LabelIDs: make(map[int]map[int]string),
}

// Process segments
for segIdx, seg := range sf.Segments {
// Validate segment bounds
if seg.StartTime >= seg.EndTime {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.DataPath),
Stage: "import",
Message: fmt.Sprintf("invalid segment bounds: start=%.2f >= end=%.2f", seg.StartTime, seg.EndTime),
})
continue
}

if seg.EndTime > sf.Duration {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.DataPath),
Stage: "import",
Message: fmt.Sprintf("segment end time (%.2f) exceeds file duration (%.2f)", seg.EndTime, sf.Duration),
})
continue
}

// Insert segment
segmentID, err := utils.GenerateLongID()
if err != nil {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.DataPath),
Stage: "import",
Message: fmt.Sprintf("failed to generate segment ID: %v", err),
})
continue
}

_, err = tx.ExecContext(ctx, `
INSERT INTO segment (id, file_id, dataset_id, start_time, end_time, freq_low, freq_high, created_at, last_modified, active)
VALUES (?, ?, ?, ?, ?, ?, ?, now(), now(), true)
`, segmentID, sf.FileID, datasetID, seg.StartTime, seg.EndTime, seg.FreqLow, seg.FreqHigh)
if err != nil {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.DataPath),
Stage: "import",
Message: fmt.Sprintf("failed to insert segment: %v", err),
})
continue
}

// Process labels
var segmentImport SegmentImport
segmentImport.SegmentID = segmentID
segmentImport.FileName = filepath.Base(sf.WavPath)
segmentImport.StartTime = seg.StartTime
segmentImport.EndTime = seg.EndTime
segmentImport.FreqLow = seg.FreqLow
segmentImport.FreqHigh = seg.FreqHigh
segmentImport.Labels = make([]LabelImport, 0)

fileUpdate.LabelIDs[segIdx] = make(map[int]string)

for labelIdx, label := range seg.Labels {
// Get DB species and calltype
dbSpecies, ok := mapping.GetDBSpecies(label.Species)
if !ok {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.DataPath),
Stage: "import",
Message: fmt.Sprintf("species not found in mapping: %s", label.Species),
})
continue
}

speciesID, ok := speciesIDMap[dbSpecies]
if !ok {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.DataPath),
Stage: "import",
Message: fmt.Sprintf("species ID not found: %s", dbSpecies),
})
continue
}

filterID, ok := filterIDMap[label.Filter]
if !ok {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.DataPath),
Stage: "import",
Message: fmt.Sprintf("filter ID not found: %s", label.Filter),
})
continue
}

// Insert label
labelID, err := utils.GenerateLongID()
if err != nil {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.DataPath),
Stage: "import",
Message: fmt.Sprintf("failed to generate label ID: %v", err),
})
continue
}

_, err = tx.ExecContext(ctx, `
INSERT INTO label (id, segment_id, species_id, filter_id, certainty, created_at, last_modified, active)
VALUES (?, ?, ?, ?, ?, now(), now(), true)
`, labelID, segmentID, speciesID, filterID, label.Certainty)
if err != nil {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.DataPath),
Stage: "import",
Message: fmt.Sprintf("failed to insert label: %v", err),
})
continue
}
importedLabels++

// Track label ID for .data file update
fileUpdate.LabelIDs[segIdx][labelIdx] = labelID

// Insert label_metadata if comment exists
if label.Comment != "" {
escapedComment := strings.ReplaceAll(label.Comment, `"`, `\"`)
metadataJSON := fmt.Sprintf(`{"comment": "%s"}`, escapedComment)

_, err = tx.ExecContext(ctx, `
INSERT INTO label_metadata (label_id, json, created_at, last_modified, active)
VALUES (?, ?, now(), now(), true)
`, labelID, metadataJSON)
if err != nil {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.DataPath),
Stage: "import",
Message: fmt.Sprintf("failed to insert label_metadata: %v", err),
})
continue
}
}

// Build label import for output
labelImport := LabelImport{
LabelID: labelID,
Species: dbSpecies,
Filter: label.Filter,
Certainty: label.Certainty,
}
if label.Comment != "" {
labelImport.Comment = label.Comment
}

// Insert label_subtype if calltype exists
if label.CallType != "" {
dbCalltype := mapping.GetDBCalltype(label.Species, label.CallType)

calltypeID := ""
if calltypeIDMap[dbSpecies] != nil {
calltypeID = calltypeIDMap[dbSpecies][dbCalltype]
}

if calltypeID == "" {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.DataPath),
Stage: "import",
Message: fmt.Sprintf("calltype ID not found: %s/%s", dbSpecies, dbCalltype),
})
continue
}

subtypeID, err := utils.GenerateLongID()
if err != nil {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.DataPath),
Stage: "import",
Message: fmt.Sprintf("failed to generate label_subtype ID: %v", err),
})
continue
}

_, err = tx.ExecContext(ctx, `
INSERT INTO label_subtype (id, label_id, calltype_id, filter_id, certainty, created_at, last_modified, active)
VALUES (?, ?, ?, ?, ?, now(), now(), true)
`, subtypeID, labelID, calltypeID, filterID, label.Certainty)
if err != nil {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.DataPath),
Stage: "import",
Message: fmt.Sprintf("failed to insert label_subtype: %v", err),
})
continue
}
importedSubtypes++

labelImport.CallType = dbCalltype
}

segmentImport.Labels = append(segmentImport.Labels, labelImport)
}

// If no labels succeeded, delete the orphaned segment
if len(segmentImport.Labels) == 0 {
_, err = tx.ExecContext(ctx, `DELETE FROM segment WHERE id = ?`, segmentID)
if err != nil {
errors = append(errors, ImportSegmentError{
File: filepath.Base(sf.DataPath),
Stage: "import",
Message: fmt.Sprintf("failed to delete orphaned segment: %v", err),
})
}
// Remove from fileUpdate since no labels were imported
delete(fileUpdate.LabelIDs, segIdx)
} else {
importedSegments = append(importedSegments, segmentImport)
}
}

fileUpdates = append(fileUpdates, fileUpdate)
}

// Commit transaction
if err := tx.Commit(); err != nil {
errors = append(errors, ImportSegmentError{
Stage: "import",
Message: fmt.Sprintf("failed to commit transaction: %v", err),
})
return nil, 0, 0, nil, errors
}

return importedSegments, importedLabels, importedSubtypes, fileUpdates, errors
}

// countTotalSegments counts total segments from validated files
func countTotalSegments(fileIDMap map[string]scannedDataFile) int {
count := 0
for _, sf := range fileIDMap {
count += len(sf.Segments)
}
return count
}

// writeIDsToDataFiles writes skraak_hash and skraak_label_ids back to .data files
func writeIDsToDataFiles(fileUpdates []dataFileUpdate) []ImportSegmentError {
var errors []ImportSegmentError

for _, fu := range fileUpdates {
// Parse the .data file
df, err := utils.ParseDataFile(fu.DataPath)
if err != nil {
errors = append(errors, ImportSegmentError{
File: filepath.Base(fu.DataPath),
Stage: "import",
Message: fmt.Sprintf("failed to re-parse .data file for writing: %v", err),
})
continue
}

// Write skraak_hash to metadata
if df.Meta.Extra == nil {
df.Meta.Extra = make(map[string]any)
}
df.Meta.Extra["skraak_hash"] = fu.WavHash

// Write skraak_label_id to each label
for segIdx, labelIDs := range fu.LabelIDs {
if segIdx >= len(df.Segments) {
continue
}
seg := df.Segments[segIdx]
for labelIdx, labelID := range labelIDs {
if labelIdx >= len(seg.Labels) {
continue
}
label := seg.Labels[labelIdx]
if label.Extra == nil {
label.Extra = make(map[string]any)
}
label.Extra["skraak_label_id"] = labelID
}
}

// Write the updated .data file
if err := df.Write(fu.DataPath); err != nil {
errors = append(errors, ImportSegmentError{
File: filepath.Base(fu.DataPath),
Stage: "import",
Message: fmt.Sprintf("failed to write updated .data file: %v", err),
})
continue
}
}

return errors
}
file addition: import_files.go (----------)

[0.248737]

package tools

import (
"context"
"database/sql"
"fmt"
"os"
"time"

"skraak/db"
"skraak/utils"
)

// ImportAudioFilesInput defines the input parameters for the import_audio_files tool
type ImportAudioFilesInput struct {
FolderPath string `json:"folder_path"`
DatasetID string `json:"dataset_id"`
LocationID string `json:"location_id"`
ClusterID string `json:"cluster_id"`
Recursive *bool `json:"recursive,omitempty"` // *bool because default is true; plain bool would make "not provided" indistinguishable from "false"
}

// ImportAudioFilesOutput defines the output structure for the import_audio_files tool
type ImportAudioFilesOutput struct {
Summary ImportSummary `json:"summary"`
FileIDs []string `json:"file_ids"`
Errors []utils.FileImportError `json:"errors,omitempty"`
}

// ImportSummary provides summary statistics for the import operation
type ImportSummary struct {
TotalFiles int `json:"total_files"`
ImportedFiles int `json:"imported_files"`
SkippedFiles int `json:"skipped_files"` // Duplicates
FailedFiles int `json:"failed_files"`
AudioMothFiles int `json:"audiomoth_files"`
TotalDuration float64 `json:"total_duration_seconds"`
ProcessingTime string `json:"processing_time"`
}

// ImportAudioFiles batch imports WAV files from a folder with hash-based duplicate detection
func ImportAudioFiles(
ctx context.Context,
input ImportAudioFilesInput,
) (ImportAudioFilesOutput, error) {
startTime := time.Now()
var output ImportAudioFilesOutput

// Default recursive to true
recursive := true
if input.Recursive != nil {
recursive = *input.Recursive
}

// Validate database hierarchy (dataset → location → cluster)
if err := validateImportInput(input, dbPath); err != nil {
return output, fmt.Errorf("validation failed: %w", err)
}

// Open database
database, err := db.OpenWriteableDB(dbPath)
if err != nil {
return output, fmt.Errorf("failed to open database: %w", err)
}
defer database.Close()

// Set cluster path if empty
err = utils.EnsureClusterPath(database, input.ClusterID, input.FolderPath)
if err != nil {
return output, fmt.Errorf("failed to set cluster path: %w", err)
}

// Import the cluster (ALL THE LOGIC IS HERE)
clusterOutput, err := utils.ImportCluster(database, utils.ClusterImportInput{
FolderPath: input.FolderPath,
DatasetID: input.DatasetID,
LocationID: input.LocationID,
ClusterID: input.ClusterID,
Recursive: recursive,
})
if err != nil {
return output, fmt.Errorf("cluster import failed: %w", err)
}

// Map to output format
output = ImportAudioFilesOutput{
Summary: ImportSummary{
TotalFiles: clusterOutput.TotalFiles,
ImportedFiles: clusterOutput.ImportedFiles,
SkippedFiles: clusterOutput.SkippedFiles,
FailedFiles: clusterOutput.FailedFiles,
AudioMothFiles: clusterOutput.AudioMothFiles,
TotalDuration: clusterOutput.TotalDuration,
ProcessingTime: time.Since(startTime).String(),
},
FileIDs: []string{}, // File IDs not tracked currently
Errors: clusterOutput.Errors,
}

return output, nil
}

// validateImportInput validates all input parameters and database relationships
func validateImportInput(input ImportAudioFilesInput, dbPath string) error {
// Verify folder exists
info, err := os.Stat(input.FolderPath)
if err != nil {
return fmt.Errorf("folder not accessible: %w", err)
}
if !info.IsDir() {
return fmt.Errorf("path is not a directory: %s", input.FolderPath)
}

return validateHierarchyIDs(input.DatasetID, input.LocationID, input.ClusterID, dbPath)
}

// validateHierarchyIDs validates dataset/location/cluster ID formats and database relationships
func validateHierarchyIDs(datasetID, locationID, clusterID, dbPath string) error {
// Validate ID formats first (fast fail before DB queries)
if err := utils.ValidateShortID(datasetID, "dataset_id"); err != nil {
return err
}
if err := utils.ValidateShortID(locationID, "location_id"); err != nil {
return err
}
if err := utils.ValidateShortID(clusterID, "cluster_id"); err != nil {
return err
}

// Open database for validation queries
database, err := db.OpenReadOnlyDB(dbPath)
if err != nil {
return fmt.Errorf("failed to open database: %w", err)
}
defer database.Close()

// Verify dataset exists and is active
var datasetExists bool
err = database.QueryRow("SELECT EXISTS(SELECT 1 FROM dataset WHERE id = ? AND active = true)", datasetID).Scan(&datasetExists)
if err != nil {
return fmt.Errorf("failed to query dataset: %w", err)
}
if !datasetExists {
return fmt.Errorf("dataset not found or inactive: %s", datasetID)
}

// Verify dataset is 'structured' type (file imports only support structured datasets)
if err := utils.ValidateDatasetTypeForImport(database, datasetID); err != nil {
return err
}

// Verify location exists and belongs to dataset
var locationDatasetID string
err = database.QueryRow("SELECT dataset_id FROM location WHERE id = ? AND active = true", locationID).Scan(&locationDatasetID)
if err == sql.ErrNoRows {
return fmt.Errorf("location not found or inactive: %s", locationID)
}
if err != nil {
return fmt.Errorf("failed to query location: %w", err)
}
if locationDatasetID != datasetID {
return fmt.Errorf("location %s does not belong to dataset %s", locationID, datasetID)
}

// Verify cluster exists and belongs to location
var clusterLocationID string
err = database.QueryRow("SELECT location_id FROM cluster WHERE id = ? AND active = true", clusterID).Scan(&clusterLocationID)
if err == sql.ErrNoRows {
return fmt.Errorf("cluster not found or inactive: %s", clusterID)
}
if err != nil {
return fmt.Errorf("failed to query cluster: %w", err)
}
if clusterLocationID != locationID {
return fmt.Errorf("cluster %s does not belong to location %s", clusterID, locationID)
}

return nil
}
file addition: import_file.go (----------)

[0.248737]

package tools

import (
"context"
"database/sql"
"fmt"
"os"
"path/filepath"
"strings"
"time"

"skraak/db"
"skraak/utils"
)

// ImportFileInput defines the input parameters for the import_file tool
type ImportFileInput struct {
FilePath string `json:"file_path"`
DatasetID string `json:"dataset_id"`
LocationID string `json:"location_id"`
ClusterID string `json:"cluster_id"`
}

// ImportFileOutput defines the output structure for the import_file tool
type ImportFileOutput struct {
FileID string `json:"file_id"`
FileName string `json:"file_name"`
Hash string `json:"hash"`
Duration float64 `json:"duration_seconds"`
SampleRate int `json:"sample_rate"`
TimestampLocal time.Time `json:"timestamp_local"`
IsAudioMoth bool `json:"is_audiomoth"`
IsDuplicate bool `json:"is_duplicate"`
ProcessingTime string `json:"processing_time"`
Error *string `json:"error,omitempty"`
}

// ImportFile imports a single WAV file into the database with duplicate detection
func ImportFile(
ctx context.Context,
input ImportFileInput,
) (ImportFileOutput, error) {
startTime := time.Now()
var output ImportFileOutput

// Phase 1: Validate file path
_, err := validateFilePath(input.FilePath)
if err != nil {
return output, fmt.Errorf("file validation failed: %w", err)
}
output.FileName = filepath.Base(input.FilePath)

// Phase 2: Validate database hierarchy
if err := validateHierarchyIDs(input.DatasetID, input.LocationID, input.ClusterID, dbPath); err != nil {
return output, fmt.Errorf("hierarchy validation failed: %w", err)
}

// Phase 3: Open database connection (single connection for all DB operations)
database, err := db.OpenWriteableDB(dbPath)
if err != nil {
return output, fmt.Errorf("database connection failed: %w", err)
}
defer database.Close()

// Phase 4: Get location data for astronomical calculations
locData, err := utils.GetLocationData(database, input.LocationID)
if err != nil {
return output, fmt.Errorf("failed to get location data: %w", err)
}

// Phase 5: Process file metadata
result, err := utils.ProcessSingleFile(input.FilePath, locData.Latitude, locData.Longitude, locData.TimezoneID, true)
if err != nil {
errMsg := err.Error()
output.Error = &errMsg
output.ProcessingTime = time.Since(startTime).String()
return output, fmt.Errorf("file processing failed: %w", err)
}

// Populate output with extracted metadata
output.FileName = result.FileName
output.Hash = result.Hash
output.Duration = result.Duration
output.SampleRate = result.SampleRate
output.TimestampLocal = result.TimestampLocal
output.IsAudioMoth = result.IsAudioMoth

// Phase 6: Ensure cluster path is set
if err := utils.EnsureClusterPath(database, input.ClusterID, filepath.Dir(input.FilePath)); err != nil {
return output, fmt.Errorf("failed to set cluster path: %w", err)
}

// Phase 7: Insert into database
fileID, isDuplicate, err := insertFileIntoDB(ctx, database, result, input.DatasetID, input.ClusterID, input.LocationID)
if err != nil {
errMsg := err.Error()
output.Error = &errMsg
output.ProcessingTime = time.Since(startTime).String()
return output, fmt.Errorf("database insertion failed: %w", err)
}

output.FileID = fileID
output.IsDuplicate = isDuplicate
output.ProcessingTime = time.Since(startTime).String()

return output, nil
}

// validateFilePath validates the file exists, is a regular file, is a WAV file, and is not empty
func validateFilePath(filePath string) (os.FileInfo, error) {
// Check file exists
info, err := os.Stat(filePath)
if err != nil {
if os.IsNotExist(err) {
return nil, fmt.Errorf("file does not exist: %s", filePath)
}
return nil, fmt.Errorf("cannot access file: %w", err)
}

// Check it's a regular file
if !info.Mode().IsRegular() {
return nil, fmt.Errorf("path is not a regular file: %s", filePath)
}

// Check extension is .wav (case-insensitive)
ext := strings.ToLower(filepath.Ext(filePath))
if ext != ".wav" {
return nil, fmt.Errorf("file must be a WAV file (got extension: %s)", ext)
}

// Check file is not empty
if info.Size() == 0 {
return nil, fmt.Errorf("file is empty: %s", filePath)
}

return info, nil
}

// insertFileIntoDB inserts a single file into the database
// Returns (fileID, isDuplicate, error)
func insertFileIntoDB(
ctx context.Context,
database *sql.DB,
result *utils.FileProcessingResult,
datasetID, clusterID, locationID string,
) (string, bool, error) {
// Begin logged transaction
tx, err := db.BeginLoggedTx(ctx, database, "import_audio_file")
if err != nil {
return "", false, fmt.Errorf("failed to begin transaction: %w", err)
}
defer tx.Rollback() // Rollback if not committed

// Check for duplicate hash
existingID, isDup, err := utils.CheckDuplicateHash(tx, result.Hash)
if err != nil {
return "", false, err
}
if isDup {
return existingID, true, nil
}

// Generate file ID
fileID, err := utils.GenerateLongID()
if err != nil {
return "", false, fmt.Errorf("ID generation failed: %w", err)
}

// Insert file record
_, err = tx.ExecContext(ctx, `
INSERT INTO file (
id, file_name, xxh64_hash, location_id, timestamp_local,
cluster_id, duration, sample_rate, maybe_solar_night, maybe_civil_night,
moon_phase, created_at, last_modified, active
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, now(), now(), true)
`,
fileID, result.FileName, result.Hash, locationID,
result.TimestampLocal, clusterID, result.Duration, result.SampleRate,
result.AstroData.SolarNight, result.AstroData.CivilNight, result.AstroData.MoonPhase,
)
if err != nil {
return "", false, fmt.Errorf("file insert failed: %w", err)
}

// Insert file_dataset junction
_, err = tx.ExecContext(ctx, `
INSERT INTO file_dataset (file_id, dataset_id, created_at, last_modified)
VALUES (?, ?, now(), now())
`, fileID, datasetID)
if err != nil {
return "", false, fmt.Errorf("file_dataset insert failed: %w", err)
}

// If AudioMoth, insert moth_metadata
if result.IsAudioMoth && result.MothData != nil {
_, err = tx.ExecContext(ctx, `
INSERT INTO moth_metadata (
file_id, timestamp, recorder_id, gain, battery_v, temp_c,
created_at, last_modified, active
) VALUES (?, ?, ?, ?, ?, ?, now(), now(), true)
`,
fileID,
result.MothData.Timestamp,
&result.MothData.RecorderID,
&result.MothData.Gain,
&result.MothData.BatteryV,
&result.MothData.TempC,
)
if err != nil {
return "", false, fmt.Errorf("moth_metadata insert failed: %w", err)
}
}

// Commit transaction
if err = tx.Commit(); err != nil {
return "", false, fmt.Errorf("transaction commit failed: %w", err)
}

return fileID, false, nil
}
file addition: export.go (----------)

[0.248737]

package tools

import (
"context"
"database/sql"
"fmt"
"os"
"path/filepath"
"sort"
"strings"

"skraak/db"
)

// ExportDatasetInput defines the input parameters for the export dataset tool
type ExportDatasetInput struct {
DatasetID string `json:"dataset_id"`
Output string `json:"output"`
DryRun bool `json:"dry_run"`
Force bool `json:"force"`
}

// ExportDatasetOutput defines the output structure
type ExportDatasetOutput struct {
DatasetID string `json:"dataset_id"`
DatasetName string `json:"dataset_name"`
OutputPath string `json:"output_path"`
RowCounts map[string]int64 `json:"row_counts"`
FileSizeMB float64 `json:"file_size_mb,omitempty"`
DryRun bool `json:"dry_run"`
Message string `json:"message"`
}

// TableRelationship defines how a table relates to a dataset
type TableRelationship struct {
Table string // table name
Relation string // "owned" | "owned-via" | "copy"
FilterCol string // column to filter on
ViaTable string // for owned-via: table to join through
}

// Dataset tables manifest - defines how each table relates to a dataset
var datasetTables = []TableRelationship{
// Owned directly
{Table: "dataset", Relation: "owned", FilterCol: "id"},
{Table: "location", Relation: "owned", FilterCol: "dataset_id"},
{Table: "cluster", Relation: "owned", FilterCol: "dataset_id"},
{Table: "segment", Relation: "owned", FilterCol: "dataset_id"},
{Table: "file_dataset", Relation: "owned", FilterCol: "dataset_id"},

// Owned via FK chain
{Table: "file", Relation: "owned-via", FilterCol: "cluster_id", ViaTable: "cluster"},
{Table: "moth_metadata", Relation: "owned-via", FilterCol: "file_id", ViaTable: "file"},
{Table: "file_metadata", Relation: "owned-via", FilterCol: "file_id", ViaTable: "file"},
{Table: "label_metadata", Relation: "owned-via", FilterCol: "label_id", ViaTable: "label"},
{Table: "label", Relation: "owned-via", FilterCol: "segment_id", ViaTable: "segment"},
{Table: "label_subtype", Relation: "owned-via", FilterCol: "label_id", ViaTable: "label"},

// Referenced (subset extraction) - none remaining

// Copied as-is (no filtering)
{Table: "ebird_taxonomy", Relation: "copy"},
{Table: "species", Relation: "copy"},
{Table: "call_type", Relation: "copy"},
{Table: "cyclic_recording_pattern", Relation: "copy"},
{Table: "filter", Relation: "copy"},
}

// ExportDataset exports a single dataset with all related data to a new database
// Note: this fails if exporting from a db with FK constraints removed (sometimes
// I remove them as duckdb is a pain when editing records due to indexes and FK's,
// it removes then reinserts therefore violating constraints)
func ExportDataset(
ctx context.Context,
input ExportDatasetInput,
) (ExportDatasetOutput, error) {
var output ExportDatasetOutput
output.DatasetID = input.DatasetID
output.OutputPath = input.Output
output.DryRun = input.DryRun
output.RowCounts = make(map[string]int64)

// Open source database (read-only for safety)
sourceDB, err := db.OpenReadOnlyDB(dbPath)
if err != nil {
return output, fmt.Errorf("failed to open source database: %w", err)
}

// Verify dataset exists and get name/type
var datasetName, datasetType string
err = sourceDB.QueryRowContext(ctx,
"SELECT name, type FROM dataset WHERE id = ? AND active = true",
input.DatasetID,
).Scan(&datasetName, &datasetType)
if err != nil {
sourceDB.Close()
return output, fmt.Errorf("dataset not found: %s", input.DatasetID)
}
output.DatasetName = datasetName

// Only structured datasets can be exported
if datasetType != "structured" {
sourceDB.Close()
return output, fmt.Errorf("cannot export dataset of type '%s': only structured datasets are supported", datasetType)
}

// Check if output file exists
if !input.DryRun {
if _, err := os.Stat(input.Output); err == nil && !input.Force {
sourceDB.Close()
return output, fmt.Errorf("output file exists: %s (use --force to overwrite)", input.Output)
}
}

// Get FK order for tables
fkOrder, err := db.GetFKOrder(sourceDB)
if err != nil {
sourceDB.Close()
return output, fmt.Errorf("failed to compute table order: %w", err)
}

// Sort our manifest by FK order
orderedTables := orderByFKDependency(datasetTables, fkOrder)

// Calculate row counts for each table
for _, tr := range orderedTables {
count, err := countTableRows(ctx, sourceDB, tr, input.DatasetID)
if err != nil {
sourceDB.Close()
return output, fmt.Errorf("failed to count rows in %s: %w", tr.Table, err)
}
if count > 0 {
output.RowCounts[tr.Table] = count
}
}

// If dry-run, return now
if input.DryRun {
sourceDB.Close()
output.Message = fmt.Sprintf("Would export dataset '%s' (%s)", datasetName, input.DatasetID)
return output, nil
}

// Close source DB before creating output (DuckDB can't attach same file twice)
sourceDB.Close()

// Create output directory if needed
outputDir := filepath.Dir(input.Output)
if outputDir != "" && outputDir != "." {
if err := os.MkdirAll(outputDir, 0755); err != nil {
return output, fmt.Errorf("failed to create output directory: %w", err)
}
}

// Create output database
outputDB, err := createOutputDatabase(input.Output)
if err != nil {
return output, fmt.Errorf("failed to create output database: %w", err)
}
defer outputDB.Close()

// Attach source database
_, err = outputDB.ExecContext(ctx, fmt.Sprintf("ATTACH '%s' AS source", dbPath))
if err != nil {
return output, fmt.Errorf("failed to attach source database: %w", err)
}

// Copy data in FK order
for _, tr := range orderedTables {
if tr.Relation == "copy" {
// Copy entire table as-is
err = copyTableAsIs(ctx, outputDB, tr.Table)
} else {
// Owned or owned-via: filter by dataset
err = copyTableData(ctx, outputDB, tr, input.DatasetID)
}
if err != nil {
return output, fmt.Errorf("failed to copy %s: %w", tr.Table, err)
}
}

// Detach source
_, err = outputDB.ExecContext(ctx, "DETACH source")
if err != nil {
return output, fmt.Errorf("failed to detach source database: %w", err)
}

// Close output DB before getting file size
outputDB.Close()
outputDB = nil

// Get file size
if info, err := os.Stat(input.Output); err == nil {
output.FileSizeMB = float64(info.Size()) / 1024 / 1024
}

// Create empty event log file
eventLogPath := input.Output + ".events.jsonl"
eventFile, err := os.Create(eventLogPath)
if err != nil {
return output, fmt.Errorf("failed to create event log file: %w", err)
}
if err := eventFile.Close(); err != nil {
return output, fmt.Errorf("failed to close event log file: %w", err)
}

output.Message = fmt.Sprintf("Successfully exported dataset '%s' (%s) to %s",
datasetName, input.DatasetID, input.Output)

return output, nil
}

// createOutputDatabase creates a new database with the schema
func createOutputDatabase(outputPath string) (*sql.DB, error) {
// Remove existing file if any
os.Remove(outputPath)

// Open new database connection
connStr := outputPath + "?access_mode=read_write"
database, err := sql.Open("duckdb", connStr)
if err != nil {
return nil, fmt.Errorf("failed to create output database: %w", err)
}

// Read and execute schema
schemaSQL, err := db.ReadSchemaSQL()
if err != nil {
database.Close()
return nil, fmt.Errorf("failed to read schema: %w", err)
}

statements := db.ExtractDDLStatements(schemaSQL)
for _, stmt := range statements {
// Skip CREATE TABLE AS SELECT statements - they don't work on empty database
if stmt.Type == "CREATE_TABLE_AS" {
continue
}
if _, err := database.Exec(stmt.SQL); err != nil {
// Ignore "already exists" errors for types
if !strings.Contains(err.Error(), "already exists") {
database.Close()
return nil, fmt.Errorf("failed to execute DDL for %s: %w", stmt.TableName, err)
}
}
}

return database, nil
}

// copyTableAsIs copies an entire table without filtering.
// Table names are interpolated via Sprintf because SQL parameterization doesn't support
// identifiers (table/column names) — only values. This is safe because all table names
// come from the hardcoded datasetTables manifest, never from user input.
func copyTableAsIs(ctx context.Context, outputDB *sql.DB, table string) error {
query := fmt.Sprintf("INSERT INTO %s SELECT * FROM source.%s", table, table)
_, err := outputDB.ExecContext(ctx, query)
return err
}

// copyTableData copies data from source to output database
func copyTableData(ctx context.Context, outputDB *sql.DB, tr TableRelationship, datasetID string) error {
var query string

switch tr.Relation {
case "owned":
// Direct filter on dataset_id (or id for dataset table)
if tr.Table == "dataset" {
query = fmt.Sprintf("INSERT INTO %s SELECT * FROM source.%s WHERE id = ?", tr.Table, tr.Table)
} else {
query = fmt.Sprintf("INSERT INTO %s SELECT * FROM source.%s WHERE dataset_id = ?", tr.Table, tr.Table)
}

case "owned-via":
// Filter via FK chain
query = buildOwnedViaQuery(tr, datasetID)

default:
return fmt.Errorf("unknown relation type: %s", tr.Relation)
}

_, err := outputDB.ExecContext(ctx, query, datasetID)
return err
}

// buildOwnedViaQuery builds a query for owned-via tables
func buildOwnedViaQuery(tr TableRelationship, datasetID string) string {
switch tr.ViaTable {
case "cluster":
return fmt.Sprintf(`INSERT INTO %s SELECT * FROM source.%s
WHERE %s IN (SELECT id FROM source.cluster WHERE dataset_id = ?)`,
tr.Table, tr.Table, tr.FilterCol)
case "file":
return fmt.Sprintf(`INSERT INTO %s SELECT * FROM source.%s
WHERE %s IN (SELECT id FROM source.file WHERE cluster_id IN
(SELECT id FROM source.cluster WHERE dataset_id = ?))`,
tr.Table, tr.Table, tr.FilterCol)
case "segment":
return fmt.Sprintf(`INSERT INTO %s SELECT * FROM source.%s
WHERE %s IN (SELECT id FROM source.segment WHERE dataset_id = ?)`,
tr.Table, tr.Table, tr.FilterCol)
case "label":
return fmt.Sprintf(`INSERT INTO %s SELECT * FROM source.%s
WHERE %s IN (SELECT id FROM source.label WHERE segment_id IN
(SELECT id FROM source.segment WHERE dataset_id = ?))`,
tr.Table, tr.Table, tr.FilterCol)
default:
// Generic fallback
return fmt.Sprintf(`INSERT INTO %s SELECT * FROM source.%s WHERE %s IN
(SELECT id FROM source.%s WHERE dataset_id = ?)`,
tr.Table, tr.Table, tr.FilterCol, tr.ViaTable)
}
}

// countTableRows counts rows for a table relationship
func countTableRows(ctx context.Context, db *sql.DB, tr TableRelationship, datasetID string) (int64, error) {
var query string

switch tr.Relation {
case "copy":
// Count all rows in table
query = "SELECT COUNT(*) FROM " + tr.Table
case "owned":
if tr.Table == "dataset" {
query = "SELECT COUNT(*) FROM " + tr.Table + " WHERE id = ?"
} else {
query = "SELECT COUNT(*) FROM " + tr.Table + " WHERE dataset_id = ?"
}
case "owned-via":
query = buildCountOwnedViaQuery(tr)
default:
return 0, nil
}

var count int64
err := db.QueryRowContext(ctx, query, datasetID).Scan(&count)
return count, err
}

// buildCountOwnedViaQuery builds a count query for owned-via tables
func buildCountOwnedViaQuery(tr TableRelationship) string {
switch tr.ViaTable {
case "cluster":
return fmt.Sprintf(`SELECT COUNT(*) FROM %s WHERE %s IN
(SELECT id FROM cluster WHERE dataset_id = ?)`, tr.Table, tr.FilterCol)
case "file":
return fmt.Sprintf(`SELECT COUNT(*) FROM %s WHERE %s IN
(SELECT id FROM file WHERE cluster_id IN
(SELECT id FROM cluster WHERE dataset_id = ?))`, tr.Table, tr.FilterCol)
case "segment":
return fmt.Sprintf(`SELECT COUNT(*) FROM %s WHERE %s IN
(SELECT id FROM segment WHERE dataset_id = ?)`, tr.Table, tr.FilterCol)
case "label":
return fmt.Sprintf(`SELECT COUNT(*) FROM %s WHERE %s IN
(SELECT id FROM label WHERE segment_id IN
(SELECT id FROM segment WHERE dataset_id = ?))`, tr.Table, tr.FilterCol)
default:
return fmt.Sprintf(`SELECT COUNT(*) FROM %s WHERE %s IN
(SELECT id FROM %s WHERE dataset_id = ?)`, tr.Table, tr.FilterCol, tr.ViaTable)
}
}

// orderByFKDependency sorts tables by FK dependency order
func orderByFKDependency(tables []TableRelationship, fkOrder []string) []TableRelationship {
// Create a map for quick order lookup
orderMap := make(map[string]int)
for i, table := range fkOrder {
orderMap[table] = i
}

// Sort by FK order
sorted := make([]TableRelationship, len(tables))
copy(sorted, tables)

sort.Slice(sorted, func(i, j int) bool {
ti, tj := sorted[i], sorted[j]
oi := orderMap[ti.Table]
oj := orderMap[tj.Table]
return oi < oj
})

return sorted
}
file addition: dataset.go (----------)

[0.248737]

package tools

import (
"context"
"fmt"
"skraak/db"
"skraak/utils"
"strings"
)

// DatasetInput defines the input parameters for the create_or_update_dataset tool
type DatasetInput struct {
ID *string `json:"id,omitempty"`
Name *string `json:"name,omitempty"`
Description *string `json:"description,omitempty"`
Type *string `json:"type,omitempty"`
}

// DatasetOutput defines the output structure
type DatasetOutput struct {
Dataset db.Dataset `json:"dataset"`
Message string `json:"message"`
}

// CreateOrUpdateDataset creates a new dataset or updates an existing one
func CreateOrUpdateDataset(
ctx context.Context,
input DatasetInput,
) (DatasetOutput, error) {
if input.ID != nil && strings.TrimSpace(*input.ID) != "" {
return updateDataset(ctx, input)
}
return createDataset(ctx, input)
}

func createDataset(ctx context.Context, input DatasetInput) (DatasetOutput, error) {
var output DatasetOutput

// Validate name (required for create)
if input.Name == nil || strings.TrimSpace(*input.Name) == "" {
return output, fmt.Errorf("name is required when creating a dataset")
}
if err := utils.ValidateStringLength(*input.Name, "name", utils.MaxDatasetNameLen); err != nil {
return output, err
}

// Validate description length if provided
if err := utils.ValidateOptionalStringLength(input.Description, "description", utils.MaxDescriptionLen); err != nil {
return output, err
}

// Validate and set type
datasetType := db.DatasetTypeStructured // Default
if input.Type != nil {
typeStr := strings.ToLower(strings.TrimSpace(*input.Type))
switch typeStr {
case "structured":
datasetType = db.DatasetTypeStructured
case "unstructured":
datasetType = db.DatasetTypeUnstructured
case "test":
datasetType = db.DatasetTypeTest
case "train":
datasetType = db.DatasetTypeTrain
default:
return output, fmt.Errorf("invalid type '%s': must be 'structured', 'unstructured', 'test', or 'train'", *input.Type)
}
}

// Open writable database connection
database, err := db.OpenWriteableDB(dbPath)
if err != nil {
return output, fmt.Errorf("database connection failed: %w", err)
}
defer database.Close()

// Begin logged transaction
tx, err := db.BeginLoggedTx(ctx, database, "create_or_update_dataset")
if err != nil {
return output, fmt.Errorf("failed to begin transaction: %w", err)
}
defer func() {
if err != nil {
tx.Rollback()
}
}()

// Check for existing dataset with same name (UNIQUE constraint)
var existingID string
err = tx.QueryRowContext(ctx,
"SELECT id FROM dataset WHERE name = ? AND active = true",
*input.Name,
).Scan(&existingID)

if err == nil {
// Dataset with this name already exists - return existing (consistent duplicate handling)
var dataset db.Dataset
err = tx.QueryRowContext(ctx,
"SELECT id, name, description, created_at, last_modified, active, type FROM dataset WHERE id = ?",
existingID,
).Scan(&dataset.ID, &dataset.Name, &dataset.Description, &dataset.CreatedAt, &dataset.LastModified, &dataset.Active, &dataset.Type)
if err != nil {
return output, fmt.Errorf("failed to fetch existing dataset: %w", err)
}

if err = tx.Commit(); err != nil {
return output, fmt.Errorf("failed to commit transaction: %w", err)
}

output.Dataset = dataset
output.Message = fmt.Sprintf("Dataset with name '%s' already exists (ID: %s) - returning existing dataset", dataset.Name, dataset.ID)
return output, nil
}

// Generate ID
id, err := utils.GenerateShortID()
if err != nil {
return output, fmt.Errorf("failed to generate ID: %w", err)
}

// Insert dataset
_, err = tx.ExecContext(ctx,
"INSERT INTO dataset (id, name, description, type, created_at, last_modified, active) VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, TRUE)",
id, *input.Name, input.Description, string(datasetType),
)
if err != nil {
return output, fmt.Errorf("failed to create dataset: %w", err)
}

// Fetch the created dataset
var dataset db.Dataset
err = tx.QueryRowContext(ctx,
"SELECT id, name, description, created_at, last_modified, active, type FROM dataset WHERE id = ?",
id,
).Scan(&dataset.ID, &dataset.Name, &dataset.Description, &dataset.CreatedAt, &dataset.LastModified, &dataset.Active, &dataset.Type)
if err != nil {
return output, fmt.Errorf("failed to fetch created dataset: %w", err)
}

if err = tx.Commit(); err != nil {
return output, fmt.Errorf("failed to commit transaction: %w", err)
}

output.Dataset = dataset
output.Message = fmt.Sprintf("Successfully created dataset '%s' with ID %s (type: %s)",
dataset.Name, dataset.ID, dataset.Type)

return output, nil
}

func updateDataset(ctx context.Context, input DatasetInput) (DatasetOutput, error) {
var output DatasetOutput
datasetID := *input.ID

// Validate ID format
if err := utils.ValidateShortID(datasetID, "dataset_id"); err != nil {
return output, err
}

// Validate fields if provided
if err := utils.ValidateOptionalStringLength(input.Name, "name", utils.MaxDatasetNameLen); err != nil {
return output, err
}
if err := utils.ValidateOptionalStringLength(input.Description, "description", utils.MaxDescriptionLen); err != nil {
return output, err
}
if input.Type != nil {
typeValue := strings.ToLower(*input.Type)
if typeValue != "structured" && typeValue != "unstructured" && typeValue != "test" && typeValue != "train" {
return output, fmt.Errorf("invalid dataset type: %s (must be 'structured', 'unstructured', 'test', or 'train')", *input.Type)
}
}

// Open writable database
database, err := db.OpenWriteableDB(dbPath)
if err != nil {
return output, fmt.Errorf("failed to open database: %w", err)
}
defer database.Close()

// Verify dataset exists and check active status
var exists, active bool
err = database.QueryRow("SELECT EXISTS(SELECT 1 FROM dataset WHERE id = ?), COALESCE((SELECT active FROM dataset WHERE id = ?), false)", datasetID, datasetID).Scan(&exists, &active)
if err != nil {
return output, fmt.Errorf("failed to query dataset: %w", err)
}
if !exists {
return output, fmt.Errorf("dataset not found: %s", datasetID)
}
if !active {
return output, fmt.Errorf("dataset '%s' is not active (cannot update inactive datasets)", datasetID)
}

// Build dynamic UPDATE query
updates := []string{}
args := []any{}

if input.Name != nil {
updates = append(updates, "name = ?")
args = append(args, *input.Name)
}
if input.Description != nil {
updates = append(updates, "description = ?")
args = append(args, *input.Description)
}
if input.Type != nil {
updates = append(updates, "type = ?")
args = append(args, strings.ToLower(*input.Type))
}

if len(updates) == 0 {
return output, fmt.Errorf("no fields provided to update")
}

// Always update last_modified
updates = append(updates, "last_modified = now()")
args = append(args, datasetID)

query := fmt.Sprintf("UPDATE dataset SET %s WHERE id = ?", strings.Join(updates, ", "))

// Begin logged transaction for update
tx, err := db.BeginLoggedTx(ctx, database, "create_or_update_dataset")
if err != nil {
return output, fmt.Errorf("failed to begin transaction: %w", err)
}
defer func() {
if err != nil {
tx.Rollback()
}
}()

_, err = tx.Exec(query, args...)
if err != nil {
return output, fmt.Errorf("failed to update dataset: %w", err)
}

// Fetch the updated dataset
var dataset db.Dataset
err = tx.QueryRow(
"SELECT id, name, description, created_at, last_modified, active, type FROM dataset WHERE id = ?",
datasetID,
).Scan(&dataset.ID, &dataset.Name, &dataset.Description, &dataset.CreatedAt, &dataset.LastModified, &dataset.Active, &dataset.Type)
if err != nil {
return output, fmt.Errorf("failed to fetch updated dataset: %w", err)
}

if err = tx.Commit(); err != nil {
return output, fmt.Errorf("failed to commit transaction: %w", err)
}

output.Dataset = dataset
output.Message = fmt.Sprintf("Successfully updated dataset '%s' (ID: %s)", dataset.Name, dataset.ID)

return output, nil
}
file addition: cluster.go (----------)

[0.248737]

package tools

import (
"context"
"fmt"
"skraak/db"
"skraak/utils"
"strings"
)

// ClusterInput defines the input parameters for the create_or_update_cluster tool
type ClusterInput struct {
ID *string `json:"id,omitempty"`
DatasetID *string `json:"dataset_id,omitempty"`
LocationID *string `json:"location_id,omitempty"`
Name *string `json:"name,omitempty"`
SampleRate *int `json:"sample_rate,omitempty"`
Path *string `json:"path,omitempty"`
CyclicRecordingPatternID *string `json:"cyclic_recording_pattern_id,omitempty"`
Description *string `json:"description,omitempty"`
}

// ClusterOutput defines the output structure
type ClusterOutput struct {
Cluster db.Cluster `json:"cluster"`
Message string `json:"message"`
}

// CreateOrUpdateCluster creates a new cluster or updates an existing one within a location
func CreateOrUpdateCluster(
ctx context.Context,
input ClusterInput,
) (ClusterOutput, error) {
if input.ID != nil && strings.TrimSpace(*input.ID) != "" {
return updateCluster(ctx, input)
}
return createCluster(ctx, input)
}

// validateClusterFields validates fields common to both create and update
func validateClusterFields(input ClusterInput) error {
if err := utils.ValidateOptionalStringLength(input.Name, "name", utils.MaxNameLen); err != nil {
return err
}
if err := utils.ValidateOptionalStringLength(input.Description, "description", utils.MaxDescriptionLen); err != nil {
return err
}
if err := utils.ValidateOptionalStringLength(input.Path, "path", utils.MaxPathLen); err != nil {
return err
}
if input.SampleRate != nil {
if err := utils.ValidatePositive(*input.SampleRate, "sample_rate"); err != nil {
return err
}
// Also check reasonable bounds
if err := utils.ValidateSampleRate(*input.SampleRate); err != nil {
return err
}
}
return nil
}

func createCluster(ctx context.Context, input ClusterInput) (ClusterOutput, error) {
var output ClusterOutput

// Validate required fields for create
if input.DatasetID == nil || strings.TrimSpace(*input.DatasetID) == "" {
return output, fmt.Errorf("dataset_id is required when creating a cluster")
}
if input.LocationID == nil || strings.TrimSpace(*input.LocationID) == "" {
return output, fmt.Errorf("location_id is required when creating a cluster")
}
if input.Name == nil || strings.TrimSpace(*input.Name) == "" {
return output, fmt.Errorf("name is required when creating a cluster")
}
if input.SampleRate == nil {
return output, fmt.Errorf("sample_rate is required when creating a cluster")
}

// Validate ID formats
if err := utils.ValidateShortID(*input.DatasetID, "dataset_id"); err != nil {
return output, err
}
if err := utils.ValidateShortID(*input.LocationID, "location_id"); err != nil {
return output, err
}

if err := validateClusterFields(input); err != nil {
return output, err
}

// Validate optional pattern ID format
if err := utils.ValidateOptionalShortID(input.CyclicRecordingPatternID, "cyclic_recording_pattern_id"); err != nil {
return output, err
}

// Open writable database connection
database, err := db.OpenWriteableDB(dbPath)
if err != nil {
return output, fmt.Errorf("database connection failed: %w", err)
}
defer database.Close()

// Begin logged transaction
tx, err := db.BeginLoggedTx(ctx, database, "create_or_update_cluster")
if err != nil {
return output, fmt.Errorf("failed to begin transaction: %w", err)
}
defer func() {
if err != nil {
tx.Rollback()
}
}()

// Verify dataset exists and is active
var datasetExists, datasetActive bool
var datasetName string
err = tx.QueryRowContext(ctx,
"SELECT EXISTS(SELECT 1 FROM dataset WHERE id = ?), COALESCE((SELECT active FROM dataset WHERE id = ?), false), COALESCE((SELECT name FROM dataset WHERE id = ?), '')",
*input.DatasetID, *input.DatasetID, *input.DatasetID,
).Scan(&datasetExists, &datasetActive, &datasetName)
if err != nil {
return output, fmt.Errorf("failed to verify dataset: %w", err)
}
if !datasetExists {
return output, fmt.Errorf("dataset with ID '%s' does not exist", *input.DatasetID)
}
if !datasetActive {
return output, fmt.Errorf("dataset '%s' (ID: %s) is not active", datasetName, *input.DatasetID)
}

// Verify location exists, is active, and belongs to the specified dataset
var locationExists, locationActive bool
var locationName string
var locationDatasetID string
err = tx.QueryRowContext(ctx,
"SELECT EXISTS(SELECT 1 FROM location WHERE id = ?), COALESCE((SELECT active FROM location WHERE id = ?), false), COALESCE((SELECT name FROM location WHERE id = ?), ''), COALESCE((SELECT dataset_id FROM location WHERE id = ?), '')",
*input.LocationID, *input.LocationID, *input.LocationID, *input.LocationID,
).Scan(&locationExists, &locationActive, &locationName, &locationDatasetID)
if err != nil {
return output, fmt.Errorf("failed to verify location: %w", err)
}
if !locationExists {
return output, fmt.Errorf("location with ID '%s' does not exist", *input.LocationID)
}
if !locationActive {
return output, fmt.Errorf("location '%s' (ID: %s) is not active", locationName, *input.LocationID)
}
if locationDatasetID != *input.DatasetID {
return output, fmt.Errorf("location '%s' (ID: %s) does not belong to dataset '%s' (ID: %s) - it belongs to dataset ID '%s'",
locationName, *input.LocationID, datasetName, *input.DatasetID, locationDatasetID)
}

// Verify cyclic recording pattern if provided
if input.CyclicRecordingPatternID != nil && strings.TrimSpace(*input.CyclicRecordingPatternID) != "" {
var patternExists, patternActive bool
err = tx.QueryRowContext(ctx,
"SELECT EXISTS(SELECT 1 FROM cyclic_recording_pattern WHERE id = ?), COALESCE((SELECT active FROM cyclic_recording_pattern WHERE id = ?), false)",
*input.CyclicRecordingPatternID, *input.CyclicRecordingPatternID,
).Scan(&patternExists, &patternActive)
if err != nil {
return output, fmt.Errorf("failed to verify cyclic recording pattern: %w", err)
}
if !patternExists {
return output, fmt.Errorf("cyclic recording pattern with ID '%s' does not exist", *input.CyclicRecordingPatternID)
}
if !patternActive {
return output, fmt.Errorf("cyclic recording pattern with ID '%s' is not active", *input.CyclicRecordingPatternID)
}
}

// Check for existing cluster with same name in location (UNIQUE constraint)
var existingID string
err = tx.QueryRowContext(ctx,
"SELECT id FROM cluster WHERE location_id = ? AND name = ? AND active = true",
*input.LocationID, *input.Name,
).Scan(&existingID)

if err == nil {
// Cluster with this name already exists in location - return existing (consistent duplicate handling)
var cluster db.Cluster
err = tx.QueryRowContext(ctx,
"SELECT id, dataset_id, location_id, name, description, created_at, last_modified, active, cyclic_recording_pattern_id, sample_rate FROM cluster WHERE id = ?",
existingID,
).Scan(&cluster.ID, &cluster.DatasetID, &cluster.LocationID, &cluster.Name, &cluster.Description,
&cluster.CreatedAt, &cluster.LastModified, &cluster.Active, &cluster.CyclicRecordingPatternID, &cluster.SampleRate)
if err != nil {
return output, fmt.Errorf("failed to fetch existing cluster: %w", err)
}

if err = tx.Commit(); err != nil {
return output, fmt.Errorf("failed to commit transaction: %w", err)
}

output.Cluster = cluster
output.Message = fmt.Sprintf("Cluster '%s' already exists in location '%s' (ID: %s) - returning existing cluster", cluster.Name, locationName, cluster.ID)
return output, nil
}

// Generate ID
id, err := utils.GenerateShortID()
if err != nil {
return output, fmt.Errorf("failed to generate ID: %w", err)
}

// Insert cluster
_, err = tx.ExecContext(ctx,
"INSERT INTO cluster (id, dataset_id, location_id, name, sample_rate, cyclic_recording_pattern_id, description, created_at, last_modified, active) VALUES (?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, TRUE)",
id, *input.DatasetID, *input.LocationID, *input.Name, *input.SampleRate, input.CyclicRecordingPatternID, input.Description,
)
if err != nil {
return output, fmt.Errorf("failed to create cluster: %w", err)
}

// Fetch the created cluster
var cluster db.Cluster
err = tx.QueryRowContext(ctx,
"SELECT id, dataset_id, location_id, name, description, created_at, last_modified, active, cyclic_recording_pattern_id, sample_rate FROM cluster WHERE id = ?",
id,
).Scan(&cluster.ID, &cluster.DatasetID, &cluster.LocationID, &cluster.Name, &cluster.Description,
&cluster.CreatedAt, &cluster.LastModified, &cluster.Active, &cluster.CyclicRecordingPatternID, &cluster.SampleRate)
if err != nil {
return output, fmt.Errorf("failed to fetch created cluster: %w", err)
}

if err = tx.Commit(); err != nil {
return output, fmt.Errorf("failed to commit transaction: %w", err)
}

output.Cluster = cluster
output.Message = fmt.Sprintf("Successfully created cluster '%s' with ID %s in location '%s' at dataset '%s' (sample rate: %d Hz)",
cluster.Name, cluster.ID, locationName, datasetName, cluster.SampleRate)

return output, nil
}

func updateCluster(ctx context.Context, input ClusterInput) (ClusterOutput, error) {
var output ClusterOutput
clusterID := *input.ID

// Validate ID format
if err := utils.ValidateShortID(clusterID, "cluster_id"); err != nil {
return output, err
}

if err := validateClusterFields(input); err != nil {
return output, err
}

// Validate optional pattern ID format
if input.CyclicRecordingPatternID != nil && strings.TrimSpace(*input.CyclicRecordingPatternID) != "" {
if err := utils.ValidateShortID(*input.CyclicRecordingPatternID, "cyclic_recording_pattern_id"); err != nil {
return output, err
}
}

// Open writable database
database, err := db.OpenWriteableDB(dbPath)
if err != nil {
return output, fmt.Errorf("failed to open database: %w", err)
}
defer database.Close()

// Verify cluster exists and check active status
var exists, active bool
err = database.QueryRow(
"SELECT EXISTS(SELECT 1 FROM cluster WHERE id = ?), COALESCE((SELECT active FROM cluster WHERE id = ?), false)",
clusterID, clusterID,
).Scan(&exists, &active)
if err != nil {
return output, fmt.Errorf("failed to query cluster: %w", err)
}
if !exists {
return output, fmt.Errorf("cluster not found: %s", clusterID)
}
if !active {
return output, fmt.Errorf("cluster '%s' is not active (cannot update inactive clusters)", clusterID)
}

// Validate cyclic_recording_pattern_id if provided
if input.CyclicRecordingPatternID != nil {
trimmedPatternID := strings.TrimSpace(*input.CyclicRecordingPatternID)
if trimmedPatternID != "" {
var patternExists, patternActive bool
err = database.QueryRow(
"SELECT EXISTS(SELECT 1 FROM cyclic_recording_pattern WHERE id = ?), COALESCE((SELECT active FROM cyclic_recording_pattern WHERE id = ?), false)",
trimmedPatternID, trimmedPatternID,
).Scan(&patternExists, &patternActive)
if err != nil {
return output, fmt.Errorf("failed to verify cyclic recording pattern: %w", err)
}
if !patternExists {
return output, fmt.Errorf("cyclic recording pattern not found: %s", trimmedPatternID)
}
if !patternActive {
return output, fmt.Errorf("cyclic recording pattern '%s' is not active", trimmedPatternID)
}
}
}

// Build dynamic UPDATE query
updates := []string{}
args := []any{}

if input.Name != nil {
updates = append(updates, "name = ?")
args = append(args, *input.Name)
}
if input.Path != nil {
updates = append(updates, "path = ?")
args = append(args, *input.Path)
}
if input.SampleRate != nil {
updates = append(updates, "sample_rate = ?")
args = append(args, *input.SampleRate)
}
if input.Description != nil {
updates = append(updates, "description = ?")
args = append(args, *input.Description)
}
if input.CyclicRecordingPatternID != nil {
trimmedPatternID := strings.TrimSpace(*input.CyclicRecordingPatternID)
if trimmedPatternID == "" {
updates = append(updates, "cyclic_recording_pattern_id = NULL")
} else {
updates = append(updates, "cyclic_recording_pattern_id = ?")
args = append(args, trimmedPatternID)
}
}

if len(updates) == 0 {
return output, fmt.Errorf("no fields provided to update")
}

// Always update last_modified
updates = append(updates, "last_modified = now()")
args = append(args, clusterID)

query := fmt.Sprintf("UPDATE cluster SET %s WHERE id = ?", strings.Join(updates, ", "))

// Begin logged transaction for update
tx, err := db.BeginLoggedTx(ctx, database, "create_or_update_cluster")
if err != nil {
return output, fmt.Errorf("failed to begin transaction: %w", err)
}
defer func() {
if err != nil {
tx.Rollback()
}
}()

_, err = tx.Exec(query, args...)
if err != nil {
return output, fmt.Errorf("failed to update cluster: %w", err)
}

// Fetch the updated cluster
var cluster db.Cluster
err = tx.QueryRow(
"SELECT id, dataset_id, location_id, name, description, created_at, last_modified, active, cyclic_recording_pattern_id, sample_rate FROM cluster WHERE id = ?",
clusterID,
).Scan(&cluster.ID, &cluster.DatasetID, &cluster.LocationID, &cluster.Name, &cluster.Description,
&cluster.CreatedAt, &cluster.LastModified, &cluster.Active, &cluster.CyclicRecordingPatternID, &cluster.SampleRate)
if err != nil {
return output, fmt.Errorf("failed to fetch updated cluster: %w", err)
}

if err = tx.Commit(); err != nil {
return output, fmt.Errorf("failed to commit transaction: %w", err)
}

output.Cluster = cluster
output.Message = fmt.Sprintf("Successfully updated cluster '%s' (ID: %s)", cluster.Name, cluster.ID)

return output, nil
}
file addition: calls_summarise.go (----------)

[0.248737]

package tools

import (
"sort"
"strings"

"skraak/utils"
)

// CallsSummariseInput defines the input for the calls-summarise tool
type CallsSummariseInput struct {
Folder string `json:"folder"`
Brief bool `json:"brief"`
Filter string `json:"filter,omitempty"`
}

// CallsSummariseOutput defines the output for the calls-summarise tool
type CallsSummariseOutput struct {
Segments []SegmentSummary `json:"segments"`
Folder string `json:"folder"`
DataFilesRead int `json:"data_files_read"`
DataFilesSkipped []string `json:"data_files_skipped"`
TotalSegments int `json:"total_segments"`
Filters map[string]FilterStats `json:"filters"`
ReviewStatus ReviewStatus `json:"review_status"`
Operators []string `json:"operators"`
Reviewers []string `json:"reviewers"`
Error *string `json:"error,omitempty"`
}

// SegmentSummary represents a single segment in the output
type SegmentSummary struct {
File string `json:"file"`
StartTime float64 `json:"start_time"`
EndTime float64 `json:"end_time"`
Labels []LabelSummary `json:"labels"`
}

// LabelSummary represents a label in the output (omits empty fields)
type LabelSummary struct {
Filter string `json:"filter"`
Certainty int `json:"certainty"`
Species string `json:"species"`
CallType string `json:"calltype,omitempty"`
Comment string `json:"comment,omitempty"`
Bookmark bool `json:"bookmark,omitempty"`
}

// FilterStats contains per-filter statistics
type FilterStats struct {
Segments int `json:"segments"`
Species map[string]int `json:"species"`
Calltypes map[string]map[string]int `json:"calltypes,omitempty"` // species -> calltype -> count
}

// ReviewStatus contains review progress statistics
type ReviewStatus struct {
Unreviewed int `json:"unreviewed"` // certainty < 100
Confirmed int `json:"confirmed"` // certainty = 100
DontKnow int `json:"dont_know"` // certainty = 0
WithCallType int `json:"with_calltype"`
WithComments int `json:"with_comments"`
Bookmarked int `json:"bookmarked"`
}

// CallsSummarise reads all .data files in a folder and produces a summary
func CallsSummarise(input CallsSummariseInput) (CallsSummariseOutput, error) {
var output CallsSummariseOutput

// Find all .data files
filePaths, err := utils.FindDataFiles(input.Folder)
if err != nil {
errMsg := err.Error()
output.Error = &errMsg
return output, err
}

// Initialize empty slices/maps (avoid null in JSON)
output.Segments = make([]SegmentSummary, 0)
output.Folder = input.Folder
output.Filters = make(map[string]FilterStats)
output.Operators = make([]string, 0)
output.Reviewers = make([]string, 0)
output.DataFilesSkipped = make([]string, 0)

if len(filePaths) == 0 {
return output, nil
}

// Track unique operators and reviewers
operatorSet := make(map[string]bool)
reviewerSet := make(map[string]bool)

// Process each file
for _, path := range filePaths {
df, err := utils.ParseDataFile(path)
if err != nil {
// Extract just the filename for skipped list
output.DataFilesSkipped = append(output.DataFilesSkipped, path)
continue
}

output.DataFilesRead++

// Track operator and reviewer
if df.Meta != nil {
if df.Meta.Operator != "" {
operatorSet[df.Meta.Operator] = true
}
if df.Meta.Reviewer != "" {
reviewerSet[df.Meta.Reviewer] = true
}
}

// Extract relative filename for segments (only needed if not brief)
var relPath string
if !input.Brief {
relPath = extractRelativePath(input.Folder, path)
}

// Process segments
for _, seg := range df.Segments {
// Filter labels if --filter is specified
var filteredLabels []*utils.Label
for _, l := range seg.Labels {
if input.Filter == "" || l.Filter == input.Filter {
filteredLabels = append(filteredLabels, l)
}
}

// Skip segments with no matching labels when filter is active
if input.Filter != "" && len(filteredLabels) == 0 {
continue
}

// Build label summaries (only if not brief)
var labels []LabelSummary
if !input.Brief {
for _, l := range filteredLabels {
labelSummary := LabelSummary{
Filter: l.Filter,
Certainty: l.Certainty,
Species: l.Species,
}
if l.CallType != "" {
labelSummary.CallType = l.CallType
}
if l.Comment != "" {
labelSummary.Comment = l.Comment
}
if l.Bookmark {
labelSummary.Bookmark = true
}
labels = append(labels, labelSummary)
}
}

// Update filter stats and review status (using filtered labels)
for _, l := range filteredLabels {
// Update filter stats
fs, exists := output.Filters[l.Filter]
if !exists {
fs = FilterStats{
Segments: 0,
Species: make(map[string]int),
Calltypes: make(map[string]map[string]int),
}
}
fs.Segments++
fs.Species[l.Species]++

// Track calltypes if present
if l.CallType != "" {
if fs.Calltypes[l.Species] == nil {
fs.Calltypes[l.Species] = make(map[string]int)
}
fs.Calltypes[l.Species][l.CallType]++
}
output.Filters[l.Filter] = fs

// Update review status
switch l.Certainty {
case 100:
output.ReviewStatus.Confirmed++
case 0:
output.ReviewStatus.DontKnow++
default:
output.ReviewStatus.Unreviewed++
}

if l.CallType != "" {
output.ReviewStatus.WithCallType++
}
if l.Comment != "" {
output.ReviewStatus.WithComments++
}
if l.Bookmark {
output.ReviewStatus.Bookmarked++
}
}

// Create segment summary only if not brief
if !input.Brief {
segSummary := SegmentSummary{
File: relPath,
StartTime: seg.StartTime,
EndTime: seg.EndTime,
Labels: labels,
}
output.Segments = append(output.Segments, segSummary)
}
}
}

// Count segments for total
if input.Brief {
// Recount from filter stats since we didn't track segments
for _, fs := range output.Filters {
output.TotalSegments += fs.Segments
}
} else {
output.TotalSegments = len(output.Segments)
}

// Clean up empty calltypes maps (omitempty doesn't work on non-nil empty maps)
for filter, fs := range output.Filters {
if len(fs.Calltypes) == 0 {
fs.Calltypes = nil
output.Filters[filter] = fs
}
}

// Convert sets to sorted slices
for op := range operatorSet {
output.Operators = append(output.Operators, op)
}
for r := range reviewerSet {
output.Reviewers = append(output.Reviewers, r)
}
sort.Strings(output.Operators)
sort.Strings(output.Reviewers)

// Sort segments by file, then start time (only if not brief)
if !input.Brief {
sort.Slice(output.Segments, func(i, j int) bool {
if output.Segments[i].File != output.Segments[j].File {
return output.Segments[i].File < output.Segments[j].File
}
return output.Segments[i].StartTime < output.Segments[j].StartTime
})
}

return output, nil
}

// extractRelativePath extracts the audio filename from a .data file path
// e.g., "/folder/tx51_LISTENING_20260221_203004.WAV.data" -> "tx51_LISTENING_20260221_203004.WAV"
// Preserves the original case of the extension as-is.
func extractRelativePath(folder, dataPath string) string {
// Get the filename
filename := dataPath
if idx := strings.LastIndex(dataPath, "/"); idx >= 0 {
filename = dataPath[idx+1:]
}

// Remove .data extension, preserve everything else
return strings.TrimSuffix(filename, ".data")
}
file addition: calls_show_images.go (----------)

[0.248737]

package tools

import (
"fmt"
"os"
"strings"

"skraak/utils"
)

// CallsShowImagesInput defines the input for the show-images tool
type CallsShowImagesInput struct {
DataFilePath string `json:"data_file_path"`
Color bool `json:"color"`
ImageSize int `json:"image_size"`
Sixel bool `json:"sixel"`
ITerm bool `json:"iterm"`
}

// CallsShowImagesOutput defines the output for the show-images tool
type CallsShowImagesOutput struct {
SegmentsShown int `json:"segments_shown"`
WavFile string `json:"wav_file"`
Error string `json:"error,omitempty"`
}

// CallsShowImages reads a .data file and displays spectrogram images for each segment
func CallsShowImages(input CallsShowImagesInput) (CallsShowImagesOutput, error) {
var output CallsShowImagesOutput

// Validate file exists
if _, err := os.Stat(input.DataFilePath); os.IsNotExist(err) {
output.Error = fmt.Sprintf("File not found: %s", input.DataFilePath)
return output, fmt.Errorf("%s", output.Error)
}

// Derive WAV file path (strip .data suffix)
wavPath := strings.TrimSuffix(input.DataFilePath, ".data")
output.WavFile = wavPath

// Check WAV file exists
if _, err := os.Stat(wavPath); os.IsNotExist(err) {
output.Error = fmt.Sprintf("WAV file not found: %s", wavPath)
return output, fmt.Errorf("%s", output.Error)
}

// Parse .data file (includes labels for future filtering)
dataFile, err := utils.ParseDataFile(input.DataFilePath)
if err != nil {
output.Error = err.Error()
return output, fmt.Errorf("%s", output.Error)
}

if len(dataFile.Segments) == 0 {
output.Error = "No segments found in .data file"
return output, fmt.Errorf("%s", output.Error)
}

// Resolve image size
imgSize := input.ImageSize
if imgSize == 0 {
imgSize = utils.SpectrogramDisplaySize
}

// Select graphics protocol
protocol := utils.ProtocolKitty
if input.ITerm {
protocol = utils.ProtocolITerm
} else if input.Sixel {
protocol = utils.ProtocolSixel
}

// Generate spectrogram for each segment and output
for i, seg := range dataFile.Segments {
// Generate spectrogram image
img, err := utils.GenerateSegmentSpectrogram(input.DataFilePath, seg.StartTime, seg.EndTime, input.Color, imgSize)
if err != nil || img == nil {
continue
}

// Print segment info
labelInfo := formatSegmentLabels(seg.Labels)
fmt.Fprintf(os.Stderr, "Segment %d: %.1fs - %.1fs (%.1fs)%s\n",
i+1, seg.StartTime, seg.EndTime, seg.EndTime-seg.StartTime, labelInfo)

// Write to stdout via terminal graphics protocol
if err := utils.WriteImage(img, os.Stdout, protocol); err != nil {
output.Error = fmt.Sprintf("Failed to write image: %v", err)
return output, fmt.Errorf("%s", output.Error)
}
fmt.Println() // Newline after image
}

output.SegmentsShown = len(dataFile.Segments)
return output, nil
}

// formatSegmentLabels formats labels for display in segment info
func formatSegmentLabels(labels []*utils.Label) string {
if len(labels) == 0 {
return ""
}
var parts []string
for _, l := range labels {
part := l.Species
if l.CallType != "" {
part += "/" + l.CallType
}
if l.Filter != "" {
part += " [" + l.Filter + "]"
}
parts = append(parts, part)
}
return " " + strings.Join(parts, ", ")
}
file addition: calls_push_certainty_test.go (----------)

[0.248737]

package tools

import (
"encoding/json"
"os"
"path/filepath"
"testing"

"skraak/utils"
)

func TestPushCertaintyPromotesMatchingLabels(t *testing.T) {
tempDir := t.TempDir()

// File with two Kiwi segments: certainty=90 and certainty=70
file1 := `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 90}]], [10, 20, 100, 1000, [{"species": "Kiwi", "certainty": 70}]]]`
file1Path := filepath.Join(tempDir, "file1.data")
if err := os.WriteFile(file1Path, []byte(file1), 0644); err != nil {
t.Fatal(err)
}

// File with one Tomtit at certainty=90 (must not be promoted when species=Kiwi)
file2 := `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Tomtit", "certainty": 90}]]]`
file2Path := filepath.Join(tempDir, "file2.data")
if err := os.WriteFile(file2Path, []byte(file2), 0644); err != nil {
t.Fatal(err)
}

result, err := PushCertainty(PushCertaintyConfig{
Folder: tempDir,
Species: "Kiwi",
Reviewer: "TestReviewer",
})
if err != nil {
t.Fatal(err)
}

if result.SegmentsUpdated != 1 {
t.Errorf("expected 1 segment updated, got %d", result.SegmentsUpdated)
}
if result.FilesUpdated != 1 {
t.Errorf("expected 1 file updated, got %d", result.FilesUpdated)
}

// Verify file1: certainty=90 Kiwi → 100, certainty=70 Kiwi → unchanged
df, err := utils.ParseDataFile(file1Path)
if err != nil {
t.Fatal(err)
}
if df.Segments[0].Labels[0].Certainty != 100 {
t.Errorf("expected certainty=100, got %d", df.Segments[0].Labels[0].Certainty)
}
if df.Segments[1].Labels[0].Certainty != 70 {
t.Errorf("expected certainty=70 unchanged, got %d", df.Segments[1].Labels[0].Certainty)
}
if df.Meta.Reviewer != "TestReviewer" {
t.Errorf("expected reviewer=TestReviewer, got %q", df.Meta.Reviewer)
}

// Verify Tomtit file was not modified
df2, err := utils.ParseDataFile(file2Path)
if err != nil {
t.Fatal(err)
}
if df2.Segments[0].Labels[0].Certainty != 90 {
t.Errorf("Tomtit certainty should be unchanged at 90, got %d", df2.Segments[0].Labels[0].Certainty)
}
}

func TestPushCertaintyFilterScope(t *testing.T) {
tempDir := t.TempDir()

// Segment has two labels from different filters, both Kiwi certainty=90
data := []any{
map[string]any{"Operator": "test"},
[]any{0.0, 10.0, 100.0, 1000.0, []any{
map[string]any{"species": "Kiwi", "certainty": 90, "filter": "model-a"},
map[string]any{"species": "Kiwi", "certainty": 90, "filter": "model-b"},
}},
}
raw, _ := json.Marshal(data)
filePath := filepath.Join(tempDir, "file1.data")
if err := os.WriteFile(filePath, raw, 0644); err != nil {
t.Fatal(err)
}

// Push only model-a
result, err := PushCertainty(PushCertaintyConfig{
Folder: tempDir,
Filter: "model-a",
Species: "Kiwi",
Reviewer: "TestReviewer",
})
if err != nil {
t.Fatal(err)
}
if result.SegmentsUpdated != 1 {
t.Errorf("expected 1 segment updated, got %d", result.SegmentsUpdated)
}

// Verify only model-a label was promoted; model-b stays at 90
df, err := utils.ParseDataFile(filePath)
if err != nil {
t.Fatal(err)
}
for _, label := range df.Segments[0].Labels {
if label.Filter == "model-a" && label.Certainty != 100 {
t.Errorf("model-a label should be 100, got %d", label.Certainty)
}
if label.Filter == "model-b" && label.Certainty != 90 {
t.Errorf("model-b label should be unchanged at 90, got %d", label.Certainty)
}
}
}
file addition: calls_push_certainty.go (----------)

[0.248737]

package tools

import (
"fmt"

"skraak/utils"
)

// PushCertaintyConfig holds the configuration for push-certainty
type PushCertaintyConfig struct {
Folder string
File string
Filter string
Species string
CallType string
Night bool
Day bool
Lat float64
Lng float64
Timezone string
Reviewer string
}

// PushCertaintyResult holds the result of push-certainty
type PushCertaintyResult struct {
SegmentsUpdated int `json:"segments_updated"`
FilesUpdated int `json:"files_updated"`
TimeFilteredCount int `json:"time_filtered_count"`
}

// PushCertainty promotes all certainty=90 segments matching the filter scope to certainty=100.
// Uses identical filtering logic to LoadDataFiles so the scope matches calls classify exactly.
func PushCertainty(config PushCertaintyConfig) (*PushCertaintyResult, error) {
state, err := LoadDataFiles(ClassifyConfig{
Folder: config.Folder,
File: config.File,
Filter: config.Filter,
Species: config.Species,
CallType: config.CallType,
Certainty: 90,
Sample: -1,
Night: config.Night,
Day: config.Day,
Lat: config.Lat,
Lng: config.Lng,
Timezone: config.Timezone,
})
if err != nil {
return nil, err
}

var segsUpdated, filesUpdated int
for i, df := range state.DataFiles {
changed := false
for _, seg := range state.FilteredSegs()[i] {
for _, label := range seg.Labels {
if labelMatchesPush(label, config.Filter, config.Species, config.CallType) {
label.Certainty = 100
changed = true
segsUpdated++
}
}
}
if changed {
df.Meta.Reviewer = config.Reviewer
if err := df.Write(df.FilePath); err != nil {
return nil, fmt.Errorf("write %s: %w", df.FilePath, err)
}
filesUpdated++
}
}

return &PushCertaintyResult{
SegmentsUpdated: segsUpdated,
FilesUpdated: filesUpdated,
TimeFilteredCount: state.TimeFilteredCount,
}, nil
}

// labelMatchesPush returns true if the label matches the push scope and has certainty=90.
// Certainty is already guaranteed by LoadDataFiles, but we re-check to target only the
// specific label that matched (a segment may carry labels from multiple filters).
func labelMatchesPush(label *utils.Label, filter, species, callType string) bool {
if filter != "" && label.Filter != filter {
return false
}
if species != "" && label.Species != species {
return false
}
if callType != "" && label.CallType != callType {
return false
}
return label.Certainty == 90
}
file addition: calls_propagate_test.go (----------)

[0.248737]

package tools

import (
"path/filepath"
"testing"

"skraak/utils"
)

// helpers

func seg(start, end float64, labels ...*utils.Label) *utils.Segment {
return &utils.Segment{
StartTime: start,
EndTime: end,
FreqLow: 100,
FreqHigh: 8000,
Labels: labels,
}
}

func lbl(filter, species, calltype string, certainty int) *utils.Label {
return &utils.Label{
Filter: filter,
Species: species,
CallType: calltype,
Certainty: certainty,
}
}

func writeFile(t *testing.T, segs ...*utils.Segment) string {
t.Helper()
dir := t.TempDir()
path := filepath.Join(dir, "test.data")
df := &utils.DataFile{
Meta: &utils.DataMeta{Operator: "ML", Reviewer: "David", Duration: 3600},
Segments: segs,
}
if err := df.Write(path); err != nil {
t.Fatalf("write fixture: %v", err)
}
return path
}

func readFile(t *testing.T, path string) *utils.DataFile {
t.Helper()
df, err := utils.ParseDataFile(path)
if err != nil {
t.Fatalf("parse %s: %v", path, err)
}
return df
}

// findLabel returns the label with matching filter and time on the parsed file, or nil.
func findLabel(df *utils.DataFile, filter string, start, end float64) *utils.Label {
for _, s := range df.Segments {
if s.StartTime != start || s.EndTime != end {
continue
}
for _, l := range s.Labels {
if l.Filter == filter {
return l
}
}
}
return nil
}

const (
fFrom = "opensoundscape-kiwi-1.2"
fTo = "opensoundscape-kiwi-1.5"
)

func TestPropagate_HappyPathSingle(t *testing.T) {
path := writeFile(t,
seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),
seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),
)

out, err := CallsPropagate(CallsPropagateInput{
File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v (%s)", err, out.Error)
}
if out.Propagated != 1 || out.TargetsExamined != 1 || out.SkippedConflict != 0 || out.SkippedNoOverlap != 0 {
t.Fatalf("counts wrong: %+v", out)
}

df := readFile(t, path)
target := findLabel(df, fTo, 100, 125)
if target == nil {
t.Fatal("target label missing")
}
if target.Species != "Kiwi" || target.CallType != "Male" || target.Certainty != 90 {
t.Errorf("target not updated correctly: species=%q calltype=%q cert=%d", target.Species, target.CallType, target.Certainty)
}
if df.Meta.Reviewer != "Skraak" {
t.Errorf("reviewer = %q, want Skraak", df.Meta.Reviewer)
}
}

func TestPropagate_NoOverlap(t *testing.T) {
path := writeFile(t,
seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),
seg(500, 525, lbl(fTo, "Kiwi", "Duet", 70)),
)

out, err := CallsPropagate(CallsPropagateInput{
File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if out.Propagated != 0 || out.TargetsExamined != 1 || out.SkippedNoOverlap != 1 {
t.Fatalf("counts wrong: %+v", out)
}
df := readFile(t, path)
target := findLabel(df, fTo, 500, 525)
if target.Certainty != 70 {
t.Errorf("target should not be modified, cert=%d", target.Certainty)
}
if df.Meta.Reviewer != "David" {
t.Errorf("reviewer should stay David (no write), got %q", df.Meta.Reviewer)
}
}

func TestPropagate_SourceWrongSpecies_Ignored(t *testing.T) {
path := writeFile(t,
seg(100, 125, lbl(fFrom, "Weka", "", 100)),
seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),
)

out, err := CallsPropagate(CallsPropagateInput{
File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if out.Propagated != 0 || out.SkippedNoOverlap != 1 {
t.Fatalf("counts wrong: %+v", out)
}
}

func TestPropagate_SourceWrongCertainty_Ignored(t *testing.T) {
// cert=70 and cert=0 source labels must NOT count as sources.
path := writeFile(t,
seg(100, 125, lbl(fFrom, "Kiwi", "Male", 70)),
seg(200, 225, lbl(fFrom, "Don't Know", "", 0)),
seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),
seg(200, 225, lbl(fTo, "Kiwi", "Male", 70)),
)

out, err := CallsPropagate(CallsPropagateInput{
File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if out.Propagated != 0 || out.SkippedNoOverlap != 2 {
t.Fatalf("counts wrong: %+v", out)
}
}

func TestPropagate_SourceWrongFilter_Ignored(t *testing.T) {
path := writeFile(t,
seg(100, 125, lbl("some-other-filter", "Kiwi", "Male", 100)),
seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),
)

out, err := CallsPropagate(CallsPropagateInput{
File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !out.FiltersMissing || out.Propagated != 0 || out.TargetsExamined != 0 {
t.Fatalf("expected FiltersMissing=true with zero counts, got: %+v", out)
}
}

func TestPropagate_TargetCert100_NotTouched(t *testing.T) {
// Target with cert=100 is human-verified — must NOT be overwritten.
path := writeFile(t,
seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),
seg(100, 125, lbl(fTo, "Kiwi", "Male", 100)),
)

out, err := CallsPropagate(CallsPropagateInput{
File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if out.TargetsExamined != 0 || out.Propagated != 0 {
t.Fatalf("cert=100 target must not be examined: %+v", out)
}
df := readFile(t, path)
if df.Meta.Reviewer != "David" {
t.Errorf("reviewer should stay David (no write), got %q", df.Meta.Reviewer)
}
}

func TestPropagate_TargetCert90_NotTouched(t *testing.T) {
// Target with cert=90 (already propagated earlier) must NOT be re-propagated.
path := writeFile(t,
seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),
seg(100, 125, lbl(fTo, "Kiwi", "Female", 90)),
)

out, err := CallsPropagate(CallsPropagateInput{
File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if out.TargetsExamined != 0 || out.Propagated != 0 {
t.Fatalf("cert=90 target must not be examined: %+v", out)
}
df := readFile(t, path)
target := findLabel(df, fTo, 100, 125)
if target.Certainty != 90 || target.CallType != "Female" {
t.Errorf("cert=90 target was modified: %+v", target)
}
}

func TestPropagate_TargetCert0_Propagated(t *testing.T) {
// Target at cert=0 ("Don't Know" / "Noise") SHOULD be propagated when an
// overlapping cert=100 source exists — rescues labels from the noise bucket
// so they surface for review even if occasionally wrong.
path := writeFile(t,
seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),
seg(100, 125, lbl(fTo, "Don't Know", "", 0)),
seg(200, 225, lbl(fFrom, "Kiwi", "Female", 100)),
seg(200, 225, lbl(fTo, "Noise", "", 0)),
)

out, err := CallsPropagate(CallsPropagateInput{
File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if out.TargetsExamined != 2 || out.Propagated != 2 {
t.Fatalf("cert=0 targets must be propagated: %+v", out)
}
df := readFile(t, path)
for _, c := range []struct {
start, end float64
calltype string
}{{100, 125, "Male"}, {200, 225, "Female"}} {
l := findLabel(df, fTo, c.start, c.end)
if l == nil || l.Species != "Kiwi" || l.CallType != c.calltype || l.Certainty != 90 {
t.Errorf("at %v-%v got %+v, want Kiwi+%s cert=90", c.start, c.end, l, c.calltype)
}
}
}

func TestPropagate_MultipleSourcesAgree(t *testing.T) {
// Two overlapping sources with same calltype → propagate.
path := writeFile(t,
seg(100, 110, lbl(fFrom, "Kiwi", "Male", 100)),
seg(105, 120, lbl(fFrom, "Kiwi", "Male", 100)),
seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),
)

out, err := CallsPropagate(CallsPropagateInput{
File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if out.Propagated != 1 || out.SkippedConflict != 0 {
t.Fatalf("counts wrong: %+v", out)
}
df := readFile(t, path)
target := findLabel(df, fTo, 100, 125)
if target.CallType != "Male" {
t.Errorf("calltype should be Male, got %q", target.CallType)
}
}

func TestPropagate_MultipleSourcesConflict(t *testing.T) {
// Two overlapping sources with different calltypes → conflict, skip, report.
path := writeFile(t,
seg(100, 110, lbl(fFrom, "Kiwi", "Male", 100)),
seg(115, 120, lbl(fFrom, "Kiwi", "Female", 100)),
seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),
)

out, err := CallsPropagate(CallsPropagateInput{
File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if out.Propagated != 0 || out.SkippedConflict != 1 {
t.Fatalf("expected 1 conflict skip: %+v", out)
}
if len(out.Conflicts) != 1 {
t.Fatalf("expected 1 conflict report, got %d", len(out.Conflicts))
}
if out.Conflicts[0].TargetStart != 100 || out.Conflicts[0].TargetEnd != 125 {
t.Errorf("conflict target wrong: %+v", out.Conflicts[0])
}
if len(out.Conflicts[0].SourceChoices) != 2 {
t.Errorf("expected 2 source choices, got %d", len(out.Conflicts[0].SourceChoices))
}
// Target must NOT be modified.
df := readFile(t, path)
target := findLabel(df, fTo, 100, 125)
if target.CallType != "Duet" || target.Certainty != 70 {
t.Errorf("conflicted target was modified: %+v", target)
}
if df.Meta.Reviewer != "David" {
t.Errorf("reviewer should stay David (no write), got %q", df.Meta.Reviewer)
}
}

func TestPropagate_EmptyCallTypePropagates(t *testing.T) {
// Source with empty calltype → target gets empty calltype.
path := writeFile(t,
seg(100, 125, lbl(fFrom, "Kiwi", "", 100)),
seg(100, 125, lbl(fTo, "Kiwi", "Male", 70)),
)

out, err := CallsPropagate(CallsPropagateInput{
File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if out.Propagated != 1 {
t.Fatalf("expected propagated=1: %+v", out)
}
df := readFile(t, path)
target := findLabel(df, fTo, 100, 125)
if target.CallType != "" {
t.Errorf("calltype should be cleared, got %q", target.CallType)
}
if target.Species != "Kiwi" || target.Certainty != 90 {
t.Errorf("target fields wrong: %+v", target)
}
}

func TestPropagate_SpeciesOverride(t *testing.T) {
// Target species was different from --species; must be overwritten.
path := writeFile(t,
seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),
seg(100, 125, lbl(fTo, "Don't Know", "", 70)),
)

out, err := CallsPropagate(CallsPropagateInput{
File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if out.Propagated != 1 {
t.Fatalf("expected propagated=1: %+v", out)
}
df := readFile(t, path)
target := findLabel(df, fTo, 100, 125)
if target.Species != "Kiwi" || target.CallType != "Male" || target.Certainty != 90 {
t.Errorf("target not overwritten correctly: %+v", target)
}
}

func TestPropagate_OverlapBoundaryExclusive(t *testing.T) {
// Segments touching at a point (src ends exactly where tgt starts) do NOT overlap.
path := writeFile(t,
seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),
seg(125, 150, lbl(fTo, "Kiwi", "Duet", 70)),
)

out, err := CallsPropagate(CallsPropagateInput{
File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if out.Propagated != 0 || out.SkippedNoOverlap != 1 {
t.Fatalf("touching boundary must not count as overlap: %+v", out)
}
}

func TestPropagate_OverlapPartial(t *testing.T) {
// 1-second overlap is enough.
path := writeFile(t,
seg(100, 126, lbl(fFrom, "Kiwi", "Male", 100)),
seg(125, 150, lbl(fTo, "Kiwi", "Duet", 70)),
)

out, err := CallsPropagate(CallsPropagateInput{
File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if out.Propagated != 1 {
t.Fatalf("expected propagated=1: %+v", out)
}
}

func TestPropagate_SupersetEitherDirection(t *testing.T) {
// Source engulfs target.
path1 := writeFile(t,
seg(100, 200, lbl(fFrom, "Kiwi", "Male", 100)),
seg(110, 150, lbl(fTo, "Kiwi", "Duet", 70)),
)
if out, _ := CallsPropagate(CallsPropagateInput{File: path1, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi"}); out.Propagated != 1 {
t.Errorf("source-engulfs-target: %+v", out)
}

// Target engulfs source.
path2 := writeFile(t,
seg(110, 150, lbl(fFrom, "Kiwi", "Male", 100)),
seg(100, 200, lbl(fTo, "Kiwi", "Duet", 70)),
)
if out, _ := CallsPropagate(CallsPropagateInput{File: path2, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi"}); out.Propagated != 1 {
t.Errorf("target-engulfs-source: %+v", out)
}
}

func TestPropagate_MissingFlags(t *testing.T) {
cases := []struct {
name string
in CallsPropagateInput
}{
{"no file", CallsPropagateInput{FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi"}},
{"no from", CallsPropagateInput{File: "x", ToFilter: fTo, Species: "Kiwi"}},
{"no to", CallsPropagateInput{File: "x", FromFilter: fFrom, Species: "Kiwi"}},
{"no species", CallsPropagateInput{File: "x", FromFilter: fFrom, ToFilter: fTo}},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
_, err := CallsPropagate(c.in)
if err == nil {
t.Errorf("expected error")
}
})
}
}

func TestPropagate_SameFromAndTo(t *testing.T) {
_, err := CallsPropagate(CallsPropagateInput{
File: "x", FromFilter: fFrom, ToFilter: fFrom, Species: "Kiwi",
})
if err == nil {
t.Error("expected error when --from == --to")
}
}

func TestPropagate_NonexistentFile(t *testing.T) {
_, err := CallsPropagate(CallsPropagateInput{
File: "/nonexistent/path.data", FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err == nil {
t.Error("expected error for nonexistent file")
}
}

func TestPropagate_RealisticMixed(t *testing.T) {
// Mimics the 20260228_211500.WAV.data case: cert=0 "Don't Know" and cert=100 Kiwi sources
// coexist; only cert=100 Kiwi gets propagated.
path := writeFile(t,
// Sources (kiwi-1.2)
seg(45, 52.5, lbl(fFrom, "Don't Know", "", 0)),
seg(142.5, 177.5, lbl(fFrom, "Kiwi", "Male", 100)),
seg(195, 217.5, lbl(fFrom, "Don't Know", "", 0)),
seg(647.5, 682.5, lbl(fFrom, "Kiwi", "Female", 100)),
seg(815, 855, lbl(fFrom, "Kiwi", "Duet", 100)),
// Targets (kiwi-1.5)
seg(147.5, 167.5, lbl(fTo, "Kiwi", "Male", 70)),
seg(647.5, 672.5, lbl(fTo, "Kiwi", "Female", 70)),
seg(815, 852.5, lbl(fTo, "Kiwi", "Duet", 70)),
)

out, err := CallsPropagate(CallsPropagateInput{
File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if out.TargetsExamined != 3 || out.Propagated != 3 || out.SkippedConflict != 0 {
t.Fatalf("counts wrong: %+v", out)
}
df := readFile(t, path)
expect := []struct {
start, end float64
calltype string
}{
{147.5, 167.5, "Male"},
{647.5, 672.5, "Female"},
{815, 852.5, "Duet"},
}
for _, e := range expect {
l := findLabel(df, fTo, e.start, e.end)
if l == nil || l.Certainty != 90 || l.CallType != e.calltype || l.Species != "Kiwi" {
t.Errorf("at %v-%v got %+v, want Kiwi+%s cert=90", e.start, e.end, l, e.calltype)
}
}
}

func TestPropagate_NoWriteIfNothingChanged(t *testing.T) {
// File with only non-target segments should not be rewritten (reviewer unchanged).
path := writeFile(t,
seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),
)

out, err := CallsPropagate(CallsPropagateInput{
File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if out.Propagated != 0 || out.TargetsExamined != 0 {
t.Fatalf("expected no activity: %+v", out)
}
df := readFile(t, path)
if df.Meta.Reviewer != "David" {
t.Errorf("reviewer should not be touched, got %q", df.Meta.Reviewer)
}
}

// writeFileAt is like writeFile but puts the file inside an existing dir
// with a caller-provided basename (must end in .data).
func writeFileAt(t *testing.T, dir, base string, segs ...*utils.Segment) string {
t.Helper()
path := filepath.Join(dir, base)
df := &utils.DataFile{
Meta: &utils.DataMeta{Operator: "ML", Reviewer: "David", Duration: 3600},
Segments: segs,
}
if err := df.Write(path); err != nil {
t.Fatalf("write fixture: %v", err)
}
return path
}

func TestPropagateFolder_AggregatesAndSkipsMissing(t *testing.T) {
dir := t.TempDir()

// File A: both filters present, one clean propagation.
aPath := writeFileAt(t, dir, "a.wav.data",
seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),
seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),
)
// File B: only target filter — missing source, must be skipped silently.
bPath := writeFileAt(t, dir, "b.wav.data",
seg(200, 225, lbl(fTo, "Kiwi", "Duet", 70)),
)
// File C: only source filter — missing target, must be skipped silently.
writeFileAt(t, dir, "c.wav.data",
seg(300, 325, lbl(fFrom, "Kiwi", "Male", 100)),
)
// File D: both filters, but no overlap → targets examined, none propagated.
dPath := writeFileAt(t, dir, "d.wav.data",
seg(400, 425, lbl(fFrom, "Kiwi", "Male", 100)),
seg(500, 525, lbl(fTo, "Kiwi", "Duet", 70)),
)

out, err := CallsPropagateFolder(CallsPropagateFolderInput{
Folder: dir, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}

if out.FilesTotal != 4 {
t.Errorf("FilesTotal: got %d, want 4", out.FilesTotal)
}
if out.FilesWithBothFilters != 2 {
t.Errorf("FilesWithBothFilters: got %d, want 2", out.FilesWithBothFilters)
}
if out.FilesSkippedNoFilter != 2 {
t.Errorf("FilesSkippedNoFilter: got %d, want 2", out.FilesSkippedNoFilter)
}
if out.FilesChanged != 1 {
t.Errorf("FilesChanged: got %d, want 1", out.FilesChanged)
}
if out.FilesErrored != 0 {
t.Errorf("FilesErrored: got %d, want 0", out.FilesErrored)
}
if out.TargetsExamined != 2 {
t.Errorf("TargetsExamined: got %d, want 2", out.TargetsExamined)
}
if out.Propagated != 1 {
t.Errorf("Propagated: got %d, want 1", out.Propagated)
}
if out.SkippedNoOverlap != 1 {
t.Errorf("SkippedNoOverlap: got %d, want 1", out.SkippedNoOverlap)
}

// File A was changed; check on-disk state.
aDf := readFile(t, aPath)
if aDf.Meta.Reviewer != "Skraak" {
t.Errorf("a.wav.data reviewer: got %q, want Skraak", aDf.Meta.Reviewer)
}
if l := findLabel(aDf, fTo, 100, 125); l == nil || l.Certainty != 90 || l.CallType != "Male" {
t.Errorf("a.wav.data target label: got %+v, want cert=90 calltype=Male", l)
}

// File B was skipped — reviewer untouched.
bDf := readFile(t, bPath)
if bDf.Meta.Reviewer != "David" {
t.Errorf("b.wav.data reviewer should not be touched, got %q", bDf.Meta.Reviewer)
}

// File D had no overlap — reviewer untouched, target still cert=70.
dDf := readFile(t, dPath)
if dDf.Meta.Reviewer != "David" {
t.Errorf("d.wav.data reviewer should not be touched, got %q", dDf.Meta.Reviewer)
}
if l := findLabel(dDf, fTo, 500, 525); l == nil || l.Certainty != 70 {
t.Errorf("d.wav.data target label should be unchanged cert=70, got %+v", l)
}
}

func TestPropagateFolder_EmptyFolder(t *testing.T) {
dir := t.TempDir()
out, err := CallsPropagateFolder(CallsPropagateFolderInput{
Folder: dir, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if out.FilesTotal != 0 || out.Propagated != 0 {
t.Errorf("expected empty result, got %+v", out)
}
}

func TestPropagateFolder_MissingRequiredFlags(t *testing.T) {
dir := t.TempDir()
cases := []CallsPropagateFolderInput{
{Folder: "", FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi"},
{Folder: dir, FromFilter: "", ToFilter: fTo, Species: "Kiwi"},
{Folder: dir, FromFilter: fFrom, ToFilter: "", Species: "Kiwi"},
{Folder: dir, FromFilter: fFrom, ToFilter: fTo, Species: ""},
{Folder: dir, FromFilter: fFrom, ToFilter: fFrom, Species: "Kiwi"},
}
for i, in := range cases {
if _, err := CallsPropagateFolder(in); err == nil {
t.Errorf("case %d: expected error for input %+v", i, in)
}
}
}

func TestPropagateFolder_NonexistentFolder(t *testing.T) {
_, err := CallsPropagateFolder(CallsPropagateFolderInput{
Folder: "/nonexistent/path/xyz", FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err == nil {
t.Fatal("expected error for nonexistent folder")
}
}

func TestPropagateFolder_ConflictsTaggedWithFile(t *testing.T) {
dir := t.TempDir()
// Two sources with different calltypes both overlapping one target.
writeFileAt(t, dir, "conflict.wav.data",
seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),
seg(110, 130, lbl(fFrom, "Kiwi", "Female", 100)),
seg(100, 130, lbl(fTo, "Kiwi", "", 70)),
)

out, err := CallsPropagateFolder(CallsPropagateFolderInput{
Folder: dir, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if out.SkippedConflict != 1 || len(out.Conflicts) != 1 {
t.Fatalf("expected one conflict, got %+v", out)
}
if out.Conflicts[0].File == "" {
t.Errorf("conflict should be tagged with file path, got %+v", out.Conflicts[0])
}
}
file addition: calls_propagate.go (----------)

[0.248737]

package tools

import (
"fmt"
"os"

"skraak/utils"
)

type CallsPropagateInput struct {
File string `json:"file"`
FromFilter string `json:"from_filter"`
ToFilter string `json:"to_filter"`
Species string `json:"species"`
}

type CallsPropagateOutput struct {
File string `json:"file"`
FromFilter string `json:"from_filter"`
ToFilter string `json:"to_filter"`
Species string `json:"species"`
FiltersMissing bool `json:"filters_missing,omitempty"`
TargetsExamined int `json:"targets_examined"`
Propagated int `json:"propagated"`
SkippedNoOverlap int `json:"skipped_no_overlap"`
SkippedConflict int `json:"skipped_conflict"`
Conflicts []PropagateConflict `json:"conflicts,omitempty"`
Changes []PropagateChange `json:"changes,omitempty"`
Error string `json:"error,omitempty"`
}

type CallsPropagateFolderInput struct {
Folder string `json:"folder"`
FromFilter string `json:"from_filter"`
ToFilter string `json:"to_filter"`
Species string `json:"species"`
}

type CallsPropagateFolderOutput struct {
Folder string `json:"folder"`
FromFilter string `json:"from_filter"`
ToFilter string `json:"to_filter"`
Species string `json:"species"`
FilesTotal int `json:"files_total"`
FilesWithBothFilters int `json:"files_with_both_filters"`
FilesSkippedNoFilter int `json:"files_skipped_no_filter"`
FilesChanged int `json:"files_changed"`
FilesErrored int `json:"files_errored"`
TargetsExamined int `json:"targets_examined"`
Propagated int `json:"propagated"`
SkippedNoOverlap int `json:"skipped_no_overlap"`
SkippedConflict int `json:"skipped_conflict"`
Conflicts []PropagateConflict `json:"conflicts,omitempty"`
Errors []CallsPropagateOutput `json:"errors,omitempty"`
Error string `json:"error,omitempty"`
}

type PropagateConflict struct {
File string `json:"file,omitempty"`
TargetStart float64 `json:"target_start"`
TargetEnd float64 `json:"target_end"`
TargetCallType string `json:"target_calltype,omitempty"`
SourceChoices []PropagateSourceChoice `json:"source_choices"`
}

type PropagateSourceChoice struct {
Start float64 `json:"start"`
End float64 `json:"end"`
Species string `json:"species"`
CallType string `json:"calltype,omitempty"`
}

type PropagateChange struct {
TargetStart float64 `json:"target_start"`
TargetEnd float64 `json:"target_end"`
PrevSpecies string `json:"prev_species"`
PrevCallType string `json:"prev_calltype,omitempty"`
PrevCertainty int `json:"prev_certainty"`
NewSpecies string `json:"new_species"`
NewCallType string `json:"new_calltype,omitempty"`
NewCertainty int `json:"new_certainty"`
}

// CallsPropagate copies verified classifications (certainty==100) from one filter's
// segments to overlapping target segments of another filter, within a single .data file.
// Target labels with certainty==70 (ML-unverified) or certainty==0 (Don't Know / Noise)
// are updated — targets at certainty==100 (human-verified) and certainty==90 (already
// propagated) are left alone. Only source labels matching --species are considered.
// Propagated target labels are set to certainty=90 and file reviewer is set to "Skraak".
func CallsPropagate(input CallsPropagateInput) (CallsPropagateOutput, error) {
output := CallsPropagateOutput{
File: input.File,
FromFilter: input.FromFilter,
ToFilter: input.ToFilter,
Species: input.Species,
}

if input.File == "" {
output.Error = "--file is required"
return output, fmt.Errorf("%s", output.Error)
}
if input.FromFilter == "" {
output.Error = "--from is required"
return output, fmt.Errorf("%s", output.Error)
}
if input.ToFilter == "" {
output.Error = "--to is required"
return output, fmt.Errorf("%s", output.Error)
}
if input.Species == "" {
output.Error = "--species is required"
return output, fmt.Errorf("%s", output.Error)
}
if input.FromFilter == input.ToFilter {
output.Error = "--from and --to must differ"
return output, fmt.Errorf("%s", output.Error)
}

if _, err := os.Stat(input.File); os.IsNotExist(err) {
output.Error = fmt.Sprintf("file not found: %s", input.File)
return output, fmt.Errorf("%s", output.Error)
}

df, err := utils.ParseDataFile(input.File)
if err != nil {
output.Error = fmt.Sprintf("parse %s: %v", input.File, err)
return output, fmt.Errorf("%s", output.Error)
}

// Fast path: skip files that don't contain both filters at all.
hasFrom, hasTo := false, false
for _, seg := range df.Segments {
for _, lbl := range seg.Labels {
if lbl.Filter == input.FromFilter {
hasFrom = true
}
if lbl.Filter == input.ToFilter {
hasTo = true
}
if hasFrom && hasTo {
break
}
}
if hasFrom && hasTo {
break
}
}
if !hasFrom || !hasTo {
output.FiltersMissing = true
return output, nil
}

type sourceRef struct {
seg *utils.Segment
label *utils.Label
}
var sources []sourceRef
for _, seg := range df.Segments {
for _, lbl := range seg.Labels {
if lbl.Filter == input.FromFilter && lbl.Species == input.Species && lbl.Certainty == 100 {
sources = append(sources, sourceRef{seg: seg, label: lbl})
break
}
}
}

changed := false
for _, tSeg := range df.Segments {
var toLabel *utils.Label
for _, lbl := range tSeg.Labels {
if lbl.Filter == input.ToFilter && (lbl.Certainty == 70 || lbl.Certainty == 0) {
toLabel = lbl
break
}
}
if toLabel == nil {
continue
}
output.TargetsExamined++

var overlaps []sourceRef
for _, s := range sources {
if s.seg.StartTime < tSeg.EndTime && tSeg.StartTime < s.seg.EndTime {
overlaps = append(overlaps, s)
}
}
if len(overlaps) == 0 {
output.SkippedNoOverlap++
continue
}

agreedCallType := overlaps[0].label.CallType
conflict := false
for _, s := range overlaps[1:] {
if s.label.CallType != agreedCallType {
conflict = true
break
}
}
if conflict {
output.SkippedConflict++
choices := make([]PropagateSourceChoice, 0, len(overlaps))
for _, s := range overlaps {
choices = append(choices, PropagateSourceChoice{
Start: s.seg.StartTime,
End: s.seg.EndTime,
Species: s.label.Species,
CallType: s.label.CallType,
})
}
output.Conflicts = append(output.Conflicts, PropagateConflict{
TargetStart: tSeg.StartTime,
TargetEnd: tSeg.EndTime,
TargetCallType: toLabel.CallType,
SourceChoices: choices,
})
continue
}

change := PropagateChange{
TargetStart: tSeg.StartTime,
TargetEnd: tSeg.EndTime,
PrevSpecies: toLabel.Species,
PrevCallType: toLabel.CallType,
PrevCertainty: toLabel.Certainty,
NewSpecies: input.Species,
NewCallType: agreedCallType,
NewCertainty: 90,
}

toLabel.Species = input.Species
toLabel.CallType = agreedCallType
toLabel.Certainty = 90
changed = true

output.Propagated++
output.Changes = append(output.Changes, change)
}

if changed {
df.Meta.Reviewer = "Skraak"
if err := df.Write(input.File); err != nil {
output.Error = fmt.Sprintf("write %s: %v", input.File, err)
return output, fmt.Errorf("%s", output.Error)
}
}

return output, nil
}

// CallsPropagateFolder runs CallsPropagate against every .data file in a folder,
// aggregating counts. Files that do not contain both --from and --to filters are
// skipped silently (counted as files_skipped_no_filter). Parse/write errors on
// individual files are collected in Errors; they don't abort the run.
func CallsPropagateFolder(input CallsPropagateFolderInput) (CallsPropagateFolderOutput, error) {
output := CallsPropagateFolderOutput{
Folder: input.Folder,
FromFilter: input.FromFilter,
ToFilter: input.ToFilter,
Species: input.Species,
}

if input.Folder == "" {
output.Error = "--folder is required"
return output, fmt.Errorf("%s", output.Error)
}
if input.FromFilter == "" {
output.Error = "--from is required"
return output, fmt.Errorf("%s", output.Error)
}
if input.ToFilter == "" {
output.Error = "--to is required"
return output, fmt.Errorf("%s", output.Error)
}
if input.Species == "" {
output.Error = "--species is required"
return output, fmt.Errorf("%s", output.Error)
}
if input.FromFilter == input.ToFilter {
output.Error = "--from and --to must differ"
return output, fmt.Errorf("%s", output.Error)
}

info, err := os.Stat(input.Folder)
if err != nil {
output.Error = fmt.Sprintf("folder not found: %s", input.Folder)
return output, fmt.Errorf("%s", output.Error)
}
if !info.IsDir() {
output.Error = fmt.Sprintf("not a directory: %s", input.Folder)
return output, fmt.Errorf("%s", output.Error)
}

files, err := utils.FindDataFiles(input.Folder)
if err != nil {
output.Error = fmt.Sprintf("list .data files: %v", err)
return output, fmt.Errorf("%s", output.Error)
}
output.FilesTotal = len(files)

for _, f := range files {
fileOut, err := CallsPropagate(CallsPropagateInput{
File: f,
FromFilter: input.FromFilter,
ToFilter: input.ToFilter,
Species: input.Species,
})
if err != nil {
output.FilesErrored++
output.Errors = append(output.Errors, fileOut)
continue
}
if fileOut.FiltersMissing {
output.FilesSkippedNoFilter++
continue
}
output.FilesWithBothFilters++
output.TargetsExamined += fileOut.TargetsExamined
output.Propagated += fileOut.Propagated
output.SkippedNoOverlap += fileOut.SkippedNoOverlap
output.SkippedConflict += fileOut.SkippedConflict
if fileOut.Propagated > 0 {
output.FilesChanged++
}
for _, c := range fileOut.Conflicts {
c.File = f
output.Conflicts = append(output.Conflicts, c)
}
}

return output, nil
}
file addition: calls_modify_test.go (----------)

[0.248737]

package tools

import (
"path/filepath"
"testing"

"skraak/utils"
)

func TestCallsModifyBookmark(t *testing.T) {
// Create a temp .data file with a bookmarked segment
tmpDir := t.TempDir()
tmpFile := filepath.Join(tmpDir, "test.data")

df := &utils.DataFile{
Meta: &utils.DataMeta{Operator: "test", Duration: 60},
Segments: []*utils.Segment{
{
StartTime: 10.0,
EndTime: 15.0,
FreqLow: 100,
FreqHigh: 5000,
Labels: []*utils.Label{
{Species: "Kiwi", Certainty: 80, Filter: "myfilter", CallType: "Duet", Bookmark: true},
},
},
},
}
if err := df.Write(tmpFile); err != nil {
t.Fatalf("failed to write test file: %v", err)
}

// Test 1: Adding bookmark when already true should do nothing
bookmark := true
result, err := CallsModify(CallsModifyInput{
File: tmpFile,
Reviewer: "tester",
Filter: "myfilter",
Segment: "10-15",
Certainty: 80,
Bookmark: &bookmark,
})

// Should return error "no changes needed"
if err == nil {
t.Errorf("expected error 'no changes needed' when bookmark already true, got nil")
}
if result.Error != "No changes needed: all values already match" {
t.Errorf("expected 'no changes needed' error, got: %s", result.Error)
}

// Verify bookmark is still true in the file
df2, err := utils.ParseDataFile(tmpFile)
if err != nil {
t.Fatalf("failed to parse file: %v", err)
}
if !df2.Segments[0].Labels[0].Bookmark {
t.Errorf("bookmark should still be true, got false")
}
}

func TestCallsModifyBookmarkFalse(t *testing.T) {
// Create a temp .data file WITHOUT a bookmark
tmpDir := t.TempDir()
tmpFile := filepath.Join(tmpDir, "test.data")

df := &utils.DataFile{
Meta: &utils.DataMeta{Operator: "test", Duration: 60},
Segments: []*utils.Segment{
{
StartTime: 10.0,
EndTime: 15.0,
FreqLow: 100,
FreqHigh: 5000,
Labels: []*utils.Label{
{Species: "Kiwi", Certainty: 80, Filter: "myfilter", CallType: "Duet", Bookmark: false},
},
},
},
}
if err := df.Write(tmpFile); err != nil {
t.Fatalf("failed to write test file: %v", err)
}

// Test: Adding bookmark when false should set it to true
bookmark := true
result, err := CallsModify(CallsModifyInput{
File: tmpFile,
Reviewer: "tester",
Filter: "myfilter",
Segment: "10-15",
Certainty: 80,
Bookmark: &bookmark,
})

if err != nil {
t.Errorf("unexpected error: %v", err)
}
if result.Bookmark == nil || !*result.Bookmark {
t.Errorf("expected bookmark=true in result, got %v", result.Bookmark)
}

// Verify bookmark is true in the file
df2, err := utils.ParseDataFile(tmpFile)
if err != nil {
t.Fatalf("failed to parse file: %v", err)
}
if !df2.Segments[0].Labels[0].Bookmark {
t.Errorf("bookmark should be true, got false")
}
}

func TestCallsModifyCommentAdditive(t *testing.T) {
// Create a temp .data file with an existing comment
tmpDir := t.TempDir()
tmpFile := filepath.Join(tmpDir, "test.data")

df := &utils.DataFile{
Meta: &utils.DataMeta{Operator: "test", Duration: 60},
Segments: []*utils.Segment{
{
StartTime: 10.0,
EndTime: 15.0,
FreqLow: 100,
FreqHigh: 5000,
Labels: []*utils.Label{
{Species: "Kiwi", Certainty: 80, Filter: "myfilter", Comment: "First observation"},
},
},
},
}
if err := df.Write(tmpFile); err != nil {
t.Fatalf("failed to write test file: %v", err)
}

// Test: Adding comment should be additive
result, err := CallsModify(CallsModifyInput{
File: tmpFile,
Reviewer: "tester",
Filter: "myfilter",
Segment: "10-15",
Certainty: 80,
Comment: "Good example",
})

if err != nil {
t.Errorf("unexpected error: %v", err)
}

expectedComment := "First observation | Good example"
if result.Comment != expectedComment {
t.Errorf("expected comment=%q, got %q", expectedComment, result.Comment)
}

// Verify comment in file
df2, err := utils.ParseDataFile(tmpFile)
if err != nil {
t.Fatalf("failed to parse file: %v", err)
}
if df2.Segments[0].Labels[0].Comment != expectedComment {
t.Errorf("expected comment in file=%q, got %q", expectedComment, df2.Segments[0].Labels[0].Comment)
}
}

func TestCallsModifyCommentAdditiveMultiple(t *testing.T) {
// Create a temp .data file and add multiple comments
tmpDir := t.TempDir()
tmpFile := filepath.Join(tmpDir, "test.data")

df := &utils.DataFile{
Meta: &utils.DataMeta{Operator: "test", Duration: 60},
Segments: []*utils.Segment{
{
StartTime: 10.0,
EndTime: 15.0,
FreqLow: 100,
FreqHigh: 5000,
Labels: []*utils.Label{
{Species: "Kiwi", Certainty: 80, Filter: "myfilter"},
},
},
},
}
if err := df.Write(tmpFile); err != nil {
t.Fatalf("failed to write test file: %v", err)
}

// Add first comment
_, err := CallsModify(CallsModifyInput{
File: tmpFile,
Reviewer: "tester",
Filter: "myfilter",
Segment: "10-15",
Certainty: 80,
Comment: "First",
})
if err != nil {
t.Fatalf("unexpected error on first comment: %v", err)
}

// Add second comment
_, err = CallsModify(CallsModifyInput{
File: tmpFile,
Reviewer: "tester",
Filter: "myfilter",
Segment: "10-15",
Certainty: 80,
Comment: "Second",
})
if err != nil {
t.Fatalf("unexpected error on second comment: %v", err)
}

// Add third comment
result, err := CallsModify(CallsModifyInput{
File: tmpFile,
Reviewer: "tester",
Filter: "myfilter",
Segment: "10-15",
Certainty: 80,
Comment: "Third",
})
if err != nil {
t.Fatalf("unexpected error on third comment: %v", err)
}

expectedComment := "First | Second | Third"
if result.Comment != expectedComment {
t.Errorf("expected comment=%q, got %q", expectedComment, result.Comment)
}
}

func TestCallsModifyCommentTooLong(t *testing.T) {
// Create a temp .data file with an existing long comment
tmpDir := t.TempDir()
tmpFile := filepath.Join(tmpDir, "test.data")

existingComment := "This is a fairly long existing comment that takes up space"
df := &utils.DataFile{
Meta: &utils.DataMeta{Operator: "test", Duration: 60},
Segments: []*utils.Segment{
{
StartTime: 10.0,
EndTime: 15.0,
FreqLow: 100,
FreqHigh: 5000,
Labels: []*utils.Label{
{Species: "Kiwi", Certainty: 80, Filter: "myfilter", Comment: existingComment},
},
},
},
}
if err := df.Write(tmpFile); err != nil {
t.Fatalf("failed to write test file: %v", err)
}

// Test: Adding a long comment that would exceed 140 chars should fail
longNewComment := "This is another very long comment that when combined with the existing one will exceed the limit"
result, err := CallsModify(CallsModifyInput{
File: tmpFile,
Reviewer: "tester",
Filter: "myfilter",
Segment: "10-15",
Certainty: 80,
Comment: longNewComment,
})

if err == nil {
t.Errorf("expected error for combined comment exceeding 140 chars, got nil")
}
if result.Error == "" {
t.Errorf("expected error message, got empty")
}

// Verify original comment is preserved
df2, err := utils.ParseDataFile(tmpFile)
if err != nil {
t.Fatalf("failed to parse file: %v", err)
}
if df2.Segments[0].Labels[0].Comment != existingComment {
t.Errorf("original comment should be preserved, got %q", df2.Segments[0].Labels[0].Comment)
}
}

func TestCallsModifyPreservesBookmarkOnOtherChange(t *testing.T) {
// Create a temp .data file with a bookmark
tmpDir := t.TempDir()
tmpFile := filepath.Join(tmpDir, "test.data")

df := &utils.DataFile{
Meta: &utils.DataMeta{Operator: "test", Duration: 60},
Segments: []*utils.Segment{
{
StartTime: 10.0,
EndTime: 15.0,
FreqLow: 100,
FreqHigh: 5000,
Labels: []*utils.Label{
{Species: "Kiwi", Certainty: 80, Filter: "myfilter", Bookmark: true},
},
},
},
}
if err := df.Write(tmpFile); err != nil {
t.Fatalf("failed to write test file: %v", err)
}

// Change certainty (without passing --bookmark) - bookmark should be preserved
result, err := CallsModify(CallsModifyInput{
File: tmpFile,
Reviewer: "tester",
Filter: "myfilter",
Segment: "10-15",
Certainty: 100,
// No Bookmark set
})

if err != nil {
t.Errorf("unexpected error: %v", err)
}
if result.Bookmark != nil {
t.Errorf("bookmark should not be in output when not changed, got %v", result.Bookmark)
}

// Verify bookmark is still true in the file
df2, err := utils.ParseDataFile(tmpFile)
if err != nil {
t.Fatalf("failed to parse file: %v", err)
}
if !df2.Segments[0].Labels[0].Bookmark {
t.Errorf("bookmark should still be true after changing certainty, got false")
}
}

func TestCallsModifyInvalidSegment(t *testing.T) {
tmpDir := t.TempDir()
tmpFile := filepath.Join(tmpDir, "test.data")

df := &utils.DataFile{
Meta: &utils.DataMeta{Operator: "test", Duration: 60},
Segments: []*utils.Segment{
{
StartTime: 10.0,
EndTime: 15.0,
FreqLow: 100,
FreqHigh: 5000,
Labels: []*utils.Label{
{Species: "Kiwi", Certainty: 80, Filter: "myfilter"},
},
},
},
}
if err := df.Write(tmpFile); err != nil {
t.Fatalf("failed to write test file: %v", err)
}

// Test: Non-existent segment should error
result, err := CallsModify(CallsModifyInput{
File: tmpFile,
Reviewer: "tester",
Filter: "myfilter",
Segment: "99-100",
Certainty: 80,
})

if err == nil {
t.Errorf("expected error for non-existent segment, got nil")
}
if result.Error == "" {
t.Errorf("expected error message, got empty")
}
}
file addition: calls_modify.go (----------)

[0.248737]

package tools

import (
"fmt"
"math"
"os"
"strings"

"skraak/utils"
)

// CallsModifyInput defines the input for the modify tool
type CallsModifyInput struct {
File string `json:"file"`
Reviewer string `json:"reviewer"`
Filter string `json:"filter"`
Segment string `json:"segment"`
Certainty int `json:"certainty"`
Species string `json:"species"`
Bookmark *bool `json:"bookmark"`
Comment string `json:"comment"`
}

// CallsModifyOutput defines the output for the modify tool
type CallsModifyOutput struct {
File string `json:"file"`
SegmentStart int `json:"segment_start"`
SegmentEnd int `json:"segment_end"`
Species string `json:"species,omitempty"`
CallType string `json:"calltype,omitempty"`
Certainty int `json:"certainty,omitempty"`
Bookmark *bool `json:"bookmark,omitempty"`
Comment string `json:"comment,omitempty"`
PreviousValue string `json:"previous_value,omitempty"`
Error string `json:"error,omitempty"`
}

// CallsModify modifies a label in a .data file
func CallsModify(input CallsModifyInput) (CallsModifyOutput, error) {
var output CallsModifyOutput

// Validate required flags
if input.File == "" {
output.Error = "--file is required"
return output, fmt.Errorf("%s", output.Error)
}
if input.Reviewer == "" {
output.Error = "--reviewer is required"
return output, fmt.Errorf("%s", output.Error)
}
if input.Filter == "" {
output.Error = "--filter is required"
return output, fmt.Errorf("%s", output.Error)
}
if input.Segment == "" {
output.Error = "--segment is required"
return output, fmt.Errorf("%s", output.Error)
}

// Parse segment time range
startTime, endTime, err := parseSegmentRange(input.Segment)
if err != nil {
output.Error = err.Error()
return output, fmt.Errorf("%s", output.Error)
}

// Validate comment (max 140 chars, ASCII only)
if len(input.Comment) > 140 {
output.Error = "--comment must be 140 characters or less"
return output, fmt.Errorf("%s", output.Error)
}
for i, r := range input.Comment {
if r > 127 {
output.Error = fmt.Sprintf("--comment must be ASCII only (non-ASCII at position %d)", i)
return output, fmt.Errorf("%s", output.Error)
}
}

output.File = input.File
output.SegmentStart = startTime
output.SegmentEnd = endTime

// Check file exists
if _, err := os.Stat(input.File); os.IsNotExist(err) {
output.Error = fmt.Sprintf("File not found: %s", input.File)
return output, fmt.Errorf("%s", output.Error)
}

// Parse .data file
dataFile, err := utils.ParseDataFile(input.File)
if err != nil {
output.Error = fmt.Sprintf("Failed to parse file: %v", err)
return output, fmt.Errorf("%s", output.Error)
}

// Find matching segment (also checks filter to handle duplicate time ranges)
segment := findSegment(dataFile.Segments, startTime, endTime, input.Filter)
if segment == nil {
output.Error = fmt.Sprintf("No segment found matching time range %d-%d", startTime, endTime)
return output, fmt.Errorf("%s", output.Error)
}

// Find label matching filter
var targetLabel *utils.Label
for _, label := range segment.Labels {
if label.Filter == input.Filter {
targetLabel = label
break
}
}

if targetLabel == nil {
output.Error = fmt.Sprintf("No label found with filter '%s' in segment %d-%d", input.Filter, startTime, endTime)
return output, fmt.Errorf("%s", output.Error)
}

// Store previous value for output
output.PreviousValue = formatLabel(targetLabel)

// Calculate new species/calltype
var newSpecies, newCallType string
if input.Species != "" {
if strings.Contains(input.Species, "+") {
parts := strings.SplitN(input.Species, "+", 2)
newSpecies = parts[0]
newCallType = parts[1]
} else {
newSpecies = input.Species
newCallType = "" // Clear calltype
}
} else {
newSpecies = targetLabel.Species
newCallType = targetLabel.CallType
}

// Check if anything would change
speciesChanging := newSpecies != targetLabel.Species || newCallType != targetLabel.CallType
certaintyChanging := input.Certainty != targetLabel.Certainty
bookmarkChanging := input.Bookmark != nil && *input.Bookmark != targetLabel.Bookmark
commentChanging := input.Comment != "" // Any non-empty comment will be added

if !speciesChanging && !certaintyChanging && !bookmarkChanging && !commentChanging {
output.Error = "No changes needed: all values already match"
return output, fmt.Errorf("%s", output.Error)
}

// Update reviewer on file metadata
dataFile.Meta.Reviewer = input.Reviewer

// Update species/calltype
targetLabel.Species = newSpecies
targetLabel.CallType = newCallType
output.Species = newSpecies
output.CallType = newCallType

// Update certainty
targetLabel.Certainty = input.Certainty
output.Certainty = input.Certainty

// Update bookmark (only if it would change - never toggle away from true)
if input.Bookmark != nil && *input.Bookmark != targetLabel.Bookmark {
targetLabel.Bookmark = *input.Bookmark
output.Bookmark = input.Bookmark
}

// Update comment (additive - append to existing comment, never destroy)
if input.Comment != "" {
var newComment string
if targetLabel.Comment != "" {
newComment = targetLabel.Comment + " | " + input.Comment
} else {
newComment = input.Comment
}
// Check length after combining
if len(newComment) > 140 {
output.Error = fmt.Sprintf("Combined comment exceeds 140 characters (%d)", len(newComment))
return output, fmt.Errorf("%s", output.Error)
}
targetLabel.Comment = newComment
output.Comment = newComment
}

// Save file
if err := dataFile.Write(input.File); err != nil {
output.Error = fmt.Sprintf("Failed to save file: %v", err)
return output, fmt.Errorf("%s", output.Error)
}

return output, nil
}

// parseSegmentRange parses "12-15" format into start and end integers
func parseSegmentRange(s string) (int, int, error) {
parts := strings.Split(s, "-")
if len(parts) != 2 {
return 0, 0, fmt.Errorf("invalid segment format: %s (expected start-end, e.g., 12-15)", s)
}

var start, end int
if _, err := fmt.Sscanf(parts[0], "%d", &start); err != nil {
return 0, 0, fmt.Errorf("invalid start time: %s", parts[0])
}
if _, err := fmt.Sscanf(parts[1], "%d", &end); err != nil {
return 0, 0, fmt.Errorf("invalid end time: %s", parts[1])
}

if start < 0 || end < 0 {
return 0, 0, fmt.Errorf("times must be non-negative")
}
if start >= end {
return 0, 0, fmt.Errorf("start time must be less than end time")
}

return start, end, nil
}

// findSegment finds a segment matching the time range using floor/ceil matching.
// It also checks that the segment contains a label with the specified filter,
// so that duplicate segments (same time range, different filters) are resolved correctly.
func findSegment(segments []*utils.Segment, startTime, endTime int, filter string) *utils.Segment {
for _, seg := range segments {
segStart := int(math.Floor(seg.StartTime))
segEnd := int(math.Ceil(seg.EndTime))
if segEnd == segStart {
segEnd = segStart + 1 // minimum 1 second
}
if segStart == startTime && segEnd == endTime {
for _, label := range seg.Labels {
if label.Filter == filter {
return seg
}
}
}
}
return nil
}

// formatLabel formats a label for display
func formatLabel(label *utils.Label) string {
result := label.Species
if label.CallType != "" {
result += "+" + label.CallType
}
result += fmt.Sprintf(" (%d%%)", label.Certainty)
return result
}
file addition: calls_from_raven.go (----------)

[0.248737]

package tools

import (
"bufio"
"fmt"
"os"
"path/filepath"
"sort"
"strings"
"sync"
"sync/atomic"

"skraak/utils"
)

// CallsFromRavenInput defines the input for the calls-from-raven tool
type CallsFromRavenInput struct {
Folder string `json:"folder"`
File string `json:"file"`
Delete bool `json:"delete"`
ProgressHandler ProgressHandler `json:"-"` // Optional progress callback
}

// CallsFromRavenOutput defines the output for the calls-from-raven tool
type CallsFromRavenOutput struct {
Calls []ClusteredCall `json:"calls"`
TotalCalls int `json:"total_calls"`
SpeciesCount map[string]int `json:"species_count"`
DataFilesWritten int `json:"data_files_written"`
DataFilesSkipped int `json:"data_files_skipped"`
FilesProcessed int `json:"files_processed"`
FilesDeleted int `json:"files_deleted"`
Filter string `json:"filter"`
Error *string `json:"error,omitempty"`
}

// RavenSelection represents a single Raven selection
type RavenSelection struct {
StartTime float64
EndTime float64
FreqLow float64
FreqHigh float64
Species string
}

// ravenJob represents a single Raven file to process
type ravenJob struct {
ravenFile string
}

// ravenResult represents the result of processing a single Raven file
type ravenResult struct {
ravenFile string
calls []ClusteredCall
written bool
skipped bool
err error
}

// CallsFromRaven processes Raven selection files and writes .data files
func CallsFromRaven(input CallsFromRavenInput) (CallsFromRavenOutput, error) {
var output CallsFromRavenOutput
output.Filter = "Raven"

// Collect Raven files to process
var ravenFiles []string
if input.File != "" {
ravenFiles = []string{input.File}
} else if input.Folder != "" {
var err error
ravenFiles, err = findRavenFiles(input.Folder)
if err != nil {
errMsg := fmt.Sprintf("Failed to find Raven files: %v", err)
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}
} else {
errMsg := "Either --folder or --file must be specified"
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}

if len(ravenFiles) == 0 {
errMsg := "No Raven files found"
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}

// Single file or small batch: process sequentially (avoid goroutine overhead)
if len(ravenFiles) < 10 {
return callsFromRavenSequential(input, ravenFiles)
}

// Large batch: parallel processing with DirCache
return callsFromRavenParallel(input, ravenFiles)
}

// callsFromRavenSequential processes Raven files one at a time (for small batches)
func callsFromRavenSequential(input CallsFromRavenInput, ravenFiles []string) (CallsFromRavenOutput, error) {
var output CallsFromRavenOutput
output.Filter = "Raven"

// Build DirCache once for the folder (even sequential benefits from avoiding repeated dir scans)
dirCaches := make(map[string]*DirCache)
if input.Folder != "" {
dirCaches[input.Folder] = NewDirCache(input.Folder)
}

speciesCount := make(map[string]int)
var allCalls []ClusteredCall
dataFilesWritten := 0
dataFilesSkipped := 0
filesProcessed := 0
filesDeleted := 0

for _, ravenFile := range ravenFiles {
dir := filepath.Dir(ravenFile)
cache := dirCaches[dir]
if cache == nil {
cache = NewDirCache(dir)
dirCaches[dir] = cache
}

calls, written, skipped, err := processRavenFileCached(ravenFile, cache)
if err != nil {
errMsg := fmt.Sprintf("Error processing %s: %v", ravenFile, err)
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}

if written {
dataFilesWritten++
}
if skipped {
dataFilesSkipped++
}

for _, call := range calls {
allCalls = append(allCalls, call)
speciesCount[call.EbirdCode]++
}

filesProcessed++

// Delete if requested and successfully processed
if input.Delete && written {
if err := os.Remove(ravenFile); err != nil {
errMsg := fmt.Sprintf("Failed to delete %s: %v", ravenFile, err)
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}
filesDeleted++
}

if input.ProgressHandler != nil {
input.ProgressHandler(filesProcessed, len(ravenFiles), filepath.Base(ravenFile))
}
}

// Sort all calls by file, then start time
sort.Slice(allCalls, func(i, j int) bool {
if allCalls[i].File != allCalls[j].File {
return allCalls[i].File < allCalls[j].File
}
return allCalls[i].StartTime < allCalls[j].StartTime
})

output.Calls = allCalls
output.TotalCalls = len(allCalls)
output.SpeciesCount = speciesCount
output.DataFilesWritten = dataFilesWritten
output.DataFilesSkipped = dataFilesSkipped
output.FilesProcessed = filesProcessed
output.FilesDeleted = filesDeleted

return output, nil
}

// callsFromRavenParallel processes Raven files concurrently using a worker pool and DirCache
func callsFromRavenParallel(input CallsFromRavenInput, ravenFiles []string) (CallsFromRavenOutput, error) {
var output CallsFromRavenOutput
output.Filter = "Raven"

total := len(ravenFiles)
var processed atomic.Int32

// Build DirCache for the folder
dirCaches := &sync.Map{}
if input.Folder != "" {
cache := NewDirCache(input.Folder)
dirCaches.Store(input.Folder, cache)
}

// Create job and result channels
jobs := make(chan ravenJob, total)
results := make(chan ravenResult, total)

// Start workers
var wg sync.WaitGroup
for range DOT_DATA_WORKERS {
wg.Add(1)
go ravenWorker(dirCaches, jobs, results, &wg)
}

// Send jobs
for _, ravenFile := range ravenFiles {
jobs <- ravenJob{ravenFile: ravenFile}
}
close(jobs)

// Wait for workers to finish, then close results
go func() {
wg.Wait()
close(results)
}()

// Collect results with progress reporting
speciesCount := make(map[string]int)
var allCalls []ClusteredCall
dataFilesWritten := 0
dataFilesSkipped := 0
filesProcessed := 0
filesDeleted := 0
var firstErr error

for result := range results {
if result.err != nil && firstErr == nil {
firstErr = result.err
}

if result.written {
dataFilesWritten++
}
if result.skipped {
dataFilesSkipped++
}

for _, call := range result.calls {
allCalls = append(allCalls, call)
speciesCount[call.EbirdCode]++
}

filesProcessed++

// Delete if requested and successfully processed
if input.Delete && result.written {
if err := os.Remove(result.ravenFile); err != nil {
if firstErr == nil {
firstErr = fmt.Errorf("failed to delete %s: %w", result.ravenFile, err)
}
} else {
filesDeleted++
}
}

if input.ProgressHandler != nil {
current := int(processed.Add(1))
input.ProgressHandler(current, total, filepath.Base(result.ravenFile))
}
}

if firstErr != nil {
errMsg := firstErr.Error()
output.Error = &errMsg
return output, firstErr
}

// Sort all calls by file, then start time
sort.Slice(allCalls, func(i, j int) bool {
if allCalls[i].File != allCalls[j].File {
return allCalls[i].File < allCalls[j].File
}
return allCalls[i].StartTime < allCalls[j].StartTime
})

output.Calls = allCalls
output.TotalCalls = len(allCalls)
output.SpeciesCount = speciesCount
output.DataFilesWritten = dataFilesWritten
output.DataFilesSkipped = dataFilesSkipped
output.FilesProcessed = filesProcessed
output.FilesDeleted = filesDeleted

return output, nil
}

// ravenWorker processes Raven files from the jobs channel
func ravenWorker(dirCaches *sync.Map, jobs <-chan ravenJob, results chan<- ravenResult, wg *sync.WaitGroup) {
defer wg.Done()

for job := range jobs {
dir := filepath.Dir(job.ravenFile)

// Get or create DirCache for this directory
var cache *DirCache
if cached, ok := dirCaches.Load(dir); ok {
cache = cached.(*DirCache)
} else {
cache = NewDirCache(dir)
dirCaches.Store(dir, cache)
}

calls, written, skipped, err := processRavenFileCached(job.ravenFile, cache)
results <- ravenResult{
ravenFile: job.ravenFile,
calls: calls,
written: written,
skipped: skipped,
err: err,
}
}
}

// findRavenFiles finds all Raven selection files in a folder
func findRavenFiles(folder string) ([]string, error) {
var files []string

entries, err := os.ReadDir(folder)
if err != nil {
return nil, err
}

for _, entry := range entries {
name := entry.Name()
if strings.HasSuffix(name, ".selections.txt") {
files = append(files, filepath.Join(folder, name))
}
}

return files, nil
}

// processRavenFileCached processes a single Raven selection file using a DirCache for WAV lookup
func processRavenFileCached(ravenFile string, cache *DirCache) ([]ClusteredCall, bool, bool, error) {
// Open file
file, err := os.Open(ravenFile)
if err != nil {
return nil, false, false, fmt.Errorf("failed to open file: %w", err)
}
defer func() { _ = file.Close() }()

// Read header and selections (tab-separated)
scanner := bufio.NewScanner(file)

// Read header line
if !scanner.Scan() {
return nil, false, false, fmt.Errorf("empty file")
}
header := strings.Split(scanner.Text(), "\t")

// Find column indices
beginTimeIdx := -1
endTimeIdx := -1
lowFreqIdx := -1
highFreqIdx := -1
speciesIdx := -1

for i, col := range header {
switch col {
case "Begin Time (s)":
beginTimeIdx = i
case "End Time (s)":
endTimeIdx = i
case "Low Freq (Hz)":
lowFreqIdx = i
case "High Freq (Hz)":
highFreqIdx = i
case "Species":
speciesIdx = i
}
}

if beginTimeIdx == -1 || endTimeIdx == -1 || speciesIdx == -1 {
return nil, false, false, fmt.Errorf("missing required columns in Raven file")
}

// Read selections
var selections []RavenSelection
for scanner.Scan() {
line := scanner.Text()
if line == "" {
continue
}

fields := strings.Split(line, "\t")
if len(fields) <= speciesIdx {
continue
}

var sel RavenSelection
if _, err := fmt.Sscanf(fields[beginTimeIdx], "%f", &sel.StartTime); err != nil {
return nil, false, false, fmt.Errorf("failed to parse begin time %q: %w", fields[beginTimeIdx], err)
}
if _, err := fmt.Sscanf(fields[endTimeIdx], "%f", &sel.EndTime); err != nil {
return nil, false, false, fmt.Errorf("failed to parse end time %q: %w", fields[endTimeIdx], err)
}
if lowFreqIdx >= 0 && lowFreqIdx < len(fields) {
if _, err := fmt.Sscanf(fields[lowFreqIdx], "%f", &sel.FreqLow); err != nil {
return nil, false, false, fmt.Errorf("failed to parse low freq %q: %w", fields[lowFreqIdx], err)
}
}
if highFreqIdx >= 0 && highFreqIdx < len(fields) {
if _, err := fmt.Sscanf(fields[highFreqIdx], "%f", &sel.FreqHigh); err != nil {
return nil, false, false, fmt.Errorf("failed to parse high freq %q: %w", fields[highFreqIdx], err)
}
}
sel.Species = fields[speciesIdx]

selections = append(selections, sel)
}

if err := scanner.Err(); err != nil {
return nil, false, false, fmt.Errorf("error reading file: %w", err)
}

if len(selections) == 0 {
return nil, false, true, nil // No selections, skip
}

// Derive WAV path from Raven filename
// "20230610_150000.Table.1.selections.txt" -> "20230610_150000"
base := filepath.Base(ravenFile)
// Remove .selections.txt
nameWithoutSuffix := strings.TrimSuffix(base, ".selections.txt")
// Remove .Table.X (or similar pattern)
idx := strings.Index(nameWithoutSuffix, ".Table.")
if idx > 0 {
nameWithoutSuffix = nameWithoutSuffix[:idx]
}

// Find WAV file using DirCache (O(1) lookup instead of O(N) directory scan)
var wavPath string
if cache != nil {
wavPath = cache.FindWAV(nameWithoutSuffix)
} else {
wavPath = findWAVFile(filepath.Dir(ravenFile), nameWithoutSuffix)
}
if wavPath == "" {
return nil, false, true, nil // WAV not found, skip
}

// Check if WAV exists (to get sample rate and duration)
sampleRate, duration, err := utils.ParseWAVHeaderMinimal(wavPath)
if err != nil {
return nil, false, true, nil // Skip if WAV not found or invalid
}

dataPath := wavPath + ".data"

// Convert selections to segments
segments := buildRavenSegments(selections, sampleRate)

// Build metadata
meta := AviaNZMeta{
Operator: "Raven",
Duration: duration,
}
reviewer := "None"
meta.Reviewer = &reviewer

// Write .data file (safe write)
if err := writeDotDataFileSafe(dataPath, segments, "Raven", meta); err != nil {
return nil, false, false, err
}

// Convert to ClusteredCalls for output
var calls []ClusteredCall
for _, sel := range selections {
calls = append(calls, ClusteredCall{
File: wavPath,
StartTime: sel.StartTime,
EndTime: sel.EndTime,
EbirdCode: sel.Species,
Segments: 1,
})
}

return calls, true, false, nil
}

// buildRavenSegments converts Raven selections to AviaNZ segments
func buildRavenSegments(selections []RavenSelection, sampleRate int) []AviaNZSegment {
var segments []AviaNZSegment

for _, sel := range selections {
labels := []AviaNZLabel{
{
Species: sel.Species,
Certainty: 70, // Default certainty for Raven (no confidence metric)
Filter: "Raven",
},
}

// Use frequency range from Raven, or full band if not specified
freqLow := sel.FreqLow
freqHigh := sel.FreqHigh
if freqLow == 0 && freqHigh == 0 {
freqHigh = float64(sampleRate)
}

segment := AviaNZSegment{
sel.StartTime,
sel.EndTime,
freqLow,
freqHigh,
labels,
}
segments = append(segments, segment)
}

return segments
}
file addition: calls_from_preds_test.go (----------)

[0.248737]

package tools

import (
"os"
"path/filepath"
"testing"

"skraak/utils"
)

func TestCallsFromPreds_EmptyFilterError(t *testing.T) {
// Create a temp CSV file
tmpDir := t.TempDir()
csvPath := filepath.Join(tmpDir, "preds.csv")
csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"
if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {
t.Fatal(err)
}

// Create a dummy WAV file (minimal valid WAV)
wavPath := filepath.Join(tmpDir, "test.wav")
createMinimalWAV(t, wavPath, 44100, 10.0)

// Test with empty filter (should error)
input := CallsFromPredsInput{
CSVPath: csvPath,
Filter: "",
WriteDotData: true,
ProgressHandler: nil,
}

output, err := CallsFromPreds(input)

// Should return error
if err == nil {
t.Error("expected error for empty filter, got nil")
}
if output.Error == nil || *output.Error == "" {
t.Error("expected error message in output, got empty")
}
}

func TestCallsFromPreds_NewDataFile(t *testing.T) {
// Create a temp CSV file
tmpDir := t.TempDir()
csvPath := filepath.Join(tmpDir, "predsST_test-filter_2025-01-01.csv")
csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"
if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {
t.Fatal(err)
}

// Create a dummy WAV file
wavPath := filepath.Join(tmpDir, "test.wav")
createMinimalWAV(t, wavPath, 44100, 10.0)

// Test with filter parsed from filename
input := CallsFromPredsInput{
CSVPath: csvPath,
Filter: "", // Will parse from filename
WriteDotData: true,
ProgressHandler: nil,
}

output, err := CallsFromPreds(input)

if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if output.DataFilesWritten != 1 {
t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)
}
if output.Filter != "test-filter" {
t.Errorf("expected filter 'test-filter', got '%s'", output.Filter)
}

// Verify .data file was created
dataPath := wavPath + ".data"
if _, err := os.Stat(dataPath); os.IsNotExist(err) {
t.Error("expected .data file to be created")
}

// Verify content
df, err := utils.ParseDataFile(dataPath)
if err != nil {
t.Fatalf("failed to parse .data file: %v", err)
}
if len(df.Segments) != 1 {
t.Errorf("expected 1 segment, got %d", len(df.Segments))
}
if len(df.Segments[0].Labels) != 1 {
t.Errorf("expected 1 label, got %d", len(df.Segments[0].Labels))
}
if df.Segments[0].Labels[0].Filter != "test-filter" {
t.Errorf("expected filter 'test-filter', got '%s'", df.Segments[0].Labels[0].Filter)
}
}

func TestCallsFromPreds_ExistingDataFileSameFilter(t *testing.T) {
// Create a temp CSV file
tmpDir := t.TempDir()
csvPath := filepath.Join(tmpDir, "predsST_existing-filter_2025-01-01.csv")
csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"
if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {
t.Fatal(err)
}

// Create a dummy WAV file
wavPath := filepath.Join(tmpDir, "test.wav")
createMinimalWAV(t, wavPath, 44100, 10.0)

// Create existing .data file with same filter
dataPath := wavPath + ".data"
existingData := `[
{"Operator": "Manual", "Reviewer": "David", "Duration": 10.0},
[5.0, 8.0, 0, 44100, [{"species": "morepork", "certainty": 90, "filter": "existing-filter"}]]
]`
if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {
t.Fatal(err)
}

// Test with same filter (should error)
input := CallsFromPredsInput{
CSVPath: csvPath,
Filter: "", // Will parse from filename -> "existing-filter"
WriteDotData: true,
ProgressHandler: nil,
}

output, err := CallsFromPreds(input)

// Should return error
if err == nil {
t.Error("expected error for same filter, got nil")
}
if output.Error == nil {
t.Error("expected error message in output")
}

// Verify original .data file is unchanged
df, err := utils.ParseDataFile(dataPath)
if err != nil {
t.Fatalf("failed to parse .data file: %v", err)
}
if len(df.Segments) != 1 {
t.Errorf("expected original 1 segment, got %d", len(df.Segments))
}
if df.Segments[0].Labels[0].Species != "morepork" {
t.Errorf("expected original species 'morepork', got '%s'", df.Segments[0].Labels[0].Species)
}
}

func TestCallsFromPreds_ExistingDataFileDifferentFilter(t *testing.T) {
// Create a temp CSV file
tmpDir := t.TempDir()
csvPath := filepath.Join(tmpDir, "predsST_new-filter_2025-01-01.csv")
csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"
if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {
t.Fatal(err)
}

// Create a dummy WAV file
wavPath := filepath.Join(tmpDir, "test.wav")
createMinimalWAV(t, wavPath, 44100, 10.0)

// Create existing .data file with different filter
dataPath := wavPath + ".data"
existingData := `[
{"Operator": "Manual", "Reviewer": "David", "Duration": 10.0},
[5.0, 8.0, 0, 44100, [{"species": "morepork", "certainty": 90, "filter": "old-filter"}]]
]`
if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {
t.Fatal(err)
}

// Test with different filter (should merge)
input := CallsFromPredsInput{
CSVPath: csvPath,
Filter: "", // Will parse from filename -> "new-filter"
WriteDotData: true,
ProgressHandler: nil,
}

output, err := CallsFromPreds(input)

if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if output.DataFilesWritten != 1 {
t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)
}

// Verify .data file has merged content
df, err := utils.ParseDataFile(dataPath)
if err != nil {
t.Fatalf("failed to parse .data file: %v", err)
}
if len(df.Segments) != 2 {
t.Errorf("expected 2 segments after merge, got %d", len(df.Segments))
}

// Check segments are sorted by start time
if df.Segments[0].StartTime > df.Segments[1].StartTime {
t.Error("expected segments to be sorted by start time")
}

// Check both filters are present
filters := make(map[string]bool)
for _, seg := range df.Segments {
for _, label := range seg.Labels {
filters[label.Filter] = true
}
}
if !filters["old-filter"] {
t.Error("expected 'old-filter' to be present")
}
if !filters["new-filter"] {
t.Error("expected 'new-filter' to be present")
}
}

func TestCallsFromPreds_ExistingDataFileParseError(t *testing.T) {
// Create a temp CSV file
tmpDir := t.TempDir()
csvPath := filepath.Join(tmpDir, "predsST_test-filter_2025-01-01.csv")
csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"
if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {
t.Fatal(err)
}

// Create a dummy WAV file
wavPath := filepath.Join(tmpDir, "test.wav")
createMinimalWAV(t, wavPath, 44100, 10.0)

// Create corrupted .data file
dataPath := wavPath + ".data"
corruptedData := `this is not valid json`
if err := os.WriteFile(dataPath, []byte(corruptedData), 0644); err != nil {
t.Fatal(err)
}

// Test (should error due to parse failure)
input := CallsFromPredsInput{
CSVPath: csvPath,
Filter: "",
WriteDotData: true,
ProgressHandler: nil,
}

output, err := CallsFromPreds(input)

// Should return error
if err == nil {
t.Error("expected error for corrupted .data file, got nil")
}
if output.Error == nil {
t.Error("expected error message in output")
}

// Verify original file is unchanged
content, err := os.ReadFile(dataPath)
if err != nil {
t.Fatal(err)
}
if string(content) != corruptedData {
t.Error("expected corrupted file to remain unchanged")
}
}

func TestCallsFromPreds_ExplicitFilter(t *testing.T) {
// Create a temp CSV file with non-standard name
tmpDir := t.TempDir()
csvPath := filepath.Join(tmpDir, "predictions.csv")
csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"
if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {
t.Fatal(err)
}

// Create a dummy WAV file
wavPath := filepath.Join(tmpDir, "test.wav")
createMinimalWAV(t, wavPath, 44100, 10.0)

// Test with explicit filter
input := CallsFromPredsInput{
CSVPath: csvPath,
Filter: "my-custom-filter",
WriteDotData: true,
ProgressHandler: nil,
}

output, err := CallsFromPreds(input)

if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if output.Filter != "my-custom-filter" {
t.Errorf("expected filter 'my-custom-filter', got '%s'", output.Filter)
}

// Verify .data file uses explicit filter
dataPath := wavPath + ".data"
df, err := utils.ParseDataFile(dataPath)
if err != nil {
t.Fatalf("failed to parse .data file: %v", err)
}
if df.Segments[0].Labels[0].Filter != "my-custom-filter" {
t.Errorf("expected filter 'my-custom-filter' in .data file, got '%s'", df.Segments[0].Labels[0].Filter)
}
}

func TestCallsFromPreds_NonParsableFilenameNoFilter(t *testing.T) {
// Create a temp CSV file with non-standard name that can't be parsed
tmpDir := t.TempDir()
csvPath := filepath.Join(tmpDir, "random_name.csv")
csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"
if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {
t.Fatal(err)
}

// Create a dummy WAV file
wavPath := filepath.Join(tmpDir, "test.wav")
createMinimalWAV(t, wavPath, 44100, 10.0)

// Test with no filter and non-parsable filename (should error)
input := CallsFromPredsInput{
CSVPath: csvPath,
Filter: "",
WriteDotData: true,
ProgressHandler: nil,
}

output, err := CallsFromPreds(input)

// Should return error
if err == nil {
t.Error("expected error for unparsable filename with no filter, got nil")
}
if output.Error == nil {
t.Error("expected error message in output")
}
}

// createMinimalWAV creates a minimal valid WAV file for testing
func createMinimalWAV(t *testing.T, path string, sampleRate int, duration float64) {
t.Helper()

numSamples := int(float64(sampleRate) * duration)
dataSize := numSamples * 2 // 16-bit mono

// WAV header (44 bytes)
header := make([]byte, 44)

// RIFF header
copy(header[0:4], "RIFF")
totalSize := uint32(36 + dataSize)
header[4] = byte(totalSize)
header[5] = byte(totalSize >> 8)
header[6] = byte(totalSize >> 16)
header[7] = byte(totalSize >> 24)
copy(header[8:12], "WAVE")

// fmt chunk
copy(header[12:16], "fmt ")
chunkSize := uint32(16)
header[16] = byte(chunkSize)
header[17] = byte(chunkSize >> 8)
header[18] = byte(chunkSize >> 16)
header[19] = byte(chunkSize >> 24)
audioFormat := uint16(1) // PCM
header[20] = byte(audioFormat)
header[21] = byte(audioFormat >> 8)
numChannels := uint16(1)
header[22] = byte(numChannels)
header[23] = byte(numChannels >> 8)
header[24] = byte(sampleRate)
header[25] = byte(sampleRate >> 8)
header[26] = byte(sampleRate >> 16)
header[27] = byte(sampleRate >> 24)
byteRate := uint32(sampleRate * 2)
header[28] = byte(byteRate)
header[29] = byte(byteRate >> 8)
header[30] = byte(byteRate >> 16)
header[31] = byte(byteRate >> 24)
blockAlign := uint16(2)
header[32] = byte(blockAlign)
header[33] = byte(blockAlign >> 8)
bitsPerSample := uint16(16)
header[34] = byte(bitsPerSample)
header[35] = byte(bitsPerSample >> 8)

// data chunk
copy(header[36:40], "data")
header[40] = byte(dataSize)
header[41] = byte(dataSize >> 8)
header[42] = byte(dataSize >> 16)
header[43] = byte(dataSize >> 24)

// Create file with header and silence
file, err := os.Create(path)
if err != nil {
t.Fatal(err)
}
defer file.Close()

if _, err := file.Write(header); err != nil {
t.Fatal(err)
}

// Write silence (zeros)
silence := make([]byte, dataSize)
if _, err := file.Write(silence); err != nil {
t.Fatal(err)
}
}
file addition: calls_from_preds.go (----------)

[0.248737]

package tools

import (
"encoding/csv"
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"

"skraak/utils"
)

// Constants for clustering algorithm
const (
CLUSTER_GAP_MULTIPLIER = 2 // 3 Gap threshold = CLUSTER_GAP_MULTIPLIER * clip_duration. 3 for kiwi
MIN_DETECTIONS_PER_CLUSTER = 0 // 1 = filter out single detections (used for kiwi, they have long calls 30s), 0 = let single detections pass through
DEFAULT_CERTAINTY = 70 // .data certainty:70
DOT_DATA_WORKERS = 8 // Number of parallel workers for .data file writing
)

// ClusteredCall represents a clustered bird call detection
type ClusteredCall struct {
File string `json:"file"`
StartTime float64 `json:"start_time"`
EndTime float64 `json:"end_time"`
EbirdCode string `json:"ebird_code"`
Segments int `json:"segments"`
}

// CallsFromPredsInput defines the input for the calls-from-preds tool
type CallsFromPredsInput struct {
CSVPath string `json:"csv_path"`
Filter string `json:"filter"`
WriteDotData bool `json:"write_dot_data"`
GapMultiplier int `json:"gap_multiplier"`
MinDetections int `json:"min_detections"`
ProgressHandler ProgressHandler `json:"-"` // Optional progress callback (not serialized)
}

// ProgressHandler is a callback function for reporting progress during long operations
// processed: number of items processed so far
// total: total number of items to process
// message: optional status message
type ProgressHandler func(processed, total int, message string)

// CallsFromPredsOutput defines the output for the calls-from-preds tool
type CallsFromPredsOutput struct {
Calls []ClusteredCall `json:"calls"`
TotalCalls int `json:"total_calls"`
ClipDuration float64 `json:"clip_duration"`
GapThreshold float64 `json:"gap_threshold"`
SpeciesCount map[string]int `json:"species_count"`
DataFilesWritten int `json:"data_files_written"`
DataFilesSkipped int `json:"data_files_skipped"`
Filter string `json:"filter"`
Error *string `json:"error,omitempty"`
}

// AviaNZ .data file types

// AviaNZMeta is the metadata element in a .data file
type AviaNZMeta struct {
Operator string `json:"Operator"`
Reviewer *string `json:"Reviewer,omitempty"`
Duration float64 `json:"Duration"`
}

// AviaNZLabel represents a species label in a segment
type AviaNZLabel struct {
Species string `json:"species"`
Certainty int `json:"certainty"`
Filter string `json:"filter"`
}

// AviaNZSegment represents a detection segment [start, end, freq_low, freq_high, labels]
type AviaNZSegment [5]any

// CallsFromPreds reads a predictions CSV and clusters detections into continuous bird calls
func CallsFromPreds(input CallsFromPredsInput) (CallsFromPredsOutput, error) {
var output CallsFromPredsOutput

// Determine filter: use provided filter, or parse from CSV filename
filter := input.Filter
if filter == "" {
filter = ParseFilterFromFilename(input.CSVPath)
}
// Filter must not be empty
if filter == "" {
errMsg := "Filter must be specified via --filter flag or parsable from CSV filename"
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}
output.Filter = filter

// Open CSV file
file, err := os.Open(input.CSVPath)
if err != nil {
errMsg := fmt.Sprintf("Failed to open CSV file: %v", err)
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}
defer func() { _ = file.Close() }()

// Read CSV
reader := csv.NewReader(file)
reader.ReuseRecord = true // Memory optimization for large files

// Read header
header, err := reader.Read()
if err != nil {
errMsg := fmt.Sprintf("Failed to read CSV header: %v", err)
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}

// Find column indices
fileIdx := -1
startTimeIdx := -1
endTimeIdx := -1
var ebirdCodes []string
var ebirdIdx []int

// Columns to ignore (not ebird codes)
ignoredColumns := map[string]bool{
"NotKiwi": true,
"0.0": true,
}

for i, col := range header {
switch col {
case "file":
fileIdx = i
case "start_time":
startTimeIdx = i
case "end_time":
endTimeIdx = i
default:
// Skip ignored columns
if ignoredColumns[col] {
continue
}
// All other columns are ebird codes
ebirdCodes = append(ebirdCodes, col)
ebirdIdx = append(ebirdIdx, i)
}
}

if fileIdx == -1 || startTimeIdx == -1 || endTimeIdx == -1 {
errMsg := "CSV must have 'file', 'start_time', and 'end_time' columns"
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}

if len(ebirdCodes) == 0 {
errMsg := "CSV must have at least one ebird code column"
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}

// Read all rows and organize by (file, ebird_code) -> start_times
// Using maps for efficient grouping
type FileEbirdKey struct {
File string
EbirdCode string
}
detections := make(map[FileEbirdKey][]float64)
clipDuration := 0.0

// Read first row to get clip duration
record, err := reader.Read()
if err != nil && err != io.EOF {
errMsg := fmt.Sprintf("Failed to read first CSV row: %v", err)
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}

if err != io.EOF {
startTime, _ := strconv.ParseFloat(record[startTimeIdx], 64)
endTime, _ := strconv.ParseFloat(record[endTimeIdx], 64)
clipDuration = endTime - startTime
output.ClipDuration = clipDuration

// Process first row
fileName := record[fileIdx]
for i, idx := range ebirdIdx {
if record[idx] == "1" {
key := FileEbirdKey{File: fileName, EbirdCode: ebirdCodes[i]}
detections[key] = append(detections[key], startTime)
}
}

// Read remaining rows
for {
record, err := reader.Read()
if err == io.EOF {
break
}
if err != nil {
errMsg := fmt.Sprintf("Failed to read CSV row: %v", err)
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}

startTime, _ := strconv.ParseFloat(record[startTimeIdx], 64)
fileName := record[fileIdx]

for i, idx := range ebirdIdx {
if record[idx] == "1" {
key := FileEbirdKey{File: fileName, EbirdCode: ebirdCodes[i]}
detections[key] = append(detections[key], startTime)
}
}
}
}

// Calculate gap threshold
gapMultiplier := CLUSTER_GAP_MULTIPLIER
if input.GapMultiplier > 0 {
gapMultiplier = input.GapMultiplier
}
minDetections := MIN_DETECTIONS_PER_CLUSTER
if input.MinDetections >= 0 {
minDetections = input.MinDetections
}
gapThreshold := float64(gapMultiplier) * clipDuration
output.GapThreshold = gapThreshold

// Cluster detections by (file, ebird_code)
var allCalls []ClusteredCall
speciesCount := make(map[string]int)

for key, startTimes := range detections {
// Sort start times
sort.Float64s(startTimes)

// Cluster consecutive detections
clusters := clusterStartTimes(startTimes, gapThreshold)

// Convert clusters to calls
for _, cluster := range clusters {
if len(cluster) <= minDetections {
continue
}

call := ClusteredCall{
File: key.File,
StartTime: cluster[0],
EndTime: cluster[len(cluster)-1] + clipDuration,
EbirdCode: key.EbirdCode,
Segments: len(cluster),
}
allCalls = append(allCalls, call)
speciesCount[key.EbirdCode]++
}
}

// Sort calls by file, then start time
sort.Slice(allCalls, func(i, j int) bool {
if allCalls[i].File != allCalls[j].File {
return allCalls[i].File < allCalls[j].File
}
return allCalls[i].StartTime < allCalls[j].StartTime
})

output.Calls = allCalls
output.TotalCalls = len(allCalls)
output.SpeciesCount = speciesCount

// Write .data files if requested
if input.WriteDotData {
dataFilesWritten, dataFilesSkipped, err := writeDotFiles(input.CSVPath, filter, allCalls, input.ProgressHandler)
if err != nil {
// Return error - this includes clobber protection and parse errors
errMsg := fmt.Sprintf("Error writing .data files: %v", err)
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}
output.DataFilesWritten = dataFilesWritten
output.DataFilesSkipped = dataFilesSkipped
}

return output, nil
}

// extractFilename extracts just the filename from a path
// "./C05/2025-11-08/20250518_210000.WAV" -> "20250518_210000.WAV"
func extractFilename(path string) string {
return filepath.Base(path)
}

// DirCache caches directory entries for fast WAV file lookup.
// Scans the directory once and builds a map from lowercased basename to full filename.
// Safe for concurrent read-only use after construction.
type DirCache struct {
dir string
wavMap map[string]string // lowercase basename -> filename with original case (e.g. "20230610_150000" -> "20230610_150000.WAV")
dirMap map[string]string // lowercase basename -> filename for any file (used by from-raven for .selections.txt etc.)
}

// NewDirCache creates a DirCache by scanning the directory once.
func NewDirCache(dir string) *DirCache {
entries, err := os.ReadDir(dir)
if err != nil {
return &DirCache{dir: dir, wavMap: make(map[string]string), dirMap: make(map[string]string)}
}
wavMap := make(map[string]string, len(entries))
dirMap := make(map[string]string, len(entries))
for _, entry := range entries {
if entry.IsDir() {
continue
}
name := entry.Name()
ext := filepath.Ext(name)
base := strings.TrimSuffix(name, ext)
dirMap[strings.ToLower(base)] = name
if strings.EqualFold(ext, ".wav") {
wavMap[strings.ToLower(base)] = name
}
}
return &DirCache{dir: dir, wavMap: wavMap, dirMap: dirMap}
}

// FindWAV looks up a WAV file by basename (case-insensitive).
// Returns the full path with correct case, or empty string if not found.
func (dc *DirCache) FindWAV(baseName string) string {
if name, ok := dc.wavMap[strings.ToLower(baseName)]; ok {
return filepath.Join(dc.dir, name)
}
return ""
}

// FindFile looks up any file by basename (case-insensitive).
// Returns the full path with correct case, or empty string if not found.
func (dc *DirCache) FindFile(baseName string) string {
if name, ok := dc.dirMap[strings.ToLower(baseName)]; ok {
return filepath.Join(dc.dir, name)
}
return ""
}

// findWAVFile finds a WAV file in the directory with case-insensitive matching.
// baseName is the filename without extension (e.g., "20230610_150000").
// Returns the full path with correct case, or empty string if not found.
// Deprecated: Use DirCache.FindWAV for batch operations to avoid repeated directory scans.
func findWAVFile(dir, baseName string) string {
entries, err := os.ReadDir(dir)
if err != nil {
return ""
}
for _, entry := range entries {
if entry.IsDir() {
continue
}
name := entry.Name()
ext := filepath.Ext(name)
nameNoExt := strings.TrimSuffix(name, ext)
if nameNoExt == baseName && strings.EqualFold(ext, ".wav") {
return filepath.Join(dir, name)
}
}
return ""
}

// writeDotFiles writes AviaNZ .data files for each audio file with calls
// Uses parallel workers for improved performance on large batches
func writeDotFiles(csvPath, filter string, calls []ClusteredCall, progress ProgressHandler) (int, int, error) {
// Base directory is the directory containing the CSV file
csvDir := filepath.Dir(csvPath)

// Group calls by file (using extracted filename)
callsByFile := make(map[string][]ClusteredCall)
for _, call := range calls {
filename := extractFilename(call.File)
callsByFile[filename] = append(callsByFile[filename], call)
}

// Report initial progress
if progress != nil {
progress(0, len(callsByFile), "Processing WAV files")
}

// If small batch, process sequentially (avoid goroutine overhead)
if len(callsByFile) < 10 {
return writeDotFilesSequential(csvDir, filter, callsByFile, progress)
}

// Parallel processing for larger batches
return writeDotFilesParallel(csvDir, filter, callsByFile, progress)
}

// dotDataJob represents a single file to process
type dotDataJob struct {
filename string
fileCalls []ClusteredCall
}

// dotDataResult represents the result of processing a single file
type dotDataResult struct {
filename string
written bool
err error
}

// writeDotFilesSequential processes files one at a time (for small batches)
func writeDotFilesSequential(csvDir, filter string, callsByFile map[string][]ClusteredCall, progress ProgressHandler) (int, int, error) {
dataFilesWritten := 0
dataFilesSkipped := 0
total := len(callsByFile)
processed := 0

for filename, fileCalls := range callsByFile {
// Find WAV file with correct case
baseName := strings.TrimSuffix(filename, filepath.Ext(filename))
wavPath := findWAVFile(csvDir, baseName)
if wavPath == "" {
dataFilesSkipped++
processed++
if progress != nil {
progress(processed, total, "")
}
continue
}

dataPath := wavPath + ".data"

sampleRate, duration, err := utils.ParseWAVHeaderMinimal(wavPath)
if err != nil {
dataFilesSkipped++
processed++
if progress != nil {
progress(processed, total, "")
}
continue
}

// Build segments and metadata
meta, segments := buildAviaNZMetaAndSegments(fileCalls, filter, duration, sampleRate)

if err := writeDotDataFileSafe(dataPath, segments, filter, meta); err != nil {
return dataFilesWritten, dataFilesSkipped, fmt.Errorf("failed to write %s: %w", dataPath, err)
}

dataFilesWritten++
processed++
if progress != nil {
progress(processed, total, "")
}
}

return dataFilesWritten, dataFilesSkipped, nil
}

// writeDotFilesParallel processes files concurrently using a worker pool
func writeDotFilesParallel(csvDir, filter string, callsByFile map[string][]ClusteredCall, progress ProgressHandler) (int, int, error) {
total := len(callsByFile)
var processed atomic.Int32

// Create job channel
jobs := make(chan dotDataJob, len(callsByFile))
results := make(chan dotDataResult, len(callsByFile))

// Start workers
var wg sync.WaitGroup
for range DOT_DATA_WORKERS {
wg.Add(1)
go dotDataWorker(csvDir, filter, jobs, results, &wg)
}

// Send jobs
for filename, fileCalls := range callsByFile {
jobs <- dotDataJob{filename: filename, fileCalls: fileCalls}
}
close(jobs)

// Wait for workers to finish
go func() {
wg.Wait()
close(results)
}()

// Collect results with progress reporting
dataFilesWritten := 0
dataFilesSkipped := 0
var firstErr error

for result := range results {
if result.err != nil && firstErr == nil {
firstErr = result.err
}
if result.written {
dataFilesWritten++
} else {
dataFilesSkipped++
}

// Report progress
if progress != nil {
current := int(processed.Add(1))
progress(current, total, "")
}
}

return dataFilesWritten, dataFilesSkipped, firstErr
}

// dotDataWorker processes files from the jobs channel
func dotDataWorker(csvDir, filter string, jobs <-chan dotDataJob, results chan<- dotDataResult, wg *sync.WaitGroup) {
defer wg.Done()

for job := range jobs {
// Find WAV file with correct case
baseName := strings.TrimSuffix(job.filename, filepath.Ext(job.filename))
wavPath := findWAVFile(csvDir, baseName)
if wavPath == "" {
results <- dotDataResult{filename: job.filename, written: false, err: nil}
continue
}

dataPath := wavPath + ".data"

sampleRate, duration, err := utils.ParseWAVHeaderMinimal(wavPath)
if err != nil {
results <- dotDataResult{filename: job.filename, written: false, err: nil}
continue
}

// Build segments and metadata
meta, segments := buildAviaNZMetaAndSegments(job.fileCalls, filter, duration, sampleRate)

if err := writeDotDataFileSafe(dataPath, segments, filter, meta); err != nil {
results <- dotDataResult{filename: job.filename, written: false, err: fmt.Errorf("failed to write %s: %w", dataPath, err)}
continue
}

results <- dotDataResult{filename: job.filename, written: true, err: nil}
}
}

// buildAviaNZMetaAndSegments creates metadata and segments for a .data file
func buildAviaNZMetaAndSegments(calls []ClusteredCall, filter string, duration float64, sampleRate int) (AviaNZMeta, []AviaNZSegment) {
// Create metadata
reviewer := "None"
meta := AviaNZMeta{
Operator: "Auto",
Reviewer: &reviewer,
Duration: duration,
}

// Build segments array
var segments []AviaNZSegment
for _, call := range calls {
// Create labels for this segment
labels := []AviaNZLabel{
{
Species: call.EbirdCode,
Certainty: DEFAULT_CERTAINTY,
Filter: filter,
},
}

// Create segment: [start, end, freq_low, freq_high, labels]
// freq_low=0, freq_high=sampleRate for full-band segments
segment := AviaNZSegment{
call.StartTime,
call.EndTime,
0, // freq_low
sampleRate, // freq_high (full band)
labels,
}
segments = append(segments, segment)
}

return meta, segments
}

// writeAviaNZDataFile writes a new .data file to disk (does not check for existing files)
func writeAviaNZDataFile(path string, data []any) error {
file, err := os.Create(path)
if err != nil {
return fmt.Errorf("failed to create file: %w", err)
}
defer func() { _ = file.Close() }()

encoder := json.NewEncoder(file)
encoder.SetIndent("", "") // No indentation for compact output

if err := encoder.Encode(data); err != nil {
return fmt.Errorf("failed to encode JSON: %w", err)
}

return nil
}

// writeDotDataFileSafe safely writes or merges .data files
// - If file doesn't exist: write new file
// - If file exists with same filter: return error (refuse to clobber)
// - If file exists with different filter: merge segments and write
// - If file exists but can't be parsed: return error (refuse to clobber)
func writeDotDataFileSafe(path string, newSegments []AviaNZSegment, filter string, meta AviaNZMeta) error {
// Check if file exists
if _, err := os.Stat(path); err == nil {
// File exists - parse and check
existing, err := utils.ParseDataFile(path)
if err != nil {
return fmt.Errorf("cannot parse existing %s: %w (refusing to clobber)", path, err)
}

// Check for duplicate filter
for _, seg := range existing.Segments {
if seg.HasFilterLabel(filter) {
return fmt.Errorf("%s already contains filter '%s' (refusing to clobber)", path, filter)
}
}

// Append new segments (different filter - safe to merge)
for _, newSeg := range newSegments {
seg := convertAviaNZSegment(newSeg, filter)
existing.Segments = append(existing.Segments, seg)
}

// Sort by start time
sort.Slice(existing.Segments, func(i, j int) bool {
return existing.Segments[i].StartTime < existing.Segments[j].StartTime
})

return existing.Write(path)
}

// File doesn't exist - write new
data := buildDataFileFromSegments(meta, newSegments)
return writeAviaNZDataFile(path, data)
}

// convertAviaNZSegment converts an AviaNZSegment to utils.Segment
func convertAviaNZSegment(seg AviaNZSegment, filter string) *utils.Segment {
labels := seg[4].([]AviaNZLabel)
utilsLabels := make([]*utils.Label, len(labels))
for i, l := range labels {
utilsLabels[i] = &utils.Label{
Species: l.Species,
Certainty: l.Certainty,
Filter: filter,
}
}

// Handle freq values (could be int or float64 depending on how they were created)
var freqLow, freqHigh float64
switch v := seg[2].(type) {
case int:
freqLow = float64(v)
case float64:
freqLow = v
}
switch v := seg[3].(type) {
case int:
freqHigh = float64(v)
case float64:
freqHigh = v
}

return &utils.Segment{
StartTime: seg[0].(float64),
EndTime: seg[1].(float64),
FreqLow: freqLow,
FreqHigh: freqHigh,
Labels: utilsLabels,
}
}

// buildDataFileFromSegments builds the data file structure from meta and segments
func buildDataFileFromSegments(meta AviaNZMeta, segments []AviaNZSegment) []any {
result := make([]any, 0, 1+len(segments))
result = append(result, meta)
for _, seg := range segments {
result = append(result, seg)
}
return result
}

// ParseFilterFromFilename extracts filter name from preds CSV filename
// "predsST_opensoundscape-kiwi-1.2_2025-11-12.csv" -> "opensoundscape-kiwi-1.2"
// Returns empty string if parsing fails
func ParseFilterFromFilename(csvPath string) string {
filename := filepath.Base(csvPath)
// Remove .csv extension
name := strings.TrimSuffix(filename, ".csv")

// Split on underscore
parts := strings.Split(name, "_")
if len(parts) == 3 {
return parts[1]
}

return ""
}

// clusterStartTimes groups consecutive start times into clusters
// where the gap between consecutive times is <= gapThreshold
func clusterStartTimes(startTimes []float64, gapThreshold float64) [][]float64 {
if len(startTimes) == 0 {
return nil
}

var clusters [][]float64
currentCluster := []float64{startTimes[0]}

for i := 1; i < len(startTimes); i++ {
gap := startTimes[i] - startTimes[i-1]
if gap <= gapThreshold {
// Same cluster
currentCluster = append(currentCluster, startTimes[i])
} else {
// New cluster
clusters = append(clusters, currentCluster)
currentCluster = []float64{startTimes[i]}
}
}
// Don't forget the last cluster
clusters = append(clusters, currentCluster)

return clusters
}
file addition: calls_from_birda_raven_test.go (----------)

[0.248737]

package tools

import (
"os"
"path/filepath"
"testing"

"skraak/utils"
)

// ============================================
// BirdNET Tests
// ============================================

func TestCallsFromBirda_NewDataFile(t *testing.T) {
tmpDir := t.TempDir()

// Create a minimal WAV file
wavPath := filepath.Join(tmpDir, "test.WAV")
createMinimalWAV(t, wavPath, 16000, 60.0)

// Create BirdNET results file
birdaPath := filepath.Join(tmpDir, "test.BirdNET.results.csv")
birdaContent := "\ufeffStart (s),End (s),Scientific name,Common name,Confidence,File\n0.0,3.0,Turdus migratorius,American Robin,0.85,/some/path/test.WAV\n"
if err := os.WriteFile(birdaPath, []byte(birdaContent), 0644); err != nil {
t.Fatal(err)
}

input := CallsFromBirdaInput{
File: birdaPath,
}

output, err := CallsFromBirda(input)

if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if output.DataFilesWritten != 1 {
t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)
}
if output.Filter != "BirdNET" {
t.Errorf("expected filter 'BirdNET', got '%s'", output.Filter)
}
if output.TotalCalls != 1 {
t.Errorf("expected 1 call, got %d", output.TotalCalls)
}

// Verify .data file was created
dataPath := wavPath + ".data"
df, err := utils.ParseDataFile(dataPath)
if err != nil {
t.Fatalf("failed to parse .data file: %v", err)
}
if len(df.Segments) != 1 {
t.Errorf("expected 1 segment, got %d", len(df.Segments))
}
if df.Segments[0].Labels[0].Filter != "BirdNET" {
t.Errorf("expected filter 'BirdNET', got '%s'", df.Segments[0].Labels[0].Filter)
}
if df.Segments[0].Labels[0].Certainty != 85 {
t.Errorf("expected certainty 85, got %d", df.Segments[0].Labels[0].Certainty)
}
}

func TestCallsFromBirda_ExistingSameFilter(t *testing.T) {
tmpDir := t.TempDir()

wavPath := filepath.Join(tmpDir, "test.WAV")
createMinimalWAV(t, wavPath, 16000, 60.0)

dataPath := wavPath + ".data"
existingData := `[{"Operator": "Test", "Duration": 60.0}, [5.0, 10.0, 0, 16000, [{"species": "Existing Bird", "certainty": 90, "filter": "BirdNET"}]]]`
if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {
t.Fatal(err)
}

birdaPath := filepath.Join(tmpDir, "test.BirdNET.results.csv")
birdaContent := "\ufeffStart (s),End (s),Scientific name,Common name,Confidence,File\n0.0,3.0,New Bird,New Bird,0.85,test.WAV\n"
if err := os.WriteFile(birdaPath, []byte(birdaContent), 0644); err != nil {
t.Fatal(err)
}

input := CallsFromBirdaInput{File: birdaPath}
output, err := CallsFromBirda(input)

if err == nil {
t.Error("expected error for same filter, got nil")
}
if output.Error == nil {
t.Error("expected error message in output")
}
}

func TestCallsFromBirda_ExistingDifferentFilter(t *testing.T) {
tmpDir := t.TempDir()

wavPath := filepath.Join(tmpDir, "test.WAV")
createMinimalWAV(t, wavPath, 16000, 60.0)

dataPath := wavPath + ".data"
existingData := `[{"Operator": "Test", "Duration": 60.0}, [5.0, 10.0, 0, 16000, [{"species": "Kiwi", "certainty": 90, "filter": "Manual"}]]]`
if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {
t.Fatal(err)
}

birdaPath := filepath.Join(tmpDir, "test.BirdNET.results.csv")
birdaContent := "\ufeffStart (s),End (s),Scientific name,Common name,Confidence,File\n0.0,3.0,Robin,Robin,0.85,test.WAV\n"
if err := os.WriteFile(birdaPath, []byte(birdaContent), 0644); err != nil {
t.Fatal(err)
}

input := CallsFromBirdaInput{File: birdaPath}
output, err := CallsFromBirda(input)

if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if output.DataFilesWritten != 1 {
t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)
}

df, err := utils.ParseDataFile(dataPath)
if err != nil {
t.Fatalf("failed to parse .data file: %v", err)
}
if len(df.Segments) != 2 {
t.Errorf("expected 2 segments after merge, got %d", len(df.Segments))
}
}

func TestCallsFromBirda_DeleteOption(t *testing.T) {
tmpDir := t.TempDir()

wavPath := filepath.Join(tmpDir, "test.WAV")
createMinimalWAV(t, wavPath, 16000, 60.0)

birdaPath := filepath.Join(tmpDir, "test.BirdNET.results.csv")
birdaContent := "\ufeffStart (s),End (s),Scientific name,Common name,Confidence,File\n0.0,3.0,Robin,Robin,0.85,test.WAV\n"
if err := os.WriteFile(birdaPath, []byte(birdaContent), 0644); err != nil {
t.Fatal(err)
}

input := CallsFromBirdaInput{File: birdaPath, Delete: true}
output, err := CallsFromBirda(input)

if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if output.FilesDeleted != 1 {
t.Errorf("expected 1 file deleted, got %d", output.FilesDeleted)
}
if _, err := os.Stat(birdaPath); !os.IsNotExist(err) {
t.Error("expected BirdNET file to be deleted")
}
}

func TestCallsFromBirda_FolderMode(t *testing.T) {
tmpDir := t.TempDir()

for i := range 2 {
wavPath := filepath.Join(tmpDir, "test"+string(rune('0'+i))+".WAV")
createMinimalWAV(t, wavPath, 16000, 60.0)

birdaPath := filepath.Join(tmpDir, "test"+string(rune('0'+i))+".BirdNET.results.csv")
birdaContent := "\ufeffStart (s),End (s),Scientific name,Common name,Confidence,File\n0.0,3.0,Bird,Bird,0.85,test.WAV\n"
if err := os.WriteFile(birdaPath, []byte(birdaContent), 0644); err != nil {
t.Fatal(err)
}
}

input := CallsFromBirdaInput{Folder: tmpDir}
output, err := CallsFromBirda(input)

if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if output.FilesProcessed != 2 {
t.Errorf("expected 2 files processed, got %d", output.FilesProcessed)
}
if output.DataFilesWritten != 2 {
t.Errorf("expected 2 data files written, got %d", output.DataFilesWritten)
}
}

// ============================================
// Raven Tests
// ============================================

func TestCallsFromRaven_NewDataFile(t *testing.T) {
tmpDir := t.TempDir()

wavPath := filepath.Join(tmpDir, "test.WAV")
createMinimalWAV(t, wavPath, 16000, 60.0)

ravenPath := filepath.Join(tmpDir, "test.Table.1.selections.txt")
ravenContent := "Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tSpecies\n1\tSpectrogram 1\t1\t0.0\t5.0\t1000\t5000\tKiwi\n"
if err := os.WriteFile(ravenPath, []byte(ravenContent), 0644); err != nil {
t.Fatal(err)
}

input := CallsFromRavenInput{File: ravenPath}
output, err := CallsFromRaven(input)

if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if output.DataFilesWritten != 1 {
t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)
}
if output.Filter != "Raven" {
t.Errorf("expected filter 'Raven', got '%s'", output.Filter)
}

dataPath := wavPath + ".data"
df, err := utils.ParseDataFile(dataPath)
if err != nil {
t.Fatalf("failed to parse .data file: %v", err)
}
if df.Segments[0].FreqLow != 1000 {
t.Errorf("expected freq_low 1000, got %f", df.Segments[0].FreqLow)
}
if df.Segments[0].FreqHigh != 5000 {
t.Errorf("expected freq_high 5000, got %f", df.Segments[0].FreqHigh)
}
}

func TestCallsFromRaven_ExistingSameFilter(t *testing.T) {
tmpDir := t.TempDir()

wavPath := filepath.Join(tmpDir, "test.WAV")
createMinimalWAV(t, wavPath, 16000, 60.0)

dataPath := wavPath + ".data"
existingData := `[{"Operator": "Test", "Duration": 60.0}, [5.0, 10.0, 0, 16000, [{"species": "Existing", "certainty": 90, "filter": "Raven"}]]]`
if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {
t.Fatal(err)
}

ravenPath := filepath.Join(tmpDir, "test.Table.1.selections.txt")
ravenContent := "Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tSpecies\n1\tSpectrogram 1\t1\t0.0\t5.0\t1000\t5000\tNew\n"
if err := os.WriteFile(ravenPath, []byte(ravenContent), 0644); err != nil {
t.Fatal(err)
}

input := CallsFromRavenInput{File: ravenPath}
output, err := CallsFromRaven(input)

if err == nil {
t.Error("expected error for same filter, got nil")
}
if output.Error == nil {
t.Error("expected error message in output")
}
}

func TestCallsFromRaven_ExistingDifferentFilter(t *testing.T) {
tmpDir := t.TempDir()

wavPath := filepath.Join(tmpDir, "test.WAV")
createMinimalWAV(t, wavPath, 16000, 60.0)

dataPath := wavPath + ".data"
existingData := `[{"Operator": "Test", "Duration": 60.0}, [5.0, 10.0, 0, 16000, [{"species": "Kiwi", "certainty": 90, "filter": "BirdNET"}]]]`
if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {
t.Fatal(err)
}

ravenPath := filepath.Join(tmpDir, "test.Table.1.selections.txt")
ravenContent := "Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tSpecies\n1\tSpectrogram 1\t1\t0.0\t5.0\t1000\t5000\tMorepork\n"
if err := os.WriteFile(ravenPath, []byte(ravenContent), 0644); err != nil {
t.Fatal(err)
}

input := CallsFromRavenInput{File: ravenPath}
output, err := CallsFromRaven(input)

if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if output.DataFilesWritten != 1 {
t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)
}

df, err := utils.ParseDataFile(dataPath)
if err != nil {
t.Fatalf("failed to parse .data file: %v", err)
}
if len(df.Segments) != 2 {
t.Errorf("expected 2 segments after merge, got %d", len(df.Segments))
}
}

func TestCallsFromRaven_DeleteOption(t *testing.T) {
tmpDir := t.TempDir()

wavPath := filepath.Join(tmpDir, "test.WAV")
createMinimalWAV(t, wavPath, 16000, 60.0)

ravenPath := filepath.Join(tmpDir, "test.Table.1.selections.txt")
ravenContent := "Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tSpecies\n1\tSpectrogram 1\t1\t0.0\t5.0\t1000\t5000\tKiwi\n"
if err := os.WriteFile(ravenPath, []byte(ravenContent), 0644); err != nil {
t.Fatal(err)
}

input := CallsFromRavenInput{File: ravenPath, Delete: true}
output, err := CallsFromRaven(input)

if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if output.FilesDeleted != 1 {
t.Errorf("expected 1 file deleted, got %d", output.FilesDeleted)
}
if _, err := os.Stat(ravenPath); !os.IsNotExist(err) {
t.Error("expected Raven file to be deleted")
}
}

func TestCallsFromRaven_MultipleSelections(t *testing.T) {
tmpDir := t.TempDir()

wavPath := filepath.Join(tmpDir, "test.WAV")
createMinimalWAV(t, wavPath, 16000, 60.0)

ravenPath := filepath.Join(tmpDir, "test.Table.1.selections.txt")
ravenContent := "Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tSpecies\n1\tSpectrogram 1\t1\t0.0\t5.0\t1000\t5000\tKiwi\n2\tSpectrogram 1\t1\t10.0\t15.0\t2000\t6000\tMorepork\n3\tSpectrogram 1\t1\t20.0\t25.0\t1500\t4500\tTui\n"
if err := os.WriteFile(ravenPath, []byte(ravenContent), 0644); err != nil {
t.Fatal(err)
}

input := CallsFromRavenInput{File: ravenPath}
output, err := CallsFromRaven(input)

if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if output.TotalCalls != 3 {
t.Errorf("expected 3 calls, got %d", output.TotalCalls)
}
if output.SpeciesCount["Kiwi"] != 1 || output.SpeciesCount["Morepork"] != 1 || output.SpeciesCount["Tui"] != 1 {
t.Errorf("unexpected species count: %v", output.SpeciesCount)
}
}
file addition: calls_from_birda.go (----------)

[0.248737]

package tools

import (
"encoding/csv"
"fmt"
"io"
"os"
"path/filepath"
"sort"
"strings"
"sync"
"sync/atomic"

"skraak/utils"
)

// CallsFromBirdaInput defines the input for the calls-from-birda tool
type CallsFromBirdaInput struct {
Folder string `json:"folder"`
File string `json:"file"`
Delete bool `json:"delete"`
ProgressHandler ProgressHandler `json:"-"` // Optional progress callback
}

// CallsFromBirdaOutput defines the output for the calls-from-birda tool
type CallsFromBirdaOutput struct {
Calls []ClusteredCall `json:"calls"`
TotalCalls int `json:"total_calls"`
SpeciesCount map[string]int `json:"species_count"`
DataFilesWritten int `json:"data_files_written"`
DataFilesSkipped int `json:"data_files_skipped"`
FilesProcessed int `json:"files_processed"`
FilesDeleted int `json:"files_deleted"`
Filter string `json:"filter"`
Error *string `json:"error,omitempty"`
}

// BirdNETDetection represents a single BirdNET detection
type BirdNETDetection struct {
StartTime float64
EndTime float64
ScientificName string
CommonName string
Confidence float64
WAVPath string
}

// birdaJob represents a single BirdNET file to process
type birdaJob struct {
birdaFile string
}

// birdaResult represents the result of processing a single BirdNET file
type birdaResult struct {
birdaFile string
calls []ClusteredCall
written bool
skipped bool
err error
}

// CallsFromBirda processes BirdNET results files and writes .data files
func CallsFromBirda(input CallsFromBirdaInput) (CallsFromBirdaOutput, error) {
var output CallsFromBirdaOutput
output.Filter = "BirdNET"

// Collect BirdNET files to process
var birdaFiles []string
if input.File != "" {
birdaFiles = []string{input.File}
} else if input.Folder != "" {
var err error
birdaFiles, err = findBirdaFiles(input.Folder)
if err != nil {
errMsg := fmt.Sprintf("Failed to find BirdNET files: %v", err)
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}
} else {
errMsg := "Either --folder or --file must be specified"
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}

if len(birdaFiles) == 0 {
errMsg := "No BirdNET files found"
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}

// Single file or small batch: process sequentially (avoid goroutine overhead)
if len(birdaFiles) < 10 {
return callsFromBirdaSequential(input, birdaFiles)
}

// Large batch: parallel processing with DirCache
return callsFromBirdaParallel(input, birdaFiles)
}

// callsFromBirdaSequential processes BirdNET files one at a time (for small batches)
func callsFromBirdaSequential(input CallsFromBirdaInput, birdaFiles []string) (CallsFromBirdaOutput, error) {
var output CallsFromBirdaOutput
output.Filter = "BirdNET"

// Build DirCache once for the folder
dirCaches := make(map[string]*DirCache)
if input.Folder != "" {
dirCaches[input.Folder] = NewDirCache(input.Folder)
}

speciesCount := make(map[string]int)
var allCalls []ClusteredCall
dataFilesWritten := 0
dataFilesSkipped := 0
filesProcessed := 0
filesDeleted := 0

for _, birdaFile := range birdaFiles {
dir := filepath.Dir(birdaFile)
cache := dirCaches[dir]
if cache == nil {
cache = NewDirCache(dir)
dirCaches[dir] = cache
}

calls, written, skipped, err := processBirdaFileCached(birdaFile, cache)
if err != nil {
errMsg := fmt.Sprintf("Error processing %s: %v", birdaFile, err)
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}

if written {
dataFilesWritten++
}
if skipped {
dataFilesSkipped++
}

for _, call := range calls {
allCalls = append(allCalls, call)
speciesCount[call.EbirdCode]++
}

filesProcessed++

// Delete if requested and successfully processed
if input.Delete && written {
if err := os.Remove(birdaFile); err != nil {
errMsg := fmt.Sprintf("Failed to delete %s: %v", birdaFile, err)
output.Error = &errMsg
return output, fmt.Errorf("%s", errMsg)
}
filesDeleted++
}

if input.ProgressHandler != nil {
input.ProgressHandler(filesProcessed, len(birdaFiles), filepath.Base(birdaFile))
}
}

// Sort all calls by file, then start time
sort.Slice(allCalls, func(i, j int) bool {
if allCalls[i].File != allCalls[j].File {
return allCalls[i].File < allCalls[j].File
}
return allCalls[i].StartTime < allCalls[j].StartTime
})

output.Calls = allCalls
output.TotalCalls = len(allCalls)
output.SpeciesCount = speciesCount
output.DataFilesWritten = dataFilesWritten
output.DataFilesSkipped = dataFilesSkipped
output.FilesProcessed = filesProcessed
output.FilesDeleted = filesDeleted

return output, nil
}

// callsFromBirdaParallel processes BirdNET files concurrently using a worker pool and DirCache
func callsFromBirdaParallel(input CallsFromBirdaInput, birdaFiles []string) (CallsFromBirdaOutput, error) {
var output CallsFromBirdaOutput
output.Filter = "BirdNET"

total := len(birdaFiles)
var processed atomic.Int32

// Build DirCache for the folder
dirCaches := &sync.Map{}
if input.Folder != "" {
cache := NewDirCache(input.Folder)
dirCaches.Store(input.Folder, cache)
}

// Create job and result channels
jobs := make(chan birdaJob, total)
results := make(chan birdaResult, total)

// Start workers
var wg sync.WaitGroup
for range DOT_DATA_WORKERS {
wg.Add(1)
go birdaWorker(dirCaches, jobs, results, &wg)
}

// Send jobs
for _, birdaFile := range birdaFiles {
jobs <- birdaJob{birdaFile: birdaFile}
}
close(jobs)

// Wait for workers to finish, then close results
go func() {
wg.Wait()
close(results)
}()

// Collect results with progress reporting
speciesCount := make(map[string]int)
var allCalls []ClusteredCall
dataFilesWritten := 0
dataFilesSkipped := 0
filesProcessed := 0
filesDeleted := 0
var firstErr error

for result := range results {
if result.err != nil && firstErr == nil {
firstErr = result.err
}

if result.written {
dataFilesWritten++
}
if result.skipped {
dataFilesSkipped++
}

for _, call := range result.calls {
allCalls = append(allCalls, call)
speciesCount[call.EbirdCode]++
}

filesProcessed++

// Delete if requested and successfully processed
if input.Delete && result.written {
if err := os.Remove(result.birdaFile); err != nil {
if firstErr == nil {
firstErr = fmt.Errorf("failed to delete %s: %w", result.birdaFile, err)
}
} else {
filesDeleted++
}
}

if input.ProgressHandler != nil {
current := int(processed.Add(1))
input.ProgressHandler(current, total, filepath.Base(result.birdaFile))
}
}

if firstErr != nil {
errMsg := firstErr.Error()
output.Error = &errMsg
return output, firstErr
}

// Sort all calls by file, then start time
sort.Slice(allCalls, func(i, j int) bool {
if allCalls[i].File != allCalls[j].File {
return allCalls[i].File < allCalls[j].File
}
return allCalls[i].StartTime < allCalls[j].StartTime
})

output.Calls = allCalls
output.TotalCalls = len(allCalls)
output.SpeciesCount = speciesCount
output.DataFilesWritten = dataFilesWritten
output.DataFilesSkipped = dataFilesSkipped
output.FilesProcessed = filesProcessed
output.FilesDeleted = filesDeleted

return output, nil
}

// birdaWorker processes BirdNET files from the jobs channel
func birdaWorker(dirCaches *sync.Map, jobs <-chan birdaJob, results chan<- birdaResult, wg *sync.WaitGroup) {
defer wg.Done()

for job := range jobs {
dir := filepath.Dir(job.birdaFile)

// Get or create DirCache for this directory
var cache *DirCache
if cached, ok := dirCaches.Load(dir); ok {
cache = cached.(*DirCache)
} else {
cache = NewDirCache(dir)
dirCaches.Store(dir, cache)
}

calls, written, skipped, err := processBirdaFileCached(job.birdaFile, cache)
results <- birdaResult{
birdaFile: job.birdaFile,
calls: calls,
written: written,
skipped: skipped,
err: err,
}
}
}

// findBirdaFiles finds all BirdNET results files in a folder
func findBirdaFiles(folder string) ([]string, error) {
var files []string

entries, err := os.ReadDir(folder)
if err != nil {
return nil, err
}

for _, entry := range entries {
name := entry.Name()
if strings.HasSuffix(name, ".BirdNET.results.csv") {
files = append(files, filepath.Join(folder, name))
}
}

return files, nil
}

// processBirdaFileCached processes a single BirdNET results file using a DirCache for WAV lookup
func processBirdaFileCached(birdaFile string, cache *DirCache) ([]ClusteredCall, bool, bool, error) {
// Open and parse CSV
file, err := os.Open(birdaFile)
if err != nil {
return nil, false, false, fmt.Errorf("failed to open file: %w", err)
}
defer func() { _ = file.Close() }()

// Create CSV reader
reader := csv.NewReader(file)

// Read header
header, err := reader.Read()
if err != nil {
return nil, false, false, fmt.Errorf("failed to read header: %w", err)
}

// Find column indices (handle BOM prefix)
startIdx := -1
endIdx := -1
commonNameIdx := -1
confidenceIdx := -1
fileIdx := -1

for i, col := range header {
// Remove BOM if present
col = strings.TrimPrefix(col, "\ufeff")
switch col {
case "Start (s)":
startIdx = i
case "End (s)":
endIdx = i
case "Common name":
commonNameIdx = i
case "Confidence":
confidenceIdx = i
case "File":
fileIdx = i
}
}

if startIdx == -1 || endIdx == -1 || commonNameIdx == -1 || confidenceIdx == -1 {
return nil, false, false, fmt.Errorf("missing required columns in BirdNET file")
}

// Read detections
var detections []BirdNETDetection
for {
record, err := reader.Read()
if err == io.EOF {
break
}
if err != nil {
return nil, false, false, fmt.Errorf("failed to read record: %w", err)
}

var det BirdNETDetection
if _, err := fmt.Sscanf(record[startIdx], "%f", &det.StartTime); err != nil {
return nil, false, false, fmt.Errorf("failed to parse start time %q: %w", record[startIdx], err)
}
if _, err := fmt.Sscanf(record[endIdx], "%f", &det.EndTime); err != nil {
return nil, false, false, fmt.Errorf("failed to parse end time %q: %w", record[endIdx], err)
}
det.CommonName = record[commonNameIdx]
if _, err := fmt.Sscanf(record[confidenceIdx], "%f", &det.Confidence); err != nil {
return nil, false, false, fmt.Errorf("failed to parse confidence %q: %w", record[confidenceIdx], err)
}
if fileIdx >= 0 && fileIdx < len(record) {
det.WAVPath = record[fileIdx]
}

detections = append(detections, det)
}

if len(detections) == 0 {
return nil, false, true, nil // No detections, skip
}

// Determine WAV path and .data path
var wavPath string
dir := filepath.Dir(birdaFile)
base := filepath.Base(birdaFile)
baseName := strings.TrimSuffix(base, ".BirdNET.results.csv")

if detections[0].WAVPath != "" {
// Check if the path from File column exists
if _, err := os.Stat(detections[0].WAVPath); err == nil {
wavPath = detections[0].WAVPath
}
}

// If not found from File column, search with DirCache
if wavPath == "" {
if cache != nil {
wavPath = cache.FindWAV(baseName)
} else {
wavPath = findWAVFile(dir, baseName)
}
}

if wavPath == "" {
return nil, false, true, nil // WAV not found, skip
}

// Check if WAV exists (to get sample rate and duration)
sampleRate, duration, err := utils.ParseWAVHeaderMinimal(wavPath)
if err != nil {
return nil, false, true, nil // Skip if WAV not found or invalid
}

dataPath := wavPath + ".data"

// Convert detections to segments
segments := buildBirdNETSegments(detections, sampleRate)

// Build metadata
meta := AviaNZMeta{
Operator: "BirdNET",
Duration: duration,
}
reviewer := "None"
meta.Reviewer = &reviewer

// Write .data file (safe write)
if err := writeDotDataFileSafe(dataPath, segments, "BirdNET", meta); err != nil {
return nil, false, false, err
}

// Convert to ClusteredCalls for output
var calls []ClusteredCall
for _, det := range detections {
calls = append(calls, ClusteredCall{
File: wavPath,
StartTime: det.StartTime,
EndTime: det.EndTime,
EbirdCode: det.CommonName,
Segments: 1,
})
}

return calls, true, false, nil
}

// buildBirdNETSegments converts BirdNET detections to AviaNZ segments
func buildBirdNETSegments(detections []BirdNETDetection, sampleRate int) []AviaNZSegment {
var segments []AviaNZSegment

for _, det := range detections {
// Convert confidence (0.0-1.0) to certainty (0-100)
certainty := min(max(int(det.Confidence*100), 0), 100)

labels := []AviaNZLabel{
{
Species: det.CommonName,
Certainty: certainty,
Filter: "BirdNET",
},
}

segment := AviaNZSegment{
det.StartTime,
det.EndTime,
0, // freq_low
sampleRate, // freq_high (full band)
labels,
}
segments = append(segments, segment)
}

return segments
}
file addition: calls_detect_anomalies_test.go (----------)

[0.248737]

package tools

import (
"os"
"path/filepath"
"testing"
)

func TestDetectAnomalies_LabelMismatch(t *testing.T) {
dir := t.TempDir()

// Same time range, different calltypes across two models
data := `[{"Operator":"test"},` +
`[0,10,100,1000,[{"species":"Kiwi","calltype":"Duet","certainty":100,"filter":"model-a"},` +
`{"species":"Kiwi","calltype":"Male","certainty":100,"filter":"model-b"}]]]`
if err := os.WriteFile(filepath.Join(dir, "f1.data"), []byte(data), 0644); err != nil {
t.Fatal(err)
}

out, err := DetectAnomalies(DetectAnomaliesInput{Folder: dir, Models: []string{"model-a", "model-b"}})
if err != nil {
t.Fatal(err)
}
if out.LabelMismatches != 1 {
t.Errorf("expected 1 label mismatch, got %d", out.LabelMismatches)
}
if out.CertaintyMismatches != 0 {
t.Errorf("expected 0 certainty mismatches, got %d", out.CertaintyMismatches)
}
if out.Anomalies[0].Type != "label_mismatch" {
t.Errorf("expected label_mismatch, got %s", out.Anomalies[0].Type)
}
}

func TestDetectAnomalies_CertaintyMismatch(t *testing.T) {
dir := t.TempDir()

// Same time range, same labels, different certainty
data := `[{"Operator":"test"},` +
`[0,10,100,1000,[{"species":"Kiwi","calltype":"Duet","certainty":90,"filter":"model-a"},` +
`{"species":"Kiwi","calltype":"Duet","certainty":100,"filter":"model-b"}]]]`
if err := os.WriteFile(filepath.Join(dir, "f1.data"), []byte(data), 0644); err != nil {
t.Fatal(err)
}

out, err := DetectAnomalies(DetectAnomaliesInput{Folder: dir, Models: []string{"model-a", "model-b"}})
if err != nil {
t.Fatal(err)
}
if out.CertaintyMismatches != 1 {
t.Errorf("expected 1 certainty mismatch, got %d", out.CertaintyMismatches)
}
if out.LabelMismatches != 0 {
t.Errorf("expected 0 label mismatches, got %d", out.LabelMismatches)
}
}

func TestDetectAnomalies_NoAnomalyWhenAgreement(t *testing.T) {
dir := t.TempDir()

data := `[{"Operator":"test"},` +
`[0,10,100,1000,[{"species":"Kiwi","calltype":"Duet","certainty":100,"filter":"model-a"},` +
`{"species":"Kiwi","calltype":"Duet","certainty":100,"filter":"model-b"}]]]`
if err := os.WriteFile(filepath.Join(dir, "f1.data"), []byte(data), 0644); err != nil {
t.Fatal(err)
}

out, err := DetectAnomalies(DetectAnomaliesInput{Folder: dir, Models: []string{"model-a", "model-b"}})
if err != nil {
t.Fatal(err)
}
if out.AnomaliesTotal != 0 {
t.Errorf("expected 0 anomalies, got %d", out.AnomaliesTotal)
}
}

func TestDetectAnomalies_LonelySegmentSkipped(t *testing.T) {
dir := t.TempDir()

// model-a has a segment, model-b has no segment in this file
data := `[{"Operator":"test"},` +
`[0,10,100,1000,[{"species":"Kiwi","certainty":100,"filter":"model-a"}]]]`
if err := os.WriteFile(filepath.Join(dir, "f1.data"), []byte(data), 0644); err != nil {
t.Fatal(err)
}

out, err := DetectAnomalies(DetectAnomaliesInput{Folder: dir, Models: []string{"model-a", "model-b"}})
if err != nil {
t.Fatal(err)
}
if out.AnomaliesTotal != 0 {
t.Errorf("lonely segment should be skipped, got %d anomalies", out.AnomaliesTotal)
}
if out.FilesWithAllModels != 0 {
t.Errorf("file missing a model should not count as FilesWithAllModels")
}
}

func TestDetectAnomalies_FailsWithOneModel(t *testing.T) {
dir := t.TempDir()
_, err := DetectAnomalies(DetectAnomaliesInput{Folder: dir, Models: []string{"model-a"}})
if err == nil {
t.Error("expected error with only 1 model")
}
}
file addition: calls_detect_anomalies.go (----------)

[0.248737]

package tools

import (
"fmt"
"os"
"path/filepath"

"skraak/utils"
)

type DetectAnomaliesInput struct {
Folder string
Models []string // at least 2 filter names
Species []string // optional scope; empty = all species
}

type DetectAnomaliesOutput struct {
Folder string `json:"folder"`
Models []string `json:"models"`
FilesExamined int `json:"files_examined"`
FilesWithAllModels int `json:"files_with_all_models"`
AnomaliesTotal int `json:"anomalies_total"`
LabelMismatches int `json:"label_mismatches"`
CertaintyMismatches int `json:"certainty_mismatches"`
Anomalies []Anomaly `json:"anomalies,omitempty"`
Error string `json:"error,omitempty"`
}

type Anomaly struct {
File string `json:"file"`
Type string `json:"type"` // "label_mismatch" | "certainty_mismatch"
Segments []AnomalySegment `json:"segments"`
}

type AnomalySegment struct {
Model string `json:"model"`
Start float64 `json:"start"`
End float64 `json:"end"`
Species string `json:"species"`
CallType string `json:"calltype,omitempty"`
Certainty int `json:"certainty"`
}

// DetectAnomalies compares corresponding segments across multiple ML model filters
// within each .data file. Segments are matched by time overlap (same logic as propagate).
// Lonely segments (no overlap in one or more models) are silently skipped.
// Anomalies are flagged when overlapping segments disagree on species+calltype,
// or when labels match but certainty values differ.
func DetectAnomalies(input DetectAnomaliesInput) (DetectAnomaliesOutput, error) {
folder := filepath.Clean(input.Folder)
output := DetectAnomaliesOutput{
Folder: folder,
Models: input.Models,
}

if len(input.Models) < 2 {
output.Error = "at least 2 --model values required"
return output, fmt.Errorf("%s", output.Error)
}
for i, a := range input.Models {
for j, b := range input.Models {
if i != j && a == b {
output.Error = "duplicate --model values are not allowed"
return output, fmt.Errorf("%s", output.Error)
}
}
}

info, err := os.Stat(input.Folder)
if err != nil {
output.Error = fmt.Sprintf("folder not found: %s", input.Folder)
return output, fmt.Errorf("%s", output.Error)
}
if !info.IsDir() {
output.Error = fmt.Sprintf("not a directory: %s", input.Folder)
return output, fmt.Errorf("%s", output.Error)
}

files, err := utils.FindDataFiles(folder)
if err != nil {
output.Error = fmt.Sprintf("list .data files: %v", err)
return output, fmt.Errorf("%s", output.Error)
}

scopeSet := make(map[string]bool, len(input.Species))
for _, s := range input.Species {
scopeSet[s] = true
}

for _, path := range files {
df, err := utils.ParseDataFile(path)
if err != nil {
continue
}
output.FilesExamined++

anomalies := detectAnomaliesInFile(df, path, input.Models, scopeSet)
if anomalies == nil {
// file didn't have all models present
continue
}
output.FilesWithAllModels++
for _, a := range anomalies {
if a.Type == "label_mismatch" {
output.LabelMismatches++
} else {
output.CertaintyMismatches++
}
}
output.Anomalies = append(output.Anomalies, anomalies...)
}
output.AnomaliesTotal = len(output.Anomalies)
return output, nil
}

// labeledSeg pairs a segment with the specific label matching the model filter.
type labeledSeg struct {
seg *utils.Segment
label *utils.Label
}

// detectAnomaliesInFile returns nil if the file doesn't contain all required models.
func detectAnomaliesInFile(df *utils.DataFile, path string, models []string, scope map[string]bool) []Anomaly {
// Collect ALL labeled segments per model — no scope filtering here.
// Scope is applied to anchor selection only, so a "Don't Know" label in model[1]
// against a "Kiwi" anchor in model[0] is correctly surfaced as a label_mismatch.
modelSegs := make(map[string][]labeledSeg, len(models))
for _, seg := range df.Segments {
for _, lbl := range seg.Labels {
for _, model := range models {
if lbl.Filter == model {
modelSegs[model] = append(modelSegs[model], labeledSeg{seg: seg, label: lbl})
break
}
}
}
}

// Skip file if any model is entirely absent.
for _, model := range models {
if len(modelSegs[model]) == 0 {
return nil
}
}

var anomalies []Anomaly

// Use models[0] as anchor. Scope filtering applies here only — other models
// contribute whatever they actually say for the overlapping time range.
for _, anchor := range modelSegs[models[0]] {
if len(scope) > 0 {
key := anchor.label.Species
if anchor.label.CallType != "" {
key += "+" + anchor.label.CallType
}
if !scope[key] && !scope[anchor.label.Species] {
continue
}
}

// Find overlapping segments in every other model.
matches := make(map[string][]labeledSeg, len(models)-1)
lonely := false
for _, model := range models[1:] {
for _, candidate := range modelSegs[model] {
if overlaps(anchor.seg, candidate.seg) {
matches[model] = append(matches[model], candidate)
}
}
if len(matches[model]) == 0 {
lonely = true
break
}
}
if lonely {
continue
}

// Build comparison group: anchor + first overlapping match per other model
// (consistent with propagate's approach).
group := []labeledSeg{anchor}
for _, model := range models[1:] {
group = append(group, matches[model][0])
}

// Check species+calltype agreement.
refSpecies := group[0].label.Species
refCallType := group[0].label.CallType
labelMatch := true
for _, ls := range group[1:] {
if ls.label.Species != refSpecies || ls.label.CallType != refCallType {
labelMatch = false
break
}
}

if !labelMatch {
anomalies = append(anomalies, Anomaly{File: path, Type: "label_mismatch", Segments: buildAnomalySegs(group, models)})
continue
}

// Labels agree — check certainty.
refCertainty := group[0].label.Certainty
for _, ls := range group[1:] {
if ls.label.Certainty != refCertainty {
anomalies = append(anomalies, Anomaly{File: path, Type: "certainty_mismatch", Segments: buildAnomalySegs(group, models)})
break
}
}
}

return anomalies
}

func buildAnomalySegs(group []labeledSeg, models []string) []AnomalySegment {
segs := make([]AnomalySegment, len(group))
for i, ls := range group {
segs[i] = AnomalySegment{
Model: models[i],
Start: ls.seg.StartTime,
End: ls.seg.EndTime,
Species: ls.label.Species,
CallType: ls.label.CallType,
Certainty: ls.label.Certainty,
}
}
return segs
}

// overlaps returns true if two segments share any time overlap.
func overlaps(a, b *utils.Segment) bool {
return a.StartTime < b.EndTime && b.StartTime < a.EndTime
}
file addition: calls_clip_labels_test.go (----------)

[0.248737]

package tools

import (
"encoding/csv"
"os"
"path/filepath"
"strings"
"testing"

"skraak/utils"
)

// --- test helpers (test file only) ---

func writeDataFile(t *testing.T, dir, name string, df *utils.DataFile) {
t.Helper()
if err := df.Write(filepath.Join(dir, name)); err != nil {
t.Fatalf("write .data file %s: %v", name, err)
}
}

func writeMapping(t *testing.T, dir, json string) {
t.Helper()
if err := os.WriteFile(filepath.Join(dir, "mapping.json"), []byte(json), 0644); err != nil {
t.Fatalf("write mapping.json: %v", err)
}
}

// parseCSV reads the output CSV, returning header and rows.
func parseCSV(t *testing.T, path string) ([]string, [][]string) {
t.Helper()
f, err := os.Open(path)
if err != nil {
t.Fatalf("open CSV %s: %v", path, err)
}
defer f.Close()
r := csv.NewReader(f)
header, err := r.Read()
if err != nil {
t.Fatalf("read header: %v", err)
}
rows, err := r.ReadAll()
if err != nil {
t.Fatalf("read rows: %v", err)
}
return header, rows
}

// clipLabels calls CallsClipLabels with standard test parameters.
func clipLabels(t *testing.T, dir string, extra ...func(*CallsClipLabelsInput)) CallsClipLabelsOutput {
t.Helper()
input := CallsClipLabelsInput{
Folder: dir,
MappingPath: filepath.Join(dir, "mapping.json"),
OutputPath: filepath.Join(dir, "clip_labels.csv"),
ClipDuration: 5,
ClipOverlap: 0,
MinLabelOverlap: 0.25,
FinalClip: "full",
}
for _, fn := range extra {
fn(&input)
}
out, err := CallsClipLabels(input)
if err != nil {
t.Fatalf("CallsClipLabels: %v", err)
}
return out
}

// --- tests ---

func TestClipLabels_RealClassTrue(t *testing.T) {
dir := t.TempDir()
writeDataFile(t, dir, "rec.wav.data", &utils.DataFile{
Meta: &utils.DataMeta{Duration: 20},
Segments: []*utils.Segment{
{
StartTime: 3, EndTime: 8, FreqLow: 100, FreqHigh: 5000,
Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},
},
},
})
writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"}}`)

out := clipLabels(t, dir)
header, rows := parseCSV(t, filepath.Join(dir, "clip_labels.csv"))

// Header: file, start_time, end_time, Kiwi
if len(header) != 4 || header[3] != "Kiwi" {
t.Fatalf("header = %v, want [..., Kiwi]", header)
}

// Clip 0-5 overlaps segment 3-8 by 2s ≥ 0.25 → Kiwi=True
// Clip 5-10 overlaps segment 3-8 by 3s ≥ 0.25 → Kiwi=True
// Clip 10-15, 15-20 → Kiwi=False
kiwiCol := 3
for i, row := range rows {
switch row[1] {
case "0.0", "5.0":
if row[kiwiCol] != "True" {
t.Errorf("row %d (start=%s): Kiwi=%s, want True", i, row[1], row[kiwiCol])
}
case "10.0", "15.0":
if row[kiwiCol] != "False" {
t.Errorf("row %d (start=%s): Kiwi=%s, want False", i, row[1], row[kiwiCol])
}
}
}
if out.PerClassTrueCount["Kiwi"] != 2 {
t.Errorf("PerClassTrueCount[Kiwi] = %d, want 2", out.PerClassTrueCount["Kiwi"])
}
}

func TestClipLabels_GapClipsAllFalse(t *testing.T) {
dir := t.TempDir()
// 15s file, Kiwi segment 0-5 only → clips 5-10 and 10-15 are gaps
writeDataFile(t, dir, "rec.wav.data", &utils.DataFile{
Meta: &utils.DataMeta{Duration: 15},
Segments: []*utils.Segment{
{
StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,
Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},
},
},
})
writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"}}`)

out := clipLabels(t, dir)
if out.ClipsAllFalseGap != 2 {
t.Errorf("ClipsAllFalseGap = %d, want 2", out.ClipsAllFalseGap)
}
if out.PerClassTrueCount["Kiwi"] != 1 {
t.Errorf("PerClassTrueCount[Kiwi] = %d, want 1", out.PerClassTrueCount["Kiwi"])
}
if out.RowsWritten != 3 {
t.Errorf("RowsWritten = %d, want 3", out.RowsWritten)
}
}

func TestClipLabels_NegativeOverridesPositive(t *testing.T) {
dir := t.TempDir()
// Kiwi segment 0-8, Not segment 0-4 → clip 0-5 overlaps both → __NEGATIVE__ wins
// Clip 5-10 overlaps only Kiwi (3s) → True
writeDataFile(t, dir, "rec.wav.data", &utils.DataFile{
Meta: &utils.DataMeta{Duration: 10},
Segments: []*utils.Segment{
{
StartTime: 0, EndTime: 8, FreqLow: 100, FreqHigh: 5000,
Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},
},
{
StartTime: 0, EndTime: 4, FreqLow: 100, FreqHigh: 5000,
Labels: []*utils.Label{{Species: "Not", Certainty: 100, Filter: "f1"}},
},
},
})
writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"},"Not":{"species":"__NEGATIVE__"}}`)

out := clipLabels(t, dir)
if out.ClipsNegative != 1 {
t.Errorf("ClipsNegative = %d, want 1", out.ClipsNegative)
}

_, rows := parseCSV(t, filepath.Join(dir, "clip_labels.csv"))
// Clip 0-5: negative hit → all-False (Not overlaps 0-4 by 4s)
if rows[0][3] != "False" {
t.Errorf("clip 0-5 Kiwi = %s, want False (overridden by __NEGATIVE__)", rows[0][3])
}
// Clip 5-10: only Kiwi overlaps (3s) → True
if rows[1][3] != "True" {
t.Errorf("clip 5-10 Kiwi = %s, want True", rows[1][3])
}
}

func TestClipLabels_IgnoreExcludesClip(t *testing.T) {
dir := t.TempDir()
// Don't Know segment 0-5, Kiwi segment 6-10
// Clip 0-5 overlaps __IGNORE__ → excluded
// Clip 5-10 overlaps Kiwi → emitted with True
writeDataFile(t, dir, "rec.wav.data", &utils.DataFile{
Meta: &utils.DataMeta{Duration: 15},
Segments: []*utils.Segment{
{
StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,
Labels: []*utils.Label{{Species: "Don't Know", Certainty: 0, Filter: "f1"}},
},
{
StartTime: 6, EndTime: 10, FreqLow: 100, FreqHigh: 5000,
Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},
},
},
})
writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"},"Don't Know":{"species":"__IGNORE__"}}`)

out := clipLabels(t, dir)
if out.ClipsIgnored != 1 {
t.Errorf("ClipsIgnored = %d, want 1", out.ClipsIgnored)
}
if out.SegmentsIgnored != 1 {
t.Errorf("SegmentsIgnored = %d, want 1", out.SegmentsIgnored)
}
// Only 2 rows: clip 5-10 (Kiwi=True) and clip 10-15 (gap)
if out.RowsWritten != 2 {
t.Errorf("RowsWritten = %d, want 2", out.RowsWritten)
}
}

func TestClipLabels_FilterRestrictsLabels(t *testing.T) {
dir := t.TempDir()
// Same time range, two filters. Only "wanted" should contribute.
writeDataFile(t, dir, "rec.wav.data", &utils.DataFile{
Meta: &utils.DataMeta{Duration: 10},
Segments: []*utils.Segment{
{
StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,
Labels: []*utils.Label{
{Species: "Kiwi", Certainty: 100, Filter: "wanted"},
{Species: "Not", Certainty: 100, Filter: "unwanted"},
},
},
},
})
writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"},"Not":{"species":"__NEGATIVE__"}}`)

out := clipLabels(t, dir, func(in *CallsClipLabelsInput) { in.Filter = "wanted" })
// Only Kiwi from "wanted" filter → clip 0-5 should be Kiwi=True
// Not from "unwanted" filter should be ignored → no __NEGATIVE__ override
if out.ClipsNegative != 0 {
t.Errorf("ClipsNegative = %d, want 0 (Not filter excluded)", out.ClipsNegative)
}
if out.PerClassTrueCount["Kiwi"] != 1 {
t.Errorf("PerClassTrueCount[Kiwi] = %d, want 1", out.PerClassTrueCount["Kiwi"])
}
}

func TestClipLabels_MappingCoverageError(t *testing.T) {
dir := t.TempDir()
writeDataFile(t, dir, "rec.wav.data", &utils.DataFile{
Meta: &utils.DataMeta{Duration: 10},
Segments: []*utils.Segment{
{
StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,
Labels: []*utils.Label{{Species: "Mystery", Certainty: 100, Filter: "f1"}},
},
},
})
writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"}}`)

input := CallsClipLabelsInput{
Folder: dir,
MappingPath: filepath.Join(dir, "mapping.json"),
OutputPath: filepath.Join(dir, "clip_labels.csv"),
ClipDuration: 5,
ClipOverlap: 0,
MinLabelOverlap: 0.25,
FinalClip: "full",
}
_, err := CallsClipLabels(input)
if err == nil {
t.Fatal("expected error for missing species in mapping")
}
if !strings.Contains(err.Error(), "Mystery") {
t.Errorf("error should mention missing species, got: %v", err)
}
}

func TestClipLabels_AppendMode(t *testing.T) {
dir := t.TempDir()
writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"}}`)

// First file
writeDataFile(t, dir, "a.wav.data", &utils.DataFile{
Meta: &utils.DataMeta{Duration: 5},
Segments: []*utils.Segment{
{
StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,
Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},
},
},
})
out1 := clipLabels(t, dir)
if out1.RowsWritten != 1 {
t.Fatalf("first run: RowsWritten = %d, want 1", out1.RowsWritten)
}

// Second run on same output file but with a different input folder
// Simulate append by running again — should fail on duplicate
_, err := CallsClipLabels(CallsClipLabelsInput{
Folder: dir,
MappingPath: filepath.Join(dir, "mapping.json"),
OutputPath: filepath.Join(dir, "clip_labels.csv"),
ClipDuration: 5,
ClipOverlap: 0,
MinLabelOverlap: 0.25,
FinalClip: "full",
})
if err == nil {
t.Fatal("expected duplicate error on second run with same folder")
}
if !strings.Contains(err.Error(), "duplicate") {
t.Errorf("error should mention duplicate, got: %v", err)
}
}

func TestClipLabels_MultipleFiles(t *testing.T) {
dir := t.TempDir()
writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"}}`)

writeDataFile(t, dir, "a.wav.data", &utils.DataFile{
Meta: &utils.DataMeta{Duration: 10},
Segments: []*utils.Segment{
{
StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,
Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},
},
},
})
writeDataFile(t, dir, "b.wav.data", &utils.DataFile{
Meta: &utils.DataMeta{Duration: 5},
Segments: []*utils.Segment{
{
StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,
Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},
},
},
})

out := clipLabels(t, dir)
if out.DataFilesParsed != 2 {
t.Errorf("DataFilesParsed = %d, want 2", out.DataFilesParsed)
}
// a: 2 clips (0-5, 5-10), b: 1 clip (0-5) = 3 total
if out.RowsWritten != 3 {
t.Errorf("RowsWritten = %d, want 3", out.RowsWritten)
}

_, rows := parseCSV(t, filepath.Join(dir, "clip_labels.csv"))
files := map[string]int{}
for _, r := range rows {
files[r[0]]++
}
if len(files) != 2 {
t.Errorf("expected 2 distinct files in CSV, got %d", len(files))
}
}
file addition: calls_clip_labels.go (----------)

[0.248737]

package tools

import (
"encoding/csv"
"fmt"
"io"
"os"
"path/filepath"
"slices"
"sort"
"strconv"
"strings"

"skraak/utils"
)

// CallsClipLabelsInput configures the clip-labels exporter.
type CallsClipLabelsInput struct {
Folder string `json:"folder"`
MappingPath string `json:"mapping"`
Filter string `json:"filter,omitempty"`
OutputPath string `json:"output"`
ClipDuration float64 `json:"clip_duration"`
ClipOverlap float64 `json:"clip_overlap"`
MinLabelOverlap float64 `json:"min_label_overlap"`
FinalClip string `json:"final_clip"`
}

// CallsClipLabelsOutput summarises a run.
type CallsClipLabelsOutput struct {
Folder string `json:"folder"`
OutputPath string `json:"output"`
Filter string `json:"filter,omitempty"`
Classes []string `json:"classes"`
DataFilesParsed int `json:"data_files_parsed"`
ClipsNegative int `json:"clips_negative"` // emitted, all-False because of __NEGATIVE__
ClipsIgnored int `json:"clips_ignored"` // excluded from output because of __IGNORE__ overlap
SegmentsIgnored int `json:"segments_ignored"` // segments whose species maps to __IGNORE__
ClipsAllFalseGap int `json:"clips_all_false_gap"` // emitted, all-False because no overlap
PerClassTrueCount map[string]int `json:"per_class_true_count"`
AppendedToFile bool `json:"appended_to_file"`
ExistingRowsFound int `json:"existing_rows_found"`
RowsWritten int `json:"rows_written"`
}

// resolvedSeg is a segment that has been classified by the mapping and is
// ready for overlap-checking against clip windows.
type resolvedSeg struct {
start, end float64
kind utils.MappingKind
classIdx int // valid only when kind == utils.MappingReal
}

// clipDisposition describes the outcome for a single clip window.
type clipDisposition int

const (
dispoLabelled clipDisposition = iota // at least one class column is True
dispoNegative // __NEGATIVE__ hit, all class columns False
dispoGap // no segment overlaps, all class columns False
dispoIgnored // __IGNORE__ hit, clip excluded from output
)

// clipLabelsRow is one row of the output CSV.
type clipLabelsRow struct {
file string
start float64
end float64
flags []bool
}

// rowKey is used for duplicate detection.
type rowKey struct {
file string
start string
end string
}

// CallsClipLabels reads .data files from a single folder and writes a CSV in
// OpenSoundScape's clip_labels format: one row per clip per file, with one
// True/False column per class in the mapping.
//
// Mirrors BoxedAnnotations.clip_labels(): every clip window is emitted; a
// column is True when any annotation of that class overlaps the window by
// ≥ min_label_overlap seconds. Sentinel mappings (__NEGATIVE__, __IGNORE__)
// get no column and contribute no labels.
func CallsClipLabels(input CallsClipLabelsInput) (CallsClipLabelsOutput, error) {
out := CallsClipLabelsOutput{
Folder: input.Folder,
OutputPath: input.OutputPath,
PerClassTrueCount: map[string]int{},
}

// Validate parameters.
finalClipMode, err := utils.ParseFinalClipMode(input.FinalClip)
if err != nil {
return out, err
}
if input.ClipDuration <= 0 {
return out, fmt.Errorf("--clip-duration must be > 0, got %v", input.ClipDuration)
}
if input.ClipOverlap < 0 || input.ClipOverlap >= input.ClipDuration {
return out, fmt.Errorf("--clip-overlap must be in [0, clip-duration), got %v", input.ClipOverlap)
}
if input.MinLabelOverlap <= 0 {
return out, fmt.Errorf("--min-label-overlap must be > 0, got %v", input.MinLabelOverlap)
}

// Load mapping.
mapping, err := utils.LoadMappingFile(input.MappingPath)
if err != nil {
return out, fmt.Errorf("load mapping %s: %w", input.MappingPath, err)
}

// Output classes: the unique canonical (non-sentinel) class names from mapping.json.
classes := mapping.Classes()
if len(classes) == 0 {
return out, fmt.Errorf("mapping.json has no real (non-sentinel) classes")
}
out.Classes = classes
out.Filter = input.Filter
classIdx := map[string]int{}
for i, c := range classes {
classIdx[c] = i
}

// Find and parse .data files.
dataPaths, err := utils.FindDataFiles(input.Folder)
if err != nil {
return out, fmt.Errorf("scan folder %s: %w", input.Folder, err)
}
if len(dataPaths) == 0 {
return out, fmt.Errorf("no .data files found in %s", input.Folder)
}

type parsedFile struct {
path string
df *utils.DataFile
}
parsed := make([]parsedFile, 0, len(dataPaths))
speciesSeen := map[string]bool{}
for _, p := range dataPaths {
df, err := utils.ParseDataFile(p)
if err != nil {
return out, fmt.Errorf("parse %s: %w", p, err)
}
if df.Meta == nil || df.Meta.Duration <= 0 {
return out, fmt.Errorf("missing or non-positive Duration in %s (cannot generate clips)", p)
}
for _, seg := range df.Segments {
for _, lbl := range seg.Labels {
if input.Filter != "" && lbl.Filter != input.Filter {
continue
}
speciesSeen[lbl.Species] = true
}
}
parsed = append(parsed, parsedFile{path: p, df: df})
}
out.DataFilesParsed = len(parsed)

// Mapping coverage check.
if missing := mapping.ValidateCoversSpecies(speciesSeen); len(missing) > 0 {
return out, fmt.Errorf("mapping.json is missing entries for species: %s\n(run /data-mapping to regenerate)", strings.Join(missing, ", "))
}

// Append-mode: read existing header + (file,start,end) tuples if any.
expectedHeader := append([]string{"file", "start_time", "end_time"}, classes...)
existing, appendMode, err := loadExistingRows(input.OutputPath, expectedHeader)
if err != nil {
return out, err
}
out.AppendedToFile = appendMode
out.ExistingRowsFound = len(existing)

// Path-rendering: relative to cwd.
cwd, err := os.Getwd()
if err != nil {
return out, fmt.Errorf("getwd: %w", err)
}
folderAbs, err := filepath.Abs(input.Folder)
if err != nil {
return out, fmt.Errorf("abs %s: %w", input.Folder, err)
}

// Process each file.
rows := make([]clipLabelsRow, 0, 1024)
for _, pf := range parsed {
fileRows, err := processClipLabelsFile(pf.path, pf.df, mapping, classIdx, classes, input, finalClipMode, cwd, folderAbs, &out)
if err != nil {
return out, err
}
rows = append(rows, fileRows...)
}

// Dedup pass — within new rows AND against existing CSV.
dedup := make(map[rowKey]bool, len(existing)+len(rows))
for k := range existing {
dedup[k] = true
}
for _, r := range rows {
k := rowKey{file: r.file, start: formatTime(r.start), end: formatTime(r.end)}
if dedup[k] {
return out, fmt.Errorf("duplicate clip detected: file=%s start=%s end=%s", k.file, k.start, k.end)
}
dedup[k] = true
}

// Write CSV.
if err := writeRows(input.OutputPath, expectedHeader, rows, appendMode); err != nil {
return out, err
}
out.RowsWritten = len(rows)

sort.Strings(out.Classes)
return out, nil
}

// processClipLabelsFile generates clip-labels rows for a single .data file.
func processClipLabelsFile(
path string,
df *utils.DataFile,
mapping utils.MappingFile,
classIdx map[string]int,
classes []string,
input CallsClipLabelsInput,
finalClipMode utils.FinalClipMode,
cwd, folderAbs string,
out *CallsClipLabelsOutput,
) ([]clipLabelsRow, error) {
windows, err := utils.GenerateClipTimes(
df.Meta.Duration,
input.ClipDuration,
input.ClipOverlap,
finalClipMode,
10,
)
if err != nil {
return nil, fmt.Errorf("generate clip windows for %s: %w", path, err)
}
if len(windows) == 0 {
return nil, nil
}

// Resolve segments against the mapping. Skip:
// - filter mismatch (when --filter set)
// - annotation duration < min_label_overlap
// - species not in mapping
segs := make([]resolvedSeg, 0, len(df.Segments))
for _, seg := range df.Segments {
if seg.EndTime-seg.StartTime < input.MinLabelOverlap {
continue
}
for _, lbl := range seg.Labels {
if input.Filter != "" && lbl.Filter != input.Filter {
continue
}
canon, kind, ok := mapping.Classify(lbl.Species)
if !ok {
continue
}
switch kind {
case utils.MappingIgn:
out.SegmentsIgnored++
segs = append(segs, resolvedSeg{
start: seg.StartTime, end: seg.EndTime, kind: kind,
})
case utils.MappingNeg:
segs = append(segs, resolvedSeg{
start: seg.StartTime, end: seg.EndTime, kind: kind,
})
case utils.MappingReal:
idx, present := classIdx[canon]
if !present {
continue
}
segs = append(segs, resolvedSeg{
start: seg.StartTime, end: seg.EndTime, kind: kind, classIdx: idx,
})
}
}
}

// Compute relative path for the WAV file.
wavName := strings.TrimSuffix(filepath.Base(path), ".data")
wavAbs := filepath.Join(folderAbs, wavName)
rel, err := filepath.Rel(cwd, wavAbs)
if err != nil {
rel = wavAbs
}
// Ensure relative paths start with ./ to match OPSO / pandas convention.
if rel != "" && !filepath.IsAbs(rel) && !strings.HasPrefix(rel, "."+string(filepath.Separator)) {
rel = "." + string(filepath.Separator) + rel
}

// Label each clip window.
var rows []clipLabelsRow
for _, w := range windows {
dispo, classHits := classifyClip(w, segs, input.MinLabelOverlap, len(classes))

if dispo == dispoIgnored {
out.ClipsIgnored++
continue
}

row := clipLabelsRow{
file: rel,
start: w.Start,
end: w.End,
flags: make([]bool, len(classes)),
}

switch dispo {
case dispoNegative:
out.ClipsNegative++
// flags stay all-False — __NEGATIVE__ overrides positives
case dispoGap:
out.ClipsAllFalseGap++
case dispoLabelled:
for i, hit := range classHits {
if hit {
row.flags[i] = true
out.PerClassTrueCount[classes[i]]++
}
}
}
rows = append(rows, row)
}

return rows, nil
}

// classifyClip determines the disposition of a single clip window against
// the resolved segments. Priority: __IGNORE__ > __NEGATIVE__ > class labels.
func classifyClip(w utils.ClipWindow, segs []resolvedSeg, minLabelOverlap float64, nClasses int) (clipDisposition, []bool) {
ignoreHit := false
negativeHit := false
classHits := make([]bool, nClasses)

for _, s := range segs {
if overlapSeconds(s.start, s.end, w.Start, w.End) < minLabelOverlap {
continue
}
switch s.kind {
case utils.MappingIgn:
ignoreHit = true
case utils.MappingNeg:
negativeHit = true
case utils.MappingReal:
classHits[s.classIdx] = true
}
}

if ignoreHit {
return dispoIgnored, nil
}
if negativeHit {
return dispoNegative, classHits
}
for _, hit := range classHits {
if hit {
return dispoLabelled, classHits
}
}
return dispoGap, classHits
}

// loadExistingRows reads an existing output CSV and returns its row keys
// (for deduplication) and whether we're in append mode.
func loadExistingRows(outputPath string, expectedHeader []string) (map[rowKey]bool, bool, error) {
fi, err := os.Stat(outputPath)
if err != nil {
if os.IsNotExist(err) {
return nil, false, nil
}
return nil, false, fmt.Errorf("stat %s: %w", outputPath, err)
}
if fi.Size() == 0 {
return nil, false, nil
}

f, err := os.Open(outputPath)
if err != nil {
return nil, false, fmt.Errorf("open existing %s: %w", outputPath, err)
}
defer func() { _ = f.Close() }()

r := csv.NewReader(f)
r.FieldsPerRecord = -1

header, err := r.Read()
if err != nil {
return nil, false, fmt.Errorf("read header of existing %s: %w", outputPath, err)
}
if !slices.Equal(header, expectedHeader) {
return nil, false, fmt.Errorf("column-set mismatch in existing %s\n existing: %s\n new: %s",
outputPath, strings.Join(header, ","), strings.Join(expectedHeader, ","))
}

existing := map[rowKey]bool{}
for {
rec, err := r.Read()
if err == io.EOF {
break
}
if err != nil {
return nil, false, fmt.Errorf("read row of existing %s: %w", outputPath, err)
}
if len(rec) < 3 {
return nil, false, fmt.Errorf("malformed row in existing %s: %v", outputPath, rec)
}
existing[rowKey{file: rec[0], start: rec[1], end: rec[2]}] = true
}

return existing, true, nil
}

// overlapSeconds returns the duration of overlap between two half-open intervals.
func overlapSeconds(aStart, aEnd, bStart, bEnd float64) float64 {
lo := max(aStart, bStart)
hi := min(aEnd, bEnd)
if hi <= lo {
return 0
}
return hi - lo
}

// formatTime renders a float to match pandas' default float repr in to_csv:
// always at least one decimal place, no trailing zeros beyond what's needed.
// e.g. 5 → "5.0", 5.5 → "5.5", 3.5001250000 → "3.500125".
func formatTime(v float64) string {
s := strconv.FormatFloat(v, 'f', -1, 64)
if !strings.ContainsRune(s, '.') {
s += ".0"
}
return s
}

// writeRows writes the clip-labels rows to a CSV file.
func writeRows(path string, header []string, rows []clipLabelsRow, appendMode bool) error {
var f *os.File
var err error
if appendMode {
f, err = os.OpenFile(path, os.O_APPEND|os.O_WRONLY, 0644)
} else {
f, err = os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
}
if err != nil {
return fmt.Errorf("open %s for write: %w", path, err)
}
defer func() { _ = f.Close() }()

w := csv.NewWriter(f)
if !appendMode {
if err := w.Write(header); err != nil {
return fmt.Errorf("write header: %w", err)
}
}

if len(rows) == 0 {
w.Flush()
return w.Error()
}
rec := make([]string, 3+len(rows[0].flags))
for _, r := range rows {
rec[0] = r.file
rec[1] = formatTime(r.start)
rec[2] = formatTime(r.end)
for i, b := range r.flags {
if b {
rec[3+i] = "True"
} else {
rec[3+i] = "False"
}
}
if err := w.Write(rec); err != nil {
return fmt.Errorf("write row: %w", err)
}
}
w.Flush()
return w.Error()
}
file addition: calls_clip_bench_test.go (----------)

[0.248737]

package tools

import (
"encoding/binary"
"math"
"os"
"testing"

"skraak/utils"
)

const benchWAV = "../audio/20211028_211500.WAV"

// ==================== WAV I/O ====================

func BenchmarkReadWAV(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, _, err := utils.ReadWAVSamples(benchWAV)
if err != nil {
b.Fatal(err)
}
}
}

func BenchmarkConvertToFloat64_16bit(b *testing.B) {
// Simulate 16-bit mono WAV data (same size as test file: 14.32M samples)
numSamples := 14320000
data := make([]byte, numSamples*2)
for i := range numSamples {
binary.LittleEndian.PutUint16(data[i*2:], uint16(i%65536))
}
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = convertToFloat64Bench(data, 16, 1)
}
}

// Duplicate of convertToFloat64 for benchmarking (unexported in utils)
func convertToFloat64Bench(data []byte, bitsPerSample, channels int) []float64 {
bytesPerSample := bitsPerSample / 8
blockAlign := bytesPerSample * channels
numSamples := len(data) / blockAlign
samples := make([]float64, numSamples)
for i := range numSamples {
offset := i * blockAlign
sample := int16(binary.LittleEndian.Uint16(data[offset : offset+2]))
samples[i] = float64(sample) / 32768.0
}
return samples
}

func BenchmarkWriteWAV(b *testing.B) {
samples, sr, _ := utils.ReadWAVSamples(benchWAV)
segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)
b.Logf("segment samples=%d", len(segSamples))
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
f, _ := os.CreateTemp("", "bench_*.wav")
utils.WriteWAVFile(f.Name(), segSamples, sr)
f.Close()
os.Remove(f.Name())
}
}

// ==================== Resample ====================

func BenchmarkResampleRate_48k(b *testing.B) {
samples, _, _ := utils.ReadWAVSamples(benchWAV)
b.Logf("resampling %d samples 48000->16000", len(samples))
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
utils.ResampleRate(samples, 48000, 16000)
}
}

func BenchmarkResampleRate_250k(b *testing.B) {
samples, _, _ := utils.ReadWAVSamples(benchWAV)
b.Logf("resampling %d samples 250000->16000", len(samples))
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
utils.ResampleRate(samples, 250000, 16000)
}
}

// ==================== Spectrogram pipeline ====================

func BenchmarkExtractSegment(b *testing.B) {
samples, sr, _ := utils.ReadWAVSamples(benchWAV)
b.Logf("full file: %d samples, sr=%d", len(samples), sr)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
seg := utils.ExtractSegmentSamples(samples, sr, 872, 895)
if len(seg) == 0 {
b.Fatal("empty segment")
}
}
}

func BenchmarkPowerSpectrumFFT_512(b *testing.B) {
n := 512
samples, sr, _ := utils.ReadWAVSamples(benchWAV)
segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)
frameData := make([]float64, n)
power := make([]float64, n/2+1)
scratch := make([]complex128, n)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
// Simulate the windowing step (Hann) + FFT
for j := range n {
frameData[j] = segSamples[j] * 0.5 * (1.0 - math.Cos(2.0*math.Pi*float64(j)/float64(n-1)))
}
utils.PowerSpectrumFFT(frameData, power, scratch)
}
}

func BenchmarkSpectrogram_23s(b *testing.B) {
samples, sr, _ := utils.ReadWAVSamples(benchWAV)
segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)
cfg := utils.DefaultSpectrogramConfig(16000)
b.Logf("segment samples=%d, windowSize=%d, hopSize=%d", len(segSamples), cfg.WindowSize, cfg.HopSize)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
spect := utils.GenerateSpectrogram(segSamples, cfg)
if spect == nil {
b.Fatal("nil spectrogram")
}
}
}

func BenchmarkSpectrogram_60s(b *testing.B) {
samples, sr, _ := utils.ReadWAVSamples(benchWAV)
segSamples := utils.ExtractSegmentSamples(samples, sr, 0, 60)
cfg := utils.DefaultSpectrogramConfig(16000)
b.Logf("60s segment samples=%d", len(segSamples))
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
spect := utils.GenerateSpectrogram(segSamples, cfg)
if spect == nil {
b.Fatal("nil spectrogram")
}
}
}

// ==================== Image creation & resize ====================

func BenchmarkCreateGrayscaleImage(b *testing.B) {
samples, sr, _ := utils.ReadWAVSamples(benchWAV)
segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)
cfg := utils.DefaultSpectrogramConfig(16000)
spect := utils.GenerateSpectrogram(segSamples, cfg)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
img := utils.CreateGrayscaleImage(spect)
if img == nil {
b.Fatal("nil image")
}
}
}

func BenchmarkCreateRGBImage(b *testing.B) {
samples, sr, _ := utils.ReadWAVSamples(benchWAV)
segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)
cfg := utils.DefaultSpectrogramConfig(16000)
spect := utils.GenerateSpectrogram(segSamples, cfg)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
colorData := utils.ApplyL4Colormap(spect)
img := utils.CreateRGBImage(colorData)
if img == nil {
b.Fatal("nil image")
}
}
}

func BenchmarkApplyL4Colormap(b *testing.B) {
samples, sr, _ := utils.ReadWAVSamples(benchWAV)
segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)
cfg := utils.DefaultSpectrogramConfig(16000)
spect := utils.GenerateSpectrogram(segSamples, cfg)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
colorData := utils.ApplyL4Colormap(spect)
if colorData == nil {
b.Fatal("nil colormap")
}
}
}

func BenchmarkResizeGray224(b *testing.B) {
samples, sr, _ := utils.ReadWAVSamples(benchWAV)
segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)
cfg := utils.DefaultSpectrogramConfig(16000)
spect := utils.GenerateSpectrogram(segSamples, cfg)
img := utils.CreateGrayscaleImage(spect)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
resized := utils.ResizeImage(img, 224, 224)
if resized == nil {
b.Fatal("nil resize")
}
}
}

func BenchmarkResizeGray448(b *testing.B) {
samples, sr, _ := utils.ReadWAVSamples(benchWAV)
segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)
cfg := utils.DefaultSpectrogramConfig(16000)
spect := utils.GenerateSpectrogram(segSamples, cfg)
img := utils.CreateGrayscaleImage(spect)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
resized := utils.ResizeImage(img, 448, 448)
if resized == nil {
b.Fatal("nil resize")
}
}
}

// ==================== PNG write ====================

func BenchmarkWritePNG_224(b *testing.B) {
samples, sr, _ := utils.ReadWAVSamples(benchWAV)
segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)
cfg := utils.DefaultSpectrogramConfig(16000)
spect := utils.GenerateSpectrogram(segSamples, cfg)
img := utils.CreateGrayscaleImage(spect)
resized := utils.ResizeImage(img, 224, 224)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
f, _ := os.CreateTemp("", "bench_*.png")
utils.WritePNG(resized, f)
f.Close()
os.Remove(f.Name())
}
}

// ==================== Full pipeline ====================

func BenchmarkFullPipelineGray224(b *testing.B) {
samples, sr, _ := utils.ReadWAVSamples(benchWAV)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)
outputSR := sr
if sr > 16000 {
segSamples = utils.ResampleRate(segSamples, sr, 16000)
outputSR = 16000
}
cfg := utils.DefaultSpectrogramConfig(outputSR)
spect := utils.GenerateSpectrogram(segSamples, cfg)
img := utils.CreateGrayscaleImage(spect)
resized := utils.ResizeImage(img, 224, 224)
f, _ := os.CreateTemp("", "bench_*.png")
utils.WritePNG(resized, f)
f.Close()
os.Remove(f.Name())
utils.WriteWAVFile(f.Name(), segSamples, outputSR)
os.Remove(f.Name())
_ = resized
}
}

func BenchmarkFullPipelineColor448(b *testing.B) {
samples, sr, _ := utils.ReadWAVSamples(benchWAV)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)
outputSR := sr
if sr > 16000 {
segSamples = utils.ResampleRate(segSamples, sr, 16000)
outputSR = 16000
}
cfg := utils.DefaultSpectrogramConfig(outputSR)
spect := utils.GenerateSpectrogram(segSamples, cfg)
colorData := utils.ApplyL4Colormap(spect)
img := utils.CreateRGBImage(colorData)
resized := utils.ResizeImage(img, 448, 448)
f, _ := os.CreateTemp("", "bench_*.png")
utils.WritePNG(resized, f)
f.Close()
os.Remove(f.Name())
utils.WriteWAVFile(f.Name(), segSamples, outputSR)
os.Remove(f.Name())
_ = resized
}
}

func BenchmarkFullPipelineWavOnly(b *testing.B) {
samples, sr, _ := utils.ReadWAVSamples(benchWAV)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)
outputSR := sr
if sr > 16000 {
segSamples = utils.ResampleRate(segSamples, sr, 16000)
outputSR = 16000
}
f, _ := os.CreateTemp("", "bench_*.wav")
utils.WriteWAVFile(f.Name(), segSamples, outputSR)
f.Close()
os.Remove(f.Name())
}
}

// ==================== Data dimension report ====================

func TestPipelineDimensions(t *testing.T) {
samples, sr, _ := utils.ReadWAVSamples(benchWAV)
segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)

t.Logf("Input: %d samples, sr=%d, segment=%d samples (%.1fs)",
len(samples), sr, len(segSamples), float64(len(segSamples))/float64(sr))

cfg := utils.DefaultSpectrogramConfig(16000)
numFrames := (len(segSamples)-cfg.WindowSize)/cfg.HopSize + 1
numBins := cfg.WindowSize/2 + 1
t.Logf("Spectrogram: %d freq bins x %d time frames = %d values",
numBins, numFrames, numBins*numFrames)

spect := utils.GenerateSpectrogram(segSamples, cfg)
t.Logf("Output: %d x %d (freq x time)", len(spect), len(spect[0]))

img := utils.CreateGrayscaleImage(spect)
t.Logf("Grayscale image: %dx%d pixels, %d bytes",
img.Bounds().Dx(), img.Bounds().Dy(), img.Bounds().Dx()*img.Bounds().Dy())

resized := utils.ResizeImage(img, 224, 224)
t.Logf("Resized 224: %dx%d", resized.Bounds().Dx(), resized.Bounds().Dy())

resized448 := utils.ResizeImage(img, 448, 448)
t.Logf("Resized 448: %dx%d", resized448.Bounds().Dx(), resized448.Bounds().Dy())
}
file addition: calls_clip.go (----------)

[0.248737]

package tools

import (
"fmt"
"image"
"math"
"os"
"path/filepath"
"runtime"
"strings"
"sync"

"skraak/utils"
)

// CallsClipInput defines the input for the clip tool
type CallsClipInput struct {
File string `json:"file"`
Folder string `json:"folder"`
Output string `json:"output"`
Prefix string `json:"prefix"`
Filter string `json:"filter"`
Species string `json:"species"`
Certainty int `json:"certainty"`
Size int `json:"size"`
Color bool `json:"color"`
WavOnly bool `json:"wav_only"`
Night bool `json:"night"`
Day bool `json:"day"`
Lat float64 `json:"lat"`
Lng float64 `json:"lng"`
Timezone string `json:"timezone"`
}

// CallsClipOutput defines the output for the clip tool
type CallsClipOutput struct {
FilesProcessed int `json:"files_processed"`
SegmentsClipped int `json:"segments_clipped"`
NightSkipped int `json:"night_skipped,omitempty"`
DaySkipped int `json:"day_skipped,omitempty"`
OutputFiles []string `json:"output_files"`
Errors []string `json:"errors,omitempty"`
}

// CallsClip processes .data files and generates audio/image clips for matching segments
func CallsClip(input CallsClipInput) (CallsClipOutput, error) {
var output CallsClipOutput

// Validate required flags
if input.File == "" && input.Folder == "" {
output.Errors = append(output.Errors, "either --file or --folder is required")
return output, fmt.Errorf("missing required flag: --file or --folder")
}
if input.Output == "" {
output.Errors = append(output.Errors, "--output is required")
return output, fmt.Errorf("missing required flag: --output")
}
if input.Prefix == "" {
output.Errors = append(output.Errors, "--prefix is required")
return output, fmt.Errorf("missing required flag: --prefix")
}

// Parse species+calltype
speciesName, callType := utils.ParseSpeciesCallType(input.Species)

// Get list of .data files
var filePaths []string
var err error

if input.File != "" {
filePaths = []string{input.File}
} else {
filePaths, err = utils.FindDataFiles(input.Folder)
if err != nil {
output.Errors = append(output.Errors, fmt.Sprintf("failed to find .data files: %v", err))
return output, err
}
}

if len(filePaths) == 0 {
output.Errors = append(output.Errors, "no .data files found")
return output, fmt.Errorf("no .data files found")
}

// Create output folder if it doesn't exist
if err := os.MkdirAll(input.Output, 0755); err != nil {
output.Errors = append(output.Errors, fmt.Sprintf("failed to create output folder: %v", err))
return output, err
}

// Clamp image size to valid range
imgSize := utils.ClampImageSize(input.Size)

// Process .data files (parallel for larger batches)
if len(filePaths) <= 2 {
// Sequential for small batches
for _, dataPath := range filePaths {
clips, skipped, errs := processFile(dataPath, input.Output, input.Prefix, input.Filter, speciesName, callType, input.Certainty, imgSize, input.Color, input.WavOnly, input.Night, input.Day, input.Lat, input.Lng, input.Timezone)
output.SegmentsClipped += len(clips)
if input.Night {
output.NightSkipped += skipped
} else {
output.DaySkipped += skipped
}
output.OutputFiles = append(output.OutputFiles, clips...)
output.Errors = append(output.Errors, errs...)
if len(clips) > 0 || len(errs) == 0 {
output.FilesProcessed++
}
}
} else {
// Parallel file processing
type fileResult struct {
clips []string
skipped int
errs []string
}

workers := min(runtime.NumCPU(), 8, len(filePaths))
jobs := make(chan string, len(filePaths))
results := make(chan fileResult, len(filePaths))

var wg sync.WaitGroup
for range workers {
wg.Go(func() {
for dataPath := range jobs {
clips, skipped, errs := processFile(dataPath, input.Output, input.Prefix, input.Filter, speciesName, callType, input.Certainty, imgSize, input.Color, input.WavOnly, input.Night, input.Day, input.Lat, input.Lng, input.Timezone)
results <- fileResult{clips: clips, skipped: skipped, errs: errs}
}
})
}

for _, dataPath := range filePaths {
jobs <- dataPath
}
close(jobs)

go func() {
wg.Wait()
close(results)
}()

for r := range results {
output.SegmentsClipped += len(r.clips)
if input.Night {
output.NightSkipped += r.skipped
} else {
output.DaySkipped += r.skipped
}
output.OutputFiles = append(output.OutputFiles, r.clips...)
output.Errors = append(output.Errors, r.errs...)
if len(r.clips) > 0 || len(r.errs) == 0 {
output.FilesProcessed++
}
}
}

return output, nil
}

// processFile processes a single .data file and returns generated clips, time-filter-skipped count, and errors
func processFile(dataPath, outputDir, prefix, filter, speciesName, callType string, certainty, imgSize int, color, wavOnly, night, day bool, lat, lng float64, timezone string) ([]string, int, []string) {
var clips []string
var errors []string

// Parse .data file
dataFile, err := utils.ParseDataFile(dataPath)
if err != nil {
errors = append(errors, fmt.Sprintf("%s: failed to parse: %v", dataPath, err))
return nil, 0, errors
}

// Get WAV basename (without path and extensions)
wavPath := filepath.Clean(strings.TrimSuffix(dataPath, ".data"))
basename := filepath.Base(wavPath)
basename = strings.TrimSuffix(basename, filepath.Ext(basename))

// Filter segments
var matchingSegments []*utils.Segment
for _, seg := range dataFile.Segments {
if seg.SegmentMatchesFilters(filter, speciesName, callType, certainty) {
matchingSegments = append(matchingSegments, seg)
}
}

if len(matchingSegments) == 0 {
return nil, 0, nil // No matches, not an error
}

// Day/night filter: check WAV header only (cheaper than reading full audio).
// Skip recordings in the wrong time-of-day before paying the cost of ReadWAVSamples.
if night || day {
result, err := IsNight(IsNightInput{
FilePath: wavPath,
Lat: lat,
Lng: lng,
Timezone: timezone,
})
if err != nil {
fmt.Fprintf(os.Stderr, "warning: skipping %s (isnight error: %v)\n", wavPath, err)
return nil, 0, nil
}
if night && !result.SolarNight {
fmt.Fprintf(os.Stderr, "skipped (daytime): %s\n", wavPath)
return nil, 1, nil
}
if day && !result.DiurnalActive {
fmt.Fprintf(os.Stderr, "skipped (nighttime): %s\n", wavPath)
return nil, 1, nil
}
}

// Read WAV samples once
samples, sampleRate, err := utils.ReadWAVSamples(wavPath)
if err != nil {
errors = append(errors, fmt.Sprintf("%s: failed to read WAV: %v", dataPath, err))
return nil, 0, errors
}

// Process matching segments (parallel for larger batches)
if len(matchingSegments) <= 2 {
for _, seg := range matchingSegments {
clipFiles, err := generateClip(samples, sampleRate, outputDir, prefix, basename, seg.StartTime, seg.EndTime, imgSize, color, wavOnly)
if err != nil {
errors = append(errors, fmt.Sprintf("%s: segment %.0f-%.0f: %v", dataPath, seg.StartTime, seg.EndTime, err))
continue
}
clips = append(clips, clipFiles...)
}
} else {
type segResult struct {
clips []string
err string
}

workers := min(runtime.NumCPU(), len(matchingSegments))
jobs := make(chan *utils.Segment, len(matchingSegments))
results := make(chan segResult, len(matchingSegments))

var wg sync.WaitGroup
for range workers {
wg.Go(func() {
for seg := range jobs {
clipFiles, err := generateClip(samples, sampleRate, outputDir, prefix, basename, seg.StartTime, seg.EndTime, imgSize, color, wavOnly)
if err != nil {
results <- segResult{err: fmt.Sprintf("%s: segment %.0f-%.0f: %v", dataPath, seg.StartTime, seg.EndTime, err)}
} else {
results <- segResult{clips: clipFiles}
}
}
})
}

for _, seg := range matchingSegments {
jobs <- seg
}
close(jobs)

go func() {
wg.Wait()
close(results)
}()

for r := range results {
if r.err != "" {
errors = append(errors, r.err)
} else {
clips = append(clips, r.clips...)
}
}
}

return clips, 0, errors
}

// generateClip generates PNG and WAV files for a segment
func generateClip(samples []float64, sampleRate int, outputDir, prefix, basename string, startTime, endTime float64, imgSize int, color, wavOnly bool) ([]string, error) {
var files []string

// Calculate integer times for filename
startInt := int(math.Floor(startTime))
endInt := int(math.Ceil(endTime))

// Build base filename
baseName := fmt.Sprintf("%s_%s_%d_%d", prefix, basename, startInt, endInt)
wavPath := filepath.Join(outputDir, baseName+".wav")

// Extract segment samples
segSamples := utils.ExtractSegmentSamples(samples, sampleRate, startTime, endTime)
if len(segSamples) == 0 {
return nil, fmt.Errorf("no samples in segment")
}

// Determine output sample rate (downsample if > 16kHz)
outputSampleRate := sampleRate
if sampleRate > utils.DefaultMaxSampleRate {
segSamples = utils.ResampleRate(segSamples, sampleRate, utils.DefaultMaxSampleRate)
outputSampleRate = utils.DefaultMaxSampleRate
}

// Generate spectrogram and PNG unless --wav-only
if !wavOnly {
pngPath := filepath.Join(outputDir, baseName+".png")

spectSampleRate := outputSampleRate
config := utils.DefaultSpectrogramConfig(spectSampleRate)
spectrogram := utils.GenerateSpectrogram(segSamples, config)
if spectrogram == nil {
return nil, fmt.Errorf("failed to generate spectrogram")
}

// Create image (grayscale or color)
var img image.Image
if color {
colorData := utils.ApplyL4Colormap(spectrogram)
img = utils.CreateRGBImage(colorData)
} else {
img = utils.CreateGrayscaleImage(spectrogram)
}
if img == nil {
return nil, fmt.Errorf("failed to create image")
}

resized := utils.ResizeImage(img, imgSize, imgSize)

// Write PNG (O_EXCL fails atomically if file exists)
pngFile, err := os.OpenFile(pngPath, os.O_WRONLY|os.O_CREATE|os.O_EXCL, 0644)
if err != nil {
if os.IsExist(err) {
return nil, fmt.Errorf("file already exists: %s", pngPath)
}
return nil, fmt.Errorf("failed to create PNG: %w", err)
}
if err := utils.WritePNG(resized, pngFile); err != nil {
_ = pngFile.Close()
return nil, fmt.Errorf("failed to write PNG: %w", err)
}
if err := pngFile.Close(); err != nil {
return nil, fmt.Errorf("failed to close PNG: %w", err)
}
files = append(files, pngPath)
}

// Write WAV
if err := utils.WriteWAVFile(wavPath, segSamples, outputSampleRate); err != nil {
return nil, fmt.Errorf("failed to write WAV: %w", err)
}
files = append(files, wavPath)

return files, nil
}
file addition: calls_classify_test.go (----------)

[0.248737]

package tools

import (
"testing"

"skraak/utils"
)

func NewClassifyState(config ClassifyConfig, dataFiles []*utils.DataFile) *ClassifyState {
hasFilter := config.Filter != "" || config.Species != "" || config.Certainty >= 0
cached := make([][]*utils.Segment, len(dataFiles))
for i, df := range dataFiles {
if !hasFilter {
cached[i] = df.Segments
} else {
for _, seg := range df.Segments {
if seg.SegmentMatchesFilters(config.Filter, config.Species, config.CallType, config.Certainty) {
cached[i] = append(cached[i], seg)
}
}
}
}
total := 0
for _, segs := range cached {
total += len(segs)
}
return &ClassifyState{
Config: config,
DataFiles: dataFiles,
filteredSegs: cached,
totalSegs: total,
}
}

func TestParseKeyBuffer(t *testing.T) {
bindings := []KeyBinding{
{Key: "k", Species: "Kiwi"},
{Key: "d", Species: "Kiwi", CallType: "Duet"},
{Key: "n", Species: "Don't Know"},
{Key: "p", Species: "Morepork"},
}

state := NewClassifyState(ClassifyConfig{Bindings: bindings, Certainty: -1}, nil)

tests := []struct {
key string
want *BindingResult
wantNil bool
}{
{"k", &BindingResult{Species: "Kiwi"}, false},
{"d", &BindingResult{Species: "Kiwi", CallType: "Duet"}, false},
{"n", &BindingResult{Species: "Don't Know"}, false},
{"p", &BindingResult{Species: "Morepork"}, false},
{"x", nil, true}, // unknown key
}

for _, tt := range tests {
got := state.ParseKeyBuffer(tt.key)
if tt.wantNil {
if got != nil {
t.Errorf("ParseKeyBuffer(%q) = %v, want nil", tt.key, got)
}
} else {
if got == nil {
t.Errorf("ParseKeyBuffer(%q) = nil, want %+v", tt.key, tt.want)
continue
}
if got.Species != tt.want.Species {
t.Errorf("ParseKeyBuffer(%q).Species = %q, want %q", tt.key, got.Species, tt.want.Species)
}
if got.CallType != tt.want.CallType {
t.Errorf("ParseKeyBuffer(%q).CallType = %q, want %q", tt.key, got.CallType, tt.want.CallType)
}
}
}
}

func TestApplyBinding(t *testing.T) {
bindings := []KeyBinding{
{Key: "k", Species: "Kiwi"},
{Key: "n", Species: "Don't Know"},
{Key: "d", Species: "Kiwi", CallType: "Duet"},
}

df := &utils.DataFile{
Meta: &utils.DataMeta{},
Segments: []*utils.Segment{
{
StartTime: 10.0,
EndTime: 20.0,
Labels: []*utils.Label{
{Species: "Unknown", Certainty: 50, Filter: "test-filter", CallType: "OldType"},
},
},
},
}

state := NewClassifyState(ClassifyConfig{
Filter: "test-filter",
Reviewer: "David",
Bindings: bindings,
Certainty: -1,
}, []*utils.DataFile{df})

// Apply "k" = Kiwi (no calltype, should remove existing calltype)
result := &BindingResult{Species: "Kiwi"}
state.ApplyBinding(result)

// Check label was updated
if len(df.Segments[0].Labels) != 1 {
t.Errorf("expected 1 label, got %d", len(df.Segments[0].Labels))
}
if df.Segments[0].Labels[0].Species != "Kiwi" {
t.Errorf("expected Species=Kiwi, got %s", df.Segments[0].Labels[0].Species)
}
if df.Segments[0].Labels[0].Certainty != 100 {
t.Errorf("expected Certainty=100, got %d", df.Segments[0].Labels[0].Certainty)
}
if df.Segments[0].Labels[0].CallType != "" {
t.Errorf("expected CallType='', got %s (should be removed)", df.Segments[0].Labels[0].CallType)
}
if df.Meta.Reviewer != "David" {
t.Errorf("expected Reviewer=David, got %s", df.Meta.Reviewer)
}

// Apply "d" = Kiwi/Duet (should set calltype)
result = &BindingResult{Species: "Kiwi", CallType: "Duet"}
state.ApplyBinding(result)

if df.Segments[0].Labels[0].CallType != "Duet" {
t.Errorf("expected CallType=Duet, got %s", df.Segments[0].Labels[0].CallType)
}

// Apply "n" = Don't Know (certainty should be 0)
result = &BindingResult{Species: "Don't Know"}
state.ApplyBinding(result)

if df.Segments[0].Labels[0].Species != "Don't Know" {
t.Errorf("expected Species=Don't Know, got %s", df.Segments[0].Labels[0].Species)
}
if df.Segments[0].Labels[0].Certainty != 0 {
t.Errorf("expected Certainty=0 for Don't Know, got %d", df.Segments[0].Labels[0].Certainty)
}
}

func TestApplyBindingCallTypeRemoval(t *testing.T) {
bindings := []KeyBinding{
{Key: "k", Species: "Kiwi"}, // no calltype
}

df := &utils.DataFile{
Meta: &utils.DataMeta{},
Segments: []*utils.Segment{
{
StartTime: 10.0,
EndTime: 20.0,
Labels: []*utils.Label{
{Species: "Kiwi", Certainty: 100, Filter: "test-filter", CallType: "Male"},
},
},
},
}

state := NewClassifyState(ClassifyConfig{
Filter: "test-filter",
Reviewer: "David",
Bindings: bindings,
Certainty: -1,
}, []*utils.DataFile{df})

// Apply "k" = Kiwi (should remove Male calltype)
result := &BindingResult{Species: "Kiwi"}
state.ApplyBinding(result)

if df.Segments[0].Labels[0].CallType != "" {
t.Errorf("expected CallType='', got %s (should be removed)", df.Segments[0].Labels[0].CallType)
}
}

func TestConfirmLabelDontKnow(t *testing.T) {
df := &utils.DataFile{
Meta: &utils.DataMeta{},
Segments: []*utils.Segment{
{
StartTime: 10.0,
EndTime: 20.0,
Labels: []*utils.Label{
{Species: "Don't Know", Certainty: 0, Filter: "test-filter"},
},
},
},
}

state := NewClassifyState(ClassifyConfig{
Filter: "test-filter",
Reviewer: "David",
Certainty: -1,
}, []*utils.DataFile{df})

// ConfirmLabel on Don't Know should be a no-op
if state.ConfirmLabel() {
t.Error("ConfirmLabel() should return false for Don't Know (certainty=0)")
}

label := df.Segments[0].Labels[0]
if label.Species != "Don't Know" {
t.Errorf("Species should remain Don't Know, got %s", label.Species)
}
if label.Certainty != 0 {
t.Errorf("Certainty should remain 0, got %d", label.Certainty)
}
if state.Dirty {
t.Error("State should not be dirty after confirming Don't Know")
}
}
file addition: calls_classify_load_test.go (----------)

[0.248737]

package tools

import (
"os"
"path/filepath"
"testing"
)

func TestLoadDataFilesFiltersFilesWithNoMatchingSegments(t *testing.T) {
// Create a temp directory with test .data files
tempDir := t.TempDir()

// File 1: Kiwi segments
file1 := `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 90}]]]`
if err := os.WriteFile(filepath.Join(tempDir, "file1.data"), []byte(file1), 0644); err != nil {
t.Fatal(err)
}

// File 2: Tomtit segments only
file2 := `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Tomtit", "certainty": 90}]]]`
if err := os.WriteFile(filepath.Join(tempDir, "file2.data"), []byte(file2), 0644); err != nil {
t.Fatal(err)
}

// File 3: Kiwi segments
file3 := `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 90}]]]`
if err := os.WriteFile(filepath.Join(tempDir, "file3.data"), []byte(file3), 0644); err != nil {
t.Fatal(err)
}

// Test 1: No filter - should load all 3 files
config1 := ClassifyConfig{Folder: tempDir, Certainty: -1}
state1, err := LoadDataFiles(config1)
if err != nil {
t.Fatal(err)
}
if len(state1.DataFiles) != 3 {
t.Errorf("No filter: expected 3 files, got %d", len(state1.DataFiles))
}
if state1.TotalSegments() != 3 {
t.Errorf("No filter: expected 3 segments total, got %d", state1.TotalSegments())
}

// Test 2: Filter by Species "Kiwi" - should load only files 1 and 3
config2 := ClassifyConfig{Folder: tempDir, Species: "Kiwi", Certainty: -1}
state2, err := LoadDataFiles(config2)
if err != nil {
t.Fatal(err)
}
if len(state2.DataFiles) != 2 {
t.Errorf("Species=Kiwi: expected 2 files, got %d", len(state2.DataFiles))
}
if state2.TotalSegments() != 2 {
t.Errorf("Species=Kiwi: expected 2 segments total, got %d", state2.TotalSegments())
}

// Test 3: Filter by Species "Tomtit" - should load only file 2
config3 := ClassifyConfig{Folder: tempDir, Species: "Tomtit", Certainty: -1}
state3, err := LoadDataFiles(config3)
if err != nil {
t.Fatal(err)
}
if len(state3.DataFiles) != 1 {
t.Errorf("Species=Tomtit: expected 1 file, got %d", len(state3.DataFiles))
}
if state3.TotalSegments() != 1 {
t.Errorf("Species=Tomtit: expected 1 segment total, got %d", state3.TotalSegments())
}

// Test 4: Filter by non-existent species - should return empty file list
// (handled gracefully by caller in cmd/calls_classify.go)
config4 := ClassifyConfig{Folder: tempDir, Species: "NonExistent", Certainty: -1}
state4, err := LoadDataFiles(config4)
if err != nil {
t.Fatalf("Species=NonExistent: unexpected error: %v", err)
}
if len(state4.DataFiles) != 0 {
t.Errorf("Species=NonExistent: expected 0 files, got %d", len(state4.DataFiles))
}
if state4.TotalSegments() != 0 {
t.Errorf("Species=NonExistent: expected 0 segments, got %d", state4.TotalSegments())
}
}

func TestLoadDataFilesWithMixedSegments(t *testing.T) {
// Create a temp directory with a file containing mixed segment types
tempDir := t.TempDir()

// File with multiple segments: some Kiwi, some Tomtit
file := `[
{"Operator": "test"},
[0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 90}]],
[10, 20, 100, 1000, [{"species": "Tomtit", "certainty": 80}]],
[20, 30, 100, 1000, [{"species": "Kiwi", "certainty": 95}]]
]`
if err := os.WriteFile(filepath.Join(tempDir, "mixed.data"), []byte(file), 0644); err != nil {
t.Fatal(err)
}

// Filter by Species "Kiwi" - should show 2 segments from the file
config := ClassifyConfig{Folder: tempDir, Species: "Kiwi", Certainty: -1}
state, err := LoadDataFiles(config)
if err != nil {
t.Fatal(err)
}
if len(state.DataFiles) != 1 {
t.Errorf("Expected 1 file, got %d", len(state.DataFiles))
}
if state.TotalSegments() != 2 {
t.Errorf("Species=Kiwi: expected 2 segments, got %d", state.TotalSegments())
}

// The DataFile should still have all 3 segments internally
// but cached filtered segments should return only the Kiwi ones
if len(state.DataFiles[0].Segments) != 3 {
t.Errorf("DataFile should have 3 segments internally, got %d", len(state.DataFiles[0].Segments))
}

// TotalSegments uses cached filtered segments
if state.TotalSegments() != 2 {
t.Errorf("TotalSegments should return 2 Kiwi segments, got %d", state.TotalSegments())
}
}

// Test that the original DataFile segments are not modified (immutable filtering)
func TestFilteringDoesNotModifyOriginalSegments(t *testing.T) {
tempDir := t.TempDir()

file := `[
{"Operator": "test"},
[0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 90}]],
[10, 20, 100, 1000, [{"species": "Tomtit", "certainty": 80}]]
]`
if err := os.WriteFile(filepath.Join(tempDir, "test.data"), []byte(file), 0644); err != nil {
t.Fatal(err)
}

config := ClassifyConfig{Folder: tempDir, Species: "Kiwi", Certainty: -1}
state, err := LoadDataFiles(config)
if err != nil {
t.Fatal(err)
}

// Original segments should be untouched
originalSegments := state.DataFiles[0].Segments
if len(originalSegments) != 2 {
t.Errorf("Original should have 2 segments, got %d", len(originalSegments))
}

// Verify all original segments are preserved
species := []string{}
for _, seg := range originalSegments {
if len(seg.Labels) > 0 {
species = append(species, seg.Labels[0].Species)
}
}
if len(species) != 2 || species[0] != "Kiwi" || species[1] != "Tomtit" {
t.Errorf("Original segments should have both species, got %v", species)
}
}

func TestLoadDataFilesCertaintyPruning(t *testing.T) {
// Create a temp directory with test .data files
tempDir := t.TempDir()

// File 1: certainty 70
file1 := `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 70}]]]`
if err := os.WriteFile(filepath.Join(tempDir, "file1.data"), []byte(file1), 0644); err != nil {
t.Fatal(err)
}

// File 2: certainty 100
file2 := `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 100}]]]`
if err := os.WriteFile(filepath.Join(tempDir, "file2.data"), []byte(file2), 0644); err != nil {
t.Fatal(err)
}

// Filter by certainty 100 - should load only file2
config := ClassifyConfig{Folder: tempDir, Certainty: 100}
state, err := LoadDataFiles(config)
if err != nil {
t.Fatal(err)
}
if len(state.DataFiles) != 1 {
t.Errorf("Certainty=100: expected 1 file, got %d", len(state.DataFiles))
}
if state.TotalSegments() != 1 {
t.Errorf("Certainty=100: expected 1 segment, got %d", state.TotalSegments())
}

// CurrentSegment should work (not nil) because file1 was pruned
seg := state.CurrentSegment()
if seg == nil {
t.Error("CurrentSegment should not be nil after pruning")
}
}
file addition: calls_classify_filter_test.go (----------)

[0.248737]

package tools

import (
"math/rand"
"testing"

"skraak/utils"
)

func TestTotalSegmentsRespectsFilters(t *testing.T) {
// Create test data files with different species and filters
df1 := &utils.DataFile{
FilePath: "/test/file1.data",
Segments: []*utils.Segment{
{
StartTime: 0,
EndTime: 10,
Labels: []*utils.Label{
{Species: "Kiwi", Filter: "model-1.0"},
},
},
{
StartTime: 10,
EndTime: 20,
Labels: []*utils.Label{
{Species: "Tomtit", Filter: "model-1.0"},
},
},
},
}

df2 := &utils.DataFile{
FilePath: "/test/file2.data",
Segments: []*utils.Segment{
{
StartTime: 0,
EndTime: 10,
Labels: []*utils.Label{
{Species: "Kiwi", Filter: "model-1.0"},
},
},
},
}

// Test 1: No filters - should count all segments (3)
state1 := NewClassifyState(ClassifyConfig{Certainty: -1}, []*utils.DataFile{df1, df2})
if got := state1.TotalSegments(); got != 3 {
t.Errorf("No filters: expected 3 segments, got %d", got)
}

// Test 2: Filter by species "Kiwi" - should count only Kiwi segments (2)
state2 := NewClassifyState(ClassifyConfig{Species: "Kiwi", Certainty: -1}, []*utils.DataFile{df1, df2})
if got := state2.TotalSegments(); got != 2 {
t.Errorf("Species=Kiwi: expected 2 segments, got %d", got)
}

// Test 3: Filter by species "Tomtit" - should count only Tomtit segments (1)
state3 := NewClassifyState(ClassifyConfig{Species: "Tomtit", Certainty: -1}, []*utils.DataFile{df1, df2})
if got := state3.TotalSegments(); got != 1 {
t.Errorf("Species=Tomtit: expected 1 segment, got %d", got)
}

// Test 4: Filter by filter name "model-1.0" - should count all segments (3)
state4 := NewClassifyState(ClassifyConfig{Filter: "model-1.0", Certainty: -1}, []*utils.DataFile{df1, df2})
if got := state4.TotalSegments(); got != 3 {
t.Errorf("Filter=model-1.0: expected 3 segments, got %d", got)
}

// Test 5: Filter by non-existent species - should count 0
state5 := NewClassifyState(ClassifyConfig{Species: "NonExistent", Certainty: -1}, []*utils.DataFile{df1, df2})
if got := state5.TotalSegments(); got != 0 {
t.Errorf("Species=NonExistent: expected 0 segments, got %d", got)
}

// Test 6: Combined filter + species
df3 := &utils.DataFile{
FilePath: "/test/file3.data",
Segments: []*utils.Segment{
{
StartTime: 0,
EndTime: 10,
Labels: []*utils.Label{
{Species: "Kiwi", Filter: "model-1.0", CallType: "Duet"},
},
},
{
StartTime: 10,
EndTime: 20,
Labels: []*utils.Label{
{Species: "Kiwi", Filter: "model-2.0", CallType: "Male"},
},
},
},
}
state6 := NewClassifyState(ClassifyConfig{Filter: "model-1.0", Species: "Kiwi", Certainty: -1}, []*utils.DataFile{df3})
if got := state6.TotalSegments(); got != 1 {
t.Errorf("Filter=model-1.0 + Species=Kiwi: expected 1 segment, got %d", got)
}
}

func TestCurrentSegmentNumberWithFilters(t *testing.T) {
// Create test data files
df1 := &utils.DataFile{
FilePath: "/test/file1.data",
Segments: []*utils.Segment{
{
StartTime: 0,
EndTime: 10,
Labels: []*utils.Label{
{Species: "Kiwi", Filter: "model-1.0"},
},
},
{
StartTime: 10,
EndTime: 20,
Labels: []*utils.Label{
{Species: "Tomtit", Filter: "model-1.0"},
},
},
},
}

df2 := &utils.DataFile{
FilePath: "/test/file2.data",
Segments: []*utils.Segment{
{
StartTime: 0,
EndTime: 10,
Labels: []*utils.Label{
{Species: "Kiwi", Filter: "model-1.0"},
},
},
},
}

// Test: Filter by species "Kiwi", at file 2, segment 0
// Should report current segment as 2 (first Kiwi in df1 + first Kiwi in df2)
state := NewClassifyState(ClassifyConfig{Species: "Kiwi", Certainty: -1}, []*utils.DataFile{df1, df2})
state.FileIdx = 1 // at df2
state.SegmentIdx = 0

if got := state.CurrentSegmentNumber(); got != 2 {
t.Errorf("Species=Kiwi, at file 2, seg 0: expected current segment 2, got %d", got)
}
}

func TestCertaintyFiltering(t *testing.T) {
// Create test data files with different certainty levels
df := &utils.DataFile{
FilePath: "/test/file1.data",
Segments: []*utils.Segment{
{
StartTime: 0,
EndTime: 10,
Labels: []*utils.Label{
{Species: "Kiwi", Filter: "model-1.0", Certainty: 70},
},
},
{
StartTime: 10,
EndTime: 20,
Labels: []*utils.Label{
{Species: "Kiwi", Filter: "model-1.0", Certainty: 100},
},
},
{
StartTime: 20,
EndTime: 30,
Labels: []*utils.Label{
{Species: "Tomtit", Filter: "model-1.0", Certainty: 70},
},
},
},
}

// Test 1: Filter by certainty 70 - should get 2 segments
state1 := NewClassifyState(ClassifyConfig{Certainty: 70}, []*utils.DataFile{df})
if got := state1.TotalSegments(); got != 2 {
t.Errorf("Certainty=70: expected 2 segments, got %d", got)
}

// Test 2: Filter by certainty 100 - should get 1 segment
state2 := NewClassifyState(ClassifyConfig{Certainty: 100}, []*utils.DataFile{df})
if got := state2.TotalSegments(); got != 1 {
t.Errorf("Certainty=100: expected 1 segment, got %d", got)
}

// Test 3: Filter by certainty 0 - should get 0 segments
state3 := NewClassifyState(ClassifyConfig{Certainty: 0}, []*utils.DataFile{df})
if got := state3.TotalSegments(); got != 0 {
t.Errorf("Certainty=0: expected 0 segments, got %d", got)
}

// Test 4: Combined species + certainty
state4 := NewClassifyState(ClassifyConfig{Species: "Kiwi", Certainty: 70}, []*utils.DataFile{df})
if got := state4.TotalSegments(); got != 1 {
t.Errorf("Species=Kiwi + Certainty=70: expected 1 segment, got %d", got)
}
}

func TestSampling(t *testing.T) {
makeSegs := func(n int) []*utils.Segment {
s := make([]*utils.Segment, n)
for i := range s {
s[i] = &utils.Segment{StartTime: float64(i), EndTime: float64(i + 1)}
}
return s
}

df1 := &utils.DataFile{FilePath: "/test/f1.data", Segments: makeSegs(6)}
df2 := &utils.DataFile{FilePath: "/test/f2.data", Segments: makeSegs(4)}
kept := []*utils.DataFile{df1, df2}
cached := [][]*utils.Segment{df1.Segments, df2.Segments}

countTotal := func(c [][]*utils.Segment) int {
n := 0
for _, s := range c {
n += len(s)
}
return n
}

// 50% of 10 → 5
k, c := applySampling(kept, cached, 50, rand.New(rand.NewSource(42)))
if got := countTotal(c); got != 5 {
t.Errorf("sample 50%%: expected 5, got %d", got)
}
// Files must be in original chronological order
for i := 1; i < len(k); i++ {
if k[i].FilePath < k[i-1].FilePath {
t.Errorf("sample 50%%: files out of order at index %d", i)
}
}

// 10% of 10 → 1
_, c2 := applySampling(kept, cached, 10, rand.New(rand.NewSource(42)))
if got := countTotal(c2); got != 1 {
t.Errorf("sample 10%%: expected 1, got %d", got)
}

// 1% of 10 → clamp to 1
_, c3 := applySampling(kept, cached, 1, rand.New(rand.NewSource(42)))
if got := countTotal(c3); got != 1 {
t.Errorf("sample 1%%: expected 1 (clamped), got %d", got)
}

// 99% of 10 → 9
_, c4 := applySampling(kept, cached, 99, rand.New(rand.NewSource(42)))
if got := countTotal(c4); got != 9 {
t.Errorf("sample 99%%: expected 9, got %d", got)
}
}

func TestCertaintyPruning(t *testing.T) {
// Simulate the bug: first file has no matching certainty segments
df1 := &utils.DataFile{
FilePath: "/test/file1.data",
Segments: []*utils.Segment{
{
StartTime: 0,
EndTime: 10,
Labels: []*utils.Label{
{Species: "Kiwi", Filter: "model-1.0", Certainty: 70},
},
},
},
}

df2 := &utils.DataFile{
FilePath: "/test/file2.data",
Segments: []*utils.Segment{
{
StartTime: 0,
EndTime: 10,
Labels: []*utils.Label{
{Species: "Kiwi", Filter: "model-1.0", Certainty: 100},
},
},
},
}

// Without pruning (old bug): file1 is first, has no certainty=100 segments
// CurrentSegment() would return nil even though TotalSegments() > 0
state := NewClassifyState(ClassifyConfig{Certainty: 100}, []*utils.DataFile{df1, df2})

// TotalSegments should be 1 (only file2 has certainty 100)
if got := state.TotalSegments(); got != 1 {
t.Errorf("Certainty=100: expected 1 segment, got %d", got)
}

// CurrentSegment should work if files are properly pruned
// Note: this test assumes LoadDataFiles does the pruning
// Here we test the state after manual construction
}
file addition: calls_classify.go (----------)

[0.248737]

package tools

import (
"fmt"
"math/rand"
"os"
"path/filepath"
"slices"
"sort"
"strings"
"time"

"skraak/utils"
)

// KeyBinding maps a key to a species/calltype
type KeyBinding struct {
Key string // single char: "k", "n", "p"
Species string // "Kiwi", "Don't Know", "Morepork"
CallType string // "Duet", "Female", "Male" (optional)
}

// ClassifyConfig holds the configuration for classification
type ClassifyConfig struct {
Folder string
File string
Filter string
Species string // scope to this species (optional)
CallType string // scope to this calltype within species (optional)
Certainty int // scope to this certainty value, -1 = no filter (optional)
Sample int // random sample percentage 1-99, -1 = no sampling, 100 = no-op
Goto string // goto this file on startup (optional, basename match)
Reviewer string
Color bool
ImageSize int // spectrogram display size in pixels (0 = default)
Sixel bool
ITerm bool
Bindings []KeyBinding
// SecondaryBindings maps a primary binding key to per-species calltype
// keys. Invoked via Shift+primary-key: the species is labeled without
// advancing, and the next key is interpreted as a calltype.
SecondaryBindings map[string]map[string]string
Night bool
Day bool
Lat float64
Lng float64
Timezone string
}

// ClassifyState holds the current state for TUI
type ClassifyState struct {
Config ClassifyConfig
DataFiles []*utils.DataFile
filteredSegs [][]*utils.Segment // cached at load time, parallel to DataFiles
totalSegs int // pre-computed total segment count
FileIdx int
SegmentIdx int
Dirty bool
Player *utils.AudioPlayer
PlaybackSpeed float64 // Current playback speed (1.0 = normal, 0.5 = half speed)
TimeFilteredCount int // files skipped by --night or --day filter
}

// BindingResult represents parsed key result
type BindingResult struct {
Species string
CallType string // empty string = remove calltype
}

// LoadDataFiles loads all .data files for classification
func LoadDataFiles(config ClassifyConfig) (*ClassifyState, error) {
var filePaths []string
var err error

if config.File != "" {
filePaths = []string{config.File}
} else {
filePaths, err = utils.FindDataFiles(config.Folder)
if err != nil {
return nil, fmt.Errorf("find data files: %w", err)
}
}

if len(filePaths) == 0 {
return nil, fmt.Errorf("no .data files found")
}

// Parse all files
dataFiles := make([]*utils.DataFile, 0, len(filePaths))
for _, path := range filePaths {
df, err := utils.ParseDataFile(path)
if err != nil {
continue // skip invalid files
}
dataFiles = append(dataFiles, df)
}

if len(dataFiles) == 0 {
return nil, fmt.Errorf("no valid .data files")
}

// Sort files by name (earliest to latest by filename timestamp)
sort.Slice(dataFiles, func(i, j int) bool {
return dataFiles[i].FilePath < dataFiles[j].FilePath
})

// Compute filtered segments once, remove files with no matches
hasFilter := config.Filter != "" || config.Species != "" || config.Certainty >= 0
var kept []*utils.DataFile
var cachedSegs [][]*utils.Segment
var timeFiltered int

for _, df := range dataFiles {
var segs []*utils.Segment
if !hasFilter {
segs = df.Segments
} else {
for _, seg := range df.Segments {
if seg.SegmentMatchesFilters(config.Filter, config.Species, config.CallType, config.Certainty) {
segs = append(segs, seg)
}
}
if len(segs) == 0 {
continue // skip files with no matching segments
}
}
// Day/night filter: runs after segment filter to avoid IsNight on irrelevant files.
if config.Night || config.Day {
wavPath := filepath.Clean(strings.TrimSuffix(df.FilePath, ".data"))
result, err := IsNight(IsNightInput{
FilePath: wavPath,
Lat: config.Lat,
Lng: config.Lng,
Timezone: config.Timezone,
})
if err != nil {
fmt.Fprintf(os.Stderr, "warning: skipping %s (isnight error: %v)\n", wavPath, err)
timeFiltered++
continue
}
if config.Night && !result.SolarNight {
timeFiltered++
continue
}
if config.Day && !result.DiurnalActive {
timeFiltered++
continue
}
}
kept = append(kept, df)
cachedSegs = append(cachedSegs, segs)
}

// Phase 4 - Random sampling (last filter step, preserves chronological order)
if config.Sample > 0 && config.Sample < 100 {
rng := rand.New(rand.NewSource(time.Now().UnixNano()))
kept, cachedSegs = applySampling(kept, cachedSegs, config.Sample, rng)
}

total := 0
for _, segs := range cachedSegs {
total += len(segs)
}

state := &ClassifyState{
Config: config,
DataFiles: kept,
filteredSegs: cachedSegs,
totalSegs: total,
TimeFilteredCount: timeFiltered,
}

// Handle --goto: find file by basename and set initial position
if config.Goto != "" {
found := false
for i, df := range state.DataFiles {
base := df.FilePath[strings.LastIndex(df.FilePath, "/")+1:]
if base == config.Goto {
state.FileIdx = i
found = true
break
}
}
if !found {
return nil, fmt.Errorf("goto file not found (or has no matching segments): %s", config.Goto)
}
}

return state, nil
}

// applySampling randomly selects sample% of segments from the filtered set.
// The returned files and segments preserve the original chronological order.
func applySampling(kept []*utils.DataFile, cachedSegs [][]*utils.Segment, sample int, rng *rand.Rand) ([]*utils.DataFile, [][]*utils.Segment) {
flat := make([]struct{ fileIdx, segIdx int }, 0)
for fi, segs := range cachedSegs {
for si := range segs {
flat = append(flat, struct{ fileIdx, segIdx int }{fi, si})
}
}

targetCount := max(len(flat)*sample/100, 1)

rng.Shuffle(len(flat), func(i, j int) { flat[i], flat[j] = flat[j], flat[i] })
selected := flat[:targetCount]

// Restore chronological order before rebuilding
sort.Slice(selected, func(i, j int) bool {
if selected[i].fileIdx != selected[j].fileIdx {
return selected[i].fileIdx < selected[j].fileIdx
}
return selected[i].segIdx < selected[j].segIdx
})

newCached := make([][]*utils.Segment, len(cachedSegs))
for _, ref := range selected {
newCached[ref.fileIdx] = append(newCached[ref.fileIdx], cachedSegs[ref.fileIdx][ref.segIdx])
}

var newKept []*utils.DataFile
var finalCached [][]*utils.Segment
for i, segs := range newCached {
if len(segs) > 0 {
newKept = append(newKept, kept[i])
finalCached = append(finalCached, segs)
}
}
return newKept, finalCached
}

// FilteredSegs returns the cached filtered segments parallel to DataFiles.
func (s *ClassifyState) FilteredSegs() [][]*utils.Segment {
return s.filteredSegs
}

// CurrentFile returns the current data file
func (s *ClassifyState) CurrentFile() *utils.DataFile {
if s.FileIdx >= len(s.DataFiles) {
return nil
}
return s.DataFiles[s.FileIdx]
}

// CurrentSegment returns the current segment
func (s *ClassifyState) CurrentSegment() *utils.Segment {
if s.FileIdx >= len(s.filteredSegs) {
return nil
}
segs := s.filteredSegs[s.FileIdx]
if s.SegmentIdx >= len(segs) {
return nil
}
return segs[s.SegmentIdx]
}

// TotalSegments returns total segments to review
func (s *ClassifyState) TotalSegments() int {
return s.totalSegs
}

// CurrentSegmentNumber returns 1-based segment number
func (s *ClassifyState) CurrentSegmentNumber() int {
count := 0
for i := 0; i < s.FileIdx; i++ {
count += len(s.filteredSegs[i])
}
return count + s.SegmentIdx + 1
}

// NextSegment moves to the next segment, returns false if at end
func (s *ClassifyState) NextSegment() bool {
if s.FileIdx >= len(s.filteredSegs) {
return false
}

segs := s.filteredSegs[s.FileIdx]
if s.SegmentIdx+1 < len(segs) {
s.SegmentIdx++
return true
}

// Move to next file
if s.FileIdx+1 < len(s.DataFiles) {
s.FileIdx++
s.SegmentIdx = 0
return true
}

return false
}

// PrevSegment moves to the previous segment, returns false if at start
func (s *ClassifyState) PrevSegment() bool {
if s.SegmentIdx > 0 {
s.SegmentIdx--
return true
}

// Move to previous file
if s.FileIdx > 0 {
s.FileIdx--
segs := s.filteredSegs[s.FileIdx]
s.SegmentIdx = max(len(segs)-1, 0)
return true
}

return false
}

// ParseKeyBuffer parses a single key into binding result
func (s *ClassifyState) ParseKeyBuffer(key string) *BindingResult {
for _, b := range s.Config.Bindings {
if b.Key == key {
return &BindingResult{
Species: b.Species,
CallType: b.CallType,
}
}
}
return nil
}

// SetComment sets the comment on the current segment's filter label.
// Returns the previous comment (for undo) or empty string if none.
func (s *ClassifyState) SetComment(comment string) string {
seg := s.CurrentSegment()
if seg == nil {
return ""
}

df := s.CurrentFile()
if df == nil {
return ""
}

// Set reviewer
df.Meta.Reviewer = s.Config.Reviewer

// Get labels matching filter
filterLabels := seg.GetFilterLabels(s.Config.Filter)

var oldComment string
if len(filterLabels) == 0 {
// No matching labels, add new one with comment
label := &utils.Label{
Species: "Don't Know",
Certainty: 0,
Filter: s.Config.Filter,
Comment: comment,
}
seg.Labels = append(seg.Labels, label)
} else {
// Set comment on first matching label
oldComment = filterLabels[0].Comment
filterLabels[0].Comment = comment
}

s.Dirty = true
return oldComment
}

// GetCurrentComment returns the comment on the current segment's filter label.
func (s *ClassifyState) GetCurrentComment() string {
seg := s.CurrentSegment()
if seg == nil {
return ""
}

filterLabels := seg.GetFilterLabels(s.Config.Filter)
if len(filterLabels) == 0 {
return ""
}
return filterLabels[0].Comment
}

// ApplyBinding applies a binding result to the current segment
func (s *ClassifyState) ApplyBinding(result *BindingResult) {
seg := s.CurrentSegment()
if seg == nil {
return
}

df := s.CurrentFile()
if df == nil {
return
}

// Set reviewer
df.Meta.Reviewer = s.Config.Reviewer

// Get labels matching filter
filterLabels := seg.GetFilterLabels(s.Config.Filter)

// Determine certainty: 0 for Don't Know, 100 for others
certainty := 100
if result.Species == "Don't Know" {
certainty = 0
}

if len(filterLabels) == 0 {
// No matching labels, add new one
seg.Labels = append(seg.Labels, &utils.Label{
Species: result.Species,
Certainty: certainty,
Filter: s.Config.Filter,
CallType: result.CallType,
})
} else {
// Edit first matching label, remove rest
filterLabels[0].Species = result.Species
filterLabels[0].Certainty = certainty
filterLabels[0].CallType = result.CallType // always set (empty = remove)

// Remove extra matching labels
if len(filterLabels) > 1 {
var newLabels []*utils.Label
for _, l := range seg.Labels {
keep := !slices.Contains(filterLabels[1:], l)
if keep {
newLabels = append(newLabels, l)
}
}
seg.Labels = newLabels
}
}

// Re-sort labels
sort.Slice(seg.Labels, func(i, j int) bool {
return seg.Labels[i].Species < seg.Labels[j].Species
})

s.Dirty = true
}

// ApplyCallTypeOnly sets the CallType on the current segment's first
// filter-matching label. Used after a Shift+primary keypress labeled the
// species and we now receive the secondary key for the calltype.
// No-op if there is no matching label to update.
func (s *ClassifyState) ApplyCallTypeOnly(callType string) {
seg := s.CurrentSegment()
if seg == nil {
return
}
df := s.CurrentFile()
if df == nil {
return
}
filterLabels := seg.GetFilterLabels(s.Config.Filter)
if len(filterLabels) == 0 {
return
}
df.Meta.Reviewer = s.Config.Reviewer
filterLabels[0].CallType = callType
s.Dirty = true
}

// HasSecondary reports whether the given primary key has any secondary
// (calltype) bindings configured.
func (s *ClassifyState) HasSecondary(primaryKey string) bool {
return len(s.Config.SecondaryBindings[primaryKey]) > 0
}

// ConfirmLabel upgrades the current segment's existing filter label certainty
// to 100. Returns true if a write is needed (label existed and was below 100).
// Returns false for Don't Know (certainty=0) — confirming a Don't Know is a no-op;
// the caller should just advance to the next segment.
func (s *ClassifyState) ConfirmLabel() bool {
seg := s.CurrentSegment()
if seg == nil {
return false
}
filterLabels := seg.GetFilterLabels(s.Config.Filter)
if len(filterLabels) == 0 {
return false
}
if filterLabels[0].Certainty == 0 {
return false
}
if filterLabels[0].Certainty == 100 {
return false
}
df := s.CurrentFile()
if df == nil {
return false
}
df.Meta.Reviewer = s.Config.Reviewer
filterLabels[0].Certainty = 100
s.Dirty = true
return true
}

// Save saves the current file
func (s *ClassifyState) Save() error {
df := s.CurrentFile()
if df == nil {
return nil
}

if !s.Dirty {
return nil
}

err := df.Write(df.FilePath)
if err != nil {
return err
}

s.Dirty = false
return nil
}

// getFilterLabel returns the label matching the current filter, or first label if no filter.
func (s *ClassifyState) getFilterLabel(seg *utils.Segment) *utils.Label {
if s.Config.Filter == "" {
if len(seg.Labels) > 0 {
return seg.Labels[0]
}
return nil
}
for _, label := range seg.Labels {
if label.Filter == s.Config.Filter {
return label
}
}
return nil
}

// getOrCreateFilterLabel gets existing label or creates new one for the current filter.
func (s *ClassifyState) getOrCreateFilterLabel(seg *utils.Segment) *utils.Label {
label := s.getFilterLabel(seg)
if label != nil {
return label
}
// Create new label
label = &utils.Label{
Species: "Don't Know",
Certainty: 0,
Filter: s.Config.Filter,
}
seg.Labels = append(seg.Labels, label)
s.Dirty = true
return label
}

// HasBookmark returns true if current segment has a bookmark on the filter label.
func (s *ClassifyState) HasBookmark() bool {
seg := s.CurrentSegment()
if seg == nil {
return false
}
label := s.getFilterLabel(seg)
return label != nil && label.Bookmark
}

// ToggleBookmark toggles the bookmark on the current segment's filter label.
func (s *ClassifyState) ToggleBookmark() {
seg := s.CurrentSegment()
if seg == nil {
return
}

df := s.CurrentFile()
if df == nil {
return
}

// Set reviewer
df.Meta.Reviewer = s.Config.Reviewer

label := s.getOrCreateFilterLabel(seg)
label.Bookmark = !label.Bookmark
s.Dirty = true
}

// NextBookmark navigates to the next bookmark, wrapping around if needed.
// Returns false if no bookmarks found (back at start position).
func (s *ClassifyState) NextBookmark() bool {
startFile := s.FileIdx
startSeg := s.SegmentIdx
first := true

for {
// Advance to next segment
if !s.NextSegment() {
// Wrap to start of folder
s.FileIdx = 0
s.SegmentIdx = 0
}

// Check if we've looped back to start
if !first && s.FileIdx == startFile && s.SegmentIdx == startSeg {
return false // full circle, no bookmark found
}
first = false

// Check if current segment has bookmark
if s.hasFilterBookmark() {
return true
}
}
}

// PrevBookmark navigates to the previous bookmark, wrapping around if needed.
// Returns false if no bookmarks found (back at start position).
func (s *ClassifyState) PrevBookmark() bool {
startFile := s.FileIdx
startSeg := s.SegmentIdx
first := true

for {
// Move to previous segment
if !s.PrevSegment() {
// Wrap to end of folder
s.FileIdx = len(s.DataFiles) - 1
segs := s.filteredSegs[s.FileIdx]
s.SegmentIdx = max(len(segs)-1, 0)
}

// Check if we've looped back to start
if !first && s.FileIdx == startFile && s.SegmentIdx == startSeg {
return false // full circle, no bookmark found
}
first = false

// Check if current segment has bookmark
if s.hasFilterBookmark() {
return true
}
}
}

// hasFilterBookmark checks if current segment has bookmark on filter-matching label.
func (s *ClassifyState) hasFilterBookmark() bool {
seg := s.CurrentSegment()
if seg == nil {
return false
}
label := s.getFilterLabel(seg)
return label != nil && label.Bookmark
}

// FormatLabels formats labels for display
func FormatLabels(labels []*utils.Label, filter string) string {
var parts []string
for _, l := range labels {
if filter != "" && l.Filter != filter {
continue
}
part := l.Species
if l.CallType != "" {
part += "/" + l.CallType
}
part += fmt.Sprintf(" (%d%%)", l.Certainty)
if l.Filter != "" {
part += " [" + l.Filter + "]"
}
if l.Comment != "" {
part += fmt.Sprintf(" \"%s\"", l.Comment)
}
parts = append(parts, part)
}
return strings.Join(parts, ", ")
}
file addition: bulk_file_import.go (----------)

[0.248737]

package tools

import (
"context"
"database/sql"
"encoding/csv"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"time"

"skraak/db"
"skraak/utils"
)

// BulkFileImportInput defines the input parameters for the bulk_file_import tool
type BulkFileImportInput struct {
DatasetID string `json:"dataset_id"`
CSVPath string `json:"csv_path"`
LogFilePath string `json:"log_file_path"`
}

// BulkFileImportOutput defines the output structure for the bulk_file_import tool
type BulkFileImportOutput struct {
TotalLocations int `json:"total_locations"`
ClustersCreated int `json:"clusters_created"`
ClustersExisting int `json:"clusters_existing"`
TotalFilesScanned int `json:"total_files_scanned"`
FilesImported int `json:"files_imported"`
FilesDuplicate int `json:"files_duplicate"`
FilesError int `json:"files_error"`
ProcessingTime string `json:"processing_time"`
Errors []string `json:"errors,omitempty"`
}

// bulkLocationData holds CSV row data for a location
type bulkLocationData struct {
LocationName string
LocationID string
DirectoryPath string
DateRange string
SampleRate int
FileCount int
}

// bulkImportStats tracks import statistics for a single cluster
type bulkImportStats struct {
TotalFiles int
ImportedFiles int
DuplicateFiles int
ErrorFiles int
}

// progressLogger handles writing to both log file and internal buffer
type progressLogger struct {
file *os.File
buffer *strings.Builder
}

// Log writes a formatted message with timestamp to both log file and buffer
func (l *progressLogger) Log(format string, args ...any) {
timestamp := time.Now().Format("2006-01-02 15:04:05")
message := fmt.Sprintf(format, args...)
line := fmt.Sprintf("[%s] %s\n", timestamp, message)

// Write to file; log write failures are non-fatal for import progress
if _, err := l.file.WriteString(line); err != nil {
fmt.Fprintf(os.Stderr, "Warning: log write failed: %v\n", err)
}
if err := l.file.Sync(); err != nil {
fmt.Fprintf(os.Stderr, "Warning: log sync failed: %v\n", err)
}

// Also keep in memory for potential error reporting
l.buffer.WriteString(line)
}

// BulkFileImport imports WAV files across multiple locations using CSV specification
func BulkFileImport(
ctx context.Context,
input BulkFileImportInput,
) (BulkFileImportOutput, error) {
startTime := time.Now()
var output BulkFileImportOutput

// Open log file
logFile, err := os.OpenFile(input.LogFilePath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
if err != nil {
return output, fmt.Errorf("failed to open log file: %w", err)
}
defer func() { _ = logFile.Close() }()

logger := &progressLogger{
file: logFile,
buffer: &strings.Builder{},
}

logger.Log("Starting bulk file import for dataset %s", input.DatasetID)

// Phase 0: Validate input
logger.Log("Validating input parameters...")
if err := bulkValidateInput(input); err != nil {
logger.Log("ERROR: Validation failed: %v", err)
output.Errors = []string{fmt.Sprintf("validation failed: %v", err)}
output.ProcessingTime = time.Since(startTime).String()
return output, fmt.Errorf("validation failed: %w", err)
}
logger.Log("Validation complete")

// Phase 1: Read CSV
logger.Log("Reading CSV file: %s", input.CSVPath)
locations, err := bulkReadCSV(input.CSVPath)
if err != nil {
logger.Log("ERROR: Failed to read CSV: %v", err)
output.Errors = []string{fmt.Sprintf("failed to read CSV: %v", err)}
output.ProcessingTime = time.Since(startTime).String()
return output, fmt.Errorf("failed to read CSV: %w", err)
}
logger.Log("Loaded %d locations from CSV", len(locations))
output.TotalLocations = len(locations)

// Phase 1.5: Validate all location_ids belong to the dataset
logger.Log("Validating location_ids belong to dataset...")
readDB, err := db.OpenReadOnlyDB(dbPath)
if err != nil {
logger.Log("ERROR: Failed to open database: %v", err)
output.Errors = []string{fmt.Sprintf("failed to open database: %v", err)}
output.ProcessingTime = time.Since(startTime).String()
return output, fmt.Errorf("failed to open database: %w", err)
}

locationErrors := bulkValidateLocationsBelongToDataset(readDB, locations, input.DatasetID)
readDB.Close()

if len(locationErrors) > 0 {
for _, locErr := range locationErrors {
logger.Log("ERROR: %s", locErr)
}
output.Errors = locationErrors
output.ProcessingTime = time.Since(startTime).String()
return output, fmt.Errorf("location validation failed: %d location(s) do not belong to dataset %s", len(locationErrors), input.DatasetID)
}
logger.Log("Location validation complete")

// Phase 2: Create/Validate Clusters
logger.Log("=== Phase 1: Creating/Validating Clusters ===")
clusterIDMap := make(map[string]string) // "locationID|dateRange" -> clusterID

database, err := db.OpenWriteableDB(dbPath)
if err != nil {
logger.Log("ERROR: Failed to open database: %v", err)
output.Errors = []string{fmt.Sprintf("failed to open database: %v", err)}
output.ProcessingTime = time.Since(startTime).String()
return output, fmt.Errorf("failed to open database: %w", err)
}
defer database.Close()

for i, loc := range locations {
logger.Log("[%d/%d] Processing location: %s", i+1, len(locations), loc.LocationName)

// Check if cluster already exists
var existingClusterID string
err := database.QueryRow(`
SELECT id FROM cluster
WHERE location_id = ? AND name = ? AND active = true
`, loc.LocationID, loc.DateRange).Scan(&existingClusterID)

var clusterID string
if err == sql.ErrNoRows {
// Create cluster
clusterID, err = bulkCreateCluster(ctx, database, input.DatasetID, loc.LocationID, loc.DateRange, loc.SampleRate)
if err != nil {
errMsg := fmt.Sprintf("Failed to create cluster for location %s: %v", loc.LocationName, err)
logger.Log("ERROR: %s", errMsg)
output.Errors = append(output.Errors, errMsg)
output.ProcessingTime = time.Since(startTime).String()
return output, fmt.Errorf("failed to create cluster: %w", err)
}
logger.Log(" Created cluster: %s", clusterID)
output.ClustersCreated++
} else if err != nil {
errMsg := fmt.Sprintf("Failed to check cluster for location %s: %v", loc.LocationName, err)
logger.Log("ERROR: %s", errMsg)
output.Errors = append(output.Errors, errMsg)
output.ProcessingTime = time.Since(startTime).String()
return output, fmt.Errorf("failed to check cluster: %w", err)
} else {
clusterID = existingClusterID
logger.Log(" Using existing cluster: %s", clusterID)
output.ClustersExisting++
}

compositeKey := loc.LocationID + "|" + loc.DateRange
clusterIDMap[compositeKey] = clusterID
}

logger.Log("=== Phase 2: Importing Files ===")

totalImported := 0
totalDuplicates := 0
totalErrors := 0
totalScanned := 0

for i, loc := range locations {
compositeKey := loc.LocationID + "|" + loc.DateRange
clusterID, ok := clusterIDMap[compositeKey]
if !ok {
continue // Should not happen, but safety check
}

logger.Log("[%d/%d] Importing files for: %s", i+1, len(locations), loc.LocationName)
logger.Log(" Directory: %s", loc.DirectoryPath)

// Check if directory exists
if _, err := os.Stat(loc.DirectoryPath); os.IsNotExist(err) {
logger.Log(" WARNING: Directory not found, skipping")
continue
}

// Import files
stats, err := bulkImportFilesForCluster(database, logger, loc.DirectoryPath, input.DatasetID, loc.LocationID, clusterID)
if err != nil {
errMsg := fmt.Sprintf("Failed to import files for location %s: %v", loc.LocationName, err)
logger.Log("ERROR: %s", errMsg)
output.Errors = append(output.Errors, errMsg)
output.TotalFilesScanned = totalScanned
output.FilesImported = totalImported
output.FilesDuplicate = totalDuplicates
output.FilesError = totalErrors
output.ProcessingTime = time.Since(startTime).String()
return output, fmt.Errorf("failed to import files: %w", err)
}

logger.Log(" Scanned: %d files", stats.TotalFiles)
logger.Log(" Imported: %d, Duplicates: %d", stats.ImportedFiles, stats.DuplicateFiles)
if stats.ErrorFiles > 0 {
logger.Log(" Errors: %d files", stats.ErrorFiles)
}

totalScanned += stats.TotalFiles
totalImported += stats.ImportedFiles
totalDuplicates += stats.DuplicateFiles
totalErrors += stats.ErrorFiles
}

logger.Log("=== Import Complete ===")
logger.Log("Total files scanned: %d", totalScanned)
logger.Log("Files imported: %d", totalImported)
logger.Log("Duplicates skipped: %d", totalDuplicates)
logger.Log("Errors: %d", totalErrors)
logger.Log("Processing time: %s", time.Since(startTime).Round(time.Second))

output.TotalFilesScanned = totalScanned
output.FilesImported = totalImported
output.FilesDuplicate = totalDuplicates
output.FilesError = totalErrors
output.ProcessingTime = time.Since(startTime).String()

return output, nil
}

// bulkValidateInput validates input parameters
func bulkValidateInput(input BulkFileImportInput) error {
// Validate ID format first (fast fail before DB queries)
if err := utils.ValidateShortID(input.DatasetID, "dataset_id"); err != nil {
return err
}

// Verify CSV file exists
if _, err := os.Stat(input.CSVPath); err != nil {
return fmt.Errorf("CSV file not accessible: %w", err)
}

// Verify log file path is writable
logDir := filepath.Dir(input.LogFilePath)
if _, err := os.Stat(logDir); err != nil {
return fmt.Errorf("log file directory not accessible: %w", err)
}

// Open database for validation queries
database, err := db.OpenReadOnlyDB(dbPath)
if err != nil {
return fmt.Errorf("failed to open database: %w", err)
}
defer database.Close()

// Verify dataset exists and is active
var datasetExists bool
err = database.QueryRow("SELECT EXISTS(SELECT 1 FROM dataset WHERE id = ? AND active = true)", input.DatasetID).Scan(&datasetExists)
if err != nil {
return fmt.Errorf("failed to query dataset: %w", err)
}
if !datasetExists {
return fmt.Errorf("dataset not found or inactive: %s", input.DatasetID)
}

// Verify dataset is 'structured' type (file imports only support structured datasets)
if err := utils.ValidateDatasetTypeForImport(database, input.DatasetID); err != nil {
return err
}

return nil
}

// bulkValidateLocationsBelongToDataset validates that all unique location_ids in the CSV belong to the dataset
func bulkValidateLocationsBelongToDataset(dbConn *sql.DB, locations []bulkLocationData, datasetID string) []string {
var errors []string

// Collect unique location_ids
uniqueLocations := make(map[string]bool)
for _, loc := range locations {
uniqueLocations[loc.LocationID] = true
}

// Validate each unique location_id
for locationID := range uniqueLocations {
if err := utils.ValidateLocationBelongsToDataset(dbConn, locationID, datasetID); err != nil {
errors = append(errors, err.Error())
}
}

return errors
}

// bulkReadCSV reads and parses the CSV file
func bulkReadCSV(path string) ([]bulkLocationData, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
}
defer func() { _ = file.Close() }()

reader := csv.NewReader(file)
records, err := reader.ReadAll()
if err != nil {
return nil, err
}

if len(records) == 0 {
return nil, fmt.Errorf("CSV file is empty")
}

var locations []bulkLocationData
for i, record := range records {
if i == 0 {
continue // Skip header
}

if len(record) < 6 {
return nil, fmt.Errorf("CSV row %d has insufficient columns (expected 6, got %d)", i+1, len(record))
}

// Validate required string fields are non-empty
locationName := strings.TrimSpace(record[0])
if locationName == "" {
return nil, fmt.Errorf("empty location_name in row %d", i+1)
}
directoryPath := strings.TrimSpace(record[2])
if directoryPath == "" {
return nil, fmt.Errorf("empty directory_path in row %d", i+1)
}
dateRange := strings.TrimSpace(record[3])
if dateRange == "" {
return nil, fmt.Errorf("empty date_range in row %d", i+1)
}

// Validate location_id format
locationID := record[1]
if err := utils.ValidateShortID(locationID, "location_id"); err != nil {
return nil, fmt.Errorf("invalid location_id in row %d: %v", i+1, err)
}

sampleRate, err := strconv.Atoi(record[4])
if err != nil {
return nil, fmt.Errorf("invalid sample_rate in row %d: %v", i+1, err)
}

// Validate sample rate is in reasonable range
if err := utils.ValidateSampleRate(sampleRate); err != nil {
return nil, fmt.Errorf("invalid sample_rate in row %d: %v", i+1, err)
}

fileCount, err := strconv.Atoi(record[5])
if err != nil {
return nil, fmt.Errorf("invalid file_count in row %d: %v", i+1, err)
}

locations = append(locations, bulkLocationData{
LocationName: locationName,
LocationID: locationID,
DirectoryPath: directoryPath,
DateRange: dateRange,
SampleRate: sampleRate,
FileCount: fileCount,
})
}

return locations, nil
}

// bulkCreateCluster creates a new cluster in the database
func bulkCreateCluster(ctx context.Context, database *sql.DB, datasetID, locationID, name string, sampleRate int) (string, error) {
// Generate a 12-character nanoid
clusterID, err := utils.GenerateShortID()
if err != nil {
return "", fmt.Errorf("failed to generate cluster ID: %v", err)
}
now := time.Now().UTC()

// Get location name for the path
var locationName string
err = database.QueryRow("SELECT name FROM location WHERE id = ?", locationID).Scan(&locationName)
if err != nil {
return "", fmt.Errorf("failed to get location name: %v", err)
}

// Normalize path: replace spaces and special characters
path := strings.ReplaceAll(locationName, " ", "_")
path = strings.ReplaceAll(path, "/", "_")

tx, err := db.BeginLoggedTx(ctx, database, "bulk_file_import")
if err != nil {
return "", fmt.Errorf("failed to begin transaction: %w", err)
}
defer tx.Rollback()

_, err = tx.ExecContext(ctx, `
INSERT INTO cluster (id, dataset_id, location_id, name, path, sample_rate, active, created_at, last_modified)
VALUES (?, ?, ?, ?, ?, ?, true, ?, ?)
`, clusterID, datasetID, locationID, name, path, sampleRate, now, now)
if err != nil {
return "", fmt.Errorf("failed to insert cluster: %w", err)
}

if err = tx.Commit(); err != nil {
return "", fmt.Errorf("failed to commit cluster creation: %w", err)
}

return clusterID, nil
}

// bulkImportFilesForCluster imports all WAV files for a single cluster
func bulkImportFilesForCluster(database *sql.DB, logger *progressLogger, folderPath, datasetID, locationID, clusterID string) (*bulkImportStats, error) {
stats := &bulkImportStats{}

// Check if directory exists
if _, err := os.Stat(folderPath); os.IsNotExist(err) {
logger.Log(" WARNING: Directory not found, skipping")
return stats, nil
}

// Import the cluster (SAME LOGIC AS import_files.go)
logger.Log(" Importing cluster %s", clusterID)
clusterOutput, err := utils.ImportCluster(database, utils.ClusterImportInput{
FolderPath: folderPath,
DatasetID: datasetID,
LocationID: locationID,
ClusterID: clusterID,
Recursive: true,
})
if err != nil {
return nil, err
}

// Map to bulk import stats
stats.TotalFiles = clusterOutput.TotalFiles
stats.ImportedFiles = clusterOutput.ImportedFiles
stats.DuplicateFiles = clusterOutput.SkippedFiles
stats.ErrorFiles = clusterOutput.FailedFiles

// Log errors
for i, fileErr := range clusterOutput.Errors {
if i < 5 { // Log first 5
logger.Log(" ERROR: %s: %s", fileErr.FileName, fileErr.Error)
}
}

logger.Log(" Complete: %d imported, %d duplicates, %d errors", stats.ImportedFiles, stats.DuplicateFiles, stats.ErrorFiles)

return stats, nil
}
file addition: shell_scripts (d--r------)

[2.1]
file addition: test_write_tools.sh (---r------)

[0.638309]

#!/bin/bash
# Test skraak create/update commands for dataset, location, cluster, pattern
# Usage: ./test_write_tools.sh
# Uses fresh copy of production DB in /tmp (auto-cleaned)

source "$(dirname "$0")/test_lib.sh"

echo "=== Testing create/update CLI Commands ==="
echo ""

check_binary

# Create fresh test database
DB_PATH=$(fresh_test_db)
trap "cleanup_test_db '$DB_PATH'" EXIT
echo "Using fresh test database: $DB_PATH"
echo ""

SKRAAK="$PROJECT_DIR/skraak"

# === PART 1: CREATE MODE ===
echo "=== PART 1: CREATE MODE ==="
echo ""

# Test 1: Create pattern
echo "Test 1: Create pattern"
result=$($SKRAAK create pattern --db "$DB_PATH" --record 60 --sleep 300 2>&1)
PATTERN_ID=$(echo "$result" | jq -r '.pattern.id // empty')
if [ -n "$PATTERN_ID" ]; then
echo -e "${GREEN}✓${NC} Create pattern (ID: $PATTERN_ID)"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Create pattern failed: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 2: Create pattern with negative values (should fail)
echo ""
echo "Test 2: Create pattern with negative values (should fail)"
result=$($SKRAAK create pattern --db "$DB_PATH" --record -10 --sleep 300 2>&1 || true)
if echo "$result" | grep -qi "error\|must be positive\|validation"; then
echo -e "${GREEN}✓${NC} Reject negative pattern values"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Should have rejected negative values: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 3: Create dataset
echo ""
echo "Test 3: Create dataset"
result=$($SKRAAK create dataset --db "$DB_PATH" --name "Test Dataset 2026" --description "Automated test" --type structured 2>&1)
DATASET_ID=$(echo "$result" | jq -r '.dataset.id // empty')
if [ -n "$DATASET_ID" ]; then
echo -e "${GREEN}✓${NC} Create dataset (ID: $DATASET_ID)"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Create dataset failed: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 4: Create dataset with invalid type (should fail)
echo ""
echo "Test 4: Create dataset with invalid type (should fail)"
result=$($SKRAAK create dataset --db "$DB_PATH" --name "Bad Dataset" --type invalid_type 2>&1 || true)
if echo "$result" | grep -qi "error\|invalid\|must be"; then
echo -e "${GREEN}✓${NC} Reject invalid dataset type"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Should have rejected invalid type: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 5: Create location
echo ""
echo "Test 5: Create location"
result=$($SKRAAK create location --db "$DB_PATH" --dataset "$DATASET_ID" --name "Test Location" --lat -41.2865 --lon 174.7762 --timezone Pacific/Auckland 2>&1)
LOCATION_ID=$(echo "$result" | jq -r '.location.id // empty')
if [ -n "$LOCATION_ID" ]; then
echo -e "${GREEN}✓${NC} Create location (ID: $LOCATION_ID)"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Create location failed: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 6: Create location with invalid latitude (should fail)
echo ""
echo "Test 6: Create location with invalid latitude (should fail)"
result=$($SKRAAK create location --db "$DB_PATH" --dataset "$DATASET_ID" --name "Bad Location" --lat 999 --lon 174.7762 --timezone Pacific/Auckland 2>&1 || true)
if echo "$result" | grep -qi "error\|latitude\|must be"; then
echo -e "${GREEN}✓${NC} Reject invalid coordinates"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Should have rejected invalid coordinates: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 7: Create cluster
echo ""
echo "Test 7: Create cluster"
result=$($SKRAAK create cluster --db "$DB_PATH" --dataset "$DATASET_ID" --location "$LOCATION_ID" --name "Test Cluster" --sample-rate 250000 2>&1)
CLUSTER_ID=$(echo "$result" | jq -r '.cluster.id // empty')
if [ -n "$CLUSTER_ID" ]; then
echo -e "${GREEN}✓${NC} Create cluster (ID: $CLUSTER_ID)"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Create cluster failed: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 8: Create cluster with negative sample rate (should fail)
echo ""
echo "Test 8: Create cluster with negative sample rate (should fail)"
result=$($SKRAAK create cluster --db "$DB_PATH" --dataset "$DATASET_ID" --location "$LOCATION_ID" --name "Bad Cluster" --sample-rate -1000 2>&1 || true)
if echo "$result" | grep -qi "error\|sample.rate\|must be positive\|validation"; then
echo -e "${GREEN}✓${NC} Reject negative sample rate"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Should have rejected negative sample rate: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# === PART 2: UPDATE MODE ===
echo ""
echo "=== PART 2: UPDATE MODE ==="
echo ""

# Test 9: Update dataset name
echo "Test 9: Update dataset name (ID: $DATASET_ID)"
echo " NOTE: Skipped due to DuckDB FK limitation on UPDATE"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
echo -e "${GREEN}✓${NC} Update dataset (skipped - DuckDB FK limitation)"

# Test 10: Update location
echo ""
echo "Test 10: Update location coordinates"
result=$($SKRAAK update location --db "$DB_PATH" --id "$LOCATION_ID" --lat -41.2900 --lon 174.7800 2>&1)
if echo "$result" | jq -e '.location.id' >/dev/null 2>&1; then
echo -e "${GREEN}✓${NC} Update location"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Update location failed: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 11: Update cluster
echo ""
echo "Test 11: Update cluster name"
result=$($SKRAAK update cluster --db "$DB_PATH" --id "$CLUSTER_ID" --name "Updated Cluster Name" 2>&1)
if echo "$result" | jq -e '.cluster.id' >/dev/null 2>&1; then
echo -e "${GREEN}✓${NC} Update cluster"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Update cluster failed: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 12: Update pattern
echo ""
echo "Test 12: Update pattern durations"
result=$($SKRAAK update pattern --db "$DB_PATH" --id "$PATTERN_ID" --record 120 --sleep 600 2>&1)
if echo "$result" | jq -e '.pattern.id' >/dev/null 2>&1; then
echo -e "${GREEN}✓${NC} Update pattern"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Update pattern failed: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 13: Update with invalid ID
echo ""
echo "Test 13: Update with non-existent ID (should fail)"
result=$($SKRAAK update dataset --db "$DB_PATH" --id "NOTAREALID123" --name "Should Fail" 2>&1 || true)
if echo "$result" | grep -qi "error\|not found\|does not exist"; then
echo -e "${GREEN}✓${NC} Reject non-existent ID"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Should have rejected non-existent ID: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

echo ""
print_summary
file addition: test_time.sh (---r------)

[0.638309]

#!/bin/bash
# Test skraak time command
# Usage: ./test_time.sh
# No database required

source "$(dirname "$0")/test_lib.sh"

echo "=== Testing skraak time ==="
echo ""

check_binary

# Test 1: Get current time
echo "Test 1: Get current time"
result=$($PROJECT_DIR/skraak time 2>&1)

time_val=$(echo "$result" | jq -r '.time // empty')
timezone=$(echo "$result" | jq -r '.timezone // empty')
unix_ts=$(echo "$result" | jq -r '.unix // empty')

if [ -n "$time_val" ] && [ -n "$timezone" ] && [ -n "$unix_ts" ]; then
echo -e "${GREEN}✓${NC} time returns all fields"
echo " Time: $time_val"
echo " Timezone: $timezone"
echo " Unix: $unix_ts"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} time missing fields"
echo " Output: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 2: Unix timestamp is a valid number
echo ""
echo "Test 2: Unix timestamp is numeric and recent"
if [ "$unix_ts" -gt 1700000000 ] 2>/dev/null; then
echo -e "${GREEN}✓${NC} Unix timestamp is reasonable ($unix_ts)"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Unix timestamp looks wrong ($unix_ts)"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 3: Time is valid RFC3339
echo ""
echo "Test 3: Time is valid RFC3339 format"
if echo "$time_val" | grep -qE '^[0-9]{4}-[0-9]{2}-[0-9]{2}T'; then
echo -e "${GREEN}✓${NC} Time is RFC3339 format"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Time format unexpected: $time_val"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

echo ""
print_summary
file addition: test_sql_output.txt (----------)

[0.638309]

[0;31mError: Database not found at ../db/test.duckdb[0m
file addition: test_sql_limit.sh (---r------)

[0.638309]

#!/bin/bash
# Test execute_sql "limited" flag behavior
# Usage: ./test_sql_limit.sh [db_path]
# Default: ../db/test.duckdb (ALWAYS USE TEST DATABASE!)
#
# This tests the fix for the bug where "limited" was always false
# even when results were truncated.

source "$(dirname "$0")/test_lib.sh"

# Get absolute paths before changing directory
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"

# Convert DB_PATH to absolute path (before we cd later)
DB_PATH_ARG="${1:-$PROJECT_DIR/db/test.duckdb}"
if [[ "$DB_PATH_ARG" = /* ]]; then
DB_PATH="$DB_PATH_ARG"
else
DB_PATH="$(cd "$(dirname "$DB_PATH_ARG")" && pwd)/$(basename "$DB_PATH_ARG")"
fi

if [ ! -f "$DB_PATH" ]; then
echo -e "${RED}Error: Database not found at $DB_PATH${NC}"
exit 1
fi

echo "=== Testing execute_sql 'limited' Flag ==="
echo ""
echo "Database: $DB_PATH"
echo ""

check_binary

# Navigate to the project directory where skraak binary is located
cd "$PROJECT_DIR" || exit 1

# Helper to run CLI command and capture JSON output (stderr discarded)
run_cli() {
"$PROJECT_DIR/skraak" "$@" 2>/dev/null || true
}

# Count total files in database for test planning
FILE_COUNT=$(run_cli sql --db "$DB_PATH" "SELECT COUNT(*) as cnt FROM file WHERE active = true" | jq -r '.rows[0].cnt // 0')
LOCATION_COUNT=$(run_cli sql --db "$DB_PATH" "SELECT COUNT(*) as cnt FROM location WHERE active = true" | jq -r '.rows[0].cnt // 0')
DATASET_COUNT=$(run_cli sql --db "$DB_PATH" "SELECT COUNT(*) as cnt FROM dataset WHERE active = true" | jq -r '.rows[0].cnt // 0')

echo "Database stats:"
echo " Files: $FILE_COUNT"
echo " Locations: $LOCATION_COUNT"
echo " Datasets: $DATASET_COUNT"
echo ""

# We need at least some files to test truncation
if [ "$FILE_COUNT" -lt 100 ]; then
echo -e "${YELLOW}Warning: Need at least 100 files to test truncation. Have $FILE_COUNT.${NC}"
echo "Some tests may be skipped."
echo ""
fi

TESTS_RUN=0
TESTS_PASSED=0
TESTS_FAILED=0

# Test helper: check limited flag and row count
test_limit_flag() {
local name="$1"
local expected_limited="$2"
local expected_row_count="$3"
local result="$4"

((TESTS_RUN++)) || true

# Note: jq '//' operator treats false as empty, so check for boolean explicitly
local actual_limited=$(echo "$result" | jq -r 'if has("limited") then (.limited | tostring) else "missing" end')
local actual_row_count=$(echo "$result" | jq -r '.row_count // -1')

if [ "$actual_limited" = "$expected_limited" ] && [ "$actual_row_count" -eq "$expected_row_count" ]; then
echo -e "${GREEN}✓${NC} $name"
echo " row_count=$actual_row_count, limited=$actual_limited"
((TESTS_PASSED++)) || true
return 0
else
echo -e "${RED}✗${NC} $name"
echo " Expected: row_count=$expected_row_count, limited=$expected_limited"
echo " Actual: row_count=$actual_row_count, limited=$actual_limited"
((TESTS_FAILED++)) || true
return 1
fi
}

# Test helper: check query_executed field
test_query_reported() {
local name="$1"
local expected_query_fragment="$2"
local result="$3"

((TESTS_RUN++)) || true

local query=$(echo "$result" | jq -r '.query_executed // ""')

if echo "$query" | grep -q "$expected_query_fragment"; then
echo -e "${GREEN}✓${NC} $name"
echo " query: $query"
((TESTS_PASSED++)) || true
return 0
else
echo -e "${RED}✗${NC} $name"
echo " Expected fragment: $expected_query_fragment"
echo " Actual query: $query"
((TESTS_FAILED++)) || true
return 1
fi
}

echo "=== Test 1: Auto-limit with truncation ==="
echo "Query without LIMIT on large table should trigger truncation"
if [ "$FILE_COUNT" -ge 100 ]; then
result=$(run_cli sql --db "$DB_PATH" "SELECT * FROM file WHERE active = true")
test_limit_flag "Auto-limit truncates results" "true" "1000" "$result"
test_query_reported "Query shows effective limit 1000" "LIMIT 1000" "$result"
else
echo -e "${YELLOW}⊘${NC} Skipped (need >= 100 files)"
fi
echo ""

echo "=== Test 2: Auto-limit without truncation ==="
echo "Query without LIMIT on small table should not truncate"
result=$(run_cli sql --db "$DB_PATH" "SELECT * FROM dataset WHERE active = true")
EXPECTED_ROWS=$DATASET_COUNT
test_limit_flag "Auto-limit no truncation" "false" "$EXPECTED_ROWS" "$result"
echo ""

echo "=== Test 3: User-provided LIMIT preserved ==="
echo "User's own LIMIT clause should be preserved"
result=$(run_cli sql --db "$DB_PATH" "SELECT * FROM file WHERE active = true LIMIT 5")
test_limit_flag "User LIMIT: limited=false" "false" "5" "$result"
test_query_reported "User LIMIT preserved in query" "LIMIT 5$" "$result"
echo ""

echo "=== Test 4: User LIMIT equal to default ==="
echo "User LIMIT 1000 should work (not double-limited)"
result=$(run_cli sql --db "$DB_PATH" "SELECT * FROM file WHERE active = true LIMIT 1000")
test_limit_flag "User LIMIT 1000: limited=false" "false" "1000" "$result"
test_query_reported "User LIMIT 1000 preserved" "LIMIT 1000$" "$result"
echo ""

echo "=== Test 5: Explicit --limit parameter with truncation ==="
echo "Using --limit 100 should truncate if table has > 100 rows"
if [ "$FILE_COUNT" -ge 100 ]; then
result=$(run_cli sql --db "$DB_PATH" --limit 100 "SELECT * FROM file WHERE active = true")
test_limit_flag "--limit 100 truncates" "true" "100" "$result"
test_query_reported "Query shows LIMIT 100" "LIMIT 100" "$result"
else
echo -e "${YELLOW}⊘${NC} Skipped (need >= 100 files)"
fi
echo ""

echo "=== Test 6: Explicit --limit parameter without truncation ==="
echo "Using --limit larger than table should not truncate"
result=$(run_cli sql --db "$DB_PATH" --limit 100 "SELECT * FROM dataset WHERE active = true")
EXPECTED_ROWS=$DATASET_COUNT
test_limit_flag "--limit > table size: no truncation" "false" "$EXPECTED_ROWS" "$result"
echo ""

echo "=== Test 7: Empty result set ==="
echo "Query returning no rows should have limited=false"
result=$(run_cli sql --db "$DB_PATH" "SELECT * FROM dataset WHERE id = 'NONEXISTENT_ID_12345'")
test_limit_flag "Empty result: limited=false" "false" "0" "$result"
echo ""

echo "=== Test 8: Small --limit with small table ==="
echo "--limit 1 on datasets should work correctly"
result=$(run_cli sql --db "$DB_PATH" --limit 1 "SELECT * FROM dataset WHERE active = true")
if [ "$DATASET_COUNT" -gt 1 ]; then
test_limit_flag "--limit 1 truncates (table has $DATASET_COUNT)" "true" "1" "$result"
else
test_limit_flag "--limit 1 no truncation (table has $DATASET_COUNT)" "false" "$DATASET_COUNT" "$result"
fi
echo ""

echo "=== Summary ==="
echo "Tests run: $TESTS_RUN"
echo -e "Passed: ${GREEN}$TESTS_PASSED${NC}"
if [ "$TESTS_FAILED" -gt 0 ]; then
echo -e "Failed: ${RED}$TESTS_FAILED${NC}"
exit 1
else
echo -e "Failed: $TESTS_FAILED"
fi
file addition: test_sql.sh (---r------)

[0.638309]

#!/bin/bash
# Test skraak sql command with various queries
# Usage: ./test_sql.sh [db_path]
# Default: uses test.duckdb (read-only tests)

source "$(dirname "$0")/test_lib.sh"

DB_PATH="${1:-$DEFAULT_TEST_DB}"

if [ ! -f "$DB_PATH" ]; then
echo -e "${RED}Error: Database not found at $DB_PATH${NC}"
exit 1
fi

echo "=== Testing skraak sql ==="
echo "Database: $DB_PATH"
echo ""

check_binary

# Helper to run CLI command and capture JSON output
run_cli() {
"$PROJECT_DIR/skraak" "$@" 2>/dev/null || true
}

# Test 1: Simple SELECT
echo "Test 1: Simple SELECT query"
result=$(run_cli sql --db "$DB_PATH" "SELECT id, name FROM dataset WHERE active = true LIMIT 5")
row_count=$(echo "$result" | jq -r '.row_count // -1')
if [ "$row_count" -ge 0 ]; then
echo -e "${GREEN}✓${NC} Simple SELECT returns results (row_count=$row_count)"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Simple SELECT failed"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 2: SELECT with --limit parameter
echo ""
echo "Test 2: SELECT with --limit parameter"
result=$(run_cli sql --db "$DB_PATH" --limit 3 "SELECT id, name FROM location WHERE active = true")
row_count=$(echo "$result" | jq -r '.row_count // -1')
if [ "$row_count" -ge 0 ] && [ "$row_count" -le 3 ]; then
echo -e "${GREEN}✓${NC} SELECT with --limit works (row_count=$row_count)"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} SELECT with --limit failed (row_count=$row_count)"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 3: JOIN query
echo ""
echo "Test 3: JOIN query across tables"
result=$(run_cli sql --db "$DB_PATH" "SELECT d.name, COUNT(l.id) as cnt FROM dataset d LEFT JOIN location l ON d.id = l.dataset_id WHERE d.active = true GROUP BY d.name LIMIT 5")
row_count=$(echo "$result" | jq -r '.row_count // -1')
if [ "$row_count" -ge 0 ]; then
echo -e "${GREEN}✓${NC} JOIN query works (row_count=$row_count)"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} JOIN query failed"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 4: Aggregate with GROUP BY
echo ""
echo "Test 4: Aggregate with GROUP BY"
result=$(run_cli sql --db "$DB_PATH" "SELECT type, COUNT(*) as cnt FROM dataset WHERE active = true GROUP BY type")
row_count=$(echo "$result" | jq -r '.row_count // -1')
if [ "$row_count" -ge 0 ]; then
echo -e "${GREEN}✓${NC} Aggregate query works (row_count=$row_count)"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Aggregate query failed"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 5: CTE (WITH clause)
echo ""
echo "Test 5: CTE with WITH clause"
result=$(run_cli sql --db "$DB_PATH" "WITH active_datasets AS (SELECT id, name FROM dataset WHERE active = true) SELECT * FROM active_datasets LIMIT 3")
row_count=$(echo "$result" | jq -r '.row_count // -1')
if [ "$row_count" -ge 0 ]; then
echo -e "${GREEN}✓${NC} CTE query works (row_count=$row_count)"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} CTE query failed"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 6: INSERT attempt (should fail)
echo ""
echo "Test 6: INSERT blocked (security)"
result=$("$PROJECT_DIR/skraak" sql --db "$DB_PATH" "INSERT INTO dataset (id, name) VALUES ('test', 'test')" 2>&1 || true)
if echo "$result" | grep -qi "error\|forbidden\|only SELECT\|only WITH"; then
echo -e "${GREEN}✓${NC} INSERT correctly rejected"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} INSERT should have been rejected"
echo " Output: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 7: SQL injection attempt (should fail)
echo ""
echo "Test 7: SQL injection blocked (security)"
result=$("$PROJECT_DIR/skraak" sql --db "$DB_PATH" "SELECT * FROM dataset; DROP TABLE dataset;" 2>&1 || true)
if echo "$result" | grep -qi "error\|forbidden\|only SELECT\|only WITH"; then
echo -e "${GREEN}✓${NC} SQL injection correctly rejected"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} SQL injection should have been rejected"
echo " Output: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 8: DELETE attempt (should fail)
echo ""
echo "Test 8: DELETE blocked (security)"
result=$("$PROJECT_DIR/skraak" sql --db "$DB_PATH" "DELETE FROM dataset WHERE id = 'test'" 2>&1 || true)
if echo "$result" | grep -qi "error\|forbidden\|only SELECT\|only WITH"; then
echo -e "${GREEN}✓${NC} DELETE correctly rejected"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} DELETE should have been rejected"
echo " Output: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 9: DROP attempt (should fail)
echo ""
echo "Test 9: DROP blocked (security)"
result=$("$PROJECT_DIR/skraak" sql --db "$DB_PATH" "DROP TABLE dataset" 2>&1 || true)
if echo "$result" | grep -qi "error\|forbidden\|only SELECT\|only WITH"; then
echo -e "${GREEN}✓${NC} DROP correctly rejected"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} DROP should have been rejected"
echo " Output: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

echo ""
print_summary
file addition: test_lib.sh (---r------)

[0.638309]

#!/bin/bash
# Shared library for shell test scripts
# Source this file: source ./test_lib.sh

set -euo pipefail

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

# Test counters
TESTS_RUN=0
TESTS_PASSED=0
TESTS_FAILED=0

# Project paths
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
PRODUCTION_DB="$PROJECT_DIR/db/skraak.duckdb"
DEFAULT_TEST_DB="$PROJECT_DIR/db/test.duckdb"

# Check that skraak binary exists
check_binary() {
if [ ! -f "$PROJECT_DIR/skraak" ]; then
echo -e "${RED}Error: skraak binary not found. Run 'go build' first.${NC}"
exit 1
fi
}

# Create fresh test database from production
# Returns path to fresh test DB (in /tmp)
fresh_test_db() {
if [ ! -f "$PRODUCTION_DB" ]; then
echo -e "${RED}Error: Production database not found at $PRODUCTION_DB${NC}"
exit 1
fi

local test_db="/tmp/skraak_test_$$.duckdb"
cp "$PRODUCTION_DB" "$test_db"
echo "$test_db"
}

# Cleanup test database
cleanup_test_db() {
local db_path="$1"
if [ -n "$db_path" ] && [ -f "$db_path" ]; then
rm -f "$db_path"
# Also remove DuckDB temp files
rm -f "${db_path}.wal" "${db_path}.tmp" 2>/dev/null || true
fi
}

# Print test summary
print_summary() {
echo ""
echo "=== Summary ==="
echo -e "Tests run: $TESTS_RUN"
echo -e "Passed: ${GREEN}$TESTS_PASSED${NC}"
if [ "$TESTS_FAILED" -gt 0 ]; then
echo -e "Failed: ${RED}$TESTS_FAILED${NC}"
else
echo -e "Failed: $TESTS_FAILED"
fi

if [ "$TESTS_FAILED" -gt 0 ]; then
return 1
fi
return 0
}
file addition: test_import.sh (---r------)

[0.638309]

#!/bin/bash
# Test import folder validation
# Usage: ./test_import.sh
# Uses fresh copy of production DB in /tmp (auto-cleaned)

source "$(dirname "$0")/test_lib.sh"

echo "=== Testing import folder validation ==="
echo ""

check_binary

# Create fresh test database
DB_PATH=$(fresh_test_db)
trap "cleanup_test_db '$DB_PATH'" EXIT
echo "Using fresh test database: $DB_PATH"
echo ""

SKRAAK="$PROJECT_DIR/skraak"

# Get test IDs from database
DATASET_ID=$($SKRAAK sql --db "$DB_PATH" "SELECT id FROM dataset WHERE active = true LIMIT 1" 2>/dev/null | jq -r '.rows[0].id // empty')
LOCATION_ID=$($SKRAAK sql --db "$DB_PATH" "SELECT id FROM location WHERE active = true LIMIT 1" 2>/dev/null | jq -r '.rows[0].id // empty')
CLUSTER_ID=$($SKRAAK sql --db "$DB_PATH" "SELECT id FROM cluster WHERE active = true LIMIT 1" 2>/dev/null | jq -r '.rows[0].id // empty')

if [ -z "$DATASET_ID" ] || [ -z "$LOCATION_ID" ] || [ -z "$CLUSTER_ID" ]; then
echo -e "${RED}Error: Could not find test entities in database${NC}"
exit 1
fi
echo " Dataset: $DATASET_ID"
echo " Location: $LOCATION_ID"
echo " Cluster: $CLUSTER_ID"
echo ""

# Test 1: Non-existent folder (should fail)
echo "Test 1: Non-existent folder (should fail)"
result=$($SKRAAK import folder --db "$DB_PATH" --dataset "$DATASET_ID" --location "$LOCATION_ID" --cluster "$CLUSTER_ID" --folder /nonexistent/folder 2>&1 || true)
if echo "$result" | grep -qi "error\|not accessible\|not found\|no such"; then
echo -e "${GREEN}✓${NC} Reject non-existent folder"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Should have rejected non-existent folder: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 2: Invalid location ID (should fail)
echo ""
echo "Test 2: Invalid location_id (should fail)"
result=$($SKRAAK import folder --db "$DB_PATH" --dataset "$DATASET_ID" --location "INVALID123456" --cluster "$CLUSTER_ID" --folder /tmp 2>&1 || true)
if echo "$result" | grep -qi "error\|not found\|invalid\|validation"; then
echo -e "${GREEN}✓${NC} Reject invalid location_id"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Should have rejected invalid location_id: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

# Test 3: Missing required flags (should fail)
echo ""
echo "Test 3: Missing --cluster flag (should fail)"
result=$($SKRAAK import folder --db "$DB_PATH" --dataset "$DATASET_ID" --location "$LOCATION_ID" --folder /tmp 2>&1 || true)
if echo "$result" | grep -qi "error\|required\|missing"; then
echo -e "${GREEN}✓${NC} Reject missing required flag"
((TESTS_RUN++)) || true
((TESTS_PASSED++)) || true
else
echo -e "${RED}✗${NC} Should have rejected missing flag: $result"
((TESTS_RUN++)) || true
((TESTS_FAILED++)) || true
fi

echo ""
print_summary

echo ""
echo "Note: These tests validate error handling only."
echo "Actual file import requires real WAV files and valid paths."
echo ""
echo "For bulk import, use the CLI tool:"
echo " skraak import bulk --db ./db/skraak.duckdb --dataset abc123 --csv import.csv --log progress.log"
file addition: test_export.sh (---r------)

[0.638309]

#!/bin/bash

# Test export dataset functionality
# Usage: ./test_export.sh [db_path]

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
SKRAAK="$PROJECT_DIR/skraak"

DB_PATH="${1:-$PROJECT_DIR/db/test.duckdb}"
EXPORT_DB="/tmp/skraak_export_test_$$.duckdb"

echo "=== Testing Export Dataset ==="
echo "Database: $DB_PATH"
echo ""

# Clean up any existing export
rm -f "$EXPORT_DB" "$EXPORT_DB.events.jsonl"

# Get a dataset ID to export
echo "Test 1: Get dataset ID..."
DATASET_ID=$("$SKRAAK" sql --db "$DB_PATH" "SELECT id FROM dataset WHERE active = true LIMIT 1" | jq -r '.rows[0].id')
if [ -z "$DATASET_ID" ] || [ "$DATASET_ID" = "null" ]; then
echo "ERROR: No active dataset found"
exit 1
fi
echo " Dataset ID: $DATASET_ID"

# Test dry-run
echo ""
echo "Test 2: Dry-run export..."
OUTPUT=$("$SKRAAK" export dataset --db "$DB_PATH" --id "$DATASET_ID" --output "$EXPORT_DB" --dry-run)
echo "$OUTPUT" | jq -r '.message'
DRY_RUN=$(echo "$OUTPUT" | jq -r '.dry_run')
if [ "$DRY_RUN" != "true" ]; then
echo "ERROR: dry_run should be true"
exit 1
fi
echo " ✓ Dry-run works"

# Verify no file created
if [ -f "$EXPORT_DB" ]; then
echo "ERROR: Export file should not exist after dry-run"
exit 1
fi
echo " ✓ No file created in dry-run mode"

# Test actual export
# Note this test fails if exporting from a db with FK constraints removed
echo ""
echo "Test 3: Export dataset..."
OUTPUT=$("$SKRAAK" export dataset --db "$DB_PATH" --id "$DATASET_ID" --output "$EXPORT_DB" --force)
echo "$OUTPUT" | jq -r '.message'

# Verify export file exists
if [ ! -f "$EXPORT_DB" ]; then
echo "ERROR: Export file not created"
exit 1
fi
echo " ✓ Export file created"

# Verify event log file exists
if [ ! -f "$EXPORT_DB.events.jsonl" ]; then
echo "ERROR: Event log file not created"
exit 1
fi
echo " ✓ Event log file created"

# Verify row counts
echo ""
echo "Test 4: Verify row counts..."
FILE_COUNT=$(echo "$OUTPUT" | jq -r '.row_counts.file')
EXPORTED_COUNT=$("$SKRAAK" sql --db "$EXPORT_DB" "SELECT COUNT(*) as count FROM file" | jq -r '.rows[0].count')
if [ "$FILE_COUNT" != "$EXPORTED_COUNT" ]; then
echo "ERROR: File count mismatch: expected $FILE_COUNT, got $EXPORTED_COUNT"
exit 1
fi
echo " ✓ Row counts match ($FILE_COUNT files)"

# Verify dataset
echo ""
echo "Test 5: Verify dataset..."
DATASET_COUNT=$("$SKRAAK" sql --db "$EXPORT_DB" "SELECT COUNT(*) as count FROM dataset WHERE id = '$DATASET_ID'" | jq -r '.rows[0].count')
if [ "$DATASET_COUNT" != "1" ]; then
echo "ERROR: Dataset not found in export"
exit 1
fi
echo " ✓ Dataset found in export"

# Test error handling - dataset not found
echo ""
echo "Test 6: Test error handling..."
ERROR=$("$SKRAAK" export dataset --db "$DB_PATH" --id "NOTAREALID" --output "$EXPORT_DB" 2>&1 || true)
if [[ ! "$ERROR" =~ "dataset not found" ]]; then
echo "ERROR: Should report dataset not found"
echo "$ERROR"
exit 1
fi
echo " ✓ Error handling works for missing dataset"

# Test --force overwrite
echo ""
echo "Test 7: Test --force overwrite..."
OUTPUT=$("$SKRAAK" export dataset --db "$DB_PATH" --id "$DATASET_ID" --output "$EXPORT_DB" --force 2>&1)
if [[ "$OUTPUT" =~ "error" ]]; then
echo "ERROR: Should not error with --force"
echo "$OUTPUT"
exit 1
fi
echo " ✓ --force overwrite works"

# Test error without --force
echo ""
echo "Test 8: Test error without --force..."
ERROR=$("$SKRAAK" export dataset --db "$DB_PATH" --id "$DATASET_ID" --output "$EXPORT_DB" 2>&1 || true)
if [[ ! "$ERROR" =~ "file exists" ]]; then
echo "ERROR: Should report file exists"
echo "$ERROR"
exit 1
fi
echo " ✓ Error handling works for existing file"

# Clean up
rm -f "$EXPORT_DB" "$EXPORT_DB.events.jsonl"

echo ""
echo "=== All tests passed ==="
file addition: test_event_log.sh (---r------)

[0.638309]

#!/bin/bash
# Test event log functionality
# Usage: ./test_event_log.sh [database_path]

set -e

DB="${1:-/home/david/go/src/skraak/db/test.duckdb}"
LOG="$DB.events.jsonl"
SKRAAK="${SKRAAK:-../skraak}"

echo "=== Testing Event Log ==="
echo "Database: $DB"
echo "Event log: $LOG"
echo ""

# Clean up
rm -f "$LOG"

# Check if database exists and has schema
if [ ! -f "$DB" ]; then
echo "Error: Database $DB does not exist"
exit 1
fi

# Test 1: Create dataset
echo "Test 1: Create dataset..."
RESULT=$($SKRAAK create dataset --db "$DB" --name "EventLogTest_$(date +%s)" --type structured 2>&1)
DATASET_ID=$(echo "$RESULT" | jq -r '.dataset.id')
echo " Created dataset: $DATASET_ID"

# Check event log
if [ ! -f "$LOG" ]; then
echo " ERROR: Event log not created!"
exit 1
fi

EVENT_COUNT=$(wc -l < "$LOG")
if [ "$EVENT_COUNT" -lt 1 ]; then
echo " ERROR: No events logged!"
exit 1
fi
echo " Event log has $EVENT_COUNT entry/entries"

# Test 2: Verify event structure
echo ""
echo "Test 2: Verify event structure..."
EVENT=$(head -1 "$LOG")
echo "$EVENT" | jq -e '.id' > /dev/null && echo " ✓ Has id"
echo "$EVENT" | jq -e '.timestamp' > /dev/null && echo " ✓ Has timestamp"
echo "$EVENT" | jq -e '.tool' > /dev/null && echo " ✓ Has tool"
echo "$EVENT" | jq -e '.queries' > /dev/null && echo " ✓ Has queries"
echo "$EVENT" | jq -e '.success' > /dev/null && echo " ✓ Has success"

# Test 3: Create location
echo ""
echo "Test 3: Create location..."
RESULT=$($SKRAAK create location --db "$DB" --dataset "$DATASET_ID" --name "TestLoc_$(date +%s)" --lat -36.85 --lon 174.76 --timezone Pacific/Auckland 2>&1)
LOCATION_ID=$(echo "$RESULT" | jq -r '.location.id')
echo " Created location: $LOCATION_ID"

# Test 4: Verify multiple events
EVENT_COUNT=$(wc -l < "$LOG")
if [ "$EVENT_COUNT" -lt 2 ]; then
echo " ERROR: Expected at least 2 events, got $EVENT_COUNT"
exit 1
fi
echo " Event log has $EVENT_COUNT entries"

# Test 5: Dry-run replay
echo ""
echo "Test 5: Dry-run replay..."
$SKRAAK replay events --db "$DB" --log "$LOG" --dry-run > /dev/null 2>&1
echo " ✓ Dry-run succeeded"

# Test 6: Verify replay command flags
echo ""
echo "Test 6: Verify replay flags..."
$SKRAAK replay events --db "$DB" --log "$LOG" --last 1 --dry-run > /dev/null 2>&1
echo " ✓ --last flag works"

echo ""
echo "=== All tests passed ==="
echo ""
echo "Event log contents:"
cat "$LOG" | jq -c '{id, tool, queries: (.queries | length), success}'
file addition: test_db_state.sh (---r------)

[0.638309]

#!/bin/bash
# Verify database state - check table counts and referential integrity
# Usage: ./test_db_state.sh [db_path]
# Default: uses test.duckdb

source "$(dirname "$0")/test_lib.sh"

DB_PATH="${1:-$DEFAULT_TEST_DB}"

if [ ! -f "$DB_PATH" ]; then
echo -e "${RED}Error: Database not found at $DB_PATH${NC}"
exit 1
fi

echo "=== Database State Verification ==="
echo "Database: $DB_PATH"
echo ""

check_binary

sql() {
"$PROJECT_DIR/skraak" sql --db "$DB_PATH" "$1" 2>/dev/null
}

cnt() {
sql "$1" | jq -r '.rows[0].cnt // "error"'
}

# Assert a query returns 0 rows (integrity violation check)
check_zero() {
local name="$1"
local query="$2"
local count
count=$(cnt "$query")
((TESTS_RUN++)) || true
if [ "$count" = "0" ]; then
echo -e " ${GREEN}✓${NC} $name"
((TESTS_PASSED++)) || true
else
echo -e " ${RED}✗${NC} $name: $count violation(s)"
((TESTS_FAILED++)) || true
fi
}

# ── Counts ────────────────────────────────────────────────────────────────────
echo "Table Counts:"
echo " Datasets: $(cnt 'SELECT COUNT(*) AS cnt FROM dataset WHERE active = true')"
echo " Locations: $(cnt 'SELECT COUNT(*) AS cnt FROM location WHERE active = true')"
echo " Clusters: $(cnt 'SELECT COUNT(*) AS cnt FROM cluster WHERE active = true')"
echo " Files: $(cnt 'SELECT COUNT(*) AS cnt FROM file WHERE active = true')"
echo " File-Dataset: $(cnt 'SELECT COUNT(*) AS cnt FROM file_dataset')"
echo " Segments: $(cnt 'SELECT COUNT(*) AS cnt FROM segment WHERE active = true')"
echo " Labels: $(cnt 'SELECT COUNT(*) AS cnt FROM label WHERE active = true')"
echo " Label subtypes: $(cnt 'SELECT COUNT(*) AS cnt FROM label_subtype WHERE active = true')"
echo " Moth metadata: $(cnt 'SELECT COUNT(*) AS cnt FROM moth_metadata WHERE active = true')"
echo " File metadata: $(cnt 'SELECT COUNT(*) AS cnt FROM file_metadata WHERE active = true')"
echo " Label metadata: $(cnt 'SELECT COUNT(*) AS cnt FROM label_metadata WHERE active = true')"
echo ""

# ── Location hierarchy ────────────────────────────────────────────────────────
echo "Location hierarchy:"
check_zero "location.dataset_id → dataset" \
"SELECT COUNT(*) AS cnt FROM location l LEFT JOIN dataset d ON l.dataset_id = d.id WHERE d.id IS NULL"
check_zero "cluster.location_id → location" \
"SELECT COUNT(*) AS cnt FROM cluster c LEFT JOIN location l ON c.location_id = l.id WHERE l.id IS NULL"
check_zero "cluster.dataset_id → dataset" \
"SELECT COUNT(*) AS cnt FROM cluster c LEFT JOIN dataset d ON c.dataset_id = d.id WHERE d.id IS NULL"
check_zero "cluster.cyclic_recording_pattern_id → cyclic_recording_pattern" \
"SELECT COUNT(*) AS cnt FROM cluster c LEFT JOIN cyclic_recording_pattern p ON c.cyclic_recording_pattern_id = p.id WHERE c.cyclic_recording_pattern_id IS NOT NULL AND p.id IS NULL"
echo ""

# ── File linkage ──────────────────────────────────────────────────────────────
echo "File linkage:"
check_zero "file.location_id → location" \
"SELECT COUNT(*) AS cnt FROM file f LEFT JOIN location l ON f.location_id = l.id WHERE f.location_id IS NOT NULL AND l.id IS NULL"
check_zero "file.cluster_id → cluster" \
"SELECT COUNT(*) AS cnt FROM file f LEFT JOIN cluster c ON f.cluster_id = c.id WHERE f.cluster_id IS NOT NULL AND c.id IS NULL"
check_zero "file_dataset.file_id → file" \
"SELECT COUNT(*) AS cnt FROM file_dataset fd LEFT JOIN file f ON fd.file_id = f.id WHERE f.id IS NULL"
check_zero "file_dataset.dataset_id → dataset" \
"SELECT COUNT(*) AS cnt FROM file_dataset fd LEFT JOIN dataset d ON fd.dataset_id = d.id WHERE d.id IS NULL"
check_zero "active files have file_dataset entry" \
"SELECT COUNT(*) AS cnt FROM file f LEFT JOIN file_dataset fd ON f.id = fd.file_id WHERE f.active = true AND fd.file_id IS NULL"
check_zero "file_dataset count >= active file count" \
"SELECT CASE WHEN (SELECT COUNT(*) FROM file_dataset) >= (SELECT COUNT(*) FROM file WHERE active = true) THEN 0 ELSE 1 END AS cnt"
check_zero "file_metadata.file_id → file" \
"SELECT COUNT(*) AS cnt FROM file_metadata fm LEFT JOIN file f ON fm.file_id = f.id WHERE f.id IS NULL"
echo ""

# ── Segment integrity ─────────────────────────────────────────────────────────
echo "Segment integrity:"
check_zero "segment.file_id → file" \
"SELECT COUNT(*) AS cnt FROM segment s LEFT JOIN file f ON s.file_id = f.id WHERE f.id IS NULL"
check_zero "segment.dataset_id → dataset" \
"SELECT COUNT(*) AS cnt FROM segment s LEFT JOIN dataset d ON s.dataset_id = d.id WHERE d.id IS NULL"
check_zero "active segments on inactive files" \
"SELECT COUNT(*) AS cnt FROM segment s JOIN file f ON s.file_id = f.id WHERE s.active = true AND f.active = false"
echo ""

# ── Label integrity ───────────────────────────────────────────────────────────
echo "Label integrity:"
check_zero "label.segment_id → segment" \
"SELECT COUNT(*) AS cnt FROM label l LEFT JOIN segment s ON l.segment_id = s.id WHERE s.id IS NULL"
check_zero "label.species_id → species" \
"SELECT COUNT(*) AS cnt FROM label l LEFT JOIN species sp ON l.species_id = sp.id WHERE sp.id IS NULL"
check_zero "label.filter_id → filter" \
"SELECT COUNT(*) AS cnt FROM label l LEFT JOIN filter f ON l.filter_id = f.id WHERE f.id IS NULL"
check_zero "active labels on inactive segments" \
"SELECT COUNT(*) AS cnt FROM label l JOIN segment s ON l.segment_id = s.id WHERE l.active = true AND s.active = false"
check_zero "label_metadata.label_id → label" \
"SELECT COUNT(*) AS cnt FROM label_metadata lm LEFT JOIN label l ON lm.label_id = l.id WHERE l.id IS NULL"
echo ""

# ── Label subtype integrity ───────────────────────────────────────────────────
echo "Label subtype integrity:"
check_zero "label_subtype.label_id → label" \
"SELECT COUNT(*) AS cnt FROM label_subtype ls LEFT JOIN label l ON ls.label_id = l.id WHERE l.id IS NULL"
check_zero "label_subtype.calltype_id → call_type" \
"SELECT COUNT(*) AS cnt FROM label_subtype ls LEFT JOIN call_type ct ON ls.calltype_id = ct.id WHERE ct.id IS NULL"
check_zero "label_subtype.filter_id → filter" \
"SELECT COUNT(*) AS cnt FROM label_subtype ls LEFT JOIN filter f ON ls.filter_id = f.id WHERE ls.filter_id IS NOT NULL AND f.id IS NULL"
echo ""

# ── Reference table integrity ─────────────────────────────────────────────────
echo "Reference table integrity:"
check_zero "call_type.species_id → species" \
"SELECT COUNT(*) AS cnt FROM call_type ct LEFT JOIN species sp ON ct.species_id = sp.id WHERE sp.id IS NULL"
echo ""

# ── Summary ───────────────────────────────────────────────────────────────────
echo "Summary: $TESTS_PASSED/$TESTS_RUN checks passed"

if [ "$TESTS_FAILED" -gt 0 ]; then
exit 1
fi
file addition: test_clip_labels.sh (---r------)

[0.638309]

#!/bin/bash
# Test skraak calls clip-labels
# Compares output against reference CSVs in clip-labels_test_data/
#
# Two test cases:
# 1. Normal (OPSO-equivalent): output matches clip_labels_opso.csv
# 2. __IGNORE__ mapping: D03 clips overlapping the ignored segment are excluded,
# but the file is not dropped entirely
#
# Note: removes clip_labels.csv and clip_labels_ignore.csv before each run
# because the command appends and checks for duplicates.

source "$(dirname "$0")/test_lib.sh"

TEST_DIR="$SCRIPT_DIR/clip-labels_test_data"

echo "=== Testing skraak calls clip-labels ==="
echo ""

check_binary

cd "$TEST_DIR"

# ── Test 1: OPSO-equivalent output ──────────────────────────────────────
echo "Test 1: OPSO-equivalent output"

rm -f ./clip_labels.csv

"$PROJECT_DIR/skraak" calls clip-labels \
--folder . --mapping ./mapping.json \
--clip-duration 5 --clip-overlap 0 --min-label-overlap 0.25 --final-clip full \
--output ./clip_labels.csv 2>/dev/null

# Compare: sort both, skip header
diff_output=$(diff <(tail -n +2 clip_labels_opso.csv | sort) \
<(tail -n +2 clip_labels.csv | sort))

if [ -z "$diff_output" ]; then
echo -e " ${GREEN}✓${NC} clip_labels.csv matches clip_labels_opso.csv (sorted, prefix-normalised)"
((TESTS_PASSED++)) || true
else
echo -e " ${RED}✗${NC} clip_labels.csv differs from clip_labels_opso.csv"
echo "$diff_output" | head -20
((TESTS_FAILED++)) || true
fi
((TESTS_RUN++)) || true

# ── Test 2: __IGNORE__ mapping ──────────────────────────────────────────
echo "Test 2: __IGNORE__ mapping (D03 segment skipped, file kept)"

rm -f ./clip_labels_ignore.csv

"$PROJECT_DIR/skraak" calls clip-labels \
--folder . --mapping ./mapping_ignore.json \
--clip-duration 5 --clip-overlap 0 --min-label-overlap 0.25 --final-clip full \
--output ./clip_labels_ignore.csv 2>/dev/null

# With __IGNORE__, clips overlapping the Don't Know segment (777-860s)
# in D03 are excluded, but D03's other clips are still emitted.
# The non-D03 rows should be identical to opso.
diff_output=$(diff <(grep -v "D03" clip_labels_opso.csv | sort) \
<(grep -v "D03" clip_labels_ignore.csv | sort))

if [ -z "$diff_output" ]; then
echo -e " ${GREEN}✓${NC} non-D03 rows match between ignore and opso"
((TESTS_PASSED++)) || true
else
echo -e " ${RED}✗${NC} non-D03 rows differ between ignore and opso"
echo "$diff_output" | head -20
((TESTS_FAILED++)) || true
fi
((TESTS_RUN++)) || true

# Verify D03 IS present in ignore output (file not dropped)
if grep -q "D03" clip_labels_ignore.csv; then
echo -e " ${GREEN}✓${NC} D03 rows present in clip_labels_ignore.csv (file not dropped)"
((TESTS_PASSED++)) || true
else
echo -e " ${RED}✗${NC} D03 rows missing from clip_labels_ignore.csv (file should be kept)"
((TESTS_FAILED++)) || true
fi
((TESTS_RUN++)) || true

# Verify D03 clips overlapping the __IGNORE__ segment (775-860s) are excluded
d03_ignore=$(grep "D03" clip_labels_ignore.csv | wc -l)
d03_opso=$(grep "D03" clip_labels_opso.csv | wc -l)
if [ "$d03_ignore" -lt "$d03_opso" ]; then
echo -e " ${GREEN}✓${NC} D03 clips reduced: $d03_ignore in ignore vs $d03_opso in opso (overlapping clips excluded)"
((TESTS_PASSED++)) || true
else
echo -e " ${RED}✗${NC} D03 clips not reduced: $d03_ignore in ignore vs $d03_opso in opso"
((TESTS_FAILED++)) || true
fi
((TESTS_RUN++)) || true

# Verify no D03 clips in the 775-860s range appear in ignore output
d03_in_range=$(grep "D03" clip_labels_ignore.csv | awk -F, '{split($2,a,"."); if ($2+0 >= 775 && $2+0 < 860) print}' | wc -l)
if [ "$d03_in_range" -eq 0 ]; then
echo -e " ${GREEN}✓${NC} No D03 clips in 775-860s range (correctly excluded)"
((TESTS_PASSED++)) || true
else
echo -e " ${RED}✗${NC} Found $d03_in_range D03 clips in 775-860s range (should be excluded)"
((TESTS_FAILED++)) || true
fi
((TESTS_RUN++)) || true

echo ""
print_summary
file addition: test_calls_from_preds.sh (---r------)

[0.638309]

#!/bin/bash
# Test script for: skraak calls from-preds
# Compares output against reference JSON files (verified with Julia)
#
# Usage: ./test_calls_from_preds.sh
#
# Tests:
# 1. predsST_opensoundscape-kiwi-1.2_2025-11-12.csv (single species: Kiwi)
# 2. preds1_opensoundscape-multi-1.0_2025-07-22.csv (multi-species)
#
# The calls array is compared as a SET (order-independent), matching
# the Julia issetequal() verification used by the author.

set -euo pipefail

# Setup paths
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
DATA_DIR="$SCRIPT_DIR/data"

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

# Test counters
TESTS_RUN=0
TESTS_PASSED=0
TESTS_FAILED=0

# Check binary exists
if [ ! -f "$PROJECT_DIR/skraak" ]; then
echo -e "${RED}Error: skraak binary not found. Run 'go build' first.${NC}"
exit 1
fi

# Compare calls arrays as sets (order-independent)
# Returns 0 if equal, 1 if different
# Usage: compare_calls_as_set <actual.json> <expected.json>
compare_calls_as_set() {
local actual="$1"
local expected="$2"

# Extract calls array and sort by all fields to get canonical order
# Then compare as arrays
local actual_sorted
local expected_sorted

actual_sorted=$(jq '(.calls // []) | sort_by(.file, .start_time, .end_time, .ebird_code, .segments)' "$actual" 2>/dev/null)
expected_sorted=$(jq '(.calls // []) | sort_by(.file, .start_time, .end_time, .ebird_code, .segments)' "$expected" 2>/dev/null)

if [ "$actual_sorted" = "$expected_sorted" ]; then
return 0
else
return 1
fi
}

# Compare metadata fields (clip_duration, gap_threshold, total_calls, species_count)
# Returns 0 if all match, 1 if any differ
# Usage: compare_metadata <actual.json> <expected.json>
compare_metadata() {
local actual="$1"
local expected="$2"

# Check each metadata field
local clip_dur_act clip_dur_exp
local gap_thr_act gap_thr_exp
local total_act total_exp
local species_act species_exp

clip_dur_act=$(jq -r '.clip_duration // "null"' "$actual")
clip_dur_exp=$(jq -r '.clip_duration // "null"' "$expected")

gap_thr_act=$(jq -r '.gap_threshold // "null"' "$actual")
gap_thr_exp=$(jq -r '.gap_threshold // "null"' "$expected")

total_act=$(jq -r '.total_calls // "null"' "$actual")
total_exp=$(jq -r '.total_calls // "null"' "$expected")

species_act=$(jq -r '.species_count' "$actual")
species_exp=$(jq -r '.species_count' "$expected")

local all_match=true

if [ "$clip_dur_act" != "$clip_dur_exp" ]; then
echo " clip_duration: expected=$clip_dur_exp, actual=$clip_dur_act"
all_match=false
fi

if [ "$gap_thr_act" != "$gap_thr_exp" ]; then
echo " gap_threshold: expected=$gap_thr_exp, actual=$gap_thr_act"
all_match=false
fi

if [ "$total_act" != "$total_exp" ]; then
echo " total_calls: expected=$total_exp, actual=$total_act"
all_match=false
fi

if [ "$species_act" != "$species_exp" ]; then
echo " species_count differs"
all_match=false
fi

if [ "$all_match" = true ]; then
return 0
else
return 1
fi
}

# Run a single test case
# Usage: run_test <csv_name> <csv_path> <expected_json_path>
run_test() {
local name="$1"
local csv_path="$2"
local expected_json="$3"

((TESTS_RUN++)) || true

echo ""
echo "Testing: $name"
echo " CSV: $(basename "$csv_path")"
echo " Expected: $(basename "$expected_json")"

# Create temp files for actual output
local actual_json stderr_output
actual_json=$(mktemp --suffix=.json)
stderr_output=$(mktemp --suffix=.txt)

# Run the command (capture stdout to file, stderr to variable)
echo " Running: skraak calls from-preds --csv ..."

if ! "$PROJECT_DIR/skraak" calls from-preds --csv "$csv_path" --dot-data=false --gap-multiplier 3 --min-detections 1 > "$actual_json" 2>"$stderr_output"; then
echo -e " ${RED}✗ Command failed${NC}"
cat "$stderr_output"
rm -f "$stderr_output"
((TESTS_FAILED++)) || true
return
fi

# Show progress from stderr
cat "$stderr_output" | head -3
rm -f "$stderr_output"

# Check if output is valid JSON
if ! jq empty "$actual_json" 2>/dev/null; then
echo -e " ${RED}✗ Output is not valid JSON${NC}"
((TESTS_FAILED++)) || true
return
fi

# Compare calls array as set (PRIMARY CHECK)
local calls_match=false
if compare_calls_as_set "$actual_json" "$expected_json"; then
calls_match=true
fi

# Compare metadata
local metadata_match=false
local metadata_diff=""
if compare_metadata "$actual_json" "$expected_json"; then
metadata_match=true
fi

# Report results
if [ "$calls_match" = true ]; then
echo -e " ${GREEN}✓ Calls array matches (set comparison)${NC}"

# Show summary stats
local call_count
call_count=$(jq '.calls | length' "$actual_json")
local species_count
species_count=$(jq '.species_count | keys | length' "$actual_json")
echo " $call_count calls across $species_count species"

if [ "$metadata_match" = true ]; then
echo -e " ${GREEN}✓ Metadata matches${NC}"
((TESTS_PASSED++)) || true
else
echo -e " ${YELLOW}⚠ Metadata differs (calls array is primary)${NC}"
compare_metadata "$actual_json" "$expected_json"
# Still count as passed since calls match
((TESTS_PASSED++)) || true
fi
else
echo -e " ${RED}✗ Calls array differs${NC}"

# Show diff stats
local actual_count expected_count
actual_count=$(jq '.calls | length' "$actual_json")
expected_count=$(jq '.calls | length' "$expected_json")
echo " Actual calls: $actual_count, Expected calls: $expected_count"

# Find calls in expected but not in actual (skip for large arrays to avoid hang)
if [ "$actual_count" -lt 10000 ] && [ "$expected_count" -lt 10000 ]; then
local missing extra
missing=$(jq -n --slurpfile exp "$expected_json" --slurpfile act "$actual_json" \
'([$exp[0].calls | .[] | {file, start_time, end_time, ebird_code, segments}] | sort) - ([$act[0].calls | .[] | {file, start_time, end_time, ebird_code, segments}] | sort) | length')
extra=$(jq -n --slurpfile exp "$expected_json" --slurpfile act "$actual_json" \
'([$act[0].calls | .[] | {file, start_time, end_time, ebird_code, segments}] | sort) - ([$exp[0].calls | .[] | {file, start_time, end_time, ebird_code, segments}] | sort) | length')

echo " Missing from actual: $missing calls"
echo " Extra in actual: $extra calls"
else
echo " (skipping detailed diff — arrays too large)"
fi

((TESTS_FAILED++)) || true
fi

# Cleanup temp files
rm -f "$actual_json" "$stderr_output"
}

# Print summary
print_summary() {
echo ""
echo "=== Summary ==="
echo "Tests run: $TESTS_RUN"
echo -e "Passed: ${GREEN}$TESTS_PASSED${NC}"
if [ "$TESTS_FAILED" -gt 0 ]; then
echo -e "Failed: ${RED}$TESTS_FAILED${NC}"
return 1
else
echo -e "Failed: $TESTS_FAILED"
return 0
fi
}

# Main
echo "=== Testing: skraak calls from-preds ==="
echo "Comparing calls arrays as SETS (order-independent)"

# Test 1: predsST (kiwi single species)
run_test \
"predsST (single species: Kiwi)" \
"$DATA_DIR/predsST_opensoundscape-kiwi-1.2_2025-11-12.csv" \
"$DATA_DIR/predsST_opensoundscape-kiwi-1.2_2025-11-12.json"

# Test 2: preds1 (multi-species)
run_test \
"preds1 (multi-species)" \
"$DATA_DIR/preds1_opensoundscape-multi-1.0_2025-07-22.csv" \
"$DATA_DIR/preds1_opensoundscape-multi-1.0_2025-07-22.json"

print_summary
file addition: test_bulk_import.sh (---r------)

[0.638309]

#!/bin/bash
# Test bulk_file_import CLI command
# Usage: ./test_bulk_import.sh [db_path]
# Default: /home/david/go/src/skraak/db/test.duckdb (ALWAYS USE TEST DATABASE!)

source "$(dirname "$0")/test_lib.sh"

# Get absolute paths before changing directory
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
DB_PATH="${1:-$PROJECT_DIR/db/test.duckdb}"

if [ ! -f "$DB_PATH" ]; then
echo -e "${RED}Error: Database not found at $DB_PATH${NC}"
exit 1
fi

echo "=== Testing bulk_file_import CLI Command ==="
echo ""
echo "Database: $DB_PATH"
echo ""

check_binary

# Navigate to the project directory where skraak binary is located
cd "$PROJECT_DIR" || exit 1

# Helper to run CLI command and capture JSON output
run_cli() {
"$PROJECT_DIR/skraak" "$@" 2>/dev/null || true
}

run_cli_with_stderr() {
"$PROJECT_DIR/skraak" "$@" 2>&1 || true
}

# Helper to check for error in CLI output
cli_is_error() {
local output="$1"
# CLI outputs errors to stderr with "Error:" prefix
if echo "$output" | grep -q '"error"' 2>/dev/null; then
return 0
fi
# Also check for error in JSON output
if echo "$output" | jq -e '.error // empty' >/dev/null 2>&1; then
return 0
fi
return 1
}

echo "Step 1: Create test dataset and locations"
echo "------------------------------------------"

# Create a test dataset using CLI
echo -n "Creating test dataset... "
DATASET_RESULT=$(run_cli create dataset --db "$DB_PATH" --name "Bulk Import Test Dataset" --type structured --description "Dataset for testing bulk import")
DATASET_ID=$(echo "$DATASET_RESULT" | jq -r '.dataset.id // empty')
if [ -n "$DATASET_ID" ]; then
echo -e "${GREEN}✓${NC} Created dataset: $DATASET_ID"
else
echo -e "${RED}✗${NC} Failed to create dataset"
echo "$DATASET_RESULT" | jq '.'
exit 1
fi

# Create test location A
echo -n "Creating test location A... "
LOCATION_A_RESULT=$(run_cli create location --db "$DB_PATH" --dataset "$DATASET_ID" --name "Test Location A" --lat -41.2865 --lon 174.7762 --timezone "Pacific/Auckland" --description "Test site A")
LOCATION_A_ID=$(echo "$LOCATION_A_RESULT" | jq -r '.location.id // empty')
if [ -n "$LOCATION_A_ID" ]; then
echo -e "${GREEN}✓${NC} Created location A: $LOCATION_A_ID"
else
echo -e "${RED}✗${NC} Failed to create location A"
echo "$LOCATION_A_RESULT" | jq '.'
exit 1
fi

# Create test location B
echo -n "Creating test location B... "
LOCATION_B_RESULT=$(run_cli create location --db "$DB_PATH" --dataset "$DATASET_ID" --name "Test Location B" --lat -36.8485 --lon 174.7633 --timezone "Pacific/Auckland" --description "Test site B")
LOCATION_B_ID=$(echo "$LOCATION_B_RESULT" | jq -r '.location.id // empty')
if [ -n "$LOCATION_B_ID" ]; then
echo -e "${GREEN}✓${NC} Created location B: $LOCATION_B_ID"
else
echo -e "${RED}✗${NC} Failed to create location B"
echo "$LOCATION_B_RESULT" | jq '.'
exit 1
fi
echo ""

echo "Step 2: Create test CSV file"
echo "-----------------------------"

# Create test CSV with sample data
CSV_FILE="/tmp/test_bulk_import_$$.csv"
LOG_FILE="/tmp/test_bulk_import_$$.log"

cat > "$CSV_FILE" << EOF
location_name,location_id,directory_path,date_range,sample_rate,file_count
Test Location A,$LOCATION_A_ID,/nonexistent/path/a,2024-01,250000,0
Test Location B,$LOCATION_B_ID,/nonexistent/path/b,2024-02,384000,0
EOF

echo -e "${GREEN}✓${NC} Created test CSV at $CSV_FILE"
echo "Contents:"
cat "$CSV_FILE"
echo ""

echo "Step 3: Test bulk_file_import CLI command"
echo "------------------------------------------"

# Note: Directories don't exist, so no files will be imported
# This validates:
# - CSV parsing
# - Location ID validation
# - Cluster auto-creation logic
# - JSON output format

echo "Running bulk import (directories don't exist)..."
IMPORT_RESULT=$(run_cli_with_stderr import bulk --db "$DB_PATH" --dataset "$DATASET_ID" --csv "$CSV_FILE" --log "$LOG_FILE")

# Extract just the JSON output (last lines starting with {)
JSON_OUTPUT=$(echo "$IMPORT_RESULT" | grep -A 100 '^{' | head -20)

# Check for valid JSON output with expected structure
FILES_IMPORTED=$(echo "$JSON_OUTPUT" | jq -r '.files_imported // empty' 2>/dev/null)
if [ -n "$FILES_IMPORTED" ]; then
echo -e "${GREEN}✓${NC} Tool executed successfully"
echo " Files imported: $FILES_IMPORTED"
echo " Total locations: $(echo "$JSON_OUTPUT" | jq -r '.total_locations')"
echo " Processing time: $(echo "$JSON_OUTPUT" | jq -r '.processing_time')"
else
# Check for error
if echo "$IMPORT_RESULT" | grep -qi "error"; then
echo -e "${YELLOW}?${NC} Tool returned error:"
echo "$IMPORT_RESULT" | grep -i "error" | head -3
else
echo -e "${RED}✗${NC} Unexpected result:"
echo "$IMPORT_RESULT" | head -5
fi
fi
echo ""

# Check if log file was created
if [ -f "$LOG_FILE" ]; then
echo -e "${GREEN}✓${NC} Log file created at $LOG_FILE"
echo " Log entries: $(wc -l < "$LOG_FILE")"
rm -f "$LOG_FILE"
else
echo -e "${YELLOW}ℹ${NC} Log file not created (expected if no files processed)"
fi
echo ""

echo "Step 4: Test validation - invalid CSV path"
echo "-------------------------------------------"
INVALID_CSV=$(run_cli_with_stderr import bulk --db "$DB_PATH" --dataset "$DATASET_ID" --csv "/nonexistent/file.csv" --log "$LOG_FILE")
if echo "$INVALID_CSV" | grep -qi "error\|no such file\|not found\|not accessible"; then
echo -e "${GREEN}✓${NC} Correctly rejected non-existent CSV file"
else
echo -e "${RED}✗${NC} Should have rejected non-existent CSV"
echo "$INVALID_CSV" | head -3
fi
echo ""

echo "Step 5: Test validation - invalid dataset ID"
echo "---------------------------------------------"
INVALID_DATASET=$(run_cli_with_stderr import bulk --db "$DB_PATH" --dataset "INVALID_ID_123" --csv "$CSV_FILE" --log "$LOG_FILE")
if echo "$INVALID_DATASET" | grep -qi "error\|not found\|no such\|does not exist"; then
echo -e "${GREEN}✓${NC} Correctly rejected invalid dataset ID"
else
echo -e "${RED}✗${NC} Should have rejected invalid dataset ID"
echo "$INVALID_DATASET" | head -3
fi
echo ""

echo "Step 6: Test validation - missing required flags"
echo "-------------------------------------------------"
MISSING_FLAGS=$(run_cli_with_stderr import bulk --db "$DB_PATH" --dataset "$DATASET_ID")
if echo "$MISSING_FLAGS" | grep -qi "missing\|required"; then
echo -e "${GREEN}✓${NC} Correctly rejected missing required flags"
else
echo -e "${RED}✗${NC} Should have rejected missing required flags"
echo "$MISSING_FLAGS" | head -3
fi
echo ""

echo "=== TEST SUMMARY ==="
echo "Bulk import CLI command validation complete!"
echo "Note: Directory errors are expected (using non-existent paths)"
echo "The test validates CSV parsing and validation logic."
echo ""

# Cleanup
echo "Cleaning up test files..."
rm -f "$CSV_FILE" "$LOG_FILE"
echo -e "${GREEN}✓${NC} Cleanup complete"
echo ""
file addition: data (d--r------)

[0.638309]
file addition: clip-labels_test_data (d--r------)

[0.638309]
file addition: commands.md (---r------)

[0.698617]

For OPSO equivalent output:
```
skraak calls clip-labels --folder . --mapping ./mapping.json \
--clip-duration 5 --clip-overlap 0 --min-label-overlap 0.25 --final-clip full \
--output ./clip_labels.csv
```
clip_labels.csv == clip_labels_opso.csv (reference file, is correct)

For __IGNORE__
```
skraak calls clip-labels --folder . --mapping ./mapping_ignore.json \
--clip-duration 5 --clip-overlap 0 --min-label-overlap 0.25 --final-clip full \
--output ./clip_labels_ignore.csv
```
D03_2022-12-17_20221022_043000.wav should be ignored, but otherwise it should have the same data in it as clip_labels_opso.csv.
file addition: clip_labels_opso.csv.bak (---r------)

[0.698617]

file,start_time,end_time,Kiwi
./D03_2022-12-17_20221022_043000.wav,0.0,5.0,False
./D03_2022-12-17_20221022_043000.wav,5.0,10.0,False
./D03_2022-12-17_20221022_043000.wav,10.0,15.0,False
./D03_2022-12-17_20221022_043000.wav,15.0,20.0,False
./D03_2022-12-17_20221022_043000.wav,20.0,25.0,False
./D03_2022-12-17_20221022_043000.wav,25.0,30.0,False
./D03_2022-12-17_20221022_043000.wav,30.0,35.0,False
./D03_2022-12-17_20221022_043000.wav,35.0,40.0,False
./D03_2022-12-17_20221022_043000.wav,40.0,45.0,False
./D03_2022-12-17_20221022_043000.wav,45.0,50.0,False
./D03_2022-12-17_20221022_043000.wav,50.0,55.0,False
./D03_2022-12-17_20221022_043000.wav,55.0,60.0,False
./D03_2022-12-17_20221022_043000.wav,60.0,65.0,False
./D03_2022-12-17_20221022_043000.wav,65.0,70.0,False
./D03_2022-12-17_20221022_043000.wav,70.0,75.0,False
./D03_2022-12-17_20221022_043000.wav,75.0,80.0,False
./D03_2022-12-17_20221022_043000.wav,80.0,85.0,False
./D03_2022-12-17_20221022_043000.wav,85.0,90.0,False
./D03_2022-12-17_20221022_043000.wav,90.0,95.0,False
./D03_2022-12-17_20221022_043000.wav,95.0,100.0,False
./D03_2022-12-17_20221022_043000.wav,100.0,105.0,False
./D03_2022-12-17_20221022_043000.wav,105.0,110.0,False
./D03_2022-12-17_20221022_043000.wav,110.0,115.0,False
./D03_2022-12-17_20221022_043000.wav,115.0,120.0,False
./D03_2022-12-17_20221022_043000.wav,120.0,125.0,False
./D03_2022-12-17_20221022_043000.wav,125.0,130.0,False
./D03_2022-12-17_20221022_043000.wav,130.0,135.0,False
./D03_2022-12-17_20221022_043000.wav,135.0,140.0,False
./D03_2022-12-17_20221022_043000.wav,140.0,145.0,False
./D03_2022-12-17_20221022_043000.wav,145.0,150.0,False
./D03_2022-12-17_20221022_043000.wav,150.0,155.0,False
./D03_2022-12-17_20221022_043000.wav,155.0,160.0,False
./D03_2022-12-17_20221022_043000.wav,160.0,165.0,False
./D03_2022-12-17_20221022_043000.wav,165.0,170.0,False
./D03_2022-12-17_20221022_043000.wav,170.0,175.0,False
./D03_2022-12-17_20221022_043000.wav,175.0,180.0,False
./D03_2022-12-17_20221022_043000.wav,180.0,185.0,False
./D03_2022-12-17_20221022_043000.wav,185.0,190.0,False
./D03_2022-12-17_20221022_043000.wav,190.0,195.0,False
./D03_2022-12-17_20221022_043000.wav,195.0,200.0,False
./D03_2022-12-17_20221022_043000.wav,200.0,205.0,False
./D03_2022-12-17_20221022_043000.wav,205.0,210.0,False
./D03_2022-12-17_20221022_043000.wav,210.0,215.0,False
./D03_2022-12-17_20221022_043000.wav,215.0,220.0,False
./D03_2022-12-17_20221022_043000.wav,220.0,225.0,False
./D03_2022-12-17_20221022_043000.wav,225.0,230.0,False
./D03_2022-12-17_20221022_043000.wav,230.0,235.0,False
./D03_2022-12-17_20221022_043000.wav,235.0,240.0,False
./D03_2022-12-17_20221022_043000.wav,240.0,245.0,False
./D03_2022-12-17_20221022_043000.wav,245.0,250.0,False
./D03_2022-12-17_20221022_043000.wav,250.0,255.0,False
./D03_2022-12-17_20221022_043000.wav,255.0,260.0,False
./D03_2022-12-17_20221022_043000.wav,260.0,265.0,False
./D03_2022-12-17_20221022_043000.wav,265.0,270.0,False
./D03_2022-12-17_20221022_043000.wav,270.0,275.0,False
./D03_2022-12-17_20221022_043000.wav,275.0,280.0,False
./D03_2022-12-17_20221022_043000.wav,280.0,285.0,False
./D03_2022-12-17_20221022_043000.wav,285.0,290.0,False
./D03_2022-12-17_20221022_043000.wav,290.0,295.0,False
./D03_2022-12-17_20221022_043000.wav,295.0,300.0,False
./D03_2022-12-17_20221022_043000.wav,300.0,305.0,False
./D03_2022-12-17_20221022_043000.wav,305.0,310.0,False
./D03_2022-12-17_20221022_043000.wav,310.0,315.0,False
./D03_2022-12-17_20221022_043000.wav,315.0,320.0,False
./D03_2022-12-17_20221022_043000.wav,320.0,325.0,False
./D03_2022-12-17_20221022_043000.wav,325.0,330.0,False
./D03_2022-12-17_20221022_043000.wav,330.0,335.0,False
./D03_2022-12-17_20221022_043000.wav,335.0,340.0,False
./D03_2022-12-17_20221022_043000.wav,340.0,345.0,False
./D03_2022-12-17_20221022_043000.wav,345.0,350.0,False
./D03_2022-12-17_20221022_043000.wav,350.0,355.0,False
./D03_2022-12-17_20221022_043000.wav,355.0,360.0,False
./D03_2022-12-17_20221022_043000.wav,360.0,365.0,False
./D03_2022-12-17_20221022_043000.wav,365.0,370.0,False
./D03_2022-12-17_20221022_043000.wav,370.0,375.0,False
./D03_2022-12-17_20221022_043000.wav,375.0,380.0,False
./D03_2022-12-17_20221022_043000.wav,380.0,385.0,False
./D03_2022-12-17_20221022_043000.wav,385.0,390.0,False
./D03_2022-12-17_20221022_043000.wav,390.0,395.0,False
./D03_2022-12-17_20221022_043000.wav,395.0,400.0,False
./D03_2022-12-17_20221022_043000.wav,400.0,405.0,False
./D03_2022-12-17_20221022_043000.wav,405.0,410.0,False
./D03_2022-12-17_20221022_043000.wav,410.0,415.0,False
./D03_2022-12-17_20221022_043000.wav,415.0,420.0,False
./D03_2022-12-17_20221022_043000.wav,420.0,425.0,False
./D03_2022-12-17_20221022_043000.wav,425.0,430.0,False
./D03_2022-12-17_20221022_043000.wav,430.0,435.0,False
./D03_2022-12-17_20221022_043000.wav,435.0,440.0,False
./D03_2022-12-17_20221022_043000.wav,440.0,445.0,False
./D03_2022-12-17_20221022_043000.wav,445.0,450.0,False
./D03_2022-12-17_20221022_043000.wav,450.0,455.0,False
./D03_2022-12-17_20221022_043000.wav,455.0,460.0,False
./D03_2022-12-17_20221022_043000.wav,460.0,465.0,False
./D03_2022-12-17_20221022_043000.wav,465.0,470.0,False
./D03_2022-12-17_20221022_043000.wav,470.0,475.0,False
./D03_2022-12-17_20221022_043000.wav,475.0,480.0,False
./D03_2022-12-17_20221022_043000.wav,480.0,485.0,False
./D03_2022-12-17_20221022_043000.wav,485.0,490.0,False
./D03_2022-12-17_20221022_043000.wav,490.0,495.0,False
./D03_2022-12-17_20221022_043000.wav,495.0,500.0,False
./D03_2022-12-17_20221022_043000.wav,500.0,505.0,False
./D03_2022-12-17_20221022_043000.wav,505.0,510.0,False
./D03_2022-12-17_20221022_043000.wav,510.0,515.0,False
./D03_2022-12-17_20221022_043000.wav,515.0,520.0,False
./D03_2022-12-17_20221022_043000.wav,520.0,525.0,False
./D03_2022-12-17_20221022_043000.wav,525.0,530.0,False
./D03_2022-12-17_20221022_043000.wav,530.0,535.0,False
./D03_2022-12-17_20221022_043000.wav,535.0,540.0,False
./D03_2022-12-17_20221022_043000.wav,540.0,545.0,False
./D03_2022-12-17_20221022_043000.wav,545.0,550.0,False
./D03_2022-12-17_20221022_043000.wav,550.0,555.0,False
./D03_2022-12-17_20221022_043000.wav,555.0,560.0,False
./D03_2022-12-17_20221022_043000.wav,560.0,565.0,False
./D03_2022-12-17_20221022_043000.wav,565.0,570.0,False
./D03_2022-12-17_20221022_043000.wav,570.0,575.0,False
./D03_2022-12-17_20221022_043000.wav,575.0,580.0,False
./D03_2022-12-17_20221022_043000.wav,580.0,585.0,False
./D03_2022-12-17_20221022_043000.wav,585.0,590.0,False
./D03_2022-12-17_20221022_043000.wav,590.0,595.0,False
./D03_2022-12-17_20221022_043000.wav,595.0,600.0,False
./D03_2022-12-17_20221022_043000.wav,600.0,605.0,False
./D03_2022-12-17_20221022_043000.wav,605.0,610.0,False
./D03_2022-12-17_20221022_043000.wav,610.0,615.0,False
./D03_2022-12-17_20221022_043000.wav,615.0,620.0,False
./D03_2022-12-17_20221022_043000.wav,620.0,625.0,False
./D03_2022-12-17_20221022_043000.wav,625.0,630.0,False
./D03_2022-12-17_20221022_043000.wav,630.0,635.0,False
./D03_2022-12-17_20221022_043000.wav,635.0,640.0,False
./D03_2022-12-17_20221022_043000.wav,640.0,645.0,False
./D03_2022-12-17_20221022_043000.wav,645.0,650.0,False
./D03_2022-12-17_20221022_043000.wav,650.0,655.0,False
./D03_2022-12-17_20221022_043000.wav,655.0,660.0,False
./D03_2022-12-17_20221022_043000.wav,660.0,665.0,False
./D03_2022-12-17_20221022_043000.wav,665.0,670.0,False
./D03_2022-12-17_20221022_043000.wav,670.0,675.0,False
./D03_2022-12-17_20221022_043000.wav,675.0,680.0,False
./D03_2022-12-17_20221022_043000.wav,680.0,685.0,False
./D03_2022-12-17_20221022_043000.wav,685.0,690.0,False
./D03_2022-12-17_20221022_043000.wav,690.0,695.0,False
./D03_2022-12-17_20221022_043000.wav,695.0,700.0,False
./D03_2022-12-17_20221022_043000.wav,700.0,705.0,False
./D03_2022-12-17_20221022_043000.wav,705.0,710.0,False
./D03_2022-12-17_20221022_043000.wav,710.0,715.0,False
./D03_2022-12-17_20221022_043000.wav,715.0,720.0,False
./D03_2022-12-17_20221022_043000.wav,720.0,725.0,False
./D03_2022-12-17_20221022_043000.wav,725.0,730.0,False
./D03_2022-12-17_20221022_043000.wav,730.0,735.0,False
./D03_2022-12-17_20221022_043000.wav,735.0,740.0,False
./D03_2022-12-17_20221022_043000.wav,740.0,745.0,False
./D03_2022-12-17_20221022_043000.wav,745.0,750.0,False
./D03_2022-12-17_20221022_043000.wav,750.0,755.0,False
./D03_2022-12-17_20221022_043000.wav,755.0,760.0,False
./D03_2022-12-17_20221022_043000.wav,760.0,765.0,False
./D03_2022-12-17_20221022_043000.wav,765.0,770.0,False
./D03_2022-12-17_20221022_043000.wav,770.0,775.0,False
./D03_2022-12-17_20221022_043000.wav,775.0,780.0,False
./D03_2022-12-17_20221022_043000.wav,780.0,785.0,False
./D03_2022-12-17_20221022_043000.wav,785.0,790.0,False
./D03_2022-12-17_20221022_043000.wav,790.0,795.0,False
./D03_2022-12-17_20221022_043000.wav,795.0,800.0,False
./D03_2022-12-17_20221022_043000.wav,800.0,805.0,False
./D03_2022-12-17_20221022_043000.wav,805.0,810.0,False
./D03_2022-12-17_20221022_043000.wav,810.0,815.0,False
./D03_2022-12-17_20221022_043000.wav,815.0,820.0,False
./D03_2022-12-17_20221022_043000.wav,820.0,825.0,False
./D03_2022-12-17_20221022_043000.wav,825.0,830.0,False
./D03_2022-12-17_20221022_043000.wav,830.0,835.0,False
./D03_2022-12-17_20221022_043000.wav,835.0,840.0,False
./D03_2022-12-17_20221022_043000.wav,840.0,845.0,False
./D03_2022-12-17_20221022_043000.wav,845.0,850.0,False
./D03_2022-12-17_20221022_043000.wav,850.0,855.0,False
./D03_2022-12-17_20221022_043000.wav,855.0,860.0,False
./D03_2022-12-17_20221022_043000.wav,860.0,865.0,False
./D03_2022-12-17_20221022_043000.wav,865.0,870.0,False
./D03_2022-12-17_20221022_043000.wav,870.0,875.0,False
./D03_2022-12-17_20221022_043000.wav,875.0,880.0,False
./D03_2022-12-17_20221022_043000.wav,880.0,885.0,False
./D03_2022-12-17_20221022_043000.wav,885.0,890.0,False
./D03_2022-12-17_20221022_043000.wav,890.0,895.0,False
./TF_3-20200512_181509.wav,0.0,5.0,False
./TF_3-20200512_181509.wav,5.0,10.0,False
./TF_3-20200512_181509.wav,10.0,15.0,False
./TF_3-20200512_181509.wav,15.0,20.0,False
./TF_3-20200512_181509.wav,20.0,25.0,False
./TF_3-20200512_181509.wav,25.0,30.0,False
./TF_3-20200512_181509.wav,30.0,35.0,False
./TF_3-20200512_181509.wav,35.0,40.0,False
./TF_3-20200512_181509.wav,40.0,45.0,False
./TF_3-20200512_181509.wav,45.0,50.0,False
./TF_3-20200512_181509.wav,50.0,55.0,False
./TF_3-20200512_181509.wav,55.0,60.0,False
./TF_3-20200512_181509.wav,60.0,65.0,False
./TF_3-20200512_181509.wav,65.0,70.0,False
./TF_3-20200512_181509.wav,70.0,75.0,False
./TF_3-20200512_181509.wav,75.0,80.0,False
./TF_3-20200512_181509.wav,80.0,85.0,False
./TF_3-20200512_181509.wav,85.0,90.0,False
./TF_3-20200512_181509.wav,90.0,95.0,False
./TF_3-20200512_181509.wav,95.0,100.0,False
./TF_3-20200512_181509.wav,100.0,105.0,False
./TF_3-20200512_181509.wav,105.0,110.0,False
./TF_3-20200512_181509.wav,110.0,115.0,False
./TF_3-20200512_181509.wav,115.0,120.0,False
./TF_3-20200512_181509.wav,120.0,125.0,False
./TF_3-20200512_181509.wav,125.0,130.0,False
./TF_3-20200512_181509.wav,130.0,135.0,False
./TF_3-20200512_181509.wav,135.0,140.0,False
./TF_3-20200512_181509.wav,140.0,145.0,False
./TF_3-20200512_181509.wav,145.0,150.0,False
./TF_3-20200512_181509.wav,150.0,155.0,False
./TF_3-20200512_181509.wav,155.0,160.0,False
./TF_3-20200512_181509.wav,160.0,165.0,False
./TF_3-20200512_181509.wav,165.0,170.0,False
./TF_3-20200512_181509.wav,170.0,175.0,False
./TF_3-20200512_181509.wav,175.0,180.0,False
./TF_3-20200512_181509.wav,180.0,185.0,False
./TF_3-20200512_181509.wav,185.0,190.0,False
./TF_3-20200512_181509.wav,190.0,195.0,False
./TF_3-20200512_181509.wav,195.0,200.0,False
./TF_3-20200512_181509.wav,200.0,205.0,False
./TF_3-20200512_181509.wav,205.0,210.0,False
./TF_3-20200512_181509.wav,210.0,215.0,False
./TF_3-20200512_181509.wav,215.0,220.0,False
./TF_3-20200512_181509.wav,220.0,225.0,False
./TF_3-20200512_181509.wav,225.0,230.0,True
./TF_3-20200512_181509.wav,230.0,235.0,True
./TF_3-20200512_181509.wav,235.0,240.0,True
./TF_3-20200512_181509.wav,240.0,245.0,True
./TF_3-20200512_181509.wav,245.0,250.0,True
./TF_3-20200512_181509.wav,250.0,255.0,True
./TF_3-20200512_181509.wav,255.0,260.0,True
./TF_3-20200512_181509.wav,260.0,265.0,True
./TF_3-20200512_181509.wav,265.0,270.0,True
./TF_3-20200512_181509.wav,270.0,275.0,False
./TF_3-20200512_181509.wav,275.0,280.0,False
./TF_3-20200512_181509.wav,280.0,285.0,False
./TF_3-20200512_181509.wav,285.0,290.0,False
./TF_3-20200512_181509.wav,290.0,295.0,False
./TF_3-20200512_181509.wav,295.0,300.0,False
./TF_3-20200512_181509.wav,300.0,305.0,False
./TF_3-20200512_181509.wav,305.0,310.0,False
./TF_3-20200512_181509.wav,310.0,315.0,False
./TF_3-20200512_181509.wav,315.0,320.0,False
./TF_3-20200512_181509.wav,320.0,325.0,False
./TF_3-20200512_181509.wav,325.0,330.0,False
./TF_3-20200512_181509.wav,330.0,335.0,False
./TF_3-20200512_181509.wav,335.0,340.0,False
./TF_3-20200512_181509.wav,340.0,345.0,False
./TF_3-20200512_181509.wav,345.0,350.0,False
./TF_3-20200512_181509.wav,350.0,355.0,False
./TF_3-20200512_181509.wav,355.0,360.0,False
./TF_3-20200512_181509.wav,360.0,365.0,False
./TF_3-20200512_181509.wav,365.0,370.0,False
./TF_3-20200512_181509.wav,370.0,375.0,False
./TF_3-20200512_181509.wav,375.0,380.0,False
./TF_3-20200512_181509.wav,380.0,385.0,False
./TF_3-20200512_181509.wav,385.0,390.0,False
./TF_3-20200512_181509.wav,390.0,395.0,True
./TF_3-20200512_181509.wav,395.0,400.0,True
./TF_3-20200512_181509.wav,400.0,405.0,True
./TF_3-20200512_181509.wav,405.0,410.0,True
./TF_3-20200512_181509.wav,410.0,415.0,True
./TF_3-20200512_181509.wav,415.0,420.0,False
./TF_3-20200512_181509.wav,420.0,425.0,False
./TF_3-20200512_181509.wav,425.0,430.0,False
./TF_3-20200512_181509.wav,430.0,435.0,False
./TF_3-20200512_181509.wav,435.0,440.0,False
./TF_3-20200512_181509.wav,440.0,445.0,False
./TF_3-20200512_181509.wav,445.0,450.0,False
./TF_3-20200512_181509.wav,450.0,455.0,False
./TF_3-20200512_181509.wav,455.0,460.0,False
./TF_3-20200512_181509.wav,460.0,465.0,False
./TF_3-20200512_181509.wav,465.0,470.0,False
./TF_3-20200512_181509.wav,470.0,475.0,False
./TF_3-20200512_181509.wav,475.0,480.0,False
./TF_3-20200512_181509.wav,480.0,485.0,False
./TF_3-20200512_181509.wav,485.0,490.0,False
./TF_3-20200512_181509.wav,490.0,495.0,False
./TF_3-20200512_181509.wav,495.0,500.0,False
./TF_3-20200512_181509.wav,500.0,505.0,False
./TF_3-20200512_181509.wav,505.0,510.0,False
./TF_3-20200512_181509.wav,510.0,515.0,False
./TF_3-20200512_181509.wav,515.0,520.0,False
./TF_3-20200512_181509.wav,520.0,525.0,False
./TF_3-20200512_181509.wav,525.0,530.0,False
./TF_3-20200512_181509.wav,530.0,535.0,False
./TF_3-20200512_181509.wav,535.0,540.0,False
./TF_3-20200512_181509.wav,540.0,545.0,False
./TF_3-20200512_181509.wav,545.0,550.0,False
./TF_3-20200512_181509.wav,550.0,555.0,False
./TF_3-20200512_181509.wav,555.0,560.0,False
./TF_3-20200512_181509.wav,560.0,565.0,False
./TF_3-20200512_181509.wav,565.0,570.0,False
./TF_3-20200512_181509.wav,570.0,575.0,False
./TF_3-20200512_181509.wav,575.0,580.0,False
./TF_3-20200512_181509.wav,580.0,585.0,False
./TF_3-20200512_181509.wav,585.0,590.0,False
./TF_3-20200512_181509.wav,590.0,595.0,False
./TF_3-20200512_181509.wav,595.0,600.0,False
./TF_3-20200512_181509.wav,600.0,605.0,False
./TF_3-20200512_181509.wav,605.0,610.0,False
./TF_3-20200512_181509.wav,610.0,615.0,False
./TF_3-20200512_181509.wav,615.0,620.0,False
./TF_3-20200512_181509.wav,620.0,625.0,False
./TF_3-20200512_181509.wav,625.0,630.0,False
./TF_3-20200512_181509.wav,630.0,635.0,False
./TF_3-20200512_181509.wav,635.0,640.0,False
./TF_3-20200512_181509.wav,640.0,645.0,False
./TF_3-20200512_181509.wav,645.0,650.0,False
./TF_3-20200512_181509.wav,650.0,655.0,False
./TF_3-20200512_181509.wav,655.0,660.0,False
./TF_3-20200512_181509.wav,660.0,665.0,False
./TF_3-20200512_181509.wav,665.0,670.0,False
./TF_3-20200512_181509.wav,670.0,675.0,False
./TF_3-20200512_181509.wav,675.0,680.0,False
./TF_3-20200512_181509.wav,680.0,685.0,False
./TF_3-20200512_181509.wav,685.0,690.0,False
./TF_3-20200512_181509.wav,690.0,695.0,False
./TF_3-20200512_181509.wav,695.0,700.0,False
./TF_3-20200512_181509.wav,700.0,705.0,False
./TF_3-20200512_181509.wav,705.0,710.0,False
./TF_3-20200512_181509.wav,710.0,715.0,False
./TF_3-20200512_181509.wav,715.0,720.0,False
./TF_3-20200512_181509.wav,720.0,725.0,False
./TF_3-20200512_181509.wav,725.0,730.0,False
./TF_3-20200512_181509.wav,730.0,735.0,False
./TF_3-20200512_181509.wav,735.0,740.0,False
./TF_3-20200512_181509.wav,740.0,745.0,False
./TF_3-20200512_181509.wav,745.0,750.0,False
./TF_3-20200512_181509.wav,750.0,755.0,False
./TF_3-20200512_181509.wav,755.0,760.0,False
./TF_3-20200512_181509.wav,760.0,765.0,False
./TF_3-20200512_181509.wav,765.0,770.0,False
./TF_3-20200512_181509.wav,770.0,775.0,False
./TF_3-20200512_181509.wav,775.0,780.0,False
./TF_3-20200512_181509.wav,780.0,785.0,False
./TF_3-20200512_181509.wav,785.0,790.0,False
./TF_3-20200512_181509.wav,790.0,795.0,False
./TF_3-20200512_181509.wav,795.0,800.0,False
./TF_3-20200512_181509.wav,800.0,805.0,False
./TF_3-20200512_181509.wav,805.0,810.0,False
./TF_3-20200512_181509.wav,810.0,815.0,False
./TF_3-20200512_181509.wav,815.0,820.0,False
./TF_3-20200512_181509.wav,820.0,825.0,False
./TF_3-20200512_181509.wav,825.0,830.0,False
./TF_3-20200512_181509.wav,830.0,835.0,False
./TF_3-20200512_181509.wav,835.0,840.0,False
./TF_3-20200512_181509.wav,840.0,845.0,False
./TF_3-20200512_181509.wav,845.0,850.0,False
./TF_3-20200512_181509.wav,850.0,855.0,False
./TF_3-20200512_181509.wav,855.0,860.0,False
./TF_3-20200512_181509.wav,860.0,865.0,False
./TF_3-20200512_181509.wav,865.0,870.0,False
./TF_3-20200512_181509.wav,870.0,875.0,False
./TF_3-20200512_181509.wav,875.0,880.0,False
./TF_3-20200512_181509.wav,880.0,885.0,False
./TF_3-20200512_181509.wav,885.0,890.0,False
./TF_3-20200512_181509.wav,890.0,895.0,False
./TF_3-20200512_181509.wav,895.0,900.0,False
./NB14-2024-05-05-20240125_054500-207-243.wav,0.0,5.0,False
./NB14-2024-05-05-20240125_054500-207-243.wav,5.0,10.0,False
./NB14-2024-05-05-20240125_054500-207-243.wav,10.0,15.0,False
./NB14-2024-05-05-20240125_054500-207-243.wav,15.0,20.0,False
./NB14-2024-05-05-20240125_054500-207-243.wav,20.0,25.0,False
./NB14-2024-05-05-20240125_054500-207-243.wav,25.0,30.0,False
./NB14-2024-05-05-20240125_054500-207-243.wav,30.0,35.0,False
./NB14-2024-05-05-20240125_054500-207-243.wav,30.000124999999997,35.000125,False
file addition: TF_3-20200512_181509.Table.1.selections.txt (---r------)

[0.698617]

Selection View Channel Begin Time (s) End Time (s) Low Freq (Hz) High Freq (Hz) Species Notes
1 Spectrogram 1 1 390 413 100 7900 Kiwi
2 Spectrogram 1 1 225 268 100 7900 Kiwi
file addition: NB14-2024-05-05-20240125_054500-207-243.Table.1.selections.txt (---r------)

[0.698617]

Selection View Channel Begin Time (s) End Time (s) Low Freq (Hz) High Freq (Hz) Species Notes
1 Spectrogram 1 1 0 36 100 7900 Not
file addition: D03_2022-12-17_20221022_043000.Table.1.selections.txt (---r------)

[0.698617]

Selection View Channel Begin Time (s) End Time (s) Low Freq (Hz) High Freq (Hz) Species Notes
1 Spectrogram 1 1 777.7342008523894 860.2406016351827 1110.0 5242.0 LTC
file addition: TESTING.md (----------)

[0.638309]

# Testing the Skraak MCP Server

## Overview

The Skraak MCP Server provides 10 tools across three categories:
- **Read tools (2)**: `get_current_time`, `execute_sql`
- **Write tools (4)**: `create_or_update_dataset`, `create_or_update_location`, `create_or_update_cluster`, `create_or_update_pattern`
- **Import tools (2 MCP)**: `import_audio_files`, `import_ml_selections`

Plus schema resources.

## Test Scripts

All scripts are in `shell_scripts/` and follow a consistent pattern.

### Read-Only Tests (No DB Modification)

```bash
cd shell_scripts

# Time tool (no database needed)
./test_time.sh

# SQL queries and security validation
./test_sql.sh

# Schema resources
./test_resources.sh

# Database integrity check
./test_db_state.sh
```

### Write Tests (Fresh DB Each Run)

These tests create a fresh copy of `skraak.duckdb` in `/tmp` and clean up automatically.

```bash
cd shell_scripts

# Create/update tools (dataset, location, cluster, pattern)
./test_write_tools.sh

# Import tools validation (error handling)
./test_import.sh
```

## Test Library

All tests source `test_lib.sh` for shared functionality:

```bash
source ./test_lib.sh

# Send MCP request
result=$(send_request "tools/call" '{"name":"execute_sql","arguments":{"query":"SELECT 1"}}')

# Run test with automatic tracking
run_test "Test name" "true" "$result" # true = expect success

# Print summary
print_summary
```

### Key Functions

| Function | Description |
|----------|-------------|
| `send_request <method> <params> [db]` | Send single MCP request |
| `send_requests <db> <req1> <req2>...` | Send multiple requests in one session |
| `run_test <name> <expect_pass> <result>` | Track test pass/fail |
| `get_result <response>` | Extract result from response |
| `is_error <response>` | Check if response is error |
| `fresh_test_db` | Create fresh test DB in /tmp |
| `cleanup_test_db <path>` | Remove test DB and temp files |
| `print_summary` | Print test counts |

## Manual JSON-RPC Testing

You can test manually via stdin:

```bash
./skraak mcp --db ./db/test.duckdb
```

Then type JSON-RPC messages (one per line):

### Initialize
```json
{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"protocolVersion":"2024-11-05","capabilities":{},"clientInfo":{"name":"test","version":"1.0"}}}
```

### List Tools
```json
{"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}}
```

### Execute SQL
```json
{"jsonrpc":"2.0","id":3,"method":"tools/call","params":{"name":"execute_sql","arguments":{"query":"SELECT COUNT(*) FROM dataset WHERE active = true"}}}
```

### Create Dataset
```json
{"jsonrpc":"2.0","id":4,"method":"tools/call","params":{"name":"create_or_update_dataset","arguments":{"name":"Test Dataset","type":"test"}}}
```

### Get Schema Resource
```json
{"jsonrpc":"2.0","id":5,"method":"resources/read","params":{"uri":"schema://full"}}
```

## SQL Query Examples

### Basic Queries

```sql
-- Active datasets
SELECT id, name, type FROM dataset WHERE active = true ORDER BY name

-- Parameterized query
SELECT id, name FROM location WHERE dataset_id = ? AND active = true

-- With limit
SELECT * FROM file WHERE active = true LIMIT 100
```

### JOINs

```sql
-- Dataset hierarchy with counts
SELECT d.name, COUNT(l.id) as locations, COUNT(f.id) as files
FROM dataset d
LEFT JOIN location l ON d.id = l.dataset_id
LEFT JOIN cluster c ON l.id = c.location_id
LEFT JOIN file f ON c.id = f.cluster_id
WHERE d.active = true
GROUP BY d.name
```

### Aggregates

```sql
-- Cluster statistics
SELECT COUNT(*) as files, SUM(duration) as total_seconds, AVG(duration) as avg_seconds
FROM file WHERE cluster_id = ? AND active = true
```

## Running Go Unit Tests

```bash
# All tests
go test ./...

# Specific package
go test ./utils/

# With coverage
go test -cover ./...

# Coverage report
go test -coverprofile=coverage.out ./utils/
go tool cover -html=coverage.out
```

**Test coverage**: 91.5% across 170+ tests

## Troubleshooting

| Issue | Solution |
|-------|----------|
| "skraak binary not found" | Run `go build` in project root |
| "Database not found" | Check path or use default |
| "Error: --db is required" | MCP command needs `--db path` |
| JSON parsing errors | Each message must be on one line |
| No response | Server outputs to stdout; check for errors in stderr |
| Test output too large | Tests print summary, not full output |

## Best Practices

1. **Run from shell_scripts directory**: Scripts use relative paths
2. **Use test.duckdb for manual testing**: Never use skraak.duckdb
3. **Write tests auto-clean**: They use /tmp and trap EXIT
4. **Check exit codes**: Tests return 0 on success, 1 on failure
5. **Run all tests before committing**: Ensures nothing is broken
file addition: README.md (----------)

[0.638309]

# Shell Test Scripts

Comprehensive test suite for the Skraak MCP Server.

## Quick Start

```bash
cd shell_scripts

# Run all tests (recommended)
./test_time.sh && ./test_sql.sh && ./test_resources.sh && \
./test_write_tools.sh && ./test_import.sh && ./test_db_state.sh && \
./test_sql_limit.sh && ./test_export.sh && ./test_event_log.sh && \
./test_calls_from_preds.sh

# Or run individually
./test_time.sh # Time tool (no DB needed)
./test_sql.sh # SQL queries
./test_resources.sh # Schema resources
./test_write_tools.sh # Create/update tools (fresh DB)
./test_import.sh # Import tools validation (fresh DB)
./test_bulk_import.sh # Bulk import CLI validation (to be implemented)
./test_db_state.sh # Database integrity check
./test_sql_limit.sh # SQL row limit enforcement
./test_export.sh # Dataset export (fresh DB) #######
./test_event_log.sh # Transaction event logging
./test_calls_from_preds.sh # Prediction file import
```

## Test Categories

### Read-Only Tests (Safe, Repeatable)

These tests read from the database and don't modify it. Run as many times as you want.

| Script | Description | Default DB |
|--------|-------------|------------|
| `test_time.sh` | Test `get_current_time` tool | None |
| `test_sql.sh` | Test `execute_sql` queries, security | test.duckdb |
| `test_resources.sh` | Test schema resources | test.duckdb |
| `test_db_state.sh` | Verify database integrity | test.duckdb |

### Write Tests (Fresh DB Each Run)

These tests modify the database. They automatically create a fresh copy of the production database in `/tmp` and clean up afterward.

| Script | Description | DB Handling |
|--------|-------------|-------------|
| `test_write_tools.sh` | Test `create_or_update_*` tools | Fresh DB in /tmp |
| `test_import.sh` | Test import tools validation | Fresh DB in /tmp |
| `test_bulk_import.sh` | Test bulk import CLI command | test.duckdb |

## Database Safety

- **Read-only tests**: Use `test.duckdb` (default) or specify path
- **Write tests**: Automatically create fresh DB from `skraak.duckdb` → `/tmp/skraak_test_$$.duckdb`
- **Never touches production**: Write tests are isolated

## Test Library

All scripts source `test_lib.sh` which provides:

- `send_request` - Send MCP request and get response
- `run_test` - Run test with pass/fail tracking
- `print_summary` - Print test results
- `fresh_test_db` - Create fresh test database
- `cleanup_test_db` - Clean up test database

## Running Individual Tests

```bash
# With default test database
./test_sql.sh

# With specific database
./test_sql.sh /path/to/database.duckdb

# Write tests always use fresh DB (no argument needed)
./test_write_tools.sh
```

## Expected Output

Each test prints:
- Test names with ✓ (pass) or ✗ (fail)
- Summary with counts
- Exit code 0 on success, 1 on failure

```
=== Testing execute_sql Tool ===
✓ Simple SELECT
✓ SELECT with limit
✓ Parameterized query
✓ JOIN query
✓ Aggregate query
✓ CTE query
✓ INSERT blocked (correctly rejected)
✓ SQL injection blocked (correctly rejected)
✓ DELETE blocked (correctly rejected)

=== Summary ===
Tests run: 9
Passed: 9
Failed: 0
```

## See Also

- `TESTING.md` - Comprehensive testing documentation
- `test_lib.sh` - Shared test functions
file addition: me.txt (----------)

[2.1]

To Do
=====
Tomtit - Gemma

Go through birdnet categories sample and try to work out what they are
Loop through making changes, Ralph loop

Look at kiwi dataset

New Dataset

test database line update with index+fk v fk only

Read audio tool (pointless atm as most models can't use it)

Bounding Box script.py

to one hot encoded csv for opensoundscape (because python is so slow, and I would have to convert to raven selection.txt first)

day -> civil sunrise to !!civil sunset!!
claude --resume "reject-reserved-key-bindings"

multi label in tui. How?? also cli

Clip from wav when no .data file—skraak save image????

find morepork mewing sound for dataset

segment unstructured import into batches of 10000 files to keep within buffer limits, structured imports should be fine as we are talking 1 sd card (24/7 its 16000 max)

ingest my training datasets

buy a drive to backup mac ~

Update tools could allow setting active to false?? Currently do not

Make freebird to .data tool

SKILLS
======

project/.claude/skills for most then link to project/.agents/skills for pi with:
find .claude/skills -type f -exec bash -c 'mkdir -p "$(dirname ".agents/skills/${1#.claude/skills/}")" && ln -s "$PWD/$1" "$PWD/.agents/skills/${1#.claude/skills/}"' _ {} \;

pi-specific are in ~ somewhere (ok because keeps them seperate) if installed with eg: $pi install npm:@tmustier/pi-ralph-wiggum

call-library: currently have a hard copy in .claude and .pi as I want to edit them in .pi

Labels in opensoundscape multi-species model
=====================
ausbit1 Australasian Bittern
bluduc1 Blue Duck
comcha Common Chaffinch
comred Redpoll (Common)
dunnoc1 Dunnock
eurbla Eurasian Blackbird
eursta European Starling
fernbi1 New Zealand Fernbird
grskiw1 Great Spotted Kiwi/Roroa
gryger1 Gray Gerygone/Grey Warbler
kea1 Kea
liskiw1 Little Spotted Kiwi/Kiwi pukupuku
lotkoe1 Long-tailed Koel/Cuckoo
morepo2 Morepork
nezbel1 New Zealand Bellbird
nezfan1 New Zealand Fantail/Piwakawaka
nezkak1 New Zealand Kaka
nezpig2 New Zealand Pigeon/Kereru
nezrob3 South Island Robin/Kakaruai
nibkiw1 North Island Brown Kiwi/Kiwi-nui
okbkiw1 Okarito Brown Kiwi/Rowi
parake parakeet sp./Kakariki
pipipi1 Pipipi/Brown Creeper
riflem1 Rifleman
saddle3 South Island Saddleback?Tieke
shbcuc1 Shining Bronze-Cuckoo
silver3 Silvereye
sobkiw2 Southern Brown Kiwi (South I.)/Tokoeka
soioys1 South Island Oystercatcher
soiwre1 South Island Wren
sonthr1 Song Thrush
spocra2 Spotless Crake
tomtit1 Tomtit/Miromiro
tui1 Tui
varoys1 Variable Oystercatcher
weka1 Weka
yellow2 Yellowhammer
weta Weta (not bird)
cangoo1 Canada Goose

# Active DB Labels ebird_code
------------------ ----------
Australasian Bittern ausbit1 x
Bellbird nezbel1 x
Chaffinch comcha x
Crake_Spotless spocra2 x
Cuckoo_Shining shbcuc1 x
Duck_Blue_Whio bluduc1 x
Dunnock_Hedge_Sparrow dunnoc1 x
Eurasian Blackbird eurbla x
European Starling eursta x
Fantail nezfan1 x
Fernbird fernbi1 x
Haast Tokoeka sobkiw2 x
Kaka nezkak1 x
Kea kea1 x
Kereru nezpig2 x
Kiwi pukupuku liskiw1 x
Kiwi_Nth_Is_Brown nibkiw1 x
Long-tailed Koel lotkoe1 x
Morepork morepo2 x
Oystercatcher_Variable varoys1 x
Parakeet parake x
Pipipi pipipi1 x Brown Creeper
Redpoll comred x
Rifleman riflem1 x
Robin_Sth_Is nezrob3 x
Roroa grskiw1 x
Rowi okbkiw1 x
S. Fiordland Tokoeka sobkiw1 x
Saddleback_Sth_Is saddle3 x
Silvereye silver3 x
South Island Oystercatcher soioys1 x
South Island Wren soiwre1 X
Thrush_Song sonthr1 x
Tomtit tomtit1 x
Tui tui1 x
Warbler_Grey gryger1 x
Weka_spp weka1 x
Yellowhammer yellow2 x
Check
Don't Know
Fake Kiwi
Korero Gecko x
Question
Weta x
Noise

Keybindings
===========
see ~/.skraak/config.json

TUI cmd
=======
skraak calls classify --folder . --filter opensoundscape-multi-1.0 --species comcha

David's Kiwi Workflow
=====================

- cp data to main drives
- backup audio
- skraak import bulk to get files into db
- Run opensoundscape models on audio
- skraak calls from-preds to make .data files
- Run julia DFMN model (also LSK model for Inge)
- skraak calls classify TUI for kiwi on 1 model
- use minimax to check "Don't Know"
- skraak calls propogate on other models
- use minimax on cert 70 Kiwi and maybe Don't Know
- skraak calls classify on remaining cert 70 Kiwi
- skraak calls classify --sample 10 on cert 90 Kiwi
- skraak calls push-certainty on remaining cert 90 Kiwi if all good
- use minimax skill /detect-anomalies to correct problems
- skraak calls classify to resolve certainty mismatches
- skraak calls summarise
- run skill /data-mapping
- run skill /import-segments

Code stuff
==========

time ./skraak calls from-preds --csv /media/david/SSD4/Twenty_Four_Seven/R620/2024-05-06/preds9_opensoundscape-multi-1.0_2025-07-22.csv > /media/david/SSD4/Twenty_Four_Seven/R620/2024-05-06/preds9_opensoundscape-multi-1.0_2025-07-22.json

for item in a
try
jsonfile = replace(item, ".csv" => ".json")
run(pipeline(`skraak calls from-preds --csv $item --gap-multiplier 3 --min-detections 1`, jsonfile))
catch e
@error "skraak failed on $item" exception=(e, catch_backtrace())
end
end

model = "/media/david/SSD2/Secondary_Models/DFMN_Inge/model_DFMN1-5_CPU_epoch-9-0.9737-2024-10-25.jld2"
labels = Dict(1 => "Duet", 2 => "Female", 3 => "Male", 4 => "Don't Know")
## Check this logic in the code
predict(a, model, labels)

model = "/media/david/SSD2/Secondary_Models/LSK/model_GSK_LSK_DFM_FT_IngeDFMN_1-5_1-0_CPU_epoch-9-0.9745-2025-01-13.jld2"
labels = Dict(1 => "GSK", 2 => "GSK", 3 => "GSK", 4 => "LSK", 5 => "LSK", 6 => "LSK")
## Needed to change the logic
predict(a, model, labels)

model = "/media/david/SSD2/Secondary_Models/DFMN_Pomona/model_DFMN1-5_Pomona3_CPU_epoch-18-0.9785-2025-03-02.jld2"
labels = Dict(1 => "Duet", 2 => "Female", 3 => "Gecko", 4 => "Male", 5 => "Don't Know")
## Check this logic in the code
predict(a, model, labels)

## Change the date
for item in x
try
jsonfile = "$item/segment_summary_2026-04-19.json"
run(pipeline(`skraak calls summarise --folder $item`, jsonfile))
catch e
@error "skraak failed on $item" exception=(e, catch_backtrace())
end
end

skraak calls summarise --folder ./recordings --brief

# print brief summary to repl
for item in a
try
run(pipeline(`skraak calls summarise --folder $item --brief`))
catch e
@error "skraak failed on $item" exception=(e, catch_backtrace())
end
end

# save brief summary to cwd
open("/home/david/summary_2026-04-17.jsonl", "w") do f
for item in a
try
run(pipeline(`skraak calls summarise --folder $item`, `jq 'del(.segments)'`, f))
catch e
@error "skraak failed on $item" exception=(e, catch_backtrace())
end
end
end

OLLAMA
======
ollama run gemma4:e4b
ollama launch pi --model gemma4:e4b # don't do this, it alters pi config

ollama run qwen3.5:9b # uninstalled

ollama list

ollama rm <model-name>
ollama rm qwen3.5:9b

R620/2024-05-06 only
Run Through Gemma

Opensoundscape Hand Classified BirdNET Hand Classified
============== =============== ======= ===============
comcha X X X
eurbla X X X
gryger1 X X none? X White-throated Sparrow (auto), Gray Gerygone
nezfan1 X X NZ Fantail
tomtit1 V. Bad garbage X
nezrob1 X X SI Robin (no types)
kereru
rifleman
silvereye
bellbird
tui
nezkak1 V. Bad(gecko, wing) V Bad, ongoing bellbird
weka1 V. Bad(noise) none
morepo2 many Gecko Also Gecko
lotkoe1 X X X

┌──────┬───────────────────────────┬───────┐
│ Rank │ Species │ Count │
├──────┼───────────────────────────┼───────┤
│ 1 │ White-throated Sparrow │ 5163 │ Gryger
├──────┼───────────────────────────┼───────┤
│ 2 │ New Zealand Bellbird │ 3812 │
├──────┼───────────────────────────┼───────┤
│ 3 │ Superb Lyrebird │ 3645 │ nezbel1+territorial
├──────┼───────────────────────────┼───────┤
│ 4 │ Common Crossbill │ 3247 │
├──────┼───────────────────────────┼───────┤
│ 5 │ Javan Shortwing │ 2824 │
├──────┼───────────────────────────┼───────┤
│ 6 │ Grey Gerygone │ 2286 │ Gryger
├──────┼───────────────────────────┼───────┤
│ 7 │ Yellow-bellied Flycatcher │ 1018 │
├──────┼───────────────────────────┼───────┤
│ 8 │ Tui │ 1004 │
├──────┼───────────────────────────┼───────┤
│ 9 │ Common Redpoll │ 949 │
├──────┼───────────────────────────┼───────┤
│ 10 │ Winter Wren │ 932 │
├──────┼───────────────────────────┼───────┤
│ 11 │ Blue-backed Manakin │ 784 │
├──────┼───────────────────────────┼───────┤
│ 12 │ Hermit Thrush │ 762 │
├──────┼───────────────────────────┼───────┤
│ 13 │ Blue Whistling-Thrush │ 728 │
├──────┼───────────────────────────┼───────┤
│ 14 │ Eastern Wood-Pewee │ 712 │
├──────┼───────────────────────────┼───────┤
│ 15 │ Common Nightingale │ 678 │
├──────┼───────────────────────────┼───────┤
│ 16 │ Red-breasted Flycatcher │ 678 │
├──────┼───────────────────────────┼───────┤
│ 17 │ New Zealand Kaka │ 639 │
├──────┼───────────────────────────┼───────┤
│ 18 │ Common Firecrest │ 608 │
├──────┼───────────────────────────┼───────┤
│ 19 │ New Zealand Fantail │ 583 │ X
├──────┼───────────────────────────┼───────┤
│ 20 │ Tomtit │ 570 │ X
├──────┼───────────────────────────┼───────┤
│ 21 │ Eurasian Golden Oriole │ 548 │
├──────┼───────────────────────────┼───────┤
│ 22 │ Musician Wren │ 526 │
├──────┼───────────────────────────┼───────┤
│ 23 │ White-browed Warbler │ 497 │
├──────┼───────────────────────────┼───────┤
│ 24 │ Cedar Waxwing │ 487 │
├──────┼───────────────────────────┼───────┤
│ 25 │ Iberian Chiffchaff │ 473 │
├──────┼───────────────────────────┼───────┤
│ 26 │ Common Redstart │ 461 │
├──────┼───────────────────────────┼───────┤
│ 27 │ European Greenfinch │ 454 │
├──────┼───────────────────────────┼───────┤
│ 28 │ Wood Thrush │ 432 │
├──────┼───────────────────────────┼───────┤
│ 29 │ Pheasant Cuckoo │ 427 │
├──────┼───────────────────────────┼───────┤
│ 30 │ Western Wood-Pewee │ 399 │
└──────┴───────────────────────────┴───────┘

skraak calls summarise --folder . > call_summary.json

# mapping.json for my big kiwi dataset
{
"Kiwi": {"species": "Kiwi"},
"Geese": {"species": "__NEGATIVE__"},
"Kaka": {"species": "__NEGATIVE__"},
"Kea": {"species": "__NEGATIVE__"},
"LTC": {"species": "__NEGATIVE__"},
"Morepork": {"species": "__NEGATIVE__"},
"Not": {"species": "__NEGATIVE__"},
"Plover": {"species": "__NEGATIVE__"}
}

# make csv to use for training big kiwi dataset
skraak calls clip-labels --folder . --mapping ./mapping.json \
--clip-duration 5 --clip-overlap 0 --min-label-overlap 0.25 --final-clip full \
--output ./clip_labels.csv

Lets manually execute this loop once, when we are happy we will design a ralph loop together to loop through remaining BirdNET classes /grill-me

# Retrieve BirdNET List from folder /media/david/SSD4/Twenty_Four_Seven/R620/2024-05-06/
skraak calls summarise --folder /media/david/SSD4/Twenty_Four_Seven/R620/2024-05-06/ --brief --filter BirdNET 2>/dev/null | jq -r '.filters.BirdNET.species | to_entries | map(select(.key | test("^[A-Z]"))) | sort_by(.value) | .[] | "\(.value)\t\(.key)"'

Start from the top of the BirdNET List and attempt to label a BirdNET class with one of the classes below using skill /call-classification, /call-classification-ollama, /call-library

While there is only a few segments in the BirdNET class, attempt to do this yourself reading data from /call-classification, /call-library. It is your role to work out what this class actually is (BirdNET mislabels many New Zealand Birds), and to assign correct labels wherever possible

When there are many segments in a BirdNET class use skill /call-classification-ollama. choose your reference images carefully. It is your role to work out what this class actually is (BirdNET mislabels many New Zealand Birds), then to use gemma to do the heavy lifting. If Gemma does a poor job of it, it is likely you have chosen the wrong species class.

Keep a .md document with your mappings, BirdNET => code as below

Common Bird List for R620
=========================
comcha Chaffinch
eurbla Blackbird
gryger1 Grey Warbler
kea1 Kea
lotkoe1 Long-tailed Cuckoo
morepo2 Morepork
nezbel1 Bellbird
nezfan1 Fantail
nezkak1 Kaka
nezpig2 Kereru
nezrob3 Kakaruai
pipipi1 Pipipi
riflem1 Rifleman
saddle3 Tieke
silver3 Silvereye
sobkiw2 Fiordland Tokoeka
soioys1 Pied Oystercatcher
tomtit1 Tomtit
tui1 Tui
yefpar3 Kakariki
weta Weta
gecko Korero Gecko

You have access to skills /pi-ralph-wiggum to loop through the BirdNET list, and pi-heartbeat, to set a timer.

Lets manually execute this loop once, when we are happy we will design a ralph loop together to loop through remaining BirdNET classes /grill-me

Category A - Direct/Obvious Mappings

┌──────────────────────┬────────┬─────────┬───────────────────────────────────────┐
│ BirdNET │ Count │ Code │ Notes │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ New Zealand Bellbird │ 3,812 │ nezbel1 │ Exact match │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ Grey Gerygone │ 2,286 │ gryger1 │ BirdNET's name for Grey Warbler │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ Tui │ 1,004 │ tui1 │ Exact match │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ New Zealand Kaka │ 603 │ nezkak1 │ Exact match │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ Morepork │ 287 │ morepo2 │ Exact match │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ Silvereye │ 248 │ silver3 │ Exact match │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ Pipipi │ 79 │ pipipi1 │ Exact match │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ Long-tailed Koel │ 47 │ lotkoe1 │ BirdNET's name for Long-tailed Cuckoo │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ Eurasian Blackbird │ 27 │ eurbla │ Exact match │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ North Island Robin │ 132 │ nezrob3 │ Robin = Kakaruai │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ European Robin │ 124 │ nezrob3 │ Same species │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ Dunnock │ 89 │ dunnoc1 │ Exact match │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ Song Thrush │ 173 │ sonthr1 │ Exact match │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ Common Redpoll │ 949 │ comred │ Exact match │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ Common Starling │ 1 │ eursta │ Exact match │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ Yellowhammer │ 4 │ yellow2 │ Exact match │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ House Sparrow │ 36 │ — │ House Sparrow not on R620 common list │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ Common Magpie │ 320 │ — │ Magpie not on R620 common list │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ Eurasian Skylark │ 5 │ — │ Not on R620 list │
├──────────────────────┼────────┼─────────┼───────────────────────────────────────┤
│ Total │ ~9,779 │ │ │
└──────────────────────┴────────┴─────────┴───────────────────────────────────────┘

────────────────────────────────────────────────────────────────────────────────

Category B - Real Mislabels (need classification)

These are BirdNET labels that don't match any NZ species name, and the segments are actually NZ
birds:

┌─────────────────────────────────────┬─────────┬─────────────────────────────────┬──────────┐
│ BirdNET │ Count │ Suspected Code(s) │ Priority │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Common Crossbill │ 3,247 │ comred? comcha? │ 🔴 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Javan Shortwing │ 2,824 │ tomtit1? nezrob3? │ 🔴 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Yellow-bellied Flycatcher │ 1,018 │ nezfan1? tomtit1? │ 🔴 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Winter Wren │ 932 │ pipipi1? riflem1? │ 🔴 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Blue-backed Manakin │ 784 │ riflem1? │ 🔴 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Hermit Thrush │ 762 │ eurbla? sonthr1? │ 🔴 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Blue Whistling-Thrush │ 728 │ eurbla? │ 🔴 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Eastern Wood-Pewee │ 712 │ tomtit1? nezfan1? │ 🔴 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Common Nightingale │ 678 │ nezrob3? │ 🔴 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Red-breasted Flycatcher │ 678 │ tomtit1? nezfan1? │ 🔴 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Common Firecrest │ 608 │ silver3? riflem1? │ 🔴 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Eurasian Golden Oriole │ 548 │ tui1? nezbel1? │ 🔴 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Musician Wren │ 526 │ pipipi1? │ 🔴 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ White-browed Warbler │ 497 │ gryger1? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Cedar Waxwing │ 487 │ eursta? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Iberian Chiffchaff │ 473 │ gryger1? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Common Redstart │ 461 │ nezrob3? tomtit1? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ European Greenfinch │ 454 │ comcha? comred? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Wood Thrush │ 432 │ eurbla? sonthr1? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Pheasant Cuckoo │ 427 │ lotkoe1? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Western Wood-Pewee │ 399 │ tomtit1? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Greater Racket-tailed Drongo │ 376 │ ? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ White-eared Honeyeater │ 358 │ nezbel1? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Broad-winged Hawk │ 351 │ Harrier? (not on list) │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Northern Pygmy-Owl │ 347 │ morepo2? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Black-capped Chickadee │ 345 │ ? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Bartlett's Tinamou │ 344 │ ? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Northern Saw-whet Owl │ 344 │ morepo2? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Varied Thrush │ 332 │ eurbla? sonthr1? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Black-faced Antthrush │ 330 │ ? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Lesser Redpoll │ 324 │ comred │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Goldcrest │ 298 │ silver3? riflem1? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Eurasian Pygmy-Owl │ 286 │ morepo2? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Common Chiffchaff │ 280 │ gryger1? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Eurasian Siskin │ 270 │ comred? comcha? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ White-throated Gerygone │ 263 │ gryger1? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Two-barred Crossbill │ 262 │ comred? comcha? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Grey Shrikethrush │ 260 │ ? │ 🟡 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Little Friarbird │ 166 │ nezbel1? │ 🟢 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Great Tit │ 165 │ tomtit1? │ 🟢 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Golden-bellied Gerygone │ 161 │ gryger1? │ 🟢 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Red Wattlebird │ 151 │ nezbel1? │ 🟢 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Common Kingfisher │ 133 │ — (Kingfisher not on R620 list) │ 🟢 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Rufous Whistler │ 11 │ ? │ 🟢 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Rock Wren │ 15 │ — (Rock Wren not on R620 list) │ 🟢 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Nightingale Wren │ 159 │ ? │ 🟢 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Little Spiderhunter │ 117 │ ? │ 🟢 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ ... and ~1,400 more with count < 10 │ │ │ 🟢 │
├─────────────────────────────────────┼─────────┼─────────────────────────────────┼──────────┤
│ Total │ ~38,000 │ │ │
└─────────────────────────────────────┴─────────┴─────────────────────────────────┴──────────┘
file addition: main.go (----------)

[2.1]

package main

import (
"fmt"
"os"

"skraak/cmd"
)

func main() {
if len(os.Args) < 2 {
printUsage()
os.Exit(1)
}

switch os.Args[1] {
case "import":
cmd.RunImport(os.Args[2:])
case "sql":
cmd.RunSQL(os.Args[2:])
case "create":
cmd.RunCreate(os.Args[2:])
case "update":
cmd.RunUpdate(os.Args[2:])
// Legacy commands removed - use create/update instead
// case "dataset":
// cmd.RunDataset(os.Args[2:])
// case "location":
// cmd.RunLocation(os.Args[2:])
// case "cluster":
// cmd.RunCluster(os.Args[2:])
// case "pattern":
// cmd.RunPattern(os.Args[2:])
case "export":
cmd.RunExport(os.Args[2:])
case "replay":
cmd.RunReplay(os.Args[2:])
case "calls":
cmd.RunCalls(os.Args[2:])
case "xxhash":
cmd.RunXXHash(os.Args[2:])
case "metadata":
cmd.RunMetadata(os.Args[2:])
case "time":
cmd.RunTime(os.Args[2:])
case "isnight":
cmd.RunIsNight(os.Args[2:])
case "prepend":
cmd.RunPrepend(os.Args[2:])
default:
fmt.Fprintf(os.Stderr, "Unknown command: %s\n\n", os.Args[1])
printUsage()
os.Exit(1)
}
}

// printUsage displays command-line usage information for all available commands
func printUsage() {
fmt.Fprintf(os.Stderr, "Usage: %s <command> [options]\n\n", os.Args[0])
fmt.Fprintf(os.Stderr, "Commands:\n")
fmt.Fprintf(os.Stderr, " sql Execute SQL query\n")
fmt.Fprintf(os.Stderr, " calls Extract/analyze bird calls (from-preds, from-brida, from-raven, show-images, classify, summarise)\n")
fmt.Fprintf(os.Stderr, " create Create a new resource (dataset, location, cluster, pattern)\n")
fmt.Fprintf(os.Stderr, " update Update an existing resource (dataset, location, cluster, pattern)\n")
fmt.Fprintf(os.Stderr, " import Import data (folder, bulk, unstructured, segments)\n")
fmt.Fprintf(os.Stderr, " export Export dataset to new database\n")
fmt.Fprintf(os.Stderr, " replay Replay event log into database\n")
fmt.Fprintf(os.Stderr, " xxhash Compute XXH64 hash of a file\n")
fmt.Fprintf(os.Stderr, " metadata Extract WAV file metadata\n")
fmt.Fprintf(os.Stderr, " time Get current time\n")
fmt.Fprintf(os.Stderr, " isnight Check if WAV file was recorded at night\n")
fmt.Fprintf(os.Stderr, " prepend Prepend prefix to WAV files and log.txt\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " %s sql --db ./db/skraak.duckdb \"SELECT COUNT(*) FROM file WHERE active = true\"\n", os.Args[0])
fmt.Fprintf(os.Stderr, " %s create dataset --db ./db/skraak.duckdb --name \"Test Dataset\"\n", os.Args[0])
fmt.Fprintf(os.Stderr, " %s update location --db ./db/skraak.duckdb --id loc123 --name \"New Name\"\n", os.Args[0])
fmt.Fprintf(os.Stderr, " %s export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb\n", os.Args[0])
fmt.Fprintf(os.Stderr, " %s replay events --db ./backup.duckdb --log ./skraak.duckdb.events.jsonl\n", os.Args[0])
fmt.Fprintf(os.Stderr, " %s calls from-preds --csv predictions.csv > calls.json\n", os.Args[0])
fmt.Fprintf(os.Stderr, " %s xxhash --file recording.wav\n", os.Args[0])
fmt.Fprintf(os.Stderr, " %s metadata --file recording.wav\n", os.Args[0])
fmt.Fprintf(os.Stderr, " %s time\n", os.Args[0])
fmt.Fprintf(os.Stderr, " %s isnight --file recording.wav --lat -36.85 --lng 174.76\n", os.Args[0])
}
file addition: lint_test.go (----------)

[2.1]

package main

import (
"os/exec"
"testing"
)

func TestGolangciLint(t *testing.T) {
cmd := exec.Command("golangci-lint", "run", "./...")
cmd.Dir = "."
out, err := cmd.CombinedOutput()
if err != nil {
t.Errorf("golangci-lint failed:\n%s", out)
}
}

func TestGoFmt(t *testing.T) {
cmd := exec.Command("go", "fmt", "./...")
cmd.Dir = "."
out, err := cmd.CombinedOutput()
if err != nil {
t.Errorf("go fmt failed: %v\n%s", err, out)
}
if len(out) > 0 {
t.Errorf("go fmt produced output (files needed formatting). Run 'go fmt ./...' to fix:\n%s", out)
}
}

func TestDeadcode(t *testing.T) {
cmd := exec.Command("deadcode", "./...")
cmd.Dir = "."
out, err := cmd.CombinedOutput()
if err != nil {
t.Errorf("deadcode failed:\n%s", out)
}
}
file addition: go.sum (----------)

[2.1]

charm.land/bubbletea/v2 v2.0.6 h1:UHN/91OyuhaOFGSrBXQ/hMZD8IO1Uc4BvHlgHXL2WJo=
charm.land/bubbletea/v2 v2.0.6/go.mod h1:MH/D8ZLlN3op37vQvijKuU29g3rqTp+aQapURFonF9g=
charm.land/lipgloss/v2 v2.0.3 h1:yM2zJ4Cf5Y51b7RHIwioil4ApI/aypFXXVHSwlM6RzU=
charm.land/lipgloss/v2 v2.0.3/go.mod h1:7myLU9iG/3xluAWzpY/fSxYYHCgoKTie7laxk6ATwXA=
github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
github.com/apache/arrow-go/v18 v18.5.1 h1:yaQ6zxMGgf9YCYw4/oaeOU3AULySDlAYDOcnr4LdHdI=
github.com/apache/arrow-go/v18 v18.5.1/go.mod h1:OCCJsmdq8AsRm8FkBSSmYTwL/s4zHW9CqxeBxEytkNE=
github.com/apache/thrift v0.22.0 h1:r7mTJdj51TMDe6RtcmNdQxgn9XcyfGDOzegMDRg47uc=
github.com/apache/thrift v0.22.0/go.mod h1:1e7J/O1Ae6ZQMTYdy9xa3w9k+XHWPfRvdPyJeynQ+/g=
github.com/aymanbagabas/go-udiff v0.4.1 h1:OEIrQ8maEeDBXQDoGCbbTTXYJMYRCRO1fnodZ12Gv5o=
github.com/aymanbagabas/go-udiff v0.4.1/go.mod h1:0L9PGwj20lrtmEMeyw4WKJ/TMyDtvAoK9bf2u/mNo3w=
github.com/bits-and-blooms/bitset v1.24.4 h1:95H15Og1clikBrKr/DuzMXkQzECs1M6hhoGXLwLQOZE=
github.com/bits-and-blooms/bitset v1.24.4/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/charmbracelet/colorprofile v0.4.3 h1:QPa1IWkYI+AOB+fE+mg/5/4HRMZcaXex9t5KX76i20Q=
github.com/charmbracelet/colorprofile v0.4.3/go.mod h1:/zT4BhpD5aGFpqQQqw7a+VtHCzu+zrQtt1zhMt9mR4Q=
github.com/charmbracelet/ultraviolet v0.0.0-20260416155717-489999b90468 h1:Q9fO0y1Zo5KB/5Vu8JZoLGm1N3RzF9bNj3Ao3xoR+Ac=
github.com/charmbracelet/ultraviolet v0.0.0-20260416155717-489999b90468/go.mod h1:bAAz7dh/FTYfC+oiHavL4mX1tOIBZ0ZwYjSi3qE6ivM=
github.com/charmbracelet/x/ansi v0.11.7 h1:kzv1kJvjg2S3r9KHo8hDdHFQLEqn4RBCb39dAYC84jI=
github.com/charmbracelet/x/ansi v0.11.7/go.mod h1:9qGpnAVYz+8ACONkZBUWPtL7lulP9No6p1epAihUZwQ=
github.com/charmbracelet/x/exp/golden v0.0.0-20250806222409-83e3a29d542f h1:pk6gmGpCE7F3FcjaOEKYriCvpmIN4+6OS/RD0vm4uIA=
github.com/charmbracelet/x/exp/golden v0.0.0-20250806222409-83e3a29d542f/go.mod h1:IfZAMTHB6XkZSeXUqriemErjAWCCzT0LwjKFYCZyw0I=
github.com/charmbracelet/x/term v0.2.2 h1:xVRT/S2ZcKdhhOuSP4t5cLi5o+JxklsoEObBSgfgZRk=
github.com/charmbracelet/x/term v0.2.2/go.mod h1:kF8CY5RddLWrsgVwpw4kAa6TESp6EB5y3uxGLeCqzAI=
github.com/charmbracelet/x/termios v0.1.1 h1:o3Q2bT8eqzGnGPOYheoYS8eEleT5ZVNYNy8JawjaNZY=
github.com/charmbracelet/x/termios v0.1.1/go.mod h1:rB7fnv1TgOPOyyKRJ9o+AsTU/vK5WHJ2ivHeut/Pcwo=
github.com/charmbracelet/x/windows v0.2.2 h1:IofanmuvaxnKHuV04sC0eBy/smG6kIKrWG2/jYn2GuM=
github.com/charmbracelet/x/windows v0.2.2/go.mod h1:/8XtdKZzedat74NQFn0NGlGL4soHB0YQZrETF96h75k=
github.com/clipperhouse/displaywidth v0.11.0 h1:lBc6kY44VFw+TDx4I8opi/EtL9m20WSEFgwIwO+UVM8=
github.com/clipperhouse/displaywidth v0.11.0/go.mod h1:bkrFNkf81G8HyVqmKGxsPufD3JhNl3dSqnGhOoSD/o0=
github.com/clipperhouse/uax29/v2 v2.7.0 h1:+gs4oBZ2gPfVrKPthwbMzWZDaAFPGYK72F0NJv2v7Vk=
github.com/clipperhouse/uax29/v2 v2.7.0/go.mod h1:EFJ2TJMRUaplDxHKj1qAEhCtQPW2tJSwu5BF98AuoVM=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/duckdb/duckdb-go-bindings v0.10502.0 h1:Uhg/dfvPLQv4cH35lMD48hqUcdOh2Z7bcuykjr4qnOA=
github.com/duckdb/duckdb-go-bindings v0.10502.0/go.mod h1:8KF3oEKrmYdSbZnQ1BPTdxAZDHRaM1LEv+oBvL2nSLk=
github.com/duckdb/duckdb-go-bindings/lib/darwin-amd64 v0.10502.0 h1:1GxSHSI1ef3sCdDVrJ9l8s6aTd7P1K788os9lHrs43g=
github.com/duckdb/duckdb-go-bindings/lib/darwin-amd64 v0.10502.0/go.mod h1:EnAvZh1kNJHp5yF+M1ZHNEvapnmt6anq1xXHVrAGqMo=
github.com/duckdb/duckdb-go-bindings/lib/darwin-arm64 v0.10502.0 h1:76gB6UiqKae6JptNiFLjwecD0oR87bXS5u6Lni9hSGI=
github.com/duckdb/duckdb-go-bindings/lib/darwin-arm64 v0.10502.0/go.mod h1:IGLSeEcFhNeZF16aVjQCULD7TsFZKG5G7SyKJAXKp5c=
github.com/duckdb/duckdb-go-bindings/lib/linux-amd64 v0.10502.0 h1:fcBKRy9keR5FLxppDD7ZjQ1EwqTRcA2kPLi2jWilPDw=
github.com/duckdb/duckdb-go-bindings/lib/linux-amd64 v0.10502.0/go.mod h1:KAIynZ0GHCS7X5fRyuFnQMg/SZBPK/bS9OCOVojClxw=
github.com/duckdb/duckdb-go-bindings/lib/linux-arm64 v0.10502.0 h1:pUwDWLQZIkm/v5aoGIu2cTAsgGqratxklRwP9zzsmiU=
github.com/duckdb/duckdb-go-bindings/lib/linux-arm64 v0.10502.0/go.mod h1:81SGOYoEUs8qaAfSk1wRfM5oobrIJ5KI7AzYhK6/bvQ=
github.com/duckdb/duckdb-go-bindings/lib/windows-amd64 v0.10502.0 h1:CDPf2ow6pP/9zYXfBdyT8a1GZ69eBWdMt5AhAsVgvyU=
github.com/duckdb/duckdb-go-bindings/lib/windows-amd64 v0.10502.0/go.mod h1:K25pJL26ARblGDeuAkrdblFvUen92+CwksLtPEHRqqQ=
github.com/duckdb/duckdb-go/v2 v2.10502.0 h1:YfdiBlXnlRdxIKu1AtBQSRI0/tGhOkIGshKq52+uA7A=
github.com/duckdb/duckdb-go/v2 v2.10502.0/go.mod h1:a/31wL2vx7dJ0isrO+E6o28DBQVaVOMbKxp2BsHTGp0=
github.com/ebitengine/oto/v3 v3.4.0 h1:br0PgASsEWaoWn38b2Goe7m1GKFYfNgnsjSd5Gg+/bQ=
github.com/ebitengine/oto/v3 v3.4.0/go.mod h1:IOleLVD0m+CMak3mRVwsYY8vTctQgOM0iiL6S7Ar7eI=
github.com/ebitengine/purego v0.9.0 h1:mh0zpKBIXDceC63hpvPuGLiJ8ZAa3DfrFTudmfi8A4k=
github.com/ebitengine/purego v0.9.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
github.com/go-viper/mapstructure/v2 v2.5.0 h1:vM5IJoUAy3d7zRSVtIwQgBj7BiWtMPfmPEgAXnvj1Ro=
github.com/go-viper/mapstructure/v2 v2.5.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM=
github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4=
github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M=
github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs=
github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/google/flatbuffers v25.12.19+incompatible h1:haMV2JRRJCe1998HeW/p0X9UaMTK6SDo0ffLn2+DbLs=
github.com/google/flatbuffers v25.12.19+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4=
github.com/klauspost/asmfmt v1.3.2/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE=
github.com/klauspost/compress v1.18.3 h1:9PJRvfbmTabkOX8moIpXPbMMbYN60bWImDDU7L+/6zw=
github.com/klauspost/compress v1.18.3/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y=
github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
github.com/lucasb-eyer/go-colorful v1.4.0 h1:UtrWVfLdarDgc44HcS7pYloGHJUjHV/4FwW4TvVgFr4=
github.com/lucasb-eyer/go-colorful v1.4.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
github.com/madelynnblue/go-dsp v1.0.0 h1:ufzvSGl8IdjCA6BFVUx1cZW/aDiiXxDBWU1MpkrtAiM=
github.com/madelynnblue/go-dsp v1.0.0/go.mod h1:dpf07Rj/u3te6cW3KwRBAqlyjP4InXHhNaYVuY73hHU=
github.com/matoous/go-nanoid/v2 v2.1.0 h1:P64+dmq21hhWdtvZfEAofnvJULaRR1Yib0+PnU669bE=
github.com/matoous/go-nanoid/v2 v2.1.0/go.mod h1:KlbGNQ+FhrUNIHUxZdL63t7tl4LaPkZNpUULS8H4uVM=
github.com/mattn/go-runewidth v0.0.23 h1:7ykA0T0jkPpzSvMS5i9uoNn2Xy3R383f9HDx3RybWcw=
github.com/mattn/go-runewidth v0.0.23/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs=
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs=
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY=
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI=
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE=
github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
github.com/pierrec/lz4/v4 v4.1.25 h1:kocOqRffaIbU5djlIBr7Wh+cx82C0vtFb0fOurZHqD0=
github.com/pierrec/lz4/v4 v4.1.25/go.mod h1:EoQMVJgeeEOMsCqCzqFm2O0cJvljX2nGZjcRIPL34O4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/sixdouglas/suncalc v0.0.0-20250114185126-291b1938b70c h1:Lyrtmwq1VO3vK30KXmA4S4u816l/HqyT11d75WR0UiU=
github.com/sixdouglas/suncalc v0.0.0-20250114185126-291b1938b70c/go.mod h1:IxOCrQX3pAL52wPiWuamnWxGcuyWANPyQfwcRb0iDqc=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no=
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM=
github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ=
github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0=
github.com/zeebo/xxh3 v1.1.0 h1:s7DLGDK45Dyfg7++yxI0khrfwq9661w9EN78eP/UZVs=
github.com/zeebo/xxh3 v1.1.0/go.mod h1:IisAie1LELR4xhVinxWS5+zf1lA4p0MW4T+w+W07F5s=
golang.org/x/exp v0.0.0-20260112195511-716be5621a96 h1:Z/6YuSHTLOHfNFdb8zVZomZr7cqNgTJvA8+Qz75D8gU=
golang.org/x/exp v0.0.0-20260112195511-716be5621a96/go.mod h1:nzimsREAkjBCIEFtHiYkrJyT+2uy9YZJB7H1k68CXZU=
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI=
golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
golang.org/x/telemetry v0.0.0-20260209163413-e7419c687ee4 h1:bTLqdHv7xrGlFbvf5/TXNxy/iUwwdkjhqQTJDjW7aj0=
golang.org/x/telemetry v0.0.0-20260209163413-e7419c687ee4/go.mod h1:g5NllXBEermZrmR51cJDQxmJUHUOfRAaNyWBM+R+548=
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da h1:noIWHXmPHxILtqtCOPIhSt0ABwskkZKjD3bXGnZGpNY=
golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90=
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
file addition: go.mod (----------)

[2.1]

module skraak

go 1.26.0

require (
charm.land/bubbletea/v2 v2.0.6
charm.land/lipgloss/v2 v2.0.3
github.com/cespare/xxhash/v2 v2.3.0
github.com/charmbracelet/x/ansi v0.11.7
github.com/duckdb/duckdb-go/v2 v2.10502.0
github.com/ebitengine/oto/v3 v3.4.0
github.com/madelynnblue/go-dsp v1.0.0
github.com/matoous/go-nanoid/v2 v2.1.0
github.com/sixdouglas/suncalc v0.0.0-20250114185126-291b1938b70c
)

require (
github.com/apache/arrow-go/v18 v18.5.1 // indirect
github.com/bits-and-blooms/bitset v1.24.4 // indirect
github.com/charmbracelet/colorprofile v0.4.3 // indirect
github.com/charmbracelet/ultraviolet v0.0.0-20260416155717-489999b90468 // indirect
github.com/charmbracelet/x/term v0.2.2 // indirect
github.com/charmbracelet/x/termios v0.1.1 // indirect
github.com/charmbracelet/x/windows v0.2.2 // indirect
github.com/clipperhouse/displaywidth v0.11.0 // indirect
github.com/clipperhouse/uax29/v2 v2.7.0 // indirect
github.com/duckdb/duckdb-go-bindings v0.10502.0 // indirect
github.com/duckdb/duckdb-go-bindings/lib/darwin-amd64 v0.10502.0 // indirect
github.com/duckdb/duckdb-go-bindings/lib/darwin-arm64 v0.10502.0 // indirect
github.com/duckdb/duckdb-go-bindings/lib/linux-amd64 v0.10502.0 // indirect
github.com/duckdb/duckdb-go-bindings/lib/linux-arm64 v0.10502.0 // indirect
github.com/duckdb/duckdb-go-bindings/lib/windows-amd64 v0.10502.0 // indirect
github.com/ebitengine/purego v0.9.0 // indirect
github.com/go-viper/mapstructure/v2 v2.5.0 // indirect
github.com/goccy/go-json v0.10.5 // indirect
github.com/google/flatbuffers v25.12.19+incompatible // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/klauspost/compress v1.18.3 // indirect
github.com/klauspost/cpuid/v2 v2.3.0 // indirect
github.com/lucasb-eyer/go-colorful v1.4.0 // indirect
github.com/mattn/go-runewidth v0.0.23 // indirect
github.com/muesli/cancelreader v0.2.2 // indirect
github.com/pierrec/lz4/v4 v4.1.25 // indirect
github.com/rivo/uniseg v0.4.7 // indirect
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
github.com/zeebo/xxh3 v1.1.0 // indirect
golang.org/x/exp v0.0.0-20260112195511-716be5621a96 // indirect
golang.org/x/mod v0.33.0 // indirect
golang.org/x/sync v0.20.0 // indirect
golang.org/x/sys v0.43.0 // indirect
golang.org/x/telemetry v0.0.0-20260209163413-e7419c687ee4 // indirect
golang.org/x/tools v0.42.0 // indirect
golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect
)
file addition: db (d--r------)

[2.1]
file addition: types.go (----------)

[0.790921]

package db

import (
"encoding/json"
"time"
)

// DatasetType represents the dataset_type enum from the schema
type DatasetType string

// Dataset type enum constants
const (
DatasetTypeStructured DatasetType = "structured"
DatasetTypeUnstructured DatasetType = "unstructured"
DatasetTypeTest DatasetType = "test"
DatasetTypeTrain DatasetType = "train"
)

// Dataset represents a row from the dataset table
type Dataset struct {
ID string `json:"id"`
Name string `json:"name"`
Description *string `json:"description"` // Pointer for nullable field
CreatedAt time.Time `json:"created_at"`
LastModified time.Time `json:"last_modified"`
Active bool `json:"active"`
Type DatasetType `json:"type"`
}

// MarshalJSON implements custom JSON marshaling for Dataset
// Formats timestamps as RFC3339
func (d Dataset) MarshalJSON() ([]byte, error) {
return json.Marshal(&struct {
ID string `json:"id"`
Name string `json:"name"`
Description *string `json:"description"`
CreatedAt string `json:"created_at"`
LastModified string `json:"last_modified"`
Active bool `json:"active"`
Type DatasetType `json:"type"`
}{
ID: d.ID,
Name: d.Name,
Description: d.Description,
CreatedAt: d.CreatedAt.Format(time.RFC3339),
LastModified: d.LastModified.Format(time.RFC3339),
Active: d.Active,
Type: d.Type,
})
}

// Location represents a row from the location table
type Location struct {
ID string `json:"id"`
DatasetID string `json:"dataset_id"`
Name string `json:"name"`
Latitude float64 `json:"latitude"`
Longitude float64 `json:"longitude"`
Description *string `json:"description"` // nullable
CreatedAt time.Time `json:"created_at"`
LastModified time.Time `json:"last_modified"`
Active bool `json:"active"`
TimezoneID string `json:"timezone_id"`
}

// MarshalJSON implements custom JSON marshaling for Location
// Formats timestamps as RFC3339
func (l Location) MarshalJSON() ([]byte, error) {
return json.Marshal(&struct {
ID string `json:"id"`
DatasetID string `json:"dataset_id"`
Name string `json:"name"`
Latitude float64 `json:"latitude"`
Longitude float64 `json:"longitude"`
Description *string `json:"description"`
CreatedAt string `json:"created_at"`
LastModified string `json:"last_modified"`
Active bool `json:"active"`
TimezoneID string `json:"timezone_id"`
}{
ID: l.ID,
DatasetID: l.DatasetID,
Name: l.Name,
Latitude: l.Latitude,
Longitude: l.Longitude,
Description: l.Description,
CreatedAt: l.CreatedAt.Format(time.RFC3339),
LastModified: l.LastModified.Format(time.RFC3339),
Active: l.Active,
TimezoneID: l.TimezoneID,
})
}

// Cluster represents a row from the cluster table
type Cluster struct {
ID string `json:"id"`
DatasetID string `json:"dataset_id"`
LocationID string `json:"location_id"`
Name string `json:"name"`
Description *string `json:"description"` // nullable
CreatedAt time.Time `json:"created_at"`
LastModified time.Time `json:"last_modified"`
Active bool `json:"active"`
CyclicRecordingPatternID *string `json:"cyclic_recording_pattern_id"` // nullable
SampleRate int `json:"sample_rate"`
}

// MarshalJSON implements custom JSON marshaling for Cluster
// Formats timestamps as RFC3339
func (c Cluster) MarshalJSON() ([]byte, error) {
return json.Marshal(&struct {
ID string `json:"id"`
DatasetID string `json:"dataset_id"`
LocationID string `json:"location_id"`
Name string `json:"name"`
Description *string `json:"description"`
CreatedAt string `json:"created_at"`
LastModified string `json:"last_modified"`
Active bool `json:"active"`
CyclicRecordingPatternID *string `json:"cyclic_recording_pattern_id"`
SampleRate int `json:"sample_rate"`
}{
ID: c.ID,
DatasetID: c.DatasetID,
LocationID: c.LocationID,
Name: c.Name,
Description: c.Description,
CreatedAt: c.CreatedAt.Format(time.RFC3339),
LastModified: c.LastModified.Format(time.RFC3339),
Active: c.Active,
CyclicRecordingPatternID: c.CyclicRecordingPatternID,
SampleRate: c.SampleRate,
})
}

// File represents a row from the file table
type File struct {
ID string `json:"id"`
FileName string `json:"file_name"`
Path *string `json:"path"` // nullable
XXH64Hash string `json:"xxh64_hash"`
LocationID string `json:"location_id"`
TimestampLocal time.Time `json:"timestamp_local"`
ClusterID *string `json:"cluster_id"` // nullable
Duration float64 `json:"duration"`
SampleRate int `json:"sample_rate"`
Description *string `json:"description"` // nullable
MaybeSolarNight *bool `json:"maybe_solar_night"` // nullable
MaybeCivilNight *bool `json:"maybe_civil_night"` // nullable
MoonPhase *float64 `json:"moon_phase"` // nullable
CreatedAt time.Time `json:"created_at"`
LastModified time.Time `json:"last_modified"`
Active bool `json:"active"`
}

// CyclicRecordingPattern represents a row from the cyclic_recording_pattern table
type CyclicRecordingPattern struct {
ID string `json:"id"`
RecordS int `json:"record_s"`
SleepS int `json:"sleep_s"`
CreatedAt time.Time `json:"created_at"`
LastModified time.Time `json:"last_modified"`
Active bool `json:"active"`
}

// MarshalJSON implements custom JSON marshaling for CyclicRecordingPattern
// Formats timestamps as RFC3339
func (p CyclicRecordingPattern) MarshalJSON() ([]byte, error) {
return json.Marshal(&struct {
ID string `json:"id"`
RecordS int `json:"record_s"`
SleepS int `json:"sleep_s"`
CreatedAt string `json:"created_at"`
LastModified string `json:"last_modified"`
Active bool `json:"active"`
}{
ID: p.ID,
RecordS: p.RecordS,
SleepS: p.SleepS,
CreatedAt: p.CreatedAt.Format(time.RFC3339),
LastModified: p.LastModified.Format(time.RFC3339),
Active: p.Active,
})
}

// GainLevel represents the gain_level enum for AudioMoth recordings
type GainLevel string

// AudioMoth gain level enum constants
const (
GainLow GainLevel = "low"
GainLowMedium GainLevel = "low-medium"
GainMedium GainLevel = "medium"
GainMediumHigh GainLevel = "medium-high"
GainHigh GainLevel = "high"
)

// MothMetadata represents a row from the moth_metadata table
type MothMetadata struct {
FileID string `json:"file_id"`
Timestamp time.Time `json:"timestamp"`
RecorderID *string `json:"recorder_id"` // nullable
Gain *GainLevel `json:"gain"` // nullable
BatteryV *float64 `json:"battery_v"` // nullable
TempC *float64 `json:"temp_c"` // nullable
CreatedAt time.Time `json:"created_at"`
LastModified time.Time `json:"last_modified"`
Active bool `json:"active"`
}

// FileDataset represents a row from the file_dataset junction table
type FileDataset struct {
FileID string `json:"file_id"`
DatasetID string `json:"dataset_id"`
CreatedAt time.Time `json:"created_at"`
LastModified time.Time `json:"last_modified"`
}
file addition: tx_logger_test.go (----------)

[0.790921]

package db

import (
"bytes"
"context"
"database/sql"
"encoding/json"
"os"
"path/filepath"
"reflect"
"strings"
"testing"
"time"
)

// =============================================================================
// Test Helpers
// =============================================================================

// resetGlobalState resets package-level variables for test isolation.
func resetGlobalState() {
eventLogMu.Lock()
defer eventLogMu.Unlock()

if eventLogFile != nil {
eventLogFile.Close()
eventLogFile = nil
eventLogEnc = nil
}
eventLogConfig = EventLogConfig{}
}

// setupTestDB creates an in-memory DuckDB with a test table.
func setupTestDB(t *testing.T) *sql.DB {
t.Helper()
db, err := sql.Open("duckdb", "")
if err != nil {
t.Fatalf("Failed to open in-memory DuckDB: %v", err)
}

_, err = db.Exec("CREATE TABLE test_table (id VARCHAR PRIMARY KEY, name VARCHAR, value INTEGER)")
if err != nil {
db.Close()
t.Fatalf("Failed to create test table: %v", err)
}

return db
}

// readEventsFile reads all events from a JSONL file.
func readEventsFile(path string) ([]TransactionEvent, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, err
}

var events []TransactionEvent
for line := range bytes.SplitSeq(data, []byte("\n")) {
if len(line) == 0 {
continue
}
var event TransactionEvent
if err := json.Unmarshal(line, &event); err != nil {
return nil, err
}
events = append(events, event)
}
return events, nil
}

// Assertion helpers using standard library

func assertEqual(t *testing.T, expected, actual any, msg ...string) {
t.Helper()
if !reflect.DeepEqual(expected, actual) {
if len(msg) > 0 {
t.Errorf("%s: expected %v, got %v", msg[0], expected, actual)
} else {
t.Errorf("expected %v, got %v", expected, actual)
}
}
}

func assertNil(t *testing.T, value any, msg ...string) {
t.Helper()
if value != nil && !isTypedNil(value) {
if len(msg) > 0 {
t.Errorf("%s: expected nil, got %v", msg[0], value)
} else {
t.Errorf("expected nil, got %v", value)
}
}
}

// isTypedNil checks if a value is a typed nil (e.g., *os.File(nil))
func isTypedNil(v any) bool {
if v == nil {
return true
}
// Use reflection to check for typed nil
rv := reflect.ValueOf(v)
switch rv.Kind() {
case reflect.Chan, reflect.Func, reflect.Map, reflect.Pointer, reflect.Slice:
return rv.IsNil()
}
return false
}

func assertNotNil(t *testing.T, value any, msg ...string) {
t.Helper()
if value == nil {
if len(msg) > 0 {
t.Errorf("%s: expected non-nil value", msg[0])
} else {
t.Errorf("expected non-nil value")
}
}
}

func assertTrue(t *testing.T, value bool, msg ...string) {
t.Helper()
if !value {
if len(msg) > 0 {
t.Errorf("%s: expected true, got false", msg[0])
} else {
t.Errorf("expected true, got false")
}
}
}

func assertFalse(t *testing.T, value bool, msg ...string) {
t.Helper()
if value {
if len(msg) > 0 {
t.Errorf("%s: expected false, got true", msg[0])
} else {
t.Errorf("expected false, got true")
}
}
}

func assertError(t *testing.T, err error, msg ...string) {
t.Helper()
if err == nil {
if len(msg) > 0 {
t.Errorf("%s: expected error, got nil", msg[0])
} else {
t.Errorf("expected error, got nil")
}
}
}

func assertNoError(t *testing.T, err error, msg ...string) {
t.Helper()
if err != nil {
if len(msg) > 0 {
t.Errorf("%s: expected no error, got %v", msg[0], err)
} else {
t.Errorf("expected no error, got %v", err)
}
}
}

func assertLen(t *testing.T, expected, actual int, msg ...string) {
t.Helper()
if expected != actual {
if len(msg) > 0 {
t.Errorf("%s: expected length %d, got %d", msg[0], expected, actual)
} else {
t.Errorf("expected length %d, got %d", expected, actual)
}
}
}

func assertContains(t *testing.T, s, substr string, msg ...string) {
t.Helper()
if !strings.Contains(s, substr) {
if len(msg) > 0 {
t.Errorf("%s: expected %q to contain %q", msg[0], s, substr)
} else {
t.Errorf("expected %q to contain %q", s, substr)
}
}
}

func assertGreater(t *testing.T, a, b int64, msg ...string) {
t.Helper()
if a <= b {
if len(msg) > 0 {
t.Errorf("%s: expected %d > %d", msg[0], a, b)
} else {
t.Errorf("expected %d > %d", a, b)
}
}
}

// =============================================================================
// Category 1: Pure Function Tests
// =============================================================================

func TestIsMutation(t *testing.T) {
tests := []struct {
name string
sql string
expected bool
}{
// INSERT variations
{"INSERT uppercase", "INSERT INTO test VALUES (1)", true},
{"INSERT lowercase", "insert into test values (1)", true},
{"INSERT with leading space", " INSERT INTO test VALUES (1)", true},
{"INSERT with leading newline", "\n\tINSERT INTO test VALUES (1)", true},
// Note: SQL with leading comment is not detected as mutation
// because isMutation checks HasPrefix after TrimSpace, and "--" is not INSERT/UPDATE/DELETE

// UPDATE variations
{"UPDATE uppercase", "UPDATE test SET x = 1", true},
{"UPDATE lowercase", "update test set x = 1", true},
{"UPDATE with WHERE", "UPDATE test SET x = 1 WHERE id = 1", true},

// DELETE variations
{"DELETE uppercase", "DELETE FROM test WHERE x = 1", true},
{"DELETE lowercase", "delete from test where x = 1", true},

// SELECT (not mutation)
{"SELECT uppercase", "SELECT * FROM test", false},
{"SELECT lowercase", "select * from test", false},
{"SELECT with WHERE", "SELECT * FROM test WHERE id = 1", false},

// WITH clause (CTE) with mutation
{"CTE with INSERT", "WITH cte AS (SELECT 1) INSERT INTO test SELECT * FROM cte", true},
{"CTE with UPDATE", "WITH cte AS (SELECT 1) UPDATE test SET x = 1", true},
{"CTE with DELETE", "WITH cte AS (SELECT 1) DELETE FROM test", true},
{"CTE lowercase with insert", "with cte as (select 1) insert into test select * from cte", true},

// WITH clause (CTE) without mutation
{"CTE with SELECT only", "WITH cte AS (SELECT 1) SELECT * FROM cte", false},
{"CTE lowercase with select", "with cte as (select 1) select * from cte", false},

// Edge cases
{"empty string", "", false},
{"whitespace only", " ", false},
{"just SELECT keyword", "SELECT", false},
{"just INSERT keyword", "INSERT", true},
{"just UPDATE keyword", "UPDATE", true},
{"just DELETE keyword", "DELETE", true},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := isMutation(tt.sql)
assertEqual(t, tt.expected, result, "isMutation(%q)", tt.sql)
})
}
}

func TestMarshalParam(t *testing.T) {
t.Run("nil", func(t *testing.T) {
result := marshalParam(nil)
assertNil(t, result)
})

t.Run("time.Time", func(t *testing.T) {
tm := time.Date(2026, 2, 18, 14, 30, 0, 0, time.UTC)
result := marshalParam(tm)
assertEqual(t, "2026-02-18T14:30:00Z", result)
})

t.Run("*time.Time nil", func(t *testing.T) {
var tm *time.Time
result := marshalParam(tm)
assertNil(t, result)
})

t.Run("*time.Time with value", func(t *testing.T) {
tm := time.Date(2026, 2, 18, 14, 30, 0, 123456789, time.UTC)
result := marshalParam(&tm)
assertEqual(t, "2026-02-18T14:30:00.123456789Z", result)
})

t.Run("time.Time with nanoseconds", func(t *testing.T) {
tm := time.Date(2026, 2, 18, 14, 30, 0, 999999999, time.UTC)
result := marshalParam(tm)
assertEqual(t, "2026-02-18T14:30:00.999999999Z", result)
})

t.Run("time.Time with timezone", func(t *testing.T) {
loc, _ := time.LoadLocation("Pacific/Auckland")
tm := time.Date(2026, 2, 19, 10, 30, 0, 0, loc)
result := marshalParam(tm)
// Should contain timezone offset
assertContains(t, result.(string), "+13:00")
})

t.Run("string", func(t *testing.T) {
result := marshalParam("hello world")
assertEqual(t, "hello world", result)
})

t.Run("*string nil", func(t *testing.T) {
var s *string
result := marshalParam(s)
assertNil(t, result)
})

t.Run("*string with value", func(t *testing.T) {
s := "hello"
result := marshalParam(&s)
assertEqual(t, "hello", result)
})

t.Run("int types", func(t *testing.T) {
assertEqual(t, int(42), marshalParam(int(42)))
assertEqual(t, int8(42), marshalParam(int8(42)))
assertEqual(t, int16(42), marshalParam(int16(42)))
assertEqual(t, int32(42), marshalParam(int32(42)))
assertEqual(t, int64(42), marshalParam(int64(42)))
assertEqual(t, uint(42), marshalParam(uint(42)))
assertEqual(t, uint8(42), marshalParam(uint8(42)))
assertEqual(t, uint16(42), marshalParam(uint16(42)))
assertEqual(t, uint32(42), marshalParam(uint32(42)))
assertEqual(t, uint64(42), marshalParam(uint64(42)))
})

t.Run("*int nil", func(t *testing.T) {
var p *int
result := marshalParam(p)
assertNil(t, result)
})

t.Run("*int with value", func(t *testing.T) {
v := 42
result := marshalParam(&v)
assertEqual(t, 42, result)
})

t.Run("*int64 nil", func(t *testing.T) {
var p *int64
result := marshalParam(p)
assertNil(t, result)
})

t.Run("*int64 with value", func(t *testing.T) {
v := int64(1234567890123)
result := marshalParam(&v)
assertEqual(t, int64(1234567890123), result)
})

t.Run("negative int", func(t *testing.T) {
assertEqual(t, int(-42), marshalParam(int(-42)))
assertEqual(t, int64(-42), marshalParam(int64(-42)))
})

t.Run("float types", func(t *testing.T) {
assertEqual(t, float32(3.14), marshalParam(float32(3.14)))
assertEqual(t, float64(3.14), marshalParam(float64(3.14)))
})

t.Run("*float64 nil", func(t *testing.T) {
var p *float64
result := marshalParam(p)
assertNil(t, result)
})

t.Run("*float64 with value", func(t *testing.T) {
v := 3.14159
result := marshalParam(&v)
assertEqual(t, 3.14159, result)
})

t.Run("*float32 nil", func(t *testing.T) {
var p *float32
result := marshalParam(p)
assertNil(t, result)
})

t.Run("*float32 with value", func(t *testing.T) {
v := float32(2.71)
result := marshalParam(&v)
assertEqual(t, float32(2.71), result)
})

t.Run("bool", func(t *testing.T) {
assertEqual(t, true, marshalParam(true))
assertEqual(t, false, marshalParam(false))
})

t.Run("*bool nil", func(t *testing.T) {
var p *bool
result := marshalParam(p)
assertNil(t, result)
})

t.Run("*bool with true", func(t *testing.T) {
v := true
result := marshalParam(&v)
assertEqual(t, true, result)
})

t.Run("*bool with false", func(t *testing.T) {
v := false
result := marshalParam(&v)
assertEqual(t, false, result)
})

t.Run("[]byte", func(t *testing.T) {
b := []byte("hello")
result := marshalParam(b)
assertEqual(t, b, result)
})

t.Run("unknown type", func(t *testing.T) {
type MyType struct{ X int }
result := marshalParam(MyType{X: 42})
// fmt.Sprintf("%v", MyType{X: 42}) produces "{42}"
assertContains(t, result.(string), "42")
})

t.Run("named type alias (like GainLevel)", func(t *testing.T) {
type GainLevel string
g := GainLevel("medium")
result := marshalParam(g)
// Named type aliases fall through to default case
assertEqual(t, "medium", result)
})

t.Run("pointer to named type alias", func(t *testing.T) {
type GainLevel string
g := GainLevel("high")
// Pointer to named type also falls through to default
result := marshalParam(&g)
// Should serialize the value, not the pointer address
assertEqual(t, "high", result)
})

t.Run("slice", func(t *testing.T) {
s := []string{"a", "b", "c"}
result := marshalParam(s)
assertEqual(t, "[a b c]", result)
})

t.Run("map", func(t *testing.T) {
m := map[string]int{"a": 1}
result := marshalParam(m)
assertContains(t, result.(string), "a")
})
}

func TestQueryRecordMarshalJSON(t *testing.T) {
t.Run("basic types", func(t *testing.T) {
qr := QueryRecord{
SQL: "INSERT INTO test VALUES (?, ?)",
Parameters: []any{"id123", 42},
}

data, err := json.Marshal(qr)
assertNoError(t, err)

var result map[string]any
err = json.Unmarshal(data, &result)
assertNoError(t, err)

assertEqual(t, "INSERT INTO test VALUES (?, ?)", result["sql"])
params := result["parameters"].([]any)
assertEqual(t, "id123", params[0])
assertEqual(t, 42.0, params[1]) // JSON numbers are floats
})

t.Run("with time.Time", func(t *testing.T) {
tm := time.Date(2026, 2, 18, 14, 30, 0, 0, time.UTC)
qr := QueryRecord{
SQL: "INSERT INTO test VALUES (?)",
Parameters: []any{tm},
}

data, err := json.Marshal(qr)
assertNoError(t, err)

var result map[string]any
err = json.Unmarshal(data, &result)
assertNoError(t, err)

params := result["parameters"].([]any)
assertEqual(t, "2026-02-18T14:30:00Z", params[0])
})

t.Run("with nil parameter", func(t *testing.T) {
qr := QueryRecord{
SQL: "INSERT INTO test VALUES (?)",
Parameters: []any{nil},
}

data, err := json.Marshal(qr)
assertNoError(t, err)

var result map[string]any
err = json.Unmarshal(data, &result)
assertNoError(t, err)

params := result["parameters"].([]any)
assertNil(t, params[0])
})

t.Run("empty parameters", func(t *testing.T) {
qr := QueryRecord{
SQL: "SELECT 1",
Parameters: []any{},
}

data, err := json.Marshal(qr)
assertNoError(t, err)

var result map[string]any
err = json.Unmarshal(data, &result)
assertNoError(t, err)

params := result["parameters"].([]any)
assertLen(t, 0, len(params))
})

t.Run("multiple param types", func(t *testing.T) {
qr := QueryRecord{
SQL: "INSERT INTO test VALUES (?, ?, ?, ?, ?)",
Parameters: []any{"string", 42, true, nil, 3.14},
}

data, err := json.Marshal(qr)
assertNoError(t, err)

var result map[string]any
err = json.Unmarshal(data, &result)
assertNoError(t, err)

params := result["parameters"].([]any)
assertLen(t, 5, len(params))
assertEqual(t, "string", params[0])
assertEqual(t, 42.0, params[1])
assertEqual(t, true, params[2])
assertNil(t, params[3])
assertEqual(t, 3.14, params[4])
})

t.Run("special characters in SQL", func(t *testing.T) {
qr := QueryRecord{
SQL: "INSERT INTO test VALUES ('O''Brien', \"test\")",
Parameters: []any{},
}

data, err := json.Marshal(qr)
assertNoError(t, err)

// Verify JSON is valid
var result map[string]any
err = json.Unmarshal(data, &result)
assertNoError(t, err)

assertContains(t, result["sql"].(string), "O''Brien")
})

t.Run("unicode in parameters", func(t *testing.T) {
qr := QueryRecord{
SQL: "INSERT INTO test VALUES (?)",
Parameters: []any{"日本語 🎵"},
}

data, err := json.Marshal(qr)
assertNoError(t, err)

var result map[string]any
err = json.Unmarshal(data, &result)
assertNoError(t, err)

params := result["parameters"].([]any)
assertEqual(t, "日本語 🎵", params[0])
})
}

// =============================================================================
// Category 2: Global State Tests
// =============================================================================

func TestSetEventLogConfig(t *testing.T) {
resetGlobalState()
defer resetGlobalState()

t.Run("set enabled with path", func(t *testing.T) {
resetGlobalState()
cfg := EventLogConfig{
Enabled: true,
Path: "/tmp/test.jsonl",
}
SetEventLogConfig(cfg)

got := GetEventLogConfig()
assertTrue(t, got.Enabled)
assertEqual(t, "/tmp/test.jsonl", got.Path)
})

t.Run("set disabled", func(t *testing.T) {
resetGlobalState()
cfg := EventLogConfig{
Enabled: false,
Path: "/tmp/test.jsonl",
}
SetEventLogConfig(cfg)

got := GetEventLogConfig()
assertFalse(t, got.Enabled)
})

t.Run("change path while file open", func(t *testing.T) {
resetGlobalState()

tmpDir := t.TempDir()
path1 := filepath.Join(tmpDir, "events1.jsonl")
path2 := filepath.Join(tmpDir, "events2.jsonl")

// Set first config and open file
SetEventLogConfig(EventLogConfig{Enabled: true, Path: path1})
ensureEventLogFile()
assertNotNil(t, eventLogFile)

// Change path - should close first file
SetEventLogConfig(EventLogConfig{Enabled: true, Path: path2})

// File handle should be nil (will reopen on next ensure)
// Note: SetEventLogConfig closes the file, sets eventLogFile = nil
assertNil(t, eventLogFile)
})
}

func TestGetEventLogConfig(t *testing.T) {
resetGlobalState()
defer resetGlobalState()

t.Run("default state", func(t *testing.T) {
resetGlobalState()
got := GetEventLogConfig()
assertFalse(t, got.Enabled)
assertEqual(t, "", got.Path)
})

t.Run("after set", func(t *testing.T) {
resetGlobalState()
SetEventLogConfig(EventLogConfig{Enabled: true, Path: "/test/path.jsonl"})
got := GetEventLogConfig()
assertTrue(t, got.Enabled)
assertEqual(t, "/test/path.jsonl", got.Path)
})
}

func TestCloseEventLog(t *testing.T) {
resetGlobalState()
defer resetGlobalState()

t.Run("close with no file", func(t *testing.T) {
resetGlobalState()
err := CloseEventLog()
assertNoError(t, err)
})

t.Run("close with open file", func(t *testing.T) {
resetGlobalState()

tmpDir := t.TempDir()
logPath := filepath.Join(tmpDir, "events.jsonl")

SetEventLogConfig(EventLogConfig{Enabled: true, Path: logPath})
ensureEventLogFile()
assertNotNil(t, eventLogFile)

err := CloseEventLog()
assertNoError(t, err)

// Verify state is reset
assertFalse(t, eventLogConfig.Enabled)
assertNil(t, eventLogFile)
assertNil(t, eventLogEnc)
})

t.Run("double close", func(t *testing.T) {
resetGlobalState()

tmpDir := t.TempDir()
logPath := filepath.Join(tmpDir, "events.jsonl")

SetEventLogConfig(EventLogConfig{Enabled: true, Path: logPath})
ensureEventLogFile()

err := CloseEventLog()
assertNoError(t, err)

// Second close should not panic
err = CloseEventLog()
assertNoError(t, err)
})
}

// =============================================================================
// Category 3: Integration Tests
// =============================================================================

func TestBeginLoggedTx(t *testing.T) {
resetGlobalState()
defer resetGlobalState()

t.Run("creates transaction", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, err := BeginLoggedTx(context.Background(), db, "test_tool")
assertNoError(t, err)
assertNotNil(t, tx)

assertEqual(t, "test_tool", tx.toolName)
assertNotNil(t, tx.queries)
assertLen(t, 0, len(tx.queries))
assertFalse(t, tx.startTime.IsZero())

tx.Rollback()
})

t.Run("empty tool name is allowed", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, err := BeginLoggedTx(context.Background(), db, "")
assertNoError(t, err)
assertNotNil(t, tx)
assertEqual(t, "", tx.toolName)

tx.Rollback()
})

t.Run("initial state is clean", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
assertLen(t, 0, len(tx.queries))
assertFalse(t, tx.startTime.IsZero())

// Verify startTime is recent (within last second)
elapsed := time.Since(tx.startTime)
assertTrue(t, elapsed < time.Second, "startTime should be recent")

tx.Rollback()
})
}

func TestLoggedTx_ExecContext(t *testing.T) {
resetGlobalState()
defer resetGlobalState()

t.Run("records INSERT", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

_, err := tx.ExecContext(context.Background(),
"INSERT INTO test_table VALUES (?, ?, ?)", "id1", "name1", 42)
assertNoError(t, err)

assertLen(t, 1, len(tx.queries))
assertContains(t, tx.queries[0].SQL, "INSERT")
assertLen(t, 3, len(tx.queries[0].Parameters))
assertEqual(t, "id1", tx.queries[0].Parameters[0])
})

t.Run("records UPDATE", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "id2", "name2", 1)

_, err := tx.ExecContext(context.Background(),
"UPDATE test_table SET value = ? WHERE id = ?", 100, "id2")
assertNoError(t, err)

assertLen(t, 2, len(tx.queries))
assertContains(t, tx.queries[1].SQL, "UPDATE")

tx.Rollback()
})

t.Run("records DELETE", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "id3", "name3", 1)

_, err := tx.ExecContext(context.Background(),
"DELETE FROM test_table WHERE id = ?", "id3")
assertNoError(t, err)

assertLen(t, 2, len(tx.queries))
assertContains(t, tx.queries[1].SQL, "DELETE")

tx.Rollback()
})

t.Run("does not record SELECT", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "id4", "name4", 1)

// SELECT should not be recorded
tx.QueryRowContext(context.Background(), "SELECT * FROM test_table WHERE id = ?", "id4")

assertLen(t, 1, len(tx.queries)) // Only the INSERT

tx.Rollback()
})

t.Run("does not record failed execution", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

// This will fail (table doesn't exist)
_, err := tx.ExecContext(context.Background(),
"INSERT INTO nonexistent_table VALUES (?)", "x")
assertError(t, err)

assertLen(t, 0, len(tx.queries)) // Failed query not recorded
})

t.Run("multiple executions recorded in order", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "id1", "name1", 1)
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "id2", "name2", 2)
tx.ExecContext(context.Background(), "UPDATE test_table SET value = ? WHERE id = ?", 99, "id1")

assertLen(t, 3, len(tx.queries))
assertContains(t, tx.queries[0].SQL, "INSERT")
assertContains(t, tx.queries[1].SQL, "INSERT")
assertContains(t, tx.queries[2].SQL, "UPDATE")
})

t.Run("parameters stored correctly", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

tx.ExecContext(context.Background(),
"INSERT INTO test_table VALUES (?, ?, ?)", "param_id", "param_name", 123)

assertLen(t, 3, len(tx.queries[0].Parameters))
assertEqual(t, "param_id", tx.queries[0].Parameters[0])
assertEqual(t, "param_name", tx.queries[0].Parameters[1])
assertEqual(t, 123, tx.queries[0].Parameters[2])
})
}

func TestLoggedTx_Exec(t *testing.T) {
resetGlobalState()
defer resetGlobalState()

t.Run("INSERT without context", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

_, err := tx.Exec("INSERT INTO test_table VALUES (?, ?, ?)", "id1", "name1", 42)
assertNoError(t, err)

assertLen(t, 1, len(tx.queries))
assertContains(t, tx.queries[0].SQL, "INSERT")
})
}

func TestLoggedTx_Commit(t *testing.T) {
resetGlobalState()
defer resetGlobalState()

t.Run("writes event to file on commit", func(t *testing.T) {
resetGlobalState()

tmpDir := t.TempDir()
logPath := filepath.Join(tmpDir, "events.jsonl")

SetEventLogConfig(EventLogConfig{Enabled: true, Path: logPath})

db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test_tool")
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "id1", "name1", 42)

err := tx.Commit()
assertNoError(t, err)

// Verify event was written
events, err := readEventsFile(logPath)
assertNoError(t, err)
assertLen(t, 1, len(events))

assertNotNil(t, events[0].ID)
assertLen(t, 21, len(events[0].ID))
assertEqual(t, "test_tool", events[0].Tool)
assertLen(t, 1, len(events[0].Queries))
assertTrue(t, events[0].Success)
// Duration may be 0 for very fast transactions
assertTrue(t, events[0].Duration >= 0)
})

t.Run("does not write when logging disabled", func(t *testing.T) {
resetGlobalState()

tmpDir := t.TempDir()
logPath := filepath.Join(tmpDir, "events.jsonl")

SetEventLogConfig(EventLogConfig{Enabled: false, Path: logPath})

db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test_tool")
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "id2", "name2", 1)

err := tx.Commit()
assertNoError(t, err)

// No file should be created
_, err = os.Stat(logPath)
assertTrue(t, os.IsNotExist(err), "file should not exist")
})

t.Run("does not write when no mutations", func(t *testing.T) {
resetGlobalState()

tmpDir := t.TempDir()
logPath := filepath.Join(tmpDir, "events.jsonl")

SetEventLogConfig(EventLogConfig{Enabled: true, Path: logPath})

db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test_tool")
// No mutations, just reads
tx.QueryRowContext(context.Background(), "SELECT 1")

err := tx.Commit()
assertNoError(t, err)

// No file should be created
_, err = os.Stat(logPath)
assertTrue(t, os.IsNotExist(err), "file should not exist")
})

t.Run("multiple mutations in single event", func(t *testing.T) {
resetGlobalState()

tmpDir := t.TempDir()
logPath := filepath.Join(tmpDir, "events.jsonl")

SetEventLogConfig(EventLogConfig{Enabled: true, Path: logPath})

db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "multi_test")
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "m1", "name1", 1)
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "m2", "name2", 2)
tx.ExecContext(context.Background(), "UPDATE test_table SET value = ? WHERE id = ?", 99, "m1")

err := tx.Commit()
assertNoError(t, err)

events, err := readEventsFile(logPath)
assertNoError(t, err)
assertLen(t, 1, len(events))
assertLen(t, 3, len(events[0].Queries))
})

t.Run("data persisted after commit", func(t *testing.T) {
resetGlobalState()

db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "persist_test", "name", 42)
tx.Commit()

var count int
err := db.QueryRow("SELECT COUNT(*) FROM test_table WHERE id = ?", "persist_test").Scan(&count)
assertNoError(t, err)
assertEqual(t, 1, count)
})

t.Run("event has valid timestamp", func(t *testing.T) {
resetGlobalState()

tmpDir := t.TempDir()
logPath := filepath.Join(tmpDir, "events.jsonl")

SetEventLogConfig(EventLogConfig{Enabled: true, Path: logPath})

db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "ts_test", "name", 1)
tx.Commit()

events, _ := readEventsFile(logPath)

// Timestamp should be recent (within last 5 seconds)
elapsed := time.Since(events[0].Timestamp)
assertTrue(t, elapsed < 5*time.Second, "timestamp should be recent")
})
}

func TestLoggedTx_Rollback(t *testing.T) {
resetGlobalState()
defer resetGlobalState()

t.Run("discards recorded queries", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "id1", "name1", 42)

assertLen(t, 1, len(tx.queries))

err := tx.Rollback()
assertNoError(t, err)

// Queries should be nil after rollback
tx.mu.Lock()
queries := tx.queries
tx.mu.Unlock()

assertNil(t, queries)
})

t.Run("does not write event to file", func(t *testing.T) {
resetGlobalState()

tmpDir := t.TempDir()
logPath := filepath.Join(tmpDir, "events.jsonl")

SetEventLogConfig(EventLogConfig{Enabled: true, Path: logPath})

db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test_tool")
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "id1", "name1", 42)

err := tx.Rollback()
assertNoError(t, err)

// No file should be created
_, err = os.Stat(logPath)
assertTrue(t, os.IsNotExist(err), "file should not exist")
})

t.Run("data not persisted", func(t *testing.T) {
resetGlobalState()

db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "rb_test", "name", 42)
tx.Rollback()

var count int
err := db.QueryRow("SELECT COUNT(*) FROM test_table WHERE id = ?", "rb_test").Scan(&count)
assertNoError(t, err)
assertEqual(t, 0, count)
})

t.Run("rollback returns nil on success", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "x", "y", 1)

err := tx.Rollback()
assertNoError(t, err)
})
}

func TestLoggedTx_QueryMethods(t *testing.T) {
resetGlobalState()
defer resetGlobalState()

db := setupTestDB(t)
defer db.Close()

// Setup: insert a row
tx, _ := BeginLoggedTx(context.Background(), db, "test")
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "q1", "name1", 42)
tx.Commit()

t.Run("QueryRowContext returns row", func(t *testing.T) {
tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

var name string
err := tx.QueryRowContext(context.Background(), "SELECT name FROM test_table WHERE id = ?", "q1").Scan(&name)
assertNoError(t, err)
assertEqual(t, "name1", name)
})

t.Run("QueryRow returns row", func(t *testing.T) {
tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

var value int
err := tx.QueryRow("SELECT value FROM test_table WHERE id = ?", "q1").Scan(&value)
assertNoError(t, err)
assertEqual(t, 42, value)
})

t.Run("QueryContext returns rows", func(t *testing.T) {
tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

rows, err := tx.QueryContext(context.Background(), "SELECT * FROM test_table")
assertNoError(t, err)
defer rows.Close()

count := 0
for rows.Next() {
count++
}
assertGreater(t, int64(count), 0)
})

t.Run("Query returns rows", func(t *testing.T) {
tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

rows, err := tx.Query("SELECT * FROM test_table")
assertNoError(t, err)
defer rows.Close()

assertTrue(t, rows.Next(), "should have at least one row")
})

t.Run("query methods not recorded", func(t *testing.T) {
tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

tx.QueryRowContext(context.Background(), "SELECT * FROM test_table")
tx.QueryContext(context.Background(), "SELECT * FROM test_table")

assertLen(t, 0, len(tx.queries))
})
}

func TestLoggedTx_Prepare(t *testing.T) {
resetGlobalState()
defer resetGlobalState()

t.Run("valid prepare", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

stmt, err := tx.PrepareContext(context.Background(),
"INSERT INTO test_table VALUES (?, ?, ?)")
assertNoError(t, err)
assertNotNil(t, stmt)
assertEqual(t, "INSERT INTO test_table VALUES (?, ?, ?)", stmt.sql)

stmt.Close()
})

t.Run("prepare without context", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

stmt, err := tx.Prepare("INSERT INTO test_table VALUES (?, ?, ?)")
assertNoError(t, err)
assertNotNil(t, stmt)

stmt.Close()
})

t.Run("invalid SQL returns error", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

stmt, err := tx.Prepare("INVALID SQL SYNTAX !!!")
assertError(t, err)
assertNil(t, stmt)
})
}

func TestLoggedStmt_ExecContext(t *testing.T) {
resetGlobalState()
defer resetGlobalState()

t.Run("INSERT with prepared stmt", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

stmt, _ := tx.PrepareContext(context.Background(),
"INSERT INTO test_table VALUES (?, ?, ?)")
defer stmt.Close()

_, err := stmt.ExecContext(context.Background(), "ps1", "name1", 42)
assertNoError(t, err)

assertLen(t, 1, len(tx.queries))
assertContains(t, tx.queries[0].SQL, "INSERT")
})

t.Run("multiple executions recorded separately", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

stmt, _ := tx.PrepareContext(context.Background(),
"INSERT INTO test_table VALUES (?, ?, ?)")
defer stmt.Close()

stmt.ExecContext(context.Background(), "ps1", "name1", 1)
stmt.ExecContext(context.Background(), "ps2", "name2", 2)
stmt.ExecContext(context.Background(), "ps3", "name3", 3)

assertLen(t, 3, len(tx.queries))
})

t.Run("parameters captured correctly", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

stmt, _ := tx.PrepareContext(context.Background(),
"INSERT INTO test_table VALUES (?, ?, ?)")
defer stmt.Close()

stmt.ExecContext(context.Background(), "captured_id", "captured_name", 999)

assertLen(t, 3, len(tx.queries[0].Parameters))
assertEqual(t, "captured_id", tx.queries[0].Parameters[0])
assertEqual(t, "captured_name", tx.queries[0].Parameters[1])
assertEqual(t, 999, tx.queries[0].Parameters[2])
})

t.Run("SELECT prepared stmt not recorded", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

// First insert some data
tx, _ := BeginLoggedTx(context.Background(), db, "test")
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "sel_test", "name", 1)
tx.Commit()

// Now test SELECT prepared statement
tx, _ = BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

stmt, _ := tx.PrepareContext(context.Background(),
"SELECT name FROM test_table WHERE id = ?")
defer stmt.Close()

var name string
err := stmt.QueryRowContext(context.Background(), "sel_test").Scan(&name)
assertNoError(t, err)
assertEqual(t, "name", name)

assertLen(t, 0, len(tx.queries))
})

t.Run("failed execution not recorded", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

// Insert one row
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "dup_id", "name", 1)

// Try to insert duplicate (will fail due to primary key)
stmt, _ := tx.PrepareContext(context.Background(),
"INSERT INTO test_table VALUES (?, ?, ?)")
defer stmt.Close()

_, err := stmt.ExecContext(context.Background(), "dup_id", "name2", 2)
assertError(t, err)

// Only first INSERT should be recorded
assertLen(t, 1, len(tx.queries))
})

t.Run("commit writes all prepared stmt queries", func(t *testing.T) {
resetGlobalState()

tmpDir := t.TempDir()
logPath := filepath.Join(tmpDir, "events.jsonl")

SetEventLogConfig(EventLogConfig{Enabled: true, Path: logPath})

db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "prep_commit_test")

stmt, _ := tx.PrepareContext(context.Background(),
"INSERT INTO test_table VALUES (?, ?, ?)")

stmt.ExecContext(context.Background(), "pc1", "name1", 1)
stmt.ExecContext(context.Background(), "pc2", "name2", 2)
stmt.Close()

tx.Commit()

events, err := readEventsFile(logPath)
assertNoError(t, err)
assertLen(t, 1, len(events))
assertLen(t, 2, len(events[0].Queries))
})
t.Run("Exec without context", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

stmt, _ := tx.PrepareContext(context.Background(),
"INSERT INTO test_table VALUES (?, ?, ?)")
defer stmt.Close()

_, err := stmt.Exec("exec_id", "name", 42)
assertNoError(t, err)

assertLen(t, 1, len(tx.queries))
})
}

func TestLoggedStmt_QueryMethods(t *testing.T) {
resetGlobalState()
defer resetGlobalState()

db := setupTestDB(t)
defer db.Close()

// Setup: insert data
tx, _ := BeginLoggedTx(context.Background(), db, "test")
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "qry1", "name1", 42)
tx.Commit()

t.Run("QueryRowContext returns row", func(t *testing.T) {
tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

stmt, _ := tx.PrepareContext(context.Background(),
"SELECT name FROM test_table WHERE id = ?")
defer stmt.Close()

var name string
err := stmt.QueryRowContext(context.Background(), "qry1").Scan(&name)
assertNoError(t, err)
assertEqual(t, "name1", name)
})

t.Run("QueryRow returns row", func(t *testing.T) {
tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

stmt, _ := tx.PrepareContext(context.Background(),
"SELECT value FROM test_table WHERE id = ?")
defer stmt.Close()

var value int
err := stmt.QueryRow("qry1").Scan(&value)
assertNoError(t, err)
assertEqual(t, 42, value)
})

t.Run("QueryContext returns rows", func(t *testing.T) {
tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

stmt, _ := tx.PrepareContext(context.Background(),
"SELECT * FROM test_table WHERE id = ?")
defer stmt.Close()

rows, err := stmt.QueryContext(context.Background(), "qry1")
assertNoError(t, err)
defer rows.Close()

assertTrue(t, rows.Next(), "should have one row")
})

t.Run("Query returns rows", func(t *testing.T) {
tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

stmt, _ := tx.PrepareContext(context.Background(),
"SELECT * FROM test_table")
defer stmt.Close()

rows, err := stmt.Query()
assertNoError(t, err)
defer rows.Close()

assertTrue(t, rows.Next(), "should have at least one row")
})
}

func TestLoggedStmt_Close(t *testing.T) {
resetGlobalState()
defer resetGlobalState()

t.Run("close returns nil on success", func(t *testing.T) {
db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "test")
defer tx.Rollback()

stmt, _ := tx.PrepareContext(context.Background(),
"INSERT INTO test_table VALUES (?, ?, ?)")

err := stmt.Close()
assertNoError(t, err)
})
}

func TestEnsureEventLogFile(t *testing.T) {
resetGlobalState()
defer resetGlobalState()

t.Run("creates file if doesn't exist", func(t *testing.T) {
resetGlobalState()

tmpDir := t.TempDir()
logPath := filepath.Join(tmpDir, "events.jsonl")

SetEventLogConfig(EventLogConfig{Enabled: true, Path: logPath})

err := ensureEventLogFile()
assertNoError(t, err)
assertNotNil(t, eventLogFile)

// File should exist
_, err = os.Stat(logPath)
assertNoError(t, err)
})

t.Run("appends to existing file", func(t *testing.T) {
resetGlobalState()

tmpDir := t.TempDir()
logPath := filepath.Join(tmpDir, "events.jsonl")

// Create file with content
os.WriteFile(logPath, []byte("existing content\n"), 0644)

SetEventLogConfig(EventLogConfig{Enabled: true, Path: logPath})

err := ensureEventLogFile()
assertNoError(t, err)

// File should still have content
data, _ := os.ReadFile(logPath)
assertContains(t, string(data), "existing content")
})

t.Run("creates directory if doesn't exist", func(t *testing.T) {
resetGlobalState()

tmpDir := t.TempDir()
logPath := filepath.Join(tmpDir, "subdir", "deep", "events.jsonl")

SetEventLogConfig(EventLogConfig{Enabled: true, Path: logPath})

err := ensureEventLogFile()
assertNoError(t, err)

// Directory should exist
dir := filepath.Dir(logPath)
_, err = os.Stat(dir)
assertNoError(t, err)
})

t.Run("returns nil if file already open", func(t *testing.T) {
resetGlobalState()

tmpDir := t.TempDir()
logPath := filepath.Join(tmpDir, "events.jsonl")

SetEventLogConfig(EventLogConfig{Enabled: true, Path: logPath})

ensureEventLogFile()
firstFile := eventLogFile

err := ensureEventLogFile()
assertNoError(t, err)

// Should reuse same file handle
assertEqual(t, firstFile, eventLogFile)
})
}

func TestTransactionEventJSON(t *testing.T) {
resetGlobalState()
defer resetGlobalState()

t.Run("complete event serializes correctly", func(t *testing.T) {
event := TransactionEvent{
ID: "test-id-12345",
Timestamp: time.Date(2026, 2, 18, 14, 30, 0, 0, time.UTC),
Tool: "test_tool",
Queries: []QueryRecord{
{SQL: "INSERT INTO test VALUES (?)", Parameters: []any{"a"}},
{SQL: "UPDATE test SET x = ?", Parameters: []any{1}},
},
Success: true,
Duration: 42,
}

data, err := json.Marshal(event)
assertNoError(t, err)

var result map[string]any
err = json.Unmarshal(data, &result)
assertNoError(t, err)

assertEqual(t, "test-id-12345", result["id"])
assertEqual(t, "test_tool", result["tool"])
assertEqual(t, true, result["success"])
assertEqual(t, 42.0, result["duration_ms"])
})

t.Run("timestamp in RFC3339Nano format", func(t *testing.T) {
event := TransactionEvent{
ID: "ts-test",
Timestamp: time.Date(2026, 2, 18, 14, 30, 0, 123456789, time.UTC),
Success: true,
}

data, err := json.Marshal(event)
assertNoError(t, err)

var result map[string]any
json.Unmarshal(data, &result)

assertContains(t, result["timestamp"].(string), "2026-02-18T14:30:00.123456789Z")
})

t.Run("duration positive", func(t *testing.T) {
event := TransactionEvent{
ID: "dur-test",
Timestamp: time.Now(),
Success: true,
Duration: 123,
}

data, _ := json.Marshal(event)
var result map[string]any
json.Unmarshal(data, &result)

assertGreater(t, int64(result["duration_ms"].(float64)), 0)
})

t.Run("ID is 21 characters in real usage", func(t *testing.T) {
// Verify by creating an actual event
resetGlobalState()

tmpDir := t.TempDir()
logPath := filepath.Join(tmpDir, "events.jsonl")

SetEventLogConfig(EventLogConfig{Enabled: true, Path: logPath})

db := setupTestDB(t)
defer db.Close()

tx, _ := BeginLoggedTx(context.Background(), db, "id_test")
tx.ExecContext(context.Background(), "INSERT INTO test_table VALUES (?, ?, ?)", "id_test", "name", 1)
tx.Commit()

events, _ := readEventsFile(logPath)
assertLen(t, 21, len(events[0].ID))
})
}

func GetEventLogConfig() EventLogConfig {
eventLogMu.Lock()
defer eventLogMu.Unlock()
return eventLogConfig
}
file addition: tx_logger.go (----------)

[0.790921]

package db

import (
"context"
"database/sql"
"encoding/json"
"fmt"
"os"
"path/filepath"
"reflect"
"strings"
"sync"
"time"

gonanoid "github.com/matoous/go-nanoid/v2"
)

// LoggedTx wraps *sql.Tx and records all Exec/ExecContext calls for mutation logging
type LoggedTx struct {
tx *sql.Tx
queries []QueryRecord
mu sync.Mutex
toolName string
startTime time.Time
}

// QueryRecord represents a single SQL statement with parameters
type QueryRecord struct {
SQL string `json:"sql"`
Parameters []any `json:"parameters"`
}

// TransactionEvent represents a complete transaction for the event log
type TransactionEvent struct {
ID string `json:"id"`
Timestamp time.Time `json:"timestamp"`
Tool string `json:"tool,omitempty"`
Queries []QueryRecord `json:"queries"`
Success bool `json:"success"`
Duration int64 `json:"duration_ms"`
}

// LoggedStmt wraps *sql.Stmt to intercept Exec calls on prepared statements
type LoggedStmt struct {
stmt *sql.Stmt
tx *LoggedTx
sql string
}

// EventLogConfig holds configuration for event logging
type EventLogConfig struct {
Enabled bool
Path string
}

var (
eventLogConfig EventLogConfig
eventLogMu sync.Mutex
eventLogFile *os.File
eventLogEnc *json.Encoder
)

// SetEventLogConfig configures event logging globally
func SetEventLogConfig(cfg EventLogConfig) {
eventLogMu.Lock()
defer eventLogMu.Unlock()

// Close existing file if path changed
if eventLogFile != nil && eventLogConfig.Path != cfg.Path {
_ = eventLogFile.Close()
eventLogFile = nil
eventLogEnc = nil
}

eventLogConfig = cfg
}

// BeginLoggedTx starts a new transaction that logs all mutations
// toolName is optional and identifies which tool initiated the transaction
func BeginLoggedTx(ctx context.Context, db *sql.DB, toolName string) (*LoggedTx, error) {
tx, err := db.BeginTx(ctx, nil)
if err != nil {
return nil, err
}

return &LoggedTx{
tx: tx,
queries: make([]QueryRecord, 0),
toolName: toolName,
startTime: time.Now(),
}, nil
}

// ExecContext executes and records the SQL statement if it's a mutation
func (l *LoggedTx) ExecContext(ctx context.Context, query string, args ...any) (sql.Result, error) {
result, err := l.tx.ExecContext(ctx, query, args...)
if err == nil && isMutation(query) {
l.mu.Lock()
l.queries = append(l.queries, QueryRecord{
SQL: query,
Parameters: args,
})
l.mu.Unlock()
}
return result, err
}

// Exec executes and records the SQL statement if it's a mutation
func (l *LoggedTx) Exec(query string, args ...any) (sql.Result, error) {
return l.ExecContext(context.Background(), query, args...)
}

// QueryRowContext delegates to underlying tx (not logged - read operation)
func (l *LoggedTx) QueryRowContext(ctx context.Context, query string, args ...any) *sql.Row {
return l.tx.QueryRowContext(ctx, query, args...)
}

// QueryRow delegates to underlying tx (not logged - read operation)
func (l *LoggedTx) QueryRow(query string, args ...any) *sql.Row {
return l.tx.QueryRow(query, args...)
}

// QueryContext delegates to underlying tx (not logged - read operation)
func (l *LoggedTx) QueryContext(ctx context.Context, query string, args ...any) (*sql.Rows, error) {
return l.tx.QueryContext(ctx, query, args...)
}

// Query delegates to underlying tx (not logged - read operation)
func (l *LoggedTx) Query(query string, args ...any) (*sql.Rows, error) {
return l.tx.Query(query, args...)
}

// PrepareContext creates a logged prepared statement
func (l *LoggedTx) PrepareContext(ctx context.Context, query string) (*LoggedStmt, error) {
stmt, err := l.tx.PrepareContext(ctx, query)
if err != nil {
return nil, err
}
return &LoggedStmt{stmt: stmt, tx: l, sql: query}, nil
}

// Prepare creates a logged prepared statement
func (l *LoggedTx) Prepare(query string) (*LoggedStmt, error) {
return l.PrepareContext(context.Background(), query)
}

// Rollback rolls back the transaction (discards recorded queries)
func (l *LoggedTx) Rollback() error {
l.mu.Lock()
l.queries = nil // Discard recorded queries
l.mu.Unlock()
return l.tx.Rollback()
}

// Commit commits the transaction and logs all recorded queries on success
func (l *LoggedTx) Commit() error {
err := l.tx.Commit()
if err != nil {
return err
}

// Log on success only
l.mu.Lock()
queries := l.queries
l.mu.Unlock()

if len(queries) > 0 && eventLogConfig.Enabled {
l.writeEvent(queries)
}

return nil
}

// writeEvent writes the transaction to the event log
func (l *LoggedTx) writeEvent(queries []QueryRecord) {
eventLogMu.Lock()
defer eventLogMu.Unlock()

if !eventLogConfig.Enabled {
return
}

// Ensure file is open
if err := ensureEventLogFile(); err != nil {
// Log to stderr but don't fail the commit
fmt.Fprintf(os.Stderr, "Warning: failed to open event log: %v\n", err)
return
}

id, err := gonanoid.New(21)
if err != nil {
fmt.Fprintf(os.Stderr, "Warning: failed to generate event ID: %v\n", err)
return
}

event := TransactionEvent{
ID: id,
Timestamp: time.Now(),
Tool: l.toolName,
Queries: queries,
Success: true,
Duration: time.Since(l.startTime).Milliseconds(),
}

if err := eventLogEnc.Encode(event); err != nil {
fmt.Fprintf(os.Stderr, "Warning: failed to write event log: %v\n", err)
}
}

// LoggedStmt methods

// ExecContext executes the prepared statement and logs if it's a mutation
func (s *LoggedStmt) ExecContext(ctx context.Context, args ...any) (sql.Result, error) {
result, err := s.stmt.ExecContext(ctx, args...)
if err == nil && isMutation(s.sql) {
s.tx.mu.Lock()
s.tx.queries = append(s.tx.queries, QueryRecord{
SQL: s.sql,
Parameters: args,
})
s.tx.mu.Unlock()
}
return result, err
}

// Exec executes the prepared statement and logs if it's a mutation
func (s *LoggedStmt) Exec(args ...any) (sql.Result, error) {
return s.ExecContext(context.Background(), args...)
}

// QueryRowContext delegates to underlying statement
func (s *LoggedStmt) QueryRowContext(ctx context.Context, args ...any) *sql.Row {
return s.stmt.QueryRowContext(ctx, args...)
}

// QueryRow delegates to underlying statement
func (s *LoggedStmt) QueryRow(args ...any) *sql.Row {
return s.stmt.QueryRow(args...)
}

// QueryContext delegates to underlying statement
func (s *LoggedStmt) QueryContext(ctx context.Context, args ...any) (*sql.Rows, error) {
return s.stmt.QueryContext(ctx, args...)
}

// Query delegates to underlying statement
func (s *LoggedStmt) Query(args ...any) (*sql.Rows, error) {
return s.stmt.Query(args...)
}

// Close closes the prepared statement
func (s *LoggedStmt) Close() error {
return s.stmt.Close()
}

// isMutation returns true if the SQL is a mutation (INSERT, UPDATE, DELETE)
func isMutation(sqlStr string) bool {
upper := strings.ToUpper(strings.TrimSpace(sqlStr))
// Handle WITH clauses (CTEs) that may contain mutations
if strings.HasPrefix(upper, "WITH") {
// Check for INSERT/UPDATE/DELETE within the query
return strings.Contains(upper, "INSERT") ||
strings.Contains(upper, "UPDATE") ||
strings.Contains(upper, "DELETE")
}
return strings.HasPrefix(upper, "INSERT") ||
strings.HasPrefix(upper, "UPDATE") ||
strings.HasPrefix(upper, "DELETE")
}

// ensureEventLogFile opens the event log file if not already open
func ensureEventLogFile() error {
if eventLogFile != nil {
return nil
}

dir := filepath.Dir(eventLogConfig.Path)
if err := os.MkdirAll(dir, 0755); err != nil {
return fmt.Errorf("failed to create event log directory: %w", err)
}

f, err := os.OpenFile(eventLogConfig.Path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
return fmt.Errorf("failed to open event log file: %w", err)
}

eventLogFile = f
eventLogEnc = json.NewEncoder(f)
eventLogEnc.SetEscapeHTML(false)

return nil
}

// CloseEventLog closes the event log file
func CloseEventLog() error {
eventLogMu.Lock()
defer eventLogMu.Unlock()

// Disable logging before closing
eventLogConfig.Enabled = false

if eventLogFile != nil {
err := eventLogFile.Close()
eventLogFile = nil
eventLogEnc = nil
return err
}
return nil
}

// MarshalJSON implements json.Marshaler for QueryRecord
// Handles special types like time.Time, nil, and nullable types
func (q QueryRecord) MarshalJSON() ([]byte, error) {
// Create a helper struct with string parameters
type QueryRecordJSON struct {
SQL string `json:"sql"`
Parameters []any `json:"parameters"`
}

result := QueryRecordJSON{
SQL: q.SQL,
Parameters: make([]any, len(q.Parameters)),
}

for i, param := range q.Parameters {
result.Parameters[i] = marshalParam(param)
}

return json.Marshal(result)
}

// marshalParam converts a parameter to a JSON-serializable value
func marshalParam(param any) any {
if param == nil {
return nil
}

switch v := param.(type) {
case time.Time:
return v.Format(time.RFC3339Nano)
case *time.Time:
if v == nil {
return nil
}
return v.Format(time.RFC3339Nano)
case string:
return v
case *string:
if v == nil {
return nil
}
return *v
case int:
return v
case *int:
if v == nil {
return nil
}
return *v
case int8:
return v
case *int8:
if v == nil {
return nil
}
return *v
case int16:
return v
case *int16:
if v == nil {
return nil
}
return *v
case int32:
return v
case *int32:
if v == nil {
return nil
}
return *v
case int64:
return v
case *int64:
if v == nil {
return nil
}
return *v
case uint:
return v
case *uint:
if v == nil {
return nil
}
return *v
case uint8:
return v
case *uint8:
if v == nil {
return nil
}
return *v
case uint16:
return v
case *uint16:
if v == nil {
return nil
}
return *v
case uint32:
return v
case *uint32:
if v == nil {
return nil
}
return *v
case uint64:
return v
case *uint64:
if v == nil {
return nil
}
return *v
case float32:
return v
case *float32:
if v == nil {
return nil
}
return *v
case float64:
return v
case *float64:
if v == nil {
return nil
}
return *v
case bool:
return v
case *bool:
if v == nil {
return nil
}
return *v
case []byte:
return v
default:
// Handle pointer types via reflection (e.g., *GainLevel, *CustomType)
rv := reflect.ValueOf(param)
if rv.Kind() == reflect.Pointer {
if rv.IsNil() {
return nil
}
// Dereference and recursively marshal the underlying value
return marshalParam(rv.Elem().Interface())
}
// For other types, try to convert to string via fmt.Sprintf
return fmt.Sprintf("%v", v)
}
}
file addition: schema_test.go (----------)

[0.790921]

package db

import (
"database/sql"
"fmt"
"strings"
"testing"

_ "github.com/duckdb/duckdb-go/v2"
)

func GetTableRowCount(db *sql.DB, table string) (int64, error) {
var count int64
err := db.QueryRow(fmt.Sprintf("SELECT COUNT(*) FROM %s", table)).Scan(&count)
if err != nil {
return 0, fmt.Errorf("failed to count rows in %s: %w", table, err)
}
return count, nil
}

func TestReadSchemaSQL(t *testing.T) {
schema, err := ReadSchemaSQL()
if err != nil {
t.Fatalf("ReadSchemaSQL() error = %v", err)
}

// Verify schema contains expected elements
if !strings.Contains(schema, "CREATE TABLE dataset") {
t.Error("schema missing CREATE TABLE dataset")
}
if !strings.Contains(schema, "CREATE TYPE dataset_type") {
t.Error("schema missing CREATE TYPE dataset_type")
}
if !strings.Contains(schema, "CREATE INDEX") {
t.Error("schema missing CREATE INDEX")
}
}

func TestExtractDDLStatements(t *testing.T) {
schema, err := ReadSchemaSQL()
if err != nil {
t.Fatalf("ReadSchemaSQL() error = %v", err)
}

statements := ExtractDDLStatements(schema)
if len(statements) == 0 {
t.Fatal("ExtractDDLStatements returned no statements")
}

// Count statement types
typeCounts := make(map[string]int)
tableNames := make(map[string]bool)

for _, stmt := range statements {
typeCounts[stmt.Type]++
if stmt.TableName != "" {
tableNames[stmt.TableName] = true
}
t.Logf("Statement type=%s table=%s sql=%s", stmt.Type, stmt.TableName, stmt.SQL[:min(50, len(stmt.SQL))])
}

// Verify we have all expected types
if typeCounts["CREATE_TYPE"] < 2 {
t.Errorf("expected at least 2 CREATE_TYPE statements, got %d", typeCounts["CREATE_TYPE"])
}
if typeCounts["CREATE_TABLE"] < 10 {
t.Errorf("expected at least 10 CREATE_TABLE statements, got %d", typeCounts["CREATE_TABLE"])
}
if typeCounts["CREATE_INDEX"] < 5 {
t.Errorf("expected at least 5 CREATE_INDEX statements, got %d", typeCounts["CREATE_INDEX"])
}
// CREATE_TABLE_AS might be 0 if the extraction logic changes - that's OK
// as long as we handle it correctly in the export code

// Verify key tables are found
expectedTables := []string{"dataset", "location", "cluster", "file", "segment", "label"}
for _, expected := range expectedTables {
if !tableNames[expected] {
t.Errorf("missing table %s in extracted statements", expected)
}
}
}

func TestExtractDDLStatement_Types(t *testing.T) {
tests := []struct {
name string
sql string
wantType string
wantTable string
}{
{
name: "CREATE TYPE",
sql: "CREATE TYPE dataset_type AS ENUM ('structured', 'unstructured');",
wantType: "CREATE_TYPE",
wantTable: "",
},
{
name: "CREATE TABLE simple",
sql: "CREATE TABLE dataset (id VARCHAR(12) PRIMARY KEY);",
wantType: "CREATE_TABLE",
wantTable: "dataset",
},
{
name: "CREATE TABLE with newlines",
sql: "CREATE TABLE location\n(\n id VARCHAR(12) PRIMARY KEY\n);",
wantType: "CREATE_TABLE",
wantTable: "location",
},
{
name: "CREATE INDEX",
sql: "CREATE INDEX idx_file_location ON file(location_id);",
wantType: "CREATE_INDEX",
wantTable: "idx_file_location",
},
{
name: "CREATE UNIQUE INDEX",
sql: "CREATE UNIQUE INDEX idx_species_label ON species(label);",
wantType: "CREATE_INDEX",
wantTable: "idx_species_label",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
stmt := parseDDLStatement(tt.sql)
if stmt.Type != tt.wantType {
t.Errorf("parseDDLStatement().Type = %v, want %v", stmt.Type, tt.wantType)
}
if stmt.TableName != tt.wantTable {
t.Errorf("parseDDLStatement().TableName = %v, want %v", stmt.TableName, tt.wantTable)
}
})
}
}

func TestExtractTableName(t *testing.T) {
tests := []struct {
name string
sql string
want string
}{
{
name: "simple table",
sql: "CREATE TABLE dataset (id VARCHAR(12) PRIMARY KEY",
want: "dataset",
},
{
name: "table with space before paren",
sql: "CREATE TABLE location (id VARCHAR(12)",
want: "location",
},
{
name: "table with newline",
sql: "CREATE TABLE cluster\n(\n id VARCHAR(12)",
want: "cluster",
},
{
name: "table with no space",
sql: "CREATE TABLE file(id VARCHAR(21)",
want: "file",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := extractTableName(tt.sql)
if got != tt.want {
t.Errorf("extractTableName() = %v, want %v", got, tt.want)
}
})
}
}

func TestExtractIndexName(t *testing.T) {
tests := []struct {
name string
sql string
want string
}{
{
name: "CREATE INDEX",
sql: "CREATE INDEX idx_file_location ON file(location_id)",
want: "idx_file_location",
},
{
name: "CREATE UNIQUE INDEX",
sql: "CREATE UNIQUE INDEX idx_species_label ON species(label)",
want: "idx_species_label",
},
{
name: "index with spaces",
sql: "CREATE INDEX idx_test ON table_name (column)",
want: "idx_test",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := extractIndexName(tt.sql)
if got != tt.want {
t.Errorf("extractIndexName() = %v, want %v", got, tt.want)
}
})
}
}

func TestExtractDDLStatements_SkipsComments(t *testing.T) {
schema := `-- This is a comment
CREATE TABLE test (id INT);
-- Another comment
CREATE INDEX idx_test ON test(id);
`
statements := ExtractDDLStatements(schema)

// Should have 2 statements, not 4
if len(statements) != 2 {
t.Errorf("expected 2 statements, got %d", len(statements))
}

for _, stmt := range statements {
if strings.Contains(stmt.SQL, "--") {
t.Errorf("statement should not contain comments: %s", stmt.SQL)
}
}
}

func TestGetFKOrder(t *testing.T) {
// Use in-memory database
db, err := sql.Open("duckdb", ":memory:")
if err != nil {
t.Fatalf("failed to open database: %v", err)
}
defer db.Close()

// Create tables with FK relationships
schema := `
CREATE TABLE parent (id VARCHAR(12) PRIMARY KEY);
CREATE TABLE child (id VARCHAR(12) PRIMARY KEY, parent_id VARCHAR(12), FOREIGN KEY (parent_id) REFERENCES parent(id));
CREATE TABLE grandchild (id VARCHAR(12) PRIMARY KEY, child_id VARCHAR(12), FOREIGN KEY (child_id) REFERENCES child(id));
CREATE TABLE independent (id VARCHAR(12) PRIMARY KEY);
`
_, err = db.Exec(schema)
if err != nil {
t.Fatalf("failed to create schema: %v", err)
}

order, err := GetFKOrder(db)
if err != nil {
t.Fatalf("GetFKOrder() error = %v", err)
}

// Build a map for quick lookup
orderMap := make(map[string]int)
for i, table := range order {
orderMap[table] = i
}

// Verify order: parent must come before child, child before grandchild
if orderMap["parent"] >= orderMap["child"] {
t.Error("parent should come before child")
}
if orderMap["child"] >= orderMap["grandchild"] {
t.Error("child should come before grandchild")
}

// Independent table can be anywhere
if _, ok := orderMap["independent"]; !ok {
t.Error("independent table missing from order")
}
}

func TestGetTableRowCount(t *testing.T) {
// Use in-memory database
db, err := sql.Open("duckdb", ":memory:")
if err != nil {
t.Fatalf("failed to open database: %v", err)
}
defer db.Close()

// Create and populate table
_, err = db.Exec("CREATE TABLE test (id INT)")
if err != nil {
t.Fatalf("failed to create table: %v", err)
}

_, err = db.Exec("INSERT INTO test VALUES (1), (2), (3)")
if err != nil {
t.Fatalf("failed to insert: %v", err)
}

count, err := GetTableRowCount(db, "test")
if err != nil {
t.Fatalf("GetTableRowCount() error = %v", err)
}

if count != 3 {
t.Errorf("GetTableRowCount() = %d, want 3", count)
}
}
file addition: schema.svg (---r------)

[0.790921]

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">


<svg width="6217pt" height="3993pt"
viewBox="0.00 0.00 6216.67 3993.04" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 3989.04)">
<title>dbml</title>

<g id="dataset_type" class="node">
<title>dataset_type</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="1019.59" cy="-214.96" rx="235.43" ry="214.92"/>
<polygon fill="#29235c" stroke="transparent" points="855.59,-304.96 855.59,-364.96 1184.59,-364.96 1184.59,-304.96 855.59,-304.96"/>
<polygon fill="none" stroke="#29235c" points="855.59,-304.96 855.59,-364.96 1184.59,-364.96 1184.59,-304.96 855.59,-304.96"/>
<text text-anchor="start" x="866.24" y="-326.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       dataset_type       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="855.59,-244.96 855.59,-304.96 1184.59,-304.96 1184.59,-244.96 855.59,-244.96"/>
<polygon fill="none" stroke="#29235c" points="855.59,-244.96 855.59,-304.96 1184.59,-304.96 1184.59,-244.96 855.59,-244.96"/>
<text text-anchor="start" x="913.39" y="-266.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#1d71b8">    structured    </text>
<polygon fill="#e7e2dd" stroke="transparent" points="855.59,-184.96 855.59,-244.96 1184.59,-244.96 1184.59,-184.96 855.59,-184.96"/>
<polygon fill="none" stroke="#29235c" points="855.59,-184.96 855.59,-244.96 1184.59,-244.96 1184.59,-184.96 855.59,-184.96"/>
<text text-anchor="start" x="895.6" y="-206.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#1d71b8">    unstructured    </text>
<polygon fill="#e7e2dd" stroke="transparent" points="855.59,-124.96 855.59,-184.96 1184.59,-184.96 1184.59,-124.96 855.59,-124.96"/>
<polygon fill="none" stroke="#29235c" points="855.59,-124.96 855.59,-184.96 1184.59,-184.96 1184.59,-124.96 855.59,-124.96"/>
<text text-anchor="start" x="958.73" y="-146.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#1d71b8">    test    </text>
<polygon fill="#e7e2dd" stroke="transparent" points="855.59,-64.96 855.59,-124.96 1184.59,-124.96 1184.59,-64.96 855.59,-64.96"/>
<polygon fill="none" stroke="#29235c" points="855.59,-64.96 855.59,-124.96 1184.59,-124.96 1184.59,-64.96 855.59,-64.96"/>
<text text-anchor="start" x="953.4" y="-86.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#1d71b8">    train    </text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="854.09,-63.96 854.09,-365.96 1185.09,-365.96 1185.09,-63.96 854.09,-63.96"/>
</g>

<g id="gain_level" class="node">
<title>gain_level</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="4428.3" cy="-1280.96" rx="207.78" ry="257.27"/>
<polygon fill="#29235c" stroke="transparent" points="4283.3,-1400.96 4283.3,-1460.96 4573.3,-1460.96 4573.3,-1400.96 4283.3,-1400.96"/>
<polygon fill="none" stroke="#29235c" points="4283.3,-1400.96 4283.3,-1460.96 4573.3,-1460.96 4573.3,-1400.96 4283.3,-1400.96"/>
<text text-anchor="start" x="4294.03" y="-1422.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       gain_level       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="4283.3,-1340.96 4283.3,-1400.96 4573.3,-1400.96 4573.3,-1340.96 4283.3,-1340.96"/>
<polygon fill="none" stroke="#29235c" points="4283.3,-1340.96 4283.3,-1400.96 4573.3,-1400.96 4573.3,-1340.96 4283.3,-1340.96"/>
<text text-anchor="start" x="4368.73" y="-1362.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#1d71b8">    low    </text>
<polygon fill="#e7e2dd" stroke="transparent" points="4283.3,-1280.96 4283.3,-1340.96 4573.3,-1340.96 4573.3,-1280.96 4283.3,-1280.96"/>
<polygon fill="none" stroke="#29235c" points="4283.3,-1280.96 4283.3,-1340.96 4573.3,-1340.96 4573.3,-1280.96 4283.3,-1280.96"/>
<text text-anchor="start" x="4306.52" y="-1302.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#1d71b8">    low-medium    </text>
<polygon fill="#e7e2dd" stroke="transparent" points="4283.3,-1220.96 4283.3,-1280.96 4573.3,-1280.96 4573.3,-1220.96 4283.3,-1220.96"/>
<polygon fill="none" stroke="#29235c" points="4283.3,-1220.96 4283.3,-1280.96 4573.3,-1280.96 4573.3,-1220.96 4283.3,-1220.96"/>
<text text-anchor="start" x="4335.84" y="-1242.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#1d71b8">    medium    </text>
<polygon fill="#e7e2dd" stroke="transparent" points="4283.3,-1160.96 4283.3,-1220.96 4573.3,-1220.96 4573.3,-1160.96 4283.3,-1160.96"/>
<polygon fill="none" stroke="#29235c" points="4283.3,-1160.96 4283.3,-1220.96 4573.3,-1220.96 4573.3,-1160.96 4283.3,-1160.96"/>
<text text-anchor="start" x="4300.28" y="-1182.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#1d71b8">    medium-high    </text>
<polygon fill="#e7e2dd" stroke="transparent" points="4283.3,-1100.96 4283.3,-1160.96 4573.3,-1160.96 4573.3,-1100.96 4283.3,-1100.96"/>
<polygon fill="none" stroke="#29235c" points="4283.3,-1100.96 4283.3,-1160.96 4573.3,-1160.96 4573.3,-1100.96 4283.3,-1100.96"/>
<text text-anchor="start" x="4362.49" y="-1122.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#1d71b8">    high    </text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="4282.3,-1099.96 4282.3,-1461.96 4574.3,-1461.96 4574.3,-1099.96 4282.3,-1099.96"/>
</g>

<g id="dataset" class="node">
<title>dataset</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="316.08" cy="-1927.96" rx="316.15" ry="342.48"/>
<polygon fill="#1d71b8" stroke="transparent" points="95.08,-2107.96 95.08,-2167.96 538.08,-2167.96 538.08,-2107.96 95.08,-2107.96"/>
<polygon fill="none" stroke="#29235c" points="95.08,-2107.96 95.08,-2167.96 538.08,-2167.96 538.08,-2107.96 95.08,-2107.96"/>
<text text-anchor="start" x="201.86" y="-2129.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       dataset       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="95.08,-2047.96 95.08,-2107.96 538.08,-2107.96 538.08,-2047.96 95.08,-2047.96"/>
<polygon fill="none" stroke="#29235c" points="95.08,-2047.96 95.08,-2107.96 538.08,-2107.96 538.08,-2047.96 95.08,-2047.96"/>
<text text-anchor="start" x="106.08" y="-2069.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">id</text>
<text text-anchor="start" x="130.97" y="-2069.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">    </text>
<text text-anchor="start" x="313.77" y="-2069.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="95.08,-1987.96 95.08,-2047.96 538.08,-2047.96 538.08,-1987.96 95.08,-1987.96"/>
<polygon fill="none" stroke="#29235c" points="95.08,-1987.96 95.08,-2047.96 538.08,-2047.96 538.08,-1987.96 95.08,-1987.96"/>
<text text-anchor="start" x="106.08" y="-2008.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">name    </text>
<text text-anchor="start" x="256.89" y="-2009.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(255)</text>
<text text-anchor="start" x="487.99" y="-2009.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="496.88" y="-2009.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="95.08,-1927.96 95.08,-1987.96 538.08,-1987.96 538.08,-1927.96 95.08,-1927.96"/>
<polygon fill="none" stroke="#29235c" points="95.08,-1927.96 95.08,-1987.96 538.08,-1987.96 538.08,-1927.96 95.08,-1927.96"/>
<text text-anchor="start" x="105.95" y="-1948.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">description    </text>
<text text-anchor="start" x="296.03" y="-1949.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(255)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="95.08,-1867.96 95.08,-1927.96 538.08,-1927.96 538.08,-1867.96 95.08,-1867.96"/>
<polygon fill="none" stroke="#29235c" points="95.08,-1867.96 95.08,-1927.96 538.08,-1927.96 538.08,-1867.96 95.08,-1867.96"/>
<text text-anchor="start" x="106.08" y="-1888.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">created_at    </text>
<text text-anchor="start" x="340.42" y="-1889.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="95.08,-1807.96 95.08,-1867.96 538.08,-1867.96 538.08,-1807.96 95.08,-1807.96"/>
<polygon fill="none" stroke="#29235c" points="95.08,-1807.96 95.08,-1867.96 538.08,-1867.96 538.08,-1807.96 95.08,-1807.96"/>
<text text-anchor="start" x="106.08" y="-1828.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">last_modified    </text>
<text text-anchor="start" x="340.42" y="-1829.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="95.08,-1747.96 95.08,-1807.96 538.08,-1807.96 538.08,-1747.96 95.08,-1747.96"/>
<polygon fill="none" stroke="#29235c" points="95.08,-1747.96 95.08,-1807.96 538.08,-1807.96 538.08,-1747.96 95.08,-1747.96"/>
<text text-anchor="start" x="106.08" y="-1768.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">active    </text>
<text text-anchor="start" x="372.38" y="-1769.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">BOOLEAN</text>
<polygon fill="#e7e2dd" stroke="transparent" points="95.08,-1687.96 95.08,-1747.96 538.08,-1747.96 538.08,-1687.96 95.08,-1687.96"/>
<polygon fill="none" stroke="#29235c" points="95.08,-1687.96 95.08,-1747.96 538.08,-1747.96 538.08,-1687.96 95.08,-1687.96"/>
<text text-anchor="start" x="106.08" y="-1708.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">type    </text>
<text text-anchor="start" x="304.79" y="-1709.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">dataset_type</text>
<text text-anchor="start" x="487.99" y="-1709.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="496.88" y="-1709.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="93.58,-1686.96 93.58,-2168.96 538.58,-2168.96 538.58,-1686.96 93.58,-1686.96"/>
</g>

<g id="edge45" class="edge">
<title>dataset:e->dataset_type:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M539.08,-1717.96C823.64,-1717.96 514.27,-683.32 668.15,-443.96 720.06,-363.22 758.6,-334.96 854.59,-334.96"/>
</g>

<g id="location" class="node">
<title>location</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="1019.59" cy="-1837.96" rx="343.81" ry="469.54"/>
<polygon fill="#1d71b8" stroke="transparent" points="778.59,-2107.96 778.59,-2167.96 1260.59,-2167.96 1260.59,-2107.96 778.59,-2107.96"/>
<polygon fill="none" stroke="#29235c" points="778.59,-2107.96 778.59,-2167.96 1260.59,-2167.96 1260.59,-2107.96 778.59,-2107.96"/>
<text text-anchor="start" x="902.21" y="-2129.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       location       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="778.59,-2047.96 778.59,-2107.96 1260.59,-2107.96 1260.59,-2047.96 778.59,-2047.96"/>
<polygon fill="none" stroke="#29235c" points="778.59,-2047.96 778.59,-2107.96 1260.59,-2107.96 1260.59,-2047.96 778.59,-2047.96"/>
<text text-anchor="start" x="789.59" y="-2069.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">id</text>
<text text-anchor="start" x="814.48" y="-2069.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">    </text>
<text text-anchor="start" x="1036.28" y="-2069.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="778.59,-1987.96 778.59,-2047.96 1260.59,-2047.96 1260.59,-1987.96 778.59,-1987.96"/>
<polygon fill="none" stroke="#29235c" points="778.59,-1987.96 778.59,-2047.96 1260.59,-2047.96 1260.59,-1987.96 778.59,-1987.96"/>
<text text-anchor="start" x="789.59" y="-2008.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">dataset_id    </text>
<text text-anchor="start" x="997.19" y="-2009.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<text text-anchor="start" x="1210.49" y="-2009.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="1219.39" y="-2009.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="778.59,-1927.96 778.59,-1987.96 1260.59,-1987.96 1260.59,-1927.96 778.59,-1927.96"/>
<polygon fill="none" stroke="#29235c" points="778.59,-1927.96 778.59,-1987.96 1260.59,-1987.96 1260.59,-1927.96 778.59,-1927.96"/>
<text text-anchor="start" x="789.59" y="-1948.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">name    </text>
<text text-anchor="start" x="979.4" y="-1949.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(140)</text>
<text text-anchor="start" x="1210.49" y="-1949.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="1219.39" y="-1949.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="778.59,-1867.96 778.59,-1927.96 1260.59,-1927.96 1260.59,-1867.96 778.59,-1867.96"/>
<polygon fill="none" stroke="#29235c" points="778.59,-1867.96 778.59,-1927.96 1260.59,-1927.96 1260.59,-1867.96 778.59,-1867.96"/>
<text text-anchor="start" x="789.59" y="-1888.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">latitude    </text>
<text text-anchor="start" x="984.71" y="-1889.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">DECIMAL(10,7)</text>
<text text-anchor="start" x="1210.49" y="-1889.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="1219.39" y="-1889.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="778.59,-1807.96 778.59,-1867.96 1260.59,-1867.96 1260.59,-1807.96 778.59,-1807.96"/>
<polygon fill="none" stroke="#29235c" points="778.59,-1807.96 778.59,-1867.96 1260.59,-1867.96 1260.59,-1807.96 778.59,-1807.96"/>
<text text-anchor="start" x="789.59" y="-1828.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">longitude    </text>
<text text-anchor="start" x="984.71" y="-1829.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">DECIMAL(10,7)</text>
<text text-anchor="start" x="1210.49" y="-1829.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="1219.39" y="-1829.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="778.59,-1747.96 778.59,-1807.96 1260.59,-1807.96 1260.59,-1747.96 778.59,-1747.96"/>
<polygon fill="none" stroke="#29235c" points="778.59,-1747.96 778.59,-1807.96 1260.59,-1807.96 1260.59,-1747.96 778.59,-1747.96"/>
<text text-anchor="start" x="789.59" y="-1768.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">description    </text>
<text text-anchor="start" x="1018.49" y="-1769.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(255)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="778.59,-1687.96 778.59,-1747.96 1260.59,-1747.96 1260.59,-1687.96 778.59,-1687.96"/>
<polygon fill="none" stroke="#29235c" points="778.59,-1687.96 778.59,-1747.96 1260.59,-1747.96 1260.59,-1687.96 778.59,-1687.96"/>
<text text-anchor="start" x="789.59" y="-1708.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">created_at    </text>
<text text-anchor="start" x="1062.93" y="-1709.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="778.59,-1627.96 778.59,-1687.96 1260.59,-1687.96 1260.59,-1627.96 778.59,-1627.96"/>
<polygon fill="none" stroke="#29235c" points="778.59,-1627.96 778.59,-1687.96 1260.59,-1687.96 1260.59,-1627.96 778.59,-1627.96"/>
<text text-anchor="start" x="789.59" y="-1648.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">last_modified    </text>
<text text-anchor="start" x="1062.93" y="-1649.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="778.59,-1567.96 778.59,-1627.96 1260.59,-1627.96 1260.59,-1567.96 778.59,-1567.96"/>
<polygon fill="none" stroke="#29235c" points="778.59,-1567.96 778.59,-1627.96 1260.59,-1627.96 1260.59,-1567.96 778.59,-1567.96"/>
<text text-anchor="start" x="789.59" y="-1588.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">active    </text>
<text text-anchor="start" x="1094.89" y="-1589.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">BOOLEAN</text>
<polygon fill="#e7e2dd" stroke="transparent" points="778.59,-1507.96 778.59,-1567.96 1260.59,-1567.96 1260.59,-1507.96 778.59,-1507.96"/>
<polygon fill="none" stroke="#29235c" points="778.59,-1507.96 778.59,-1567.96 1260.59,-1567.96 1260.59,-1507.96 778.59,-1507.96"/>
<text text-anchor="start" x="789.56" y="-1528.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">timezone_id    </text>
<text text-anchor="start" x="997.39" y="-1529.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(40)</text>
<text text-anchor="start" x="1210.69" y="-1529.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="1219.59" y="-1529.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="777.59,-1506.96 777.59,-2168.96 1261.59,-2168.96 1261.59,-1506.96 777.59,-1506.96"/>
</g>


<g id="edge2" class="edge">
<title>dataset:e->location:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M539.08,-2077.96C644.86,-2077.96 666.9,-2021.77 767.29,-2018.14"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="767.65,-2021.64 777.59,-2017.96 767.52,-2014.64 767.65,-2021.64"/>
<text text-anchor="middle" x="771.36" y="-2027.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="530.18" y="-2087.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>

<g id="cluster" class="node">
<title>cluster</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="1875.83" cy="-1293.96" rx="468.62" ry="511.89"/>
<polygon fill="#1d71b8" stroke="transparent" points="1546.83,-1593.96 1546.83,-1653.96 2205.83,-1653.96 2205.83,-1593.96 1546.83,-1593.96"/>
<polygon fill="none" stroke="#29235c" points="1546.83,-1593.96 1546.83,-1653.96 2205.83,-1653.96 2205.83,-1593.96 1546.83,-1593.96"/>
<text text-anchor="start" x="1766.97" y="-1615.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       cluster       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="1546.83,-1533.96 1546.83,-1593.96 2205.83,-1593.96 2205.83,-1533.96 1546.83,-1533.96"/>
<polygon fill="none" stroke="#29235c" points="1546.83,-1533.96 1546.83,-1593.96 2205.83,-1593.96 2205.83,-1533.96 1546.83,-1533.96"/>
<text text-anchor="start" x="1557.83" y="-1555.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">id</text>
<text text-anchor="start" x="1582.72" y="-1555.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">    </text>
<text text-anchor="start" x="1981.52" y="-1555.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="1546.83,-1473.96 1546.83,-1533.96 2205.83,-1533.96 2205.83,-1473.96 1546.83,-1473.96"/>
<polygon fill="none" stroke="#29235c" points="1546.83,-1473.96 1546.83,-1533.96 2205.83,-1533.96 2205.83,-1473.96 1546.83,-1473.96"/>
<text text-anchor="start" x="1557.83" y="-1494.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">dataset_id    </text>
<text text-anchor="start" x="1942.43" y="-1495.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<text text-anchor="start" x="2155.74" y="-1495.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="2164.63" y="-1495.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="1546.83,-1413.96 1546.83,-1473.96 2205.83,-1473.96 2205.83,-1413.96 1546.83,-1413.96"/>
<polygon fill="none" stroke="#29235c" points="1546.83,-1413.96 1546.83,-1473.96 2205.83,-1473.96 2205.83,-1413.96 1546.83,-1413.96"/>
<text text-anchor="start" x="1557.83" y="-1434.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">location_id    </text>
<text text-anchor="start" x="1942.43" y="-1435.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<text text-anchor="start" x="2155.74" y="-1435.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="2164.63" y="-1435.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="1546.83,-1353.96 1546.83,-1413.96 2205.83,-1413.96 2205.83,-1353.96 1546.83,-1353.96"/>
<polygon fill="none" stroke="#29235c" points="1546.83,-1353.96 1546.83,-1413.96 2205.83,-1413.96 2205.83,-1353.96 1546.83,-1353.96"/>
<text text-anchor="start" x="1557.83" y="-1374.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">name    </text>
<text text-anchor="start" x="1924.64" y="-1375.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(140)</text>
<text text-anchor="start" x="2155.74" y="-1375.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="2164.63" y="-1375.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="1546.83,-1293.96 1546.83,-1353.96 2205.83,-1353.96 2205.83,-1293.96 1546.83,-1293.96"/>
<polygon fill="none" stroke="#29235c" points="1546.83,-1293.96 1546.83,-1353.96 2205.83,-1353.96 2205.83,-1293.96 1546.83,-1293.96"/>
<text text-anchor="start" x="1557.83" y="-1314.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">description    </text>
<text text-anchor="start" x="1963.73" y="-1315.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(255)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="1546.83,-1233.96 1546.83,-1293.96 2205.83,-1293.96 2205.83,-1233.96 1546.83,-1233.96"/>
<polygon fill="none" stroke="#29235c" points="1546.83,-1233.96 1546.83,-1293.96 2205.83,-1293.96 2205.83,-1233.96 1546.83,-1233.96"/>
<text text-anchor="start" x="1557.83" y="-1254.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">created_at    </text>
<text text-anchor="start" x="2008.17" y="-1255.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="1546.83,-1173.96 1546.83,-1233.96 2205.83,-1233.96 2205.83,-1173.96 1546.83,-1173.96"/>
<polygon fill="none" stroke="#29235c" points="1546.83,-1173.96 1546.83,-1233.96 2205.83,-1233.96 2205.83,-1173.96 1546.83,-1173.96"/>
<text text-anchor="start" x="1557.83" y="-1194.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">last_modified    </text>
<text text-anchor="start" x="2008.17" y="-1195.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="1546.83,-1113.96 1546.83,-1173.96 2205.83,-1173.96 2205.83,-1113.96 1546.83,-1113.96"/>
<polygon fill="none" stroke="#29235c" points="1546.83,-1113.96 1546.83,-1173.96 2205.83,-1173.96 2205.83,-1113.96 1546.83,-1113.96"/>
<text text-anchor="start" x="1557.83" y="-1134.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">active    </text>
<text text-anchor="start" x="2040.13" y="-1135.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">BOOLEAN</text>
<polygon fill="#e7e2dd" stroke="transparent" points="1546.83,-1053.96 1546.83,-1113.96 2205.83,-1113.96 2205.83,-1053.96 1546.83,-1053.96"/>
<polygon fill="none" stroke="#29235c" points="1546.83,-1053.96 1546.83,-1113.96 2205.83,-1113.96 2205.83,-1053.96 1546.83,-1053.96"/>
<text text-anchor="start" x="1557.34" y="-1074.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">cyclic_recording_pattern_id    </text>
<text text-anchor="start" x="1981.67" y="-1075.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="1546.83,-993.96 1546.83,-1053.96 2205.83,-1053.96 2205.83,-993.96 1546.83,-993.96"/>
<polygon fill="none" stroke="#29235c" points="1546.83,-993.96 1546.83,-1053.96 2205.83,-1053.96 2205.83,-993.96 1546.83,-993.96"/>
<text text-anchor="start" x="1557.83" y="-1014.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">sample_rate    </text>
<text text-anchor="start" x="2013.52" y="-1015.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">INTEGER</text>
<text text-anchor="start" x="2155.74" y="-1015.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="2164.63" y="-1015.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="1546.83,-933.96 1546.83,-993.96 2205.83,-993.96 2205.83,-933.96 1546.83,-933.96"/>
<polygon fill="none" stroke="#29235c" points="1546.83,-933.96 1546.83,-993.96 2205.83,-993.96 2205.83,-933.96 1546.83,-933.96"/>
<text text-anchor="start" x="1557.83" y="-954.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">path    </text>
<text text-anchor="start" x="1963.73" y="-955.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(255)</text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="1545.33,-932.96 1545.33,-1654.96 2206.33,-1654.96 2206.33,-932.96 1545.33,-932.96"/>
</g>


<g id="edge4" class="edge">
<title>dataset:e->cluster:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M539.08,-2077.96C845.98,-2077.96 441.23,-909.58 668.15,-702.96 725.9,-650.38 1310.19,-653.98 1371.02,-702.96 1652.11,-929.33 1190.4,-1493.09 1535.65,-1503.81"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="1535.78,-1507.31 1545.83,-1503.96 1535.88,-1500.31 1535.78,-1507.31"/>
<text text-anchor="middle" x="1552.05" y="-1513.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="547.97" y="-2087.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>

<g id="file_dataset" class="node">
<title>file_dataset</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="3581.95" cy="-2185.96" rx="325.95" ry="257.27"/>
<polygon fill="#1d71b8" stroke="transparent" points="3353.95,-2305.96 3353.95,-2365.96 3810.95,-2365.96 3810.95,-2305.96 3353.95,-2305.96"/>
<polygon fill="none" stroke="#29235c" points="3353.95,-2305.96 3353.95,-2365.96 3810.95,-2365.96 3810.95,-2305.96 3353.95,-2305.96"/>
<text text-anchor="start" x="3438.4" y="-2327.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       file_dataset       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="3353.95,-2245.96 3353.95,-2305.96 3810.95,-2305.96 3810.95,-2245.96 3353.95,-2245.96"/>
<polygon fill="none" stroke="#29235c" points="3353.95,-2245.96 3353.95,-2305.96 3810.95,-2305.96 3810.95,-2245.96 3353.95,-2245.96"/>
<text text-anchor="start" x="3364.95" y="-2267.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">file_id</text>
<text text-anchor="start" x="3448.51" y="-2267.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">    </text>
<text text-anchor="start" x="3547.55" y="-2267.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(21)</text>
<text text-anchor="start" x="3760.86" y="-2267.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="3769.75" y="-2267.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3353.95,-2185.96 3353.95,-2245.96 3810.95,-2245.96 3810.95,-2185.96 3353.95,-2185.96"/>
<polygon fill="none" stroke="#29235c" points="3353.95,-2185.96 3353.95,-2245.96 3810.95,-2245.96 3810.95,-2185.96 3353.95,-2185.96"/>
<text text-anchor="start" x="3364.86" y="-2207.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">dataset_id</text>
<text text-anchor="start" x="3512.48" y="-2207.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">    </text>
<text text-anchor="start" x="3547.75" y="-2207.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<text text-anchor="start" x="3761.06" y="-2207.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="3769.95" y="-2207.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3353.95,-2125.96 3353.95,-2185.96 3810.95,-2185.96 3810.95,-2125.96 3353.95,-2125.96"/>
<polygon fill="none" stroke="#29235c" points="3353.95,-2125.96 3353.95,-2185.96 3810.95,-2185.96 3810.95,-2125.96 3353.95,-2125.96"/>
<text text-anchor="start" x="3364.95" y="-2146.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">created_at    </text>
<text text-anchor="start" x="3613.29" y="-2147.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3353.95,-2065.96 3353.95,-2125.96 3810.95,-2125.96 3810.95,-2065.96 3353.95,-2065.96"/>
<polygon fill="none" stroke="#29235c" points="3353.95,-2065.96 3353.95,-2125.96 3810.95,-2125.96 3810.95,-2065.96 3353.95,-2065.96"/>
<text text-anchor="start" x="3364.95" y="-2086.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">last_modified    </text>
<text text-anchor="start" x="3613.29" y="-2087.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3353.95,-2005.96 3353.95,-2065.96 3810.95,-2065.96 3810.95,-2005.96 3353.95,-2005.96"/>
<polygon fill="none" stroke="#29235c" points="3353.95,-2005.96 3353.95,-2065.96 3810.95,-2065.96 3810.95,-2005.96 3353.95,-2005.96"/>
<text text-anchor="start" x="3422.4" y="-2027.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#1d71b8">    file_id, dataset_id    </text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="3352.45,-2004.96 3352.45,-2366.96 3811.45,-2366.96 3811.45,-2004.96 3352.45,-2004.96"/>
</g>


<g id="edge20" class="edge">
<title>dataset:e->file_dataset:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M539.08,-2077.96C855.27,-2077.96 434.65,-874.16 668.15,-660.96 847.82,-496.92 2753.17,-361.56 3111.79,-721.96 3158.72,-769.12 3127.03,-1855.75 3147.79,-1918.96 3196.77,-2068.11 3192.72,-2209.74 3342.84,-2215.76"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="3342.89,-2219.26 3352.95,-2215.96 3343.02,-2212.26 3342.89,-2219.26"/>
<text text-anchor="middle" x="3359.18" y="-2225.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="530.18" y="-2049.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>

<g id="segment" class="node">
<title>segment</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="4428.3" cy="-2110.96" rx="325.95" ry="554.24"/>
<polygon fill="#1d71b8" stroke="transparent" points="4200.3,-2440.96 4200.3,-2500.96 4657.3,-2500.96 4657.3,-2440.96 4200.3,-2440.96"/>
<polygon fill="none" stroke="#29235c" points="4200.3,-2440.96 4200.3,-2500.96 4657.3,-2500.96 4657.3,-2440.96 4200.3,-2440.96"/>
<text text-anchor="start" x="4305.2" y="-2462.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       segment       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="4200.3,-2380.96 4200.3,-2440.96 4657.3,-2440.96 4657.3,-2380.96 4200.3,-2380.96"/>
<polygon fill="none" stroke="#29235c" points="4200.3,-2380.96 4200.3,-2440.96 4657.3,-2440.96 4657.3,-2380.96 4200.3,-2380.96"/>
<text text-anchor="start" x="4211.3" y="-2402.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">id</text>
<text text-anchor="start" x="4236.19" y="-2402.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">    </text>
<text text-anchor="start" x="4432.99" y="-2402.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(21)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4200.3,-2320.96 4200.3,-2380.96 4657.3,-2380.96 4657.3,-2320.96 4200.3,-2320.96"/>
<polygon fill="none" stroke="#29235c" points="4200.3,-2320.96 4200.3,-2380.96 4657.3,-2380.96 4657.3,-2320.96 4200.3,-2320.96"/>
<text text-anchor="start" x="4211.3" y="-2341.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">file_id    </text>
<text text-anchor="start" x="4393.9" y="-2342.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(21)</text>
<text text-anchor="start" x="4607.21" y="-2342.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="4616.1" y="-2342.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4200.3,-2260.96 4200.3,-2320.96 4657.3,-2320.96 4657.3,-2260.96 4200.3,-2260.96"/>
<polygon fill="none" stroke="#29235c" points="4200.3,-2260.96 4200.3,-2320.96 4657.3,-2320.96 4657.3,-2260.96 4200.3,-2260.96"/>
<text text-anchor="start" x="4211.2" y="-2281.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">dataset_id    </text>
<text text-anchor="start" x="4394.1" y="-2282.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<text text-anchor="start" x="4607.41" y="-2282.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="4616.3" y="-2282.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4200.3,-2200.96 4200.3,-2260.96 4657.3,-2260.96 4657.3,-2200.96 4200.3,-2200.96"/>
<polygon fill="none" stroke="#29235c" points="4200.3,-2200.96 4200.3,-2260.96 4657.3,-2260.96 4657.3,-2200.96 4200.3,-2200.96"/>
<text text-anchor="start" x="4211.3" y="-2221.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">start_time    </text>
<text text-anchor="start" x="4399.21" y="-2222.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">DECIMAL(7,3)</text>
<text text-anchor="start" x="4607.21" y="-2222.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="4616.1" y="-2222.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4200.3,-2140.96 4200.3,-2200.96 4657.3,-2200.96 4657.3,-2140.96 4200.3,-2140.96"/>
<polygon fill="none" stroke="#29235c" points="4200.3,-2140.96 4200.3,-2200.96 4657.3,-2200.96 4657.3,-2140.96 4200.3,-2140.96"/>
<text text-anchor="start" x="4211.3" y="-2161.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">end_time    </text>
<text text-anchor="start" x="4399.21" y="-2162.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">DECIMAL(7,3)</text>
<text text-anchor="start" x="4607.21" y="-2162.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="4616.1" y="-2162.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4200.3,-2080.96 4200.3,-2140.96 4657.3,-2140.96 4657.3,-2080.96 4200.3,-2080.96"/>
<polygon fill="none" stroke="#29235c" points="4200.3,-2080.96 4200.3,-2140.96 4657.3,-2140.96 4657.3,-2080.96 4200.3,-2080.96"/>
<text text-anchor="start" x="4211.3" y="-2101.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">freq_low    </text>
<text text-anchor="start" x="4438.3" y="-2102.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">DECIMAL(9,3)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4200.3,-2020.96 4200.3,-2080.96 4657.3,-2080.96 4657.3,-2020.96 4200.3,-2020.96"/>
<polygon fill="none" stroke="#29235c" points="4200.3,-2020.96 4200.3,-2080.96 4657.3,-2080.96 4657.3,-2020.96 4200.3,-2020.96"/>
<text text-anchor="start" x="4211.3" y="-2041.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">freq_high    </text>
<text text-anchor="start" x="4438.3" y="-2042.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">DECIMAL(9,3)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4200.3,-1960.96 4200.3,-2020.96 4657.3,-2020.96 4657.3,-1960.96 4200.3,-1960.96"/>
<polygon fill="none" stroke="#29235c" points="4200.3,-1960.96 4200.3,-2020.96 4657.3,-2020.96 4657.3,-1960.96 4200.3,-1960.96"/>
<text text-anchor="start" x="4211.3" y="-1981.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">description    </text>
<text text-anchor="start" x="4415.2" y="-1982.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(255)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4200.3,-1900.96 4200.3,-1960.96 4657.3,-1960.96 4657.3,-1900.96 4200.3,-1900.96"/>
<polygon fill="none" stroke="#29235c" points="4200.3,-1900.96 4200.3,-1960.96 4657.3,-1960.96 4657.3,-1900.96 4200.3,-1900.96"/>
<text text-anchor="start" x="4211.3" y="-1921.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">created_at    </text>
<text text-anchor="start" x="4459.64" y="-1922.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4200.3,-1840.96 4200.3,-1900.96 4657.3,-1900.96 4657.3,-1840.96 4200.3,-1840.96"/>
<polygon fill="none" stroke="#29235c" points="4200.3,-1840.96 4200.3,-1900.96 4657.3,-1900.96 4657.3,-1840.96 4200.3,-1840.96"/>
<text text-anchor="start" x="4211.3" y="-1861.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">last_modified    </text>
<text text-anchor="start" x="4459.64" y="-1862.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4200.3,-1780.96 4200.3,-1840.96 4657.3,-1840.96 4657.3,-1780.96 4200.3,-1780.96"/>
<polygon fill="none" stroke="#29235c" points="4200.3,-1780.96 4200.3,-1840.96 4657.3,-1840.96 4657.3,-1780.96 4200.3,-1780.96"/>
<text text-anchor="start" x="4211.3" y="-1801.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">active    </text>
<text text-anchor="start" x="4491.6" y="-1802.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">BOOLEAN</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4200.3,-1720.96 4200.3,-1780.96 4657.3,-1780.96 4657.3,-1720.96 4200.3,-1720.96"/>
<polygon fill="none" stroke="#29235c" points="4200.3,-1720.96 4200.3,-1780.96 4657.3,-1780.96 4657.3,-1720.96 4200.3,-1720.96"/>
<text text-anchor="start" x="4268.74" y="-1742.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#1d71b8">    file_id, dataset_id    </text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="4198.8,-1719.96 4198.8,-2501.96 4657.8,-2501.96 4657.8,-1719.96 4198.8,-1719.96"/>
</g>


<g id="edge24" class="edge">
<title>dataset:e->segment:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M539.08,-2077.96C713.45,-2077.96 542.66,-635.02 668.15,-513.96 764.75,-420.78 1740.61,-457.96 1874.83,-457.96 1874.83,-457.96 1874.83,-457.96 2747.22,-457.96 3311.32,-457.96 3615.03,-90.3 4016.12,-486.96 4058.01,-528.39 4045.38,-1488.42 4052.12,-1546.96 4071,-1710.96 4032.4,-2267.48 4189.19,-2290.24"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="4189.07,-2293.74 4199.3,-2290.96 4189.57,-2286.76 4189.07,-2293.74"/>
<text text-anchor="middle" x="4193.08" y="-2300.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="547.97" y="-2049.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>


<g id="edge6" class="edge">
<title>location:e->cluster:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M1261.59,-2077.96C1566.92,-2077.96 1244.43,-1458.12 1535.67,-1444.2"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="1535.91,-1447.69 1545.83,-1443.96 1535.75,-1440.7 1535.91,-1447.69"/>
<text text-anchor="middle" x="1552.05" y="-1453.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="1252.69" y="-2087.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>

<g id="file" class="node">
<title>file</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="2746.22" cy="-1412.96" rx="365.65" ry="681.8"/>
<polygon fill="#1d71b8" stroke="transparent" points="2490.22,-1832.96 2490.22,-1892.96 3003.22,-1892.96 3003.22,-1832.96 2490.22,-1832.96"/>
<polygon fill="none" stroke="#29235c" points="2490.22,-1832.96 2490.22,-1892.96 3003.22,-1892.96 3003.22,-1832.96 2490.22,-1832.96"/>
<text text-anchor="start" x="2664.02" y="-1854.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       file       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="2490.22,-1772.96 2490.22,-1832.96 3003.22,-1832.96 3003.22,-1772.96 2490.22,-1772.96"/>
<polygon fill="none" stroke="#29235c" points="2490.22,-1772.96 2490.22,-1832.96 3003.22,-1832.96 3003.22,-1772.96 2490.22,-1772.96"/>
<text text-anchor="start" x="2501.22" y="-1794.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">id</text>
<text text-anchor="start" x="2526.11" y="-1794.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">    </text>
<text text-anchor="start" x="2778.91" y="-1794.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(21)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="2490.22,-1712.96 2490.22,-1772.96 3003.22,-1772.96 3003.22,-1712.96 2490.22,-1712.96"/>
<polygon fill="none" stroke="#29235c" points="2490.22,-1712.96 2490.22,-1772.96 3003.22,-1772.96 3003.22,-1712.96 2490.22,-1712.96"/>
<text text-anchor="start" x="2501.22" y="-1733.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">file_name    </text>
<text text-anchor="start" x="2722.03" y="-1734.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(255)</text>
<text text-anchor="start" x="2953.12" y="-1734.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="2962.02" y="-1734.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="2490.22,-1652.96 2490.22,-1712.96 3003.22,-1712.96 3003.22,-1652.96 2490.22,-1652.96"/>
<polygon fill="none" stroke="#29235c" points="2490.22,-1652.96 2490.22,-1712.96 3003.22,-1712.96 3003.22,-1652.96 2490.22,-1652.96"/>
<text text-anchor="start" x="2501.22" y="-1673.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">xxh64_hash    </text>
<text text-anchor="start" x="2739.82" y="-1674.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(16)</text>
<text text-anchor="start" x="2953.12" y="-1674.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="2962.02" y="-1674.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="2490.22,-1592.96 2490.22,-1652.96 3003.22,-1652.96 3003.22,-1592.96 2490.22,-1592.96"/>
<polygon fill="none" stroke="#29235c" points="2490.22,-1592.96 2490.22,-1652.96 3003.22,-1652.96 3003.22,-1592.96 2490.22,-1592.96"/>
<text text-anchor="start" x="2501.22" y="-1613.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">location_id    </text>
<text text-anchor="start" x="2778.91" y="-1614.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="2490.22,-1532.96 2490.22,-1592.96 3003.22,-1592.96 3003.22,-1532.96 2490.22,-1532.96"/>
<polygon fill="none" stroke="#29235c" points="2490.22,-1532.96 2490.22,-1592.96 3003.22,-1592.96 3003.22,-1532.96 2490.22,-1532.96"/>
<text text-anchor="start" x="2500.87" y="-1553.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">timestamp_local    </text>
<text text-anchor="start" x="2766.84" y="-1554.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<text text-anchor="start" x="2953.5" y="-1554.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="2962.39" y="-1554.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="2490.22,-1472.96 2490.22,-1532.96 3003.22,-1532.96 3003.22,-1472.96 2490.22,-1472.96"/>
<polygon fill="none" stroke="#29235c" points="2490.22,-1472.96 2490.22,-1532.96 3003.22,-1532.96 3003.22,-1472.96 2490.22,-1472.96"/>
<text text-anchor="start" x="2501.22" y="-1493.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">cluster_id    </text>
<text text-anchor="start" x="2778.91" y="-1494.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="2490.22,-1412.96 2490.22,-1472.96 3003.22,-1472.96 3003.22,-1412.96 2490.22,-1412.96"/>
<polygon fill="none" stroke="#29235c" points="2490.22,-1412.96 2490.22,-1472.96 3003.22,-1472.96 3003.22,-1412.96 2490.22,-1412.96"/>
<text text-anchor="start" x="2501.22" y="-1433.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">duration    </text>
<text text-anchor="start" x="2745.13" y="-1434.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">DECIMAL(7,3)</text>
<text text-anchor="start" x="2953.12" y="-1434.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="2962.02" y="-1434.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="2490.22,-1352.96 2490.22,-1412.96 3003.22,-1412.96 3003.22,-1352.96 2490.22,-1352.96"/>
<polygon fill="none" stroke="#29235c" points="2490.22,-1352.96 2490.22,-1412.96 3003.22,-1412.96 3003.22,-1352.96 2490.22,-1352.96"/>
<text text-anchor="start" x="2501.22" y="-1373.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">sample_rate    </text>
<text text-anchor="start" x="2810.91" y="-1374.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">INTEGER</text>
<text text-anchor="start" x="2953.12" y="-1374.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="2962.02" y="-1374.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="2490.22,-1292.96 2490.22,-1352.96 3003.22,-1352.96 3003.22,-1292.96 2490.22,-1292.96"/>
<polygon fill="none" stroke="#29235c" points="2490.22,-1292.96 2490.22,-1352.96 3003.22,-1352.96 3003.22,-1292.96 2490.22,-1292.96"/>
<text text-anchor="start" x="2501.22" y="-1313.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">description    </text>
<text text-anchor="start" x="2761.12" y="-1314.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(255)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="2490.22,-1232.96 2490.22,-1292.96 3003.22,-1292.96 3003.22,-1232.96 2490.22,-1232.96"/>
<polygon fill="none" stroke="#29235c" points="2490.22,-1232.96 2490.22,-1292.96 3003.22,-1292.96 3003.22,-1232.96 2490.22,-1232.96"/>
<text text-anchor="start" x="2501.22" y="-1253.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">maybe_solar_night    </text>
<text text-anchor="start" x="2837.52" y="-1254.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">BOOLEAN</text>
<polygon fill="#e7e2dd" stroke="transparent" points="2490.22,-1172.96 2490.22,-1232.96 3003.22,-1232.96 3003.22,-1172.96 2490.22,-1172.96"/>
<polygon fill="none" stroke="#29235c" points="2490.22,-1172.96 2490.22,-1232.96 3003.22,-1232.96 3003.22,-1172.96 2490.22,-1172.96"/>
<text text-anchor="start" x="2501.22" y="-1193.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">maybe_civil_night    </text>
<text text-anchor="start" x="2837.52" y="-1194.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">BOOLEAN</text>
<polygon fill="#e7e2dd" stroke="transparent" points="2490.22,-1112.96 2490.22,-1172.96 3003.22,-1172.96 3003.22,-1112.96 2490.22,-1112.96"/>
<polygon fill="none" stroke="#29235c" points="2490.22,-1112.96 2490.22,-1172.96 3003.22,-1172.96 3003.22,-1112.96 2490.22,-1112.96"/>
<text text-anchor="start" x="2501.22" y="-1133.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">moon_phase    </text>
<text text-anchor="start" x="2784.22" y="-1134.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">DECIMAL(3,2)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="2490.22,-1052.96 2490.22,-1112.96 3003.22,-1112.96 3003.22,-1052.96 2490.22,-1052.96"/>
<polygon fill="none" stroke="#29235c" points="2490.22,-1052.96 2490.22,-1112.96 3003.22,-1112.96 3003.22,-1052.96 2490.22,-1052.96"/>
<text text-anchor="start" x="2501.22" y="-1073.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">created_at    </text>
<text text-anchor="start" x="2805.56" y="-1074.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="2490.22,-992.96 2490.22,-1052.96 3003.22,-1052.96 3003.22,-992.96 2490.22,-992.96"/>
<polygon fill="none" stroke="#29235c" points="2490.22,-992.96 2490.22,-1052.96 3003.22,-1052.96 3003.22,-992.96 2490.22,-992.96"/>
<text text-anchor="start" x="2501.22" y="-1013.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">last_modified    </text>
<text text-anchor="start" x="2805.56" y="-1014.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="2490.22,-932.96 2490.22,-992.96 3003.22,-992.96 3003.22,-932.96 2490.22,-932.96"/>
<polygon fill="none" stroke="#29235c" points="2490.22,-932.96 2490.22,-992.96 3003.22,-992.96 3003.22,-932.96 2490.22,-932.96"/>
<text text-anchor="start" x="2501.22" y="-953.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">active    </text>
<text text-anchor="start" x="2837.52" y="-954.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">BOOLEAN</text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="2488.72,-931.96 2488.72,-1893.96 3003.72,-1893.96 3003.72,-931.96 2488.72,-931.96"/>
</g>


<g id="edge10" class="edge">
<title>location:e->file:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M1261.59,-2077.96C1371.85,-2077.96 1310.53,-1930.33 1407.02,-1876.96 1589.74,-1775.89 2160.96,-1914.28 2344.64,-1814.96 2435.48,-1765.84 2385.96,-1632.24 2479.1,-1623.42"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="2479.38,-1626.91 2489.22,-1622.96 2479.07,-1619.92 2479.38,-1626.91"/>
<text text-anchor="middle" x="2482.99" y="-1632.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="1252.69" y="-2049.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>

<g id="cyclic_recording_pattern" class="node">
<title>cyclic_recording_pattern</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="1019.59" cy="-1011.96" rx="351.36" ry="299.63"/>
<polygon fill="#1d71b8" stroke="transparent" points="773.59,-1161.96 773.59,-1221.96 1266.59,-1221.96 1266.59,-1161.96 773.59,-1161.96"/>
<polygon fill="none" stroke="#29235c" points="773.59,-1161.96 773.59,-1221.96 1266.59,-1221.96 1266.59,-1161.96 773.59,-1161.96"/>
<text text-anchor="start" x="784.47" y="-1183.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       cyclic_recording_pattern       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="773.59,-1101.96 773.59,-1161.96 1266.59,-1161.96 1266.59,-1101.96 773.59,-1101.96"/>
<polygon fill="none" stroke="#29235c" points="773.59,-1101.96 773.59,-1161.96 1266.59,-1161.96 1266.59,-1101.96 773.59,-1101.96"/>
<text text-anchor="start" x="784.59" y="-1123.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">id</text>
<text text-anchor="start" x="809.48" y="-1123.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">    </text>
<text text-anchor="start" x="1042.28" y="-1123.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="773.59,-1041.96 773.59,-1101.96 1266.59,-1101.96 1266.59,-1041.96 773.59,-1041.96"/>
<polygon fill="none" stroke="#29235c" points="773.59,-1041.96 773.59,-1101.96 1266.59,-1101.96 1266.59,-1041.96 773.59,-1041.96"/>
<text text-anchor="start" x="784.59" y="-1062.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">record_s    </text>
<text text-anchor="start" x="1074.28" y="-1063.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">INTEGER</text>
<text text-anchor="start" x="1216.49" y="-1063.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="1225.39" y="-1063.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="773.59,-981.96 773.59,-1041.96 1266.59,-1041.96 1266.59,-981.96 773.59,-981.96"/>
<polygon fill="none" stroke="#29235c" points="773.59,-981.96 773.59,-1041.96 1266.59,-1041.96 1266.59,-981.96 773.59,-981.96"/>
<text text-anchor="start" x="784.59" y="-1002.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">sleep_s    </text>
<text text-anchor="start" x="1074.28" y="-1003.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">INTEGER</text>
<text text-anchor="start" x="1216.49" y="-1003.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="1225.39" y="-1003.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="773.59,-921.96 773.59,-981.96 1266.59,-981.96 1266.59,-921.96 773.59,-921.96"/>
<polygon fill="none" stroke="#29235c" points="773.59,-921.96 773.59,-981.96 1266.59,-981.96 1266.59,-921.96 773.59,-921.96"/>
<text text-anchor="start" x="784.59" y="-942.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">created_at    </text>
<text text-anchor="start" x="1068.93" y="-943.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="773.59,-861.96 773.59,-921.96 1266.59,-921.96 1266.59,-861.96 773.59,-861.96"/>
<polygon fill="none" stroke="#29235c" points="773.59,-861.96 773.59,-921.96 1266.59,-921.96 1266.59,-861.96 773.59,-861.96"/>
<text text-anchor="start" x="784.59" y="-882.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">last_modified    </text>
<text text-anchor="start" x="1068.93" y="-883.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="773.59,-801.96 773.59,-861.96 1266.59,-861.96 1266.59,-801.96 773.59,-801.96"/>
<polygon fill="none" stroke="#29235c" points="773.59,-801.96 773.59,-861.96 1266.59,-861.96 1266.59,-801.96 773.59,-801.96"/>
<text text-anchor="start" x="784.59" y="-822.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">active    </text>
<text text-anchor="start" x="1100.89" y="-823.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">BOOLEAN</text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="772.09,-800.96 772.09,-1222.96 1267.09,-1222.96 1267.09,-800.96 772.09,-800.96"/>
</g>


<g id="edge8" class="edge">
<title>cyclic_recording_pattern:e->cluster:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M1267.59,-1131.96C1389.64,-1131.96 1418.77,-1086.55 1535.75,-1084.07"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="1535.87,-1087.57 1545.83,-1083.96 1535.79,-1080.57 1535.87,-1087.57"/>
<text text-anchor="middle" x="1552.05" y="-1093.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="1258.69" y="-1141.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>


<g id="edge12" class="edge">
<title>cluster:e->file:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M2206.83,-1563.96C2331.72,-1563.96 2359.34,-1506.25 2478.91,-1503.09"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="2479.26,-1506.59 2489.22,-1502.96 2479.17,-1499.59 2479.26,-1506.59"/>
<text text-anchor="middle" x="2482.99" y="-1512.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="2215.72" y="-1573.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>

<g id="moth_metadata" class="node">
<title>moth_metadata</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="3581.95" cy="-950.96" rx="308.1" ry="427.19"/>
<polygon fill="#1d71b8" stroke="transparent" points="3365.95,-1190.96 3365.95,-1250.96 3797.95,-1250.96 3797.95,-1190.96 3365.95,-1190.96"/>
<polygon fill="none" stroke="#29235c" points="3365.95,-1190.96 3365.95,-1250.96 3797.95,-1250.96 3797.95,-1190.96 3365.95,-1190.96"/>
<text text-anchor="start" x="3408.56" y="-1212.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       moth_metadata       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="3365.95,-1130.96 3365.95,-1190.96 3797.95,-1190.96 3797.95,-1130.96 3365.95,-1130.96"/>
<polygon fill="none" stroke="#29235c" points="3365.95,-1130.96 3365.95,-1190.96 3797.95,-1190.96 3797.95,-1130.96 3365.95,-1130.96"/>
<text text-anchor="start" x="3376.95" y="-1152.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">file_id</text>
<text text-anchor="start" x="3460.51" y="-1152.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">    </text>
<text text-anchor="start" x="3573.64" y="-1152.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(21)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3365.95,-1070.96 3365.95,-1130.96 3797.95,-1130.96 3797.95,-1070.96 3365.95,-1070.96"/>
<polygon fill="none" stroke="#29235c" points="3365.95,-1070.96 3365.95,-1130.96 3797.95,-1130.96 3797.95,-1070.96 3365.95,-1070.96"/>
<text text-anchor="start" x="3376.95" y="-1091.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">timestamp    </text>
<text text-anchor="start" x="3561.2" y="-1092.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<text text-anchor="start" x="3747.86" y="-1092.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="3756.75" y="-1092.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3365.95,-1010.96 3365.95,-1070.96 3797.95,-1070.96 3797.95,-1010.96 3365.95,-1010.96"/>
<polygon fill="none" stroke="#29235c" points="3365.95,-1010.96 3365.95,-1070.96 3797.95,-1070.96 3797.95,-1010.96 3365.95,-1010.96"/>
<text text-anchor="start" x="3376.77" y="-1031.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">recorder_id    </text>
<text text-anchor="start" x="3573.8" y="-1032.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(16)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3365.95,-950.96 3365.95,-1010.96 3797.95,-1010.96 3797.95,-950.96 3365.95,-950.96"/>
<polygon fill="none" stroke="#29235c" points="3365.95,-950.96 3365.95,-1010.96 3797.95,-1010.96 3797.95,-950.96 3365.95,-950.96"/>
<text text-anchor="start" x="3376.95" y="-971.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">gain    </text>
<text text-anchor="start" x="3642.92" y="-972.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">gain_level</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3365.95,-890.96 3365.95,-950.96 3797.95,-950.96 3797.95,-890.96 3365.95,-890.96"/>
<polygon fill="none" stroke="#29235c" points="3365.95,-890.96 3365.95,-950.96 3797.95,-950.96 3797.95,-890.96 3365.95,-890.96"/>
<text text-anchor="start" x="3376.95" y="-911.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">battery_v    </text>
<text text-anchor="start" x="3578.96" y="-912.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">DECIMAL(2,1)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3365.95,-830.96 3365.95,-890.96 3797.95,-890.96 3797.95,-830.96 3365.95,-830.96"/>
<polygon fill="none" stroke="#29235c" points="3365.95,-830.96 3365.95,-890.96 3797.95,-890.96 3797.95,-830.96 3365.95,-830.96"/>
<text text-anchor="start" x="3376.95" y="-851.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">temp_c    </text>
<text text-anchor="start" x="3578.96" y="-852.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">DECIMAL(3,1)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3365.95,-770.96 3365.95,-830.96 3797.95,-830.96 3797.95,-770.96 3365.95,-770.96"/>
<polygon fill="none" stroke="#29235c" points="3365.95,-770.96 3365.95,-830.96 3797.95,-830.96 3797.95,-770.96 3365.95,-770.96"/>
<text text-anchor="start" x="3376.95" y="-791.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">created_at    </text>
<text text-anchor="start" x="3600.29" y="-792.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3365.95,-710.96 3365.95,-770.96 3797.95,-770.96 3797.95,-710.96 3365.95,-710.96"/>
<polygon fill="none" stroke="#29235c" points="3365.95,-710.96 3365.95,-770.96 3797.95,-770.96 3797.95,-710.96 3365.95,-710.96"/>
<text text-anchor="start" x="3376.93" y="-731.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">last_modified    </text>
<text text-anchor="start" x="3600.62" y="-732.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3365.95,-650.96 3365.95,-710.96 3797.95,-710.96 3797.95,-650.96 3365.95,-650.96"/>
<polygon fill="none" stroke="#29235c" points="3365.95,-650.96 3365.95,-710.96 3797.95,-710.96 3797.95,-650.96 3365.95,-650.96"/>
<text text-anchor="start" x="3376.95" y="-671.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">active    </text>
<text text-anchor="start" x="3632.26" y="-672.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">BOOLEAN</text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="3364.95,-649.96 3364.95,-1251.96 3798.95,-1251.96 3798.95,-649.96 3364.95,-649.96"/>
</g>


<g id="edge14" class="edge">
<title>file:e->moth_metadata:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M3004.22,-1802.96C3199.81,-1802.96 3045.74,-1553.82 3147.79,-1386.96 3218.62,-1271.14 3225.35,-1166.41 3354.6,-1161.17"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="3355.02,-1164.66 3364.95,-1160.96 3354.89,-1157.66 3355.02,-1164.66"/>
<text text-anchor="middle" x="3371.18" y="-1170.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="2995.32" y="-1812.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>

<g id="file_metadata" class="node">
<title>file_metadata</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="3581.95" cy="-1652.96" rx="308.1" ry="257.27"/>
<polygon fill="#1d71b8" stroke="transparent" points="3365.95,-1772.96 3365.95,-1832.96 3797.95,-1832.96 3797.95,-1772.96 3365.95,-1772.96"/>
<polygon fill="none" stroke="#29235c" points="3365.95,-1772.96 3365.95,-1832.96 3797.95,-1832.96 3797.95,-1772.96 3365.95,-1772.96"/>
<text text-anchor="start" x="3423.68" y="-1794.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       file_metadata       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="3365.95,-1712.96 3365.95,-1772.96 3797.95,-1772.96 3797.95,-1712.96 3365.95,-1712.96"/>
<polygon fill="none" stroke="#29235c" points="3365.95,-1712.96 3365.95,-1772.96 3797.95,-1772.96 3797.95,-1712.96 3365.95,-1712.96"/>
<text text-anchor="start" x="3376.95" y="-1734.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">file_id</text>
<text text-anchor="start" x="3460.51" y="-1734.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">    </text>
<text text-anchor="start" x="3573.64" y="-1734.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(21)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3365.95,-1652.96 3365.95,-1712.96 3797.95,-1712.96 3797.95,-1652.96 3365.95,-1652.96"/>
<polygon fill="none" stroke="#29235c" points="3365.95,-1652.96 3365.95,-1712.96 3797.95,-1712.96 3797.95,-1652.96 3365.95,-1652.96"/>
<text text-anchor="start" x="3376.95" y="-1673.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">json    </text>
<text text-anchor="start" x="3701.62" y="-1674.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">JSON</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3365.95,-1592.96 3365.95,-1652.96 3797.95,-1652.96 3797.95,-1592.96 3365.95,-1592.96"/>
<polygon fill="none" stroke="#29235c" points="3365.95,-1592.96 3365.95,-1652.96 3797.95,-1652.96 3797.95,-1592.96 3365.95,-1592.96"/>
<text text-anchor="start" x="3376.95" y="-1613.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">created_at    </text>
<text text-anchor="start" x="3600.29" y="-1614.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3365.95,-1532.96 3365.95,-1592.96 3797.95,-1592.96 3797.95,-1532.96 3365.95,-1532.96"/>
<polygon fill="none" stroke="#29235c" points="3365.95,-1532.96 3365.95,-1592.96 3797.95,-1592.96 3797.95,-1532.96 3365.95,-1532.96"/>
<text text-anchor="start" x="3376.93" y="-1553.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">last_modified    </text>
<text text-anchor="start" x="3600.62" y="-1554.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3365.95,-1472.96 3365.95,-1532.96 3797.95,-1532.96 3797.95,-1472.96 3365.95,-1472.96"/>
<polygon fill="none" stroke="#29235c" points="3365.95,-1472.96 3365.95,-1532.96 3797.95,-1532.96 3797.95,-1472.96 3365.95,-1472.96"/>
<text text-anchor="start" x="3376.95" y="-1493.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">active    </text>
<text text-anchor="start" x="3632.26" y="-1494.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">BOOLEAN</text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="3364.95,-1471.96 3364.95,-1833.96 3798.95,-1833.96 3798.95,-1471.96 3364.95,-1471.96"/>
</g>


<g id="edge16" class="edge">
<title>file:e->file_metadata:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M3004.22,-1802.96C3163.25,-1802.96 3200.83,-1745.51 3354.65,-1743.04"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="3354.98,-1746.54 3364.95,-1742.96 3354.93,-1739.54 3354.98,-1746.54"/>
<text text-anchor="middle" x="3371.18" y="-1752.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="3013.11" y="-1812.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>


<g id="edge18" class="edge">
<title>file:e->file_dataset:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M3004.22,-1802.96C3131.36,-1802.96 3217.71,-2251.35 3342.71,-2274.99"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="3342.67,-2278.5 3352.95,-2275.96 3343.33,-2271.53 3342.67,-2278.5"/>
<text text-anchor="middle" x="3359.18" y="-2285.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="2995.32" y="-1774.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>


<g id="edge22" class="edge">
<title>file:e->segment:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M3004.22,-1802.96C3292.21,-1802.96 2933.16,-706.99 3147.79,-514.96 3183.74,-482.79 3981.42,-481.45 4016.12,-514.96 4057.38,-554.81 4045.94,-1489.93 4052.12,-1546.96 4071.32,-1724.1 4019.69,-2327.38 4189.18,-2350.29"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="4189.09,-2353.79 4199.3,-2350.96 4189.55,-2346.81 4189.09,-2353.79"/>
<text text-anchor="middle" x="4193.08" y="-2360.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="3013.11" y="-1774.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>

<g id="edge46" class="edge">
<title>moth_metadata:e->gain_level:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M3798.95,-980.96C4092.46,-980.96 3988.79,-1430.96 4282.3,-1430.96"/>
</g>


<g id="edge26" class="edge">
<title>file_dataset:e->segment:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M3811.95,-2035.96C3916.54,-2035.96 3931.69,-1980.68 4016.12,-1918.96 4102.42,-1855.86 4089.78,-1757.55 4189.01,-1751.28"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="4189.41,-1754.77 4199.3,-1750.96 4189.19,-1747.77 4189.41,-1754.77"/>
<text text-anchor="middle" x="4193.08" y="-1760.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="3803.06" y="-2045.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>

<g id="label" class="node">
<title>label</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="5178.48" cy="-2230.96" rx="337.99" ry="384.83"/>
<polygon fill="#1d71b8" stroke="transparent" points="4941.48,-2440.96 4941.48,-2500.96 5415.48,-2500.96 5415.48,-2440.96 4941.48,-2440.96"/>
<polygon fill="none" stroke="#29235c" points="4941.48,-2440.96 4941.48,-2500.96 5415.48,-2500.96 5415.48,-2440.96 4941.48,-2440.96"/>
<text text-anchor="start" x="5082.44" y="-2462.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       label       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="4941.48,-2380.96 4941.48,-2440.96 5415.48,-2440.96 5415.48,-2380.96 4941.48,-2380.96"/>
<polygon fill="none" stroke="#29235c" points="4941.48,-2380.96 4941.48,-2440.96 5415.48,-2440.96 5415.48,-2380.96 4941.48,-2380.96"/>
<text text-anchor="start" x="4952.48" y="-2402.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">id</text>
<text text-anchor="start" x="4977.36" y="-2402.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">    </text>
<text text-anchor="start" x="5191.17" y="-2402.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(21)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4941.48,-2320.96 4941.48,-2380.96 5415.48,-2380.96 5415.48,-2320.96 4941.48,-2320.96"/>
<polygon fill="none" stroke="#29235c" points="4941.48,-2320.96 4941.48,-2380.96 5415.48,-2380.96 5415.48,-2320.96 4941.48,-2320.96"/>
<text text-anchor="start" x="4952" y="-2341.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">segment_id    </text>
<text text-anchor="start" x="5152.28" y="-2342.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(21)</text>
<text text-anchor="start" x="5365.58" y="-2342.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="5374.48" y="-2342.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4941.48,-2260.96 4941.48,-2320.96 5415.48,-2320.96 5415.48,-2260.96 4941.48,-2260.96"/>
<polygon fill="none" stroke="#29235c" points="4941.48,-2260.96 4941.48,-2320.96 5415.48,-2320.96 5415.48,-2260.96 4941.48,-2260.96"/>
<text text-anchor="start" x="4952.48" y="-2281.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">species_id    </text>
<text text-anchor="start" x="5152.08" y="-2282.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<text text-anchor="start" x="5365.38" y="-2282.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="5374.28" y="-2282.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4941.48,-2200.96 4941.48,-2260.96 5415.48,-2260.96 5415.48,-2200.96 4941.48,-2200.96"/>
<polygon fill="none" stroke="#29235c" points="4941.48,-2200.96 4941.48,-2260.96 5415.48,-2260.96 5415.48,-2200.96 4941.48,-2200.96"/>
<text text-anchor="start" x="4952.48" y="-2221.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">filter_id    </text>
<text text-anchor="start" x="5152.08" y="-2222.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<text text-anchor="start" x="5365.38" y="-2222.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="5374.28" y="-2222.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4941.48,-2140.96 4941.48,-2200.96 5415.48,-2200.96 5415.48,-2140.96 4941.48,-2140.96"/>
<polygon fill="none" stroke="#29235c" points="4941.48,-2140.96 4941.48,-2200.96 5415.48,-2200.96 5415.48,-2140.96 4941.48,-2140.96"/>
<text text-anchor="start" x="4952.48" y="-2161.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">certainty    </text>
<text text-anchor="start" x="5196.48" y="-2162.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">DECIMAL(5,2)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4941.48,-2080.96 4941.48,-2140.96 5415.48,-2140.96 5415.48,-2080.96 4941.48,-2080.96"/>
<polygon fill="none" stroke="#29235c" points="4941.48,-2080.96 4941.48,-2140.96 5415.48,-2140.96 5415.48,-2080.96 4941.48,-2080.96"/>
<text text-anchor="start" x="4952.48" y="-2101.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">created_at    </text>
<text text-anchor="start" x="5217.82" y="-2102.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4941.48,-2020.96 4941.48,-2080.96 5415.48,-2080.96 5415.48,-2020.96 4941.48,-2020.96"/>
<polygon fill="none" stroke="#29235c" points="4941.48,-2020.96 4941.48,-2080.96 5415.48,-2080.96 5415.48,-2020.96 4941.48,-2020.96"/>
<text text-anchor="start" x="4952.48" y="-2041.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">last_modified    </text>
<text text-anchor="start" x="5217.82" y="-2042.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4941.48,-1960.96 4941.48,-2020.96 5415.48,-2020.96 5415.48,-1960.96 4941.48,-1960.96"/>
<polygon fill="none" stroke="#29235c" points="4941.48,-1960.96 4941.48,-2020.96 5415.48,-2020.96 5415.48,-1960.96 4941.48,-1960.96"/>
<text text-anchor="start" x="4952.48" y="-1981.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">active    </text>
<text text-anchor="start" x="5210.69" y="-1982.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">BOOLEAN</text>
<text text-anchor="start" x="5365.38" y="-1982.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="5374.28" y="-1982.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="4940.48,-1959.96 4940.48,-2501.96 5416.48,-2501.96 5416.48,-1959.96 4940.48,-1959.96"/>
</g>


<g id="edge32" class="edge">
<title>segment:e->label:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M4658.3,-2410.96C4783.01,-2410.96 4810.77,-2354.2 4930.18,-2351.09"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="4930.52,-2354.59 4940.48,-2350.96 4930.43,-2347.59 4930.52,-2354.59"/>
<text text-anchor="middle" x="4934.25" y="-2360.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="4649.4" y="-2420.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>

<g id="ebird_taxonomy" class="node">
<title>ebird_taxonomy</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="3581.95" cy="-3260.96" rx="434.33" ry="724.15"/>
<polygon fill="#1d71b8" stroke="transparent" points="3276.95,-3710.96 3276.95,-3770.96 3886.95,-3770.96 3886.95,-3710.96 3276.95,-3710.96"/>
<polygon fill="none" stroke="#29235c" points="3276.95,-3710.96 3276.95,-3770.96 3886.95,-3770.96 3886.95,-3710.96 3276.95,-3710.96"/>
<text text-anchor="start" x="3405.9" y="-3732.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       ebird_taxonomy       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="3276.95,-3650.96 3276.95,-3710.96 3886.95,-3710.96 3886.95,-3650.96 3276.95,-3650.96"/>
<polygon fill="none" stroke="#29235c" points="3276.95,-3650.96 3276.95,-3710.96 3886.95,-3710.96 3886.95,-3650.96 3276.95,-3650.96"/>
<text text-anchor="start" x="3287.95" y="-3672.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">id</text>
<text text-anchor="start" x="3312.84" y="-3672.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">    </text>
<text text-anchor="start" x="3662.64" y="-3672.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3276.95,-3590.96 3276.95,-3650.96 3886.95,-3650.96 3886.95,-3590.96 3276.95,-3590.96"/>
<polygon fill="none" stroke="#29235c" points="3276.95,-3590.96 3276.95,-3650.96 3886.95,-3650.96 3886.95,-3590.96 3276.95,-3590.96"/>
<text text-anchor="start" x="3287.95" y="-3611.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">taxonomy_version    </text>
<text text-anchor="start" x="3641.34" y="-3612.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(4)</text>
<text text-anchor="start" x="3836.86" y="-3612.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="3845.75" y="-3612.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3276.95,-3530.96 3276.95,-3590.96 3886.95,-3590.96 3886.95,-3530.96 3276.95,-3530.96"/>
<polygon fill="none" stroke="#29235c" points="3276.95,-3530.96 3276.95,-3590.96 3886.95,-3590.96 3886.95,-3530.96 3276.95,-3530.96"/>
<text text-anchor="start" x="3287.95" y="-3551.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">taxon_order    </text>
<text text-anchor="start" x="3694.64" y="-3552.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">INTEGER</text>
<text text-anchor="start" x="3836.86" y="-3552.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="3845.75" y="-3552.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3276.95,-3470.96 3276.95,-3530.96 3886.95,-3530.96 3886.95,-3470.96 3276.95,-3470.96"/>
<polygon fill="none" stroke="#29235c" points="3276.95,-3470.96 3276.95,-3530.96 3886.95,-3530.96 3886.95,-3470.96 3276.95,-3470.96"/>
<text text-anchor="start" x="3287.95" y="-3491.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">category    </text>
<text text-anchor="start" x="3623.55" y="-3492.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(15)</text>
<text text-anchor="start" x="3836.86" y="-3492.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="3845.75" y="-3492.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3276.95,-3410.96 3276.95,-3470.96 3886.95,-3470.96 3886.95,-3410.96 3276.95,-3410.96"/>
<polygon fill="none" stroke="#29235c" points="3276.95,-3410.96 3276.95,-3470.96 3886.95,-3470.96 3886.95,-3410.96 3276.95,-3410.96"/>
<text text-anchor="start" x="3287.95" y="-3431.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">species_code    </text>
<text text-anchor="start" x="3623.55" y="-3432.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(15)</text>
<text text-anchor="start" x="3836.86" y="-3432.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="3845.75" y="-3432.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3276.95,-3350.96 3276.95,-3410.96 3886.95,-3410.96 3886.95,-3350.96 3276.95,-3350.96"/>
<polygon fill="none" stroke="#29235c" points="3276.95,-3350.96 3276.95,-3410.96 3886.95,-3410.96 3886.95,-3350.96 3276.95,-3350.96"/>
<text text-anchor="start" x="3287.95" y="-3371.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">taxon_concept_id    </text>
<text text-anchor="start" x="3662.64" y="-3372.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(15)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3276.95,-3290.96 3276.95,-3350.96 3886.95,-3350.96 3886.95,-3290.96 3276.95,-3290.96"/>
<polygon fill="none" stroke="#29235c" points="3276.95,-3290.96 3276.95,-3350.96 3886.95,-3350.96 3886.95,-3290.96 3276.95,-3290.96"/>
<text text-anchor="start" x="3287.83" y="-3311.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">primary_com_name    </text>
<text text-anchor="start" x="3605.86" y="-3312.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(100)</text>
<text text-anchor="start" x="3836.96" y="-3312.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="3845.85" y="-3312.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3276.95,-3230.96 3276.95,-3290.96 3886.95,-3290.96 3886.95,-3230.96 3276.95,-3230.96"/>
<polygon fill="none" stroke="#29235c" points="3276.95,-3230.96 3276.95,-3290.96 3886.95,-3290.96 3886.95,-3230.96 3276.95,-3230.96"/>
<text text-anchor="start" x="3287.95" y="-3251.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">sci_name    </text>
<text text-anchor="start" x="3605.76" y="-3252.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(100)</text>
<text text-anchor="start" x="3836.86" y="-3252.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="3845.75" y="-3252.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3276.95,-3170.96 3276.95,-3230.96 3886.95,-3230.96 3886.95,-3170.96 3276.95,-3170.96"/>
<polygon fill="none" stroke="#29235c" points="3276.95,-3170.96 3276.95,-3230.96 3886.95,-3230.96 3886.95,-3170.96 3276.95,-3170.96"/>
<text text-anchor="start" x="3287.95" y="-3191.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">bird_order    </text>
<text text-anchor="start" x="3662.64" y="-3192.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(30)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3276.95,-3110.96 3276.95,-3170.96 3886.95,-3170.96 3886.95,-3110.96 3276.95,-3110.96"/>
<polygon fill="none" stroke="#29235c" points="3276.95,-3110.96 3276.95,-3170.96 3886.95,-3170.96 3886.95,-3110.96 3276.95,-3110.96"/>
<text text-anchor="start" x="3287.95" y="-3131.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">family    </text>
<text text-anchor="start" x="3644.86" y="-3132.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(100)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3276.95,-3050.96 3276.95,-3110.96 3886.95,-3110.96 3886.95,-3050.96 3276.95,-3050.96"/>
<polygon fill="none" stroke="#29235c" points="3276.95,-3050.96 3276.95,-3110.96 3886.95,-3110.96 3886.95,-3050.96 3276.95,-3050.96"/>
<text text-anchor="start" x="3287.95" y="-3071.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">species_group    </text>
<text text-anchor="start" x="3644.86" y="-3072.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(100)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3276.95,-2990.96 3276.95,-3050.96 3886.95,-3050.96 3886.95,-2990.96 3276.95,-2990.96"/>
<polygon fill="none" stroke="#29235c" points="3276.95,-2990.96 3276.95,-3050.96 3886.95,-3050.96 3886.95,-2990.96 3276.95,-2990.96"/>
<text text-anchor="start" x="3287.95" y="-3011.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">report_as    </text>
<text text-anchor="start" x="3662.64" y="-3012.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(15)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3276.95,-2930.96 3276.95,-2990.96 3886.95,-2990.96 3886.95,-2930.96 3276.95,-2930.96"/>
<polygon fill="none" stroke="#29235c" points="3276.95,-2930.96 3276.95,-2990.96 3886.95,-2990.96 3886.95,-2930.96 3276.95,-2930.96"/>
<text text-anchor="start" x="3287.95" y="-2951.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">valid_from    </text>
<text text-anchor="start" x="3751.53" y="-2952.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">DATE</text>
<text text-anchor="start" x="3836.86" y="-2952.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="3845.75" y="-2952.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3276.95,-2870.96 3276.95,-2930.96 3886.95,-2930.96 3886.95,-2870.96 3276.95,-2870.96"/>
<polygon fill="none" stroke="#29235c" points="3276.95,-2870.96 3276.95,-2930.96 3886.95,-2930.96 3886.95,-2870.96 3276.95,-2870.96"/>
<text text-anchor="start" x="3287.95" y="-2891.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">valid_to    </text>
<text text-anchor="start" x="3790.62" y="-2892.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">DATE</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3276.95,-2810.96 3276.95,-2870.96 3886.95,-2870.96 3886.95,-2810.96 3276.95,-2810.96"/>
<polygon fill="none" stroke="#29235c" points="3276.95,-2810.96 3276.95,-2870.96 3886.95,-2870.96 3886.95,-2810.96 3276.95,-2810.96"/>
<text text-anchor="start" x="3287.95" y="-2831.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">active    </text>
<text text-anchor="start" x="3721.26" y="-2832.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">BOOLEAN</text>
<polygon fill="#e7e2dd" stroke="transparent" points="3276.95,-2750.96 3276.95,-2810.96 3886.95,-2810.96 3886.95,-2750.96 3276.95,-2750.96"/>
<polygon fill="none" stroke="#29235c" points="3276.95,-2750.96 3276.95,-2810.96 3886.95,-2810.96 3886.95,-2750.96 3276.95,-2750.96"/>
<text text-anchor="start" x="3309.87" y="-2772.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#1d71b8">    taxonomy_version, species_code    </text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="3275.95,-2749.96 3275.95,-3771.96 3887.95,-3771.96 3887.95,-2749.96 3275.95,-2749.96"/>
</g>

<g id="species" class="node">
<title>species</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="4428.3" cy="-3109.96" rx="376.36" ry="427.19"/>
<polygon fill="#1d71b8" stroke="transparent" points="4164.3,-3349.96 4164.3,-3409.96 4692.3,-3409.96 4692.3,-3349.96 4164.3,-3349.96"/>
<polygon fill="none" stroke="#29235c" points="4164.3,-3349.96 4164.3,-3409.96 4692.3,-3409.96 4692.3,-3349.96 4164.3,-3349.96"/>
<text text-anchor="start" x="4311.81" y="-3371.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       species       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="4164.3,-3289.96 4164.3,-3349.96 4692.3,-3349.96 4692.3,-3289.96 4164.3,-3289.96"/>
<polygon fill="none" stroke="#29235c" points="4164.3,-3289.96 4164.3,-3349.96 4692.3,-3349.96 4692.3,-3289.96 4164.3,-3289.96"/>
<text text-anchor="start" x="4175.3" y="-3311.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">id</text>
<text text-anchor="start" x="4200.19" y="-3311.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">    </text>
<text text-anchor="start" x="4467.99" y="-3311.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4164.3,-3229.96 4164.3,-3289.96 4692.3,-3289.96 4692.3,-3229.96 4164.3,-3229.96"/>
<polygon fill="none" stroke="#29235c" points="4164.3,-3229.96 4164.3,-3289.96 4692.3,-3289.96 4692.3,-3229.96 4164.3,-3229.96"/>
<text text-anchor="start" x="4175.3" y="-3250.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">label    </text>
<text text-anchor="start" x="4411.11" y="-3251.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(100)</text>
<text text-anchor="start" x="4642.21" y="-3251.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="4651.1" y="-3251.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4164.3,-3169.96 4164.3,-3229.96 4692.3,-3229.96 4692.3,-3169.96 4164.3,-3169.96"/>
<polygon fill="none" stroke="#29235c" points="4164.3,-3169.96 4164.3,-3229.96 4692.3,-3229.96 4692.3,-3169.96 4164.3,-3169.96"/>
<text text-anchor="start" x="4175.3" y="-3190.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">ebird_code    </text>
<text text-anchor="start" x="4467.99" y="-3191.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4164.3,-3109.96 4164.3,-3169.96 4692.3,-3169.96 4692.3,-3109.96 4164.3,-3109.96"/>
<polygon fill="none" stroke="#29235c" points="4164.3,-3109.96 4164.3,-3169.96 4692.3,-3169.96 4692.3,-3109.96 4164.3,-3109.96"/>
<text text-anchor="start" x="4175.3" y="-3130.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">taxonomy_version    </text>
<text text-anchor="start" x="4485.78" y="-3131.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(4)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4164.3,-3049.96 4164.3,-3109.96 4692.3,-3109.96 4692.3,-3049.96 4164.3,-3049.96"/>
<polygon fill="none" stroke="#29235c" points="4164.3,-3049.96 4164.3,-3109.96 4692.3,-3109.96 4692.3,-3049.96 4164.3,-3049.96"/>
<text text-anchor="start" x="4175.3" y="-3070.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">description    </text>
<text text-anchor="start" x="4450.2" y="-3071.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(255)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4164.3,-2989.96 4164.3,-3049.96 4692.3,-3049.96 4692.3,-2989.96 4164.3,-2989.96"/>
<polygon fill="none" stroke="#29235c" points="4164.3,-2989.96 4164.3,-3049.96 4692.3,-3049.96 4692.3,-2989.96 4164.3,-2989.96"/>
<text text-anchor="start" x="4175.3" y="-3010.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">created_at    </text>
<text text-anchor="start" x="4494.64" y="-3011.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4164.3,-2929.96 4164.3,-2989.96 4692.3,-2989.96 4692.3,-2929.96 4164.3,-2929.96"/>
<polygon fill="none" stroke="#29235c" points="4164.3,-2929.96 4164.3,-2989.96 4692.3,-2989.96 4692.3,-2929.96 4164.3,-2929.96"/>
<text text-anchor="start" x="4175.3" y="-2950.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">last_modified    </text>
<text text-anchor="start" x="4494.64" y="-2951.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4164.3,-2869.96 4164.3,-2929.96 4692.3,-2929.96 4692.3,-2869.96 4164.3,-2869.96"/>
<polygon fill="none" stroke="#29235c" points="4164.3,-2869.96 4164.3,-2929.96 4692.3,-2929.96 4692.3,-2869.96 4164.3,-2869.96"/>
<text text-anchor="start" x="4175.3" y="-2890.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">active    </text>
<text text-anchor="start" x="4526.6" y="-2891.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">BOOLEAN</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4164.3,-2809.96 4164.3,-2869.96 4692.3,-2869.96 4692.3,-2809.96 4164.3,-2809.96"/>
<polygon fill="none" stroke="#29235c" points="4164.3,-2809.96 4164.3,-2869.96 4692.3,-2869.96 4692.3,-2809.96 4164.3,-2809.96"/>
<text text-anchor="start" x="4174.89" y="-2831.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#1d71b8">    ebird_code, taxonomy_version    </text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="4163.3,-2808.96 4163.3,-3410.96 4693.3,-3410.96 4693.3,-2808.96 4163.3,-2808.96"/>
</g>


<g id="edge28" class="edge">
<title>ebird_taxonomy:e->species:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M3887.95,-2780.96C4009.68,-2780.96 4036.7,-2836.78 4153.25,-2839.83"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="4153.25,-2843.33 4163.3,-2839.96 4153.34,-2836.33 4153.25,-2843.33"/>
<text text-anchor="middle" x="4157.08" y="-2849.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="3879.06" y="-2790.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>

<g id="call_type" class="node">
<title>call_type</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="5178.48" cy="-3111.96" rx="328.2" ry="299.63"/>
<polygon fill="#1d71b8" stroke="transparent" points="4948.48,-3261.96 4948.48,-3321.96 5408.48,-3321.96 5408.48,-3261.96 4948.48,-3261.96"/>
<polygon fill="none" stroke="#29235c" points="4948.48,-3261.96 4948.48,-3321.96 5408.48,-3321.96 5408.48,-3261.96 4948.48,-3261.96"/>
<text text-anchor="start" x="5053.1" y="-3283.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       call_type       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="4948.48,-3201.96 4948.48,-3261.96 5408.48,-3261.96 5408.48,-3201.96 4948.48,-3201.96"/>
<polygon fill="none" stroke="#29235c" points="4948.48,-3201.96 4948.48,-3261.96 5408.48,-3261.96 5408.48,-3201.96 4948.48,-3201.96"/>
<text text-anchor="start" x="4959.48" y="-3223.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">id</text>
<text text-anchor="start" x="4984.36" y="-3223.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">    </text>
<text text-anchor="start" x="5184.17" y="-3223.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4948.48,-3141.96 4948.48,-3201.96 5408.48,-3201.96 5408.48,-3141.96 4948.48,-3141.96"/>
<polygon fill="none" stroke="#29235c" points="4948.48,-3141.96 4948.48,-3201.96 5408.48,-3201.96 5408.48,-3141.96 4948.48,-3141.96"/>
<text text-anchor="start" x="4959.12" y="-3162.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">species_id    </text>
<text text-anchor="start" x="5145.28" y="-3163.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<text text-anchor="start" x="5358.58" y="-3163.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="5367.48" y="-3163.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4948.48,-3081.96 4948.48,-3141.96 5408.48,-3141.96 5408.48,-3081.96 4948.48,-3081.96"/>
<polygon fill="none" stroke="#29235c" points="4948.48,-3081.96 4948.48,-3141.96 5408.48,-3141.96 5408.48,-3081.96 4948.48,-3081.96"/>
<text text-anchor="start" x="4959.48" y="-3102.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">label    </text>
<text text-anchor="start" x="5127.29" y="-3103.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(100)</text>
<text text-anchor="start" x="5358.38" y="-3103.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="5367.28" y="-3103.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4948.48,-3021.96 4948.48,-3081.96 5408.48,-3081.96 5408.48,-3021.96 4948.48,-3021.96"/>
<polygon fill="none" stroke="#29235c" points="4948.48,-3021.96 4948.48,-3081.96 5408.48,-3081.96 5408.48,-3021.96 4948.48,-3021.96"/>
<text text-anchor="start" x="4959.48" y="-3042.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">created_at    </text>
<text text-anchor="start" x="5210.82" y="-3043.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4948.48,-2961.96 4948.48,-3021.96 5408.48,-3021.96 5408.48,-2961.96 4948.48,-2961.96"/>
<polygon fill="none" stroke="#29235c" points="4948.48,-2961.96 4948.48,-3021.96 5408.48,-3021.96 5408.48,-2961.96 4948.48,-2961.96"/>
<text text-anchor="start" x="4959.48" y="-2982.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">last_modified    </text>
<text text-anchor="start" x="5210.82" y="-2983.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4948.48,-2901.96 4948.48,-2961.96 5408.48,-2961.96 5408.48,-2901.96 4948.48,-2901.96"/>
<polygon fill="none" stroke="#29235c" points="4948.48,-2901.96 4948.48,-2961.96 5408.48,-2961.96 5408.48,-2901.96 4948.48,-2901.96"/>
<text text-anchor="start" x="4959.48" y="-2922.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">active    </text>
<text text-anchor="start" x="5242.78" y="-2923.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">BOOLEAN</text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="4947.48,-2900.96 4947.48,-3322.96 5409.48,-3322.96 5409.48,-2900.96 4947.48,-2900.96"/>
</g>


<g id="edge30" class="edge">
<title>species:e->call_type:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M4693.3,-3319.96C4820.57,-3319.96 4817.04,-3179.66 4937.42,-3172.26"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="4937.59,-3175.76 4947.48,-3171.96 4937.37,-3168.76 4937.59,-3175.76"/>
<text text-anchor="middle" x="4941.25" y="-3181.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="4684.4" y="-3329.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>


<g id="edge34" class="edge">
<title>species:e->label:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M4693.3,-3319.96C4925.02,-3319.96 4711.74,-2320.89 4930.43,-2291.62"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="4930.72,-2295.1 4940.48,-2290.96 4930.27,-2288.12 4930.72,-2295.1"/>
<text text-anchor="middle" x="4934.25" y="-2300.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="4702.19" y="-3329.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>

<g id="label_subtype" class="node">
<title>label_subtype</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="5880.57" cy="-2963.96" rx="328.2" ry="384.83"/>
<polygon fill="#1d71b8" stroke="transparent" points="5650.57,-3173.96 5650.57,-3233.96 6110.57,-3233.96 6110.57,-3173.96 5650.57,-3173.96"/>
<polygon fill="none" stroke="#29235c" points="5650.57,-3173.96 5650.57,-3233.96 6110.57,-3233.96 6110.57,-3173.96 5650.57,-3173.96"/>
<text text-anchor="start" x="5719.62" y="-3195.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       label_subtype       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="5650.57,-3113.96 5650.57,-3173.96 6110.57,-3173.96 6110.57,-3113.96 5650.57,-3113.96"/>
<polygon fill="none" stroke="#29235c" points="5650.57,-3113.96 5650.57,-3173.96 6110.57,-3173.96 6110.57,-3113.96 5650.57,-3113.96"/>
<text text-anchor="start" x="5661.57" y="-3135.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">id</text>
<text text-anchor="start" x="5686.46" y="-3135.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">    </text>
<text text-anchor="start" x="5886.26" y="-3135.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(21)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="5650.57,-3053.96 5650.57,-3113.96 6110.57,-3113.96 6110.57,-3053.96 5650.57,-3053.96"/>
<polygon fill="none" stroke="#29235c" points="5650.57,-3053.96 5650.57,-3113.96 6110.57,-3113.96 6110.57,-3053.96 5650.57,-3053.96"/>
<text text-anchor="start" x="5661.57" y="-3074.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">label_id    </text>
<text text-anchor="start" x="5847.17" y="-3075.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(21)</text>
<text text-anchor="start" x="6060.48" y="-3075.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="6069.37" y="-3075.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="5650.57,-2993.96 5650.57,-3053.96 6110.57,-3053.96 6110.57,-2993.96 5650.57,-2993.96"/>
<polygon fill="none" stroke="#29235c" points="5650.57,-2993.96 5650.57,-3053.96 6110.57,-3053.96 6110.57,-2993.96 5650.57,-2993.96"/>
<text text-anchor="start" x="5661.21" y="-3014.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">calltype_id    </text>
<text text-anchor="start" x="5847.37" y="-3015.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<text text-anchor="start" x="6060.68" y="-3015.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="6069.57" y="-3015.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="5650.57,-2933.96 5650.57,-2993.96 6110.57,-2993.96 6110.57,-2933.96 5650.57,-2933.96"/>
<polygon fill="none" stroke="#29235c" points="5650.57,-2933.96 5650.57,-2993.96 6110.57,-2993.96 6110.57,-2933.96 5650.57,-2933.96"/>
<text text-anchor="start" x="5661.57" y="-2954.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">filter_id    </text>
<text text-anchor="start" x="5886.26" y="-2955.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="5650.57,-2873.96 5650.57,-2933.96 6110.57,-2933.96 6110.57,-2873.96 5650.57,-2873.96"/>
<polygon fill="none" stroke="#29235c" points="5650.57,-2873.96 5650.57,-2933.96 6110.57,-2933.96 6110.57,-2873.96 5650.57,-2873.96"/>
<text text-anchor="start" x="5661.57" y="-2894.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">certainty    </text>
<text text-anchor="start" x="5891.58" y="-2895.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">DECIMAL(5,2)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="5650.57,-2813.96 5650.57,-2873.96 6110.57,-2873.96 6110.57,-2813.96 5650.57,-2813.96"/>
<polygon fill="none" stroke="#29235c" points="5650.57,-2813.96 5650.57,-2873.96 6110.57,-2873.96 6110.57,-2813.96 5650.57,-2813.96"/>
<text text-anchor="start" x="5661.57" y="-2834.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">created_at    </text>
<text text-anchor="start" x="5912.91" y="-2835.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="5650.57,-2753.96 5650.57,-2813.96 6110.57,-2813.96 6110.57,-2753.96 5650.57,-2753.96"/>
<polygon fill="none" stroke="#29235c" points="5650.57,-2753.96 5650.57,-2813.96 6110.57,-2813.96 6110.57,-2753.96 5650.57,-2753.96"/>
<text text-anchor="start" x="5661.57" y="-2774.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">last_modified    </text>
<text text-anchor="start" x="5912.91" y="-2775.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="5650.57,-2693.96 5650.57,-2753.96 6110.57,-2753.96 6110.57,-2693.96 5650.57,-2693.96"/>
<polygon fill="none" stroke="#29235c" points="5650.57,-2693.96 5650.57,-2753.96 6110.57,-2753.96 6110.57,-2693.96 5650.57,-2693.96"/>
<text text-anchor="start" x="5661.57" y="-2714.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">active    </text>
<text text-anchor="start" x="5905.78" y="-2715.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">BOOLEAN</text>
<text text-anchor="start" x="6060.48" y="-2715.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="6069.37" y="-2715.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="5649.57,-2692.96 5649.57,-3234.96 6111.57,-3234.96 6111.57,-2692.96 5649.57,-2692.96"/>
</g>


<g id="edge42" class="edge">
<title>call_type:e->label_subtype:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M5409.48,-3231.96C5547.21,-3231.96 5510.34,-3033.99 5639.55,-3024.33"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="5639.7,-3027.82 5649.57,-3023.96 5639.45,-3020.83 5639.7,-3027.82"/>
<text text-anchor="middle" x="5643.35" y="-3033.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="5400.58" y="-3241.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>

<g id="filter" class="node">
<title>filter</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="4428.3" cy="-705.96" rx="316.15" ry="299.63"/>
<polygon fill="#1d71b8" stroke="transparent" points="4207.3,-855.96 4207.3,-915.96 4650.3,-915.96 4650.3,-855.96 4207.3,-855.96"/>
<polygon fill="none" stroke="#29235c" points="4207.3,-855.96 4207.3,-915.96 4650.3,-915.96 4650.3,-855.96 4207.3,-855.96"/>
<text text-anchor="start" x="4336.33" y="-877.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       filter       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="4207.3,-795.96 4207.3,-855.96 4650.3,-855.96 4650.3,-795.96 4207.3,-795.96"/>
<polygon fill="none" stroke="#29235c" points="4207.3,-795.96 4207.3,-855.96 4650.3,-855.96 4650.3,-795.96 4207.3,-795.96"/>
<text text-anchor="start" x="4218.3" y="-817.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">id</text>
<text text-anchor="start" x="4243.19" y="-817.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">    </text>
<text text-anchor="start" x="4425.99" y="-817.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(12)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4207.3,-735.96 4207.3,-795.96 4650.3,-795.96 4650.3,-735.96 4207.3,-735.96"/>
<polygon fill="none" stroke="#29235c" points="4207.3,-735.96 4207.3,-795.96 4650.3,-795.96 4650.3,-735.96 4207.3,-735.96"/>
<text text-anchor="start" x="4218.3" y="-756.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">name    </text>
<text text-anchor="start" x="4369.11" y="-757.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(140)</text>
<text text-anchor="start" x="4600.21" y="-757.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="4609.1" y="-757.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4207.3,-675.96 4207.3,-735.96 4650.3,-735.96 4650.3,-675.96 4207.3,-675.96"/>
<polygon fill="none" stroke="#29235c" points="4207.3,-675.96 4207.3,-735.96 4650.3,-735.96 4650.3,-675.96 4207.3,-675.96"/>
<text text-anchor="start" x="4218.17" y="-696.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">description    </text>
<text text-anchor="start" x="4408.25" y="-697.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(255)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4207.3,-615.96 4207.3,-675.96 4650.3,-675.96 4650.3,-615.96 4207.3,-615.96"/>
<polygon fill="none" stroke="#29235c" points="4207.3,-615.96 4207.3,-675.96 4650.3,-675.96 4650.3,-615.96 4207.3,-615.96"/>
<text text-anchor="start" x="4218.3" y="-636.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">created_at    </text>
<text text-anchor="start" x="4452.64" y="-637.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4207.3,-555.96 4207.3,-615.96 4650.3,-615.96 4650.3,-555.96 4207.3,-555.96"/>
<polygon fill="none" stroke="#29235c" points="4207.3,-555.96 4207.3,-615.96 4650.3,-615.96 4650.3,-555.96 4207.3,-555.96"/>
<text text-anchor="start" x="4218.3" y="-576.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">last_modified    </text>
<text text-anchor="start" x="4452.64" y="-577.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="4207.3,-495.96 4207.3,-555.96 4650.3,-555.96 4650.3,-495.96 4207.3,-495.96"/>
<polygon fill="none" stroke="#29235c" points="4207.3,-495.96 4207.3,-555.96 4650.3,-555.96 4650.3,-495.96 4207.3,-495.96"/>
<text text-anchor="start" x="4218.3" y="-516.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">active    </text>
<text text-anchor="start" x="4445.51" y="-517.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">BOOLEAN</text>
<text text-anchor="start" x="4600.21" y="-517.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c"> </text>
<text text-anchor="start" x="4609.1" y="-517.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">(!)</text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="4205.8,-494.96 4205.8,-916.96 4650.8,-916.96 4650.8,-494.96 4205.8,-494.96"/>
</g>


<g id="edge36" class="edge">
<title>filter:e->label:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M4651.3,-825.96C4759.42,-825.96 4762.75,-915.21 4804.48,-1014.96 4908.09,-1262.62 4674.63,-2206.57 4930.37,-2230.5"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="4930.33,-2234 4940.48,-2230.96 4930.65,-2227.01 4930.33,-2234"/>
<text text-anchor="middle" x="4934.25" y="-2240.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="4642.4" y="-797.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>


<g id="edge44" class="edge">
<title>filter:e->label_subtype:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M4651.3,-825.96C5242.7,-825.96 5297.96,-1287.41 5516.47,-1836.96 5608.38,-2068.1 5403.13,-2939.67 5639.48,-2963.46"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="5639.41,-2966.96 5649.57,-2963.96 5639.75,-2959.97 5639.41,-2966.96"/>
<text text-anchor="middle" x="5643.35" y="-2973.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="4660.19" y="-797.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>

<g id="label_metadata" class="node">
<title>label_metadata</title>
<ellipse fill="none" stroke="black" stroke-width="0" cx="5880.57" cy="-2303.96" rx="308.1" ry="257.27"/>
<polygon fill="#1d71b8" stroke="transparent" points="5664.57,-2423.96 5664.57,-2483.96 6096.57,-2483.96 6096.57,-2423.96 5664.57,-2423.96"/>
<polygon fill="none" stroke="#29235c" points="5664.57,-2423.96 5664.57,-2483.96 6096.57,-2483.96 6096.57,-2423.96 5664.57,-2423.96"/>
<text text-anchor="start" x="5708.95" y="-2445.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#ffffff">       label_metadata       </text>
<polygon fill="#e7e2dd" stroke="transparent" points="5664.57,-2363.96 5664.57,-2423.96 6096.57,-2423.96 6096.57,-2363.96 5664.57,-2363.96"/>
<polygon fill="none" stroke="#29235c" points="5664.57,-2363.96 5664.57,-2423.96 6096.57,-2423.96 6096.57,-2363.96 5664.57,-2363.96"/>
<text text-anchor="start" x="5675.57" y="-2385.16" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="32.00" fill="#29235c">label_id</text>
<text text-anchor="start" x="5785.82" y="-2385.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">    </text>
<text text-anchor="start" x="5872.26" y="-2385.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">VARCHAR(21)</text>
<polygon fill="#e7e2dd" stroke="transparent" points="5664.57,-2303.96 5664.57,-2363.96 6096.57,-2363.96 6096.57,-2303.96 5664.57,-2303.96"/>
<polygon fill="none" stroke="#29235c" points="5664.57,-2303.96 5664.57,-2363.96 6096.57,-2363.96 6096.57,-2303.96 5664.57,-2303.96"/>
<text text-anchor="start" x="5675.57" y="-2324.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">json    </text>
<text text-anchor="start" x="6000.24" y="-2325.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">JSON</text>
<polygon fill="#e7e2dd" stroke="transparent" points="5664.57,-2243.96 5664.57,-2303.96 6096.57,-2303.96 6096.57,-2243.96 5664.57,-2243.96"/>
<polygon fill="none" stroke="#29235c" points="5664.57,-2243.96 5664.57,-2303.96 6096.57,-2303.96 6096.57,-2243.96 5664.57,-2243.96"/>
<text text-anchor="start" x="5675.57" y="-2264.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">created_at    </text>
<text text-anchor="start" x="5898.91" y="-2265.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="5664.57,-2183.96 5664.57,-2243.96 6096.57,-2243.96 6096.57,-2183.96 5664.57,-2183.96"/>
<polygon fill="none" stroke="#29235c" points="5664.57,-2183.96 5664.57,-2243.96 6096.57,-2243.96 6096.57,-2183.96 5664.57,-2183.96"/>
<text text-anchor="start" x="5675.55" y="-2204.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">last_modified    </text>
<text text-anchor="start" x="5899.24" y="-2205.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">TIMESTAMP</text>
<polygon fill="#e7e2dd" stroke="transparent" points="5664.57,-2123.96 5664.57,-2183.96 6096.57,-2183.96 6096.57,-2123.96 5664.57,-2123.96"/>
<polygon fill="none" stroke="#29235c" points="5664.57,-2123.96 5664.57,-2183.96 6096.57,-2183.96 6096.57,-2123.96 5664.57,-2123.96"/>
<text text-anchor="start" x="5675.57" y="-2144.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">active    </text>
<text text-anchor="start" x="5930.87" y="-2145.16" font-family="Helvetica,sans-Serif" font-style="italic" font-size="32.00" fill="#29235c">BOOLEAN</text>
<polygon fill="none" stroke="#29235c" stroke-width="2" points="5663.57,-2122.96 5663.57,-2484.96 6097.57,-2484.96 6097.57,-2122.96 5663.57,-2122.96"/>
</g>


<g id="edge38" class="edge">
<title>label:e->label_metadata:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M5416.48,-2410.96C5523.11,-2410.96 5551.73,-2395.01 5653.49,-2394.01"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="5653.59,-2397.51 5663.57,-2393.96 5653.55,-2390.51 5653.59,-2397.51"/>
<text text-anchor="middle" x="5657.35" y="-2403.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="5407.58" y="-2420.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>


<g id="edge40" class="edge">
<title>label:e->label_subtype:w</title>
<path fill="none" stroke="#29235c" stroke-width="3" d="M5416.48,-2410.96C5729.46,-2410.96 5341.92,-3068.93 5639.17,-3083.71"/>
<polygon fill="#29235c" stroke="#29235c" stroke-width="3" points="5639.49,-3087.22 5649.57,-3083.96 5639.66,-3080.22 5639.49,-3087.22"/>
<text text-anchor="middle" x="5643.35" y="-3093.56" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">*</text>
<text text-anchor="middle" x="5407.58" y="-2382.16" font-family="Helvetica,sans-Serif" font-size="32.00" fill="#29235c">1</text>
</g>
</g>
</svg>
file addition: schema.sql (----------)

[0.790921]

-- NOTE: DBML does not like functions and materialised views
-- from this: $npm install -g @dbml/cli
-- sql2dbml schema.sql --postgres -o schema.dbml
-- from this: $npm install -g @softwaretechnik/dbml-renderer
-- dbml-renderer -i schema.dbml -o schema.svg

CREATE TYPE dataset_type AS ENUM ('structured', 'unstructured', 'test', 'train');

CREATE TABLE dataset (
id VARCHAR(12) PRIMARY KEY,
name VARCHAR(255) UNIQUE NOT NULL,
description VARCHAR(255),
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
active BOOLEAN DEFAULT TRUE,
type dataset_type NOT NULL DEFAULT 'structured'
);

CREATE TABLE location (
id VARCHAR(12) PRIMARY KEY,
dataset_id VARCHAR(12) NOT NULL,
name VARCHAR(140) NOT NULL,
latitude DECIMAL(10, 7) NOT NULL CHECK (latitude BETWEEN -90.0 AND 90.0),
longitude DECIMAL(10, 7) NOT NULL CHECK (longitude BETWEEN -180.0 AND 180.0),
description VARCHAR(255),
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
active BOOLEAN DEFAULT TRUE,
timezone_id VARCHAR(40) NOT NULL,
FOREIGN KEY (dataset_id) REFERENCES dataset(id),
UNIQUE (dataset_id, name)
);

CREATE TABLE cyclic_recording_pattern (
id VARCHAR(12) PRIMARY KEY,
record_s INTEGER NOT NULL,
sleep_s INTEGER NOT NULL,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
active BOOLEAN DEFAULT TRUE,
UNIQUE (record_s, sleep_s)
);

CREATE TABLE cluster (
id VARCHAR(12) PRIMARY KEY,
dataset_id VARCHAR(12) NOT NULL,
location_id VARCHAR(12) NOT NULL,
name VARCHAR(140) NOT NULL,
description VARCHAR(255),
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
active BOOLEAN DEFAULT TRUE,
cyclic_recording_pattern_id VARCHAR(12),
sample_rate INTEGER NOT NULL,
path VARCHAR(255) NULL,
FOREIGN KEY (dataset_id) REFERENCES dataset(id),
FOREIGN KEY (location_id) REFERENCES location(id),
FOREIGN KEY (cyclic_recording_pattern_id) REFERENCES cyclic_recording_pattern(id),
UNIQUE (location_id, name)
);

CREATE TYPE gain_level AS ENUM ('low', 'low-medium', 'medium', 'medium-high', 'high');

CREATE TABLE file (
id VARCHAR(21) PRIMARY KEY,
file_name VARCHAR(255) NOT NULL,
xxh64_hash VARCHAR(16) UNIQUE NOT NULL,
location_id VARCHAR(12),
timestamp_local TIMESTAMP WITH TIME ZONE NOT NULL,
cluster_id VARCHAR(12),
duration DECIMAL(7, 3) NOT NULL CHECK (duration > 0),
sample_rate INTEGER NOT NULL,
description VARCHAR(255),
maybe_solar_night BOOLEAN,
maybe_civil_night BOOLEAN,
moon_phase DECIMAL(3,2) CHECK (moon_phase BETWEEN 0.00 AND 1.00),
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
active BOOLEAN DEFAULT TRUE,
FOREIGN KEY (location_id) REFERENCES location(id),
FOREIGN KEY (cluster_id) REFERENCES cluster(id)
);

CREATE TABLE moth_metadata (
file_id VARCHAR(21) PRIMARY KEY,
timestamp TIMESTAMP WITH TIME ZONE NOT NULL,
recorder_id VARCHAR(16),
gain gain_level NULL,
battery_v DECIMAL(2, 1) CHECK (battery_v >= 0),
temp_c DECIMAL(3, 1),
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
active BOOLEAN DEFAULT TRUE,
FOREIGN KEY (file_id) REFERENCES file(id)
);

CREATE TABLE file_metadata (
file_id VARCHAR(21) PRIMARY KEY,
json JSON,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
active BOOLEAN DEFAULT TRUE,
FOREIGN KEY (file_id) REFERENCES file(id)
);

CREATE TABLE file_dataset (
file_id VARCHAR(21) NOT NULL,
dataset_id VARCHAR(12) NOT NULL,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (file_id, dataset_id),
FOREIGN KEY (file_id) REFERENCES file(id),
FOREIGN KEY (dataset_id) REFERENCES dataset(id)
);

CREATE TABLE segment(
id VARCHAR(21) PRIMARY KEY,
file_id VARCHAR(21) NOT NULL,
dataset_id VARCHAR(12) NOT NULL,
start_time DECIMAL(7,3) NOT NULL,
end_time DECIMAL(7,3) NOT NULL,
freq_low DECIMAL(9,3) CHECK (freq_low < 300000),
freq_high DECIMAL(9,3) CHECK (freq_high < 300000),
description VARCHAR(255),
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
active BOOLEAN DEFAULT TRUE,
FOREIGN KEY (file_id) REFERENCES file(id),
FOREIGN KEY (dataset_id) REFERENCES dataset(id),
FOREIGN KEY (file_id, dataset_id) REFERENCES file_dataset(file_id, dataset_id)
);

CREATE TABLE ebird_taxonomy (
id VARCHAR(12) PRIMARY KEY,
taxonomy_version VARCHAR(4) NOT NULL,
taxon_order INTEGER NOT NULL,
category VARCHAR(15) NOT NULL,
species_code VARCHAR(15) NOT NULL,
taxon_concept_id VARCHAR(15),
primary_com_name VARCHAR(100) NOT NULL,
sci_name VARCHAR(100) NOT NULL,
bird_order VARCHAR(30),
family VARCHAR(100),
species_group VARCHAR(100),
report_as VARCHAR(15),
valid_from DATE NOT NULL, -- Need to drop
valid_to DATE, -- Need to drop
active BOOLEAN DEFAULT TRUE,
UNIQUE (species_code, taxonomy_version)
);

CREATE TABLE species (
id VARCHAR(12) PRIMARY KEY,
label VARCHAR(100) UNIQUE NOT NULL,
ebird_code VARCHAR(12),
taxonomy_version VARCHAR(4),
description VARCHAR(255),
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
active BOOLEAN DEFAULT TRUE,
FOREIGN KEY (ebird_code, taxonomy_version) REFERENCES ebird_taxonomy(species_code, taxonomy_version)
);

CREATE TABLE call_type (
id VARCHAR(12) PRIMARY KEY,
species_id VARCHAR(12) NOT NULL,
label VARCHAR(100) NOT NULL,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
active BOOLEAN DEFAULT TRUE,
FOREIGN KEY (species_id) REFERENCES species(id)
);

CREATE TABLE filter (
id VARCHAR(12) PRIMARY KEY,
name VARCHAR(140) NOT NULL,
description VARCHAR(255),
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
active BOOLEAN NOT NULL DEFAULT true
);

CREATE TABLE label (
id VARCHAR(21) PRIMARY KEY,
segment_id VARCHAR(21) NOT NULL,
species_id VARCHAR(12) NOT NULL,
filter_id VARCHAR(12) NOT NULL,
certainty DECIMAL(5,2) CHECK (certainty <= 100 AND certainty >= 0),
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
active BOOLEAN NOT NULL DEFAULT true,
FOREIGN KEY (segment_id) REFERENCES segment(id),
FOREIGN KEY (species_id) REFERENCES species(id),
FOREIGN KEY (filter_id) REFERENCES filter(id)
);

CREATE TABLE label_metadata (
label_id VARCHAR(21) PRIMARY KEY,
json JSON,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
active BOOLEAN DEFAULT TRUE,
FOREIGN KEY (label_id) REFERENCES label(id)
);

CREATE TABLE label_subtype (
id VARCHAR(21) PRIMARY KEY,
label_id VARCHAR(21) NOT NULL,
calltype_id VARCHAR(12) NOT NULL,
filter_id VARCHAR(12),
certainty DECIMAL(5,2) CHECK (certainty <= 100 AND certainty >= 0),
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
active BOOLEAN NOT NULL DEFAULT true,
FOREIGN KEY (label_id) REFERENCES label(id),
FOREIGN KEY (calltype_id) REFERENCES call_type(id),
FOREIGN KEY (filter_id) REFERENCES filter(id)
);

-- FK indexes on file table (1.26M rows)
CREATE INDEX idx_file_location ON file(location_id);
CREATE INDEX idx_file_cluster ON file(cluster_id);
-- Performance index on file for time-based queries
CREATE INDEX idx_file_timestamp_local ON file(timestamp_local);
-- FK indexes on segment table (201K rows)
CREATE INDEX idx_segment_file ON segment(file_id);
CREATE INDEX idx_segment_dataset ON segment(dataset_id);
-- FK indexes on label table (200K rows)
CREATE INDEX idx_label_segment_id ON label(segment_id);
CREATE INDEX idx_label_species_id ON label(species_id);
-- FK indexes on label_subtype table (114K rows)
CREATE INDEX idx_label_subtype_label_id ON label_subtype(label_id);
CREATE INDEX idx_label_subtype_calltype_id ON label_subtype(calltype_id);
CREATE INDEX idx_label_subtype_filter_id ON label_subtype(filter_id);
-- FK lookup for ebird taxonomy (used by species table FK)
CREATE INDEX idx_ebird_taxonomy_species_code ON ebird_taxonomy(species_code, taxonomy_version);
-- Junction table reverse lookups
CREATE INDEX idx_file_dataset_dataset ON file_dataset(dataset_id);
file addition: schema.go (----------)

[0.790921]

package db

import (
"database/sql"
"embed"
"fmt"
"slices"
"strings"
)

//go:embed schema.sql
var schemaFS embed.FS

// ReadSchemaSQL reads the schema.sql file
// Uses embedded file for distributed binaries
func ReadSchemaSQL() (string, error) {
data, err := schemaFS.ReadFile("schema.sql")
if err != nil {
return "", fmt.Errorf("failed to read schema.sql: %w", err)
}
return string(data), nil
}

// DDLStatement represents a parsed DDL statement with metadata
type DDLStatement struct {
SQL string
Type string // "CREATE_TYPE", "CREATE_TABLE", "CREATE_INDEX", "CREATE_TABLE_AS"
TableName string // for CREATE TABLE and CREATE INDEX
}

// ExtractDDLStatements splits schema SQL into executable DDL statements
// Returns statements in order: types, tables, indexes
// Handles CREATE TABLE ... AS SELECT specially (marked but included)
func ExtractDDLStatements(schemaSQL string) []DDLStatement {
var statements []DDLStatement

// Split by semicolon, but handle the CREATE TABLE AS SELECT case
lines := strings.Split(schemaSQL, "\n")
var currentStmt strings.Builder

for _, line := range lines {
trimmed := strings.TrimSpace(line)

// Skip empty lines and comments
if trimmed == "" || strings.HasPrefix(trimmed, "--") {
continue
}

currentStmt.WriteString(line)
currentStmt.WriteString("\n")

// Statement ends at semicolon
if strings.HasSuffix(trimmed, ";") {
sql := strings.TrimSpace(currentStmt.String())
if sql != "" {
stmt := parseDDLStatement(sql)
statements = append(statements, stmt)
}
currentStmt.Reset()
}
}

// Handle any remaining statement without trailing semicolon
if currentStmt.Len() > 0 {
sql := strings.TrimSpace(currentStmt.String())
if sql != "" && strings.HasSuffix(sql, ";") {
stmt := parseDDLStatement(sql)
statements = append(statements, stmt)
}
}

return statements
}

// parseDDLStatement determines the type and table name of a DDL statement
func parseDDLStatement(sql string) DDLStatement {
upper := strings.ToUpper(sql)

switch {
case strings.HasPrefix(upper, "CREATE TYPE"):
return DDLStatement{SQL: sql, Type: "CREATE_TYPE", TableName: ""}

case strings.HasPrefix(upper, "CREATE TABLE"):
tableName := extractTableName(sql)
return DDLStatement{SQL: sql, Type: "CREATE_TABLE", TableName: tableName}

case strings.HasPrefix(upper, "CREATE INDEX") || strings.HasPrefix(upper, "CREATE UNIQUE INDEX"):
indexName := extractIndexName(sql)
return DDLStatement{SQL: sql, Type: "CREATE_INDEX", TableName: indexName}

default:
return DDLStatement{SQL: sql, Type: "UNKNOWN", TableName: ""}
}
}

// extractTableName extracts table name from CREATE TABLE statement
func extractTableName(sql string) string {
// CREATE TABLE name (
// or CREATE TABLE name(
upper := strings.ToUpper(sql)

// Find "CREATE TABLE"
idx := strings.Index(upper, "CREATE TABLE")
if idx == -1 {
return ""
}

// Move past "CREATE TABLE"
rest := sql[idx+12:]
rest = strings.TrimSpace(rest)

// Find opening parenthesis or end
endIdx := strings.Index(rest, "(")
if endIdx == -1 {
endIdx = len(rest)
}

name := strings.TrimSpace(rest[:endIdx])
return name
}

// extractIndexName extracts index name from CREATE INDEX statement
func extractIndexName(sql string) string {
upper := strings.ToUpper(sql)

// Handle "CREATE UNIQUE INDEX" or "CREATE INDEX"
var rest string
if strings.HasPrefix(upper, "CREATE UNIQUE INDEX") {
rest = sql[19:]
} else if strings.HasPrefix(upper, "CREATE INDEX") {
rest = sql[12:]
} else {
return ""
}

rest = strings.TrimSpace(rest)

// Find " ON "
onIdx := strings.Index(strings.ToUpper(rest), " ON ")
if onIdx == -1 {
return ""
}

name := strings.TrimSpace(rest[:onIdx])
return name
}

// FKRelation represents a foreign key relationship between tables
type FKRelation struct {
Table string // table that has the FK
Column string // FK column
ForeignTable string // referenced table
}

// GetFKOrder computes the order tables should be copied based on FK dependencies
// Tables with no FKs come first, then dependent tables in topological order
func GetFKOrder(db *sql.DB) ([]string, error) {
// Use DuckDB's duckdb_constraints() function for accurate FK info
query := `
SELECT table_name, referenced_table
FROM duckdb_constraints()
WHERE constraint_type = 'FOREIGN KEY'
AND referenced_table IS NOT NULL
`

rows, err := db.Query(query)
if err != nil {
return nil, fmt.Errorf("failed to query FK relationships: %w", err)
}
defer rows.Close()

// Build reverse dependency graph: table -> tables that depend on it
// dependsOnMe[A] = [B, C] means B and C have FKs to A
dependsOnMe := make(map[string][]string)
tables := make(map[string]bool)

for rows.Next() {
var table, foreignTable string
if err := rows.Scan(&table, &foreignTable); err != nil {
return nil, fmt.Errorf("failed to scan FK row: %w", err)
}

tables[table] = true
tables[foreignTable] = true

// foreignTable is referenced by table
dependsOnMe[foreignTable] = append(dependsOnMe[foreignTable], table)
}

if err := rows.Err(); err != nil {
return nil, fmt.Errorf("error iterating FK rows: %w", err)
}

// Get all tables from the database
tableRows, err := db.Query(`
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'main'
AND table_type = 'BASE TABLE'
`)
if err != nil {
return nil, fmt.Errorf("failed to query tables: %w", err)
}
defer tableRows.Close()

for tableRows.Next() {
var name string
if err := tableRows.Scan(&name); err != nil {
return nil, fmt.Errorf("failed to scan table name: %w", err)
}
tables[name] = true
}

// Count how many FKs each table has (tables it depends on)
fkCount := make(map[string]int)
for table := range tables {
fkCount[table] = 0
}
for _, dependents := range dependsOnMe {
for _, dependent := range dependents {
fkCount[dependent]++
}
}

// Topological sort (Kahn's algorithm)
// 1. Start with tables that have no FKs (fkCount = 0)
var queue []string
for table := range tables {
if fkCount[table] == 0 {
queue = append(queue, table)
}
}

// 2. Process queue
var result []string
for len(queue) > 0 {
// Pop first element
current := queue[0]
queue = queue[1:]
result = append(result, current)

// For each table that depends on current, decrease its FK count
for _, dependent := range dependsOnMe[current] {
fkCount[dependent]--
if fkCount[dependent] == 0 {
queue = append(queue, dependent)
}
}
}

// If result doesn't contain all tables, there's a cycle
if len(result) != len(tables) {
// Add remaining tables (cycle handling)
for table := range tables {
found := slices.Contains(result, table)
if !found {
result = append(result, table)
}
}
}

return result, nil
}
file addition: schema.dbml (---r------)

[0.790921]

Enum "dataset_type" {
"structured"
"unstructured"
"test"
"train"
}

Enum "gain_level" {
"low"
"low-medium"
"medium"
"medium-high"
"high"
}

Table "dataset" {
"id" VARCHAR(12) [pk]
"name" VARCHAR(255) [unique, not null]
"description" VARCHAR(255)
"created_at" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"last_modified" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"active" BOOLEAN [default: TRUE]
"type" dataset_type [not null, default: 'structured']
}

Table "location" {
"id" VARCHAR(12) [pk]
"dataset_id" VARCHAR(12) [not null]
"name" VARCHAR(140) [not null]
"latitude" DECIMAL(10,7) [not null, check: `latitude BETWEEN -90.0 AND 90.0`]
"longitude" DECIMAL(10,7) [not null, check: `longitude BETWEEN -180.0 AND 180.0`]
"description" VARCHAR(255)
"created_at" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"last_modified" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"active" BOOLEAN [default: TRUE]
"timezone_id" VARCHAR(40) [not null]

Indexes {
(dataset_id, name) [unique]
}
}

Table "cyclic_recording_pattern" {
"id" VARCHAR(12) [pk]
"record_s" INTEGER [not null]
"sleep_s" INTEGER [not null]
"created_at" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"last_modified" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"active" BOOLEAN [default: TRUE]

Indexes {
(record_s, sleep_s) [unique]
}
}

Table "cluster" {
"id" VARCHAR(12) [pk]
"dataset_id" VARCHAR(12) [not null]
"location_id" VARCHAR(12) [not null]
"name" VARCHAR(140) [not null]
"description" VARCHAR(255)
"created_at" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"last_modified" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"active" BOOLEAN [default: TRUE]
"cyclic_recording_pattern_id" VARCHAR(12)
"sample_rate" INTEGER [not null]
"path" VARCHAR(255)

Indexes {
(location_id, name) [unique]
}
}

Table "file" {
"id" VARCHAR(21) [pk]
"file_name" VARCHAR(255) [not null]
"xxh64_hash" VARCHAR(16) [unique, not null]
"location_id" VARCHAR(12)
"timestamp_local" TIMESTAMP [not null]
"cluster_id" VARCHAR(12)
"duration" DECIMAL(7,3) [not null, check: `duration > 0`]
"sample_rate" INTEGER [not null]
"description" VARCHAR(255)
"maybe_solar_night" BOOLEAN
"maybe_civil_night" BOOLEAN
"moon_phase" DECIMAL(3,2) [check: `moon_phase BETWEEN 0.00 AND 1.00`]
"created_at" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"last_modified" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"active" BOOLEAN [default: TRUE]

Indexes {
location_id [name: "idx_file_location"]
cluster_id [name: "idx_file_cluster"]
timestamp_local [name: "idx_file_timestamp_local"]
}
}

Table "moth_metadata" {
"file_id" VARCHAR(21) [pk]
"timestamp" TIMESTAMP [not null]
"recorder_id" VARCHAR(16)
"gain" gain_level
"battery_v" DECIMAL(2,1) [check: `battery_v >= 0`]
"temp_c" DECIMAL(3,1)
"created_at" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"last_modified" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"active" BOOLEAN [default: TRUE]
}

Table "file_metadata" {
"file_id" VARCHAR(21) [pk]
"json" JSON
"created_at" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"last_modified" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"active" BOOLEAN [default: TRUE]
}

Table "file_dataset" {
"file_id" VARCHAR(21) [not null]
"dataset_id" VARCHAR(12) [not null]
"created_at" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"last_modified" TIMESTAMP [default: `CURRENT_TIMESTAMP`]

Indexes {
(file_id, dataset_id) [pk]
dataset_id [name: "idx_file_dataset_dataset"]
}
}

Table "segment" {
"id" VARCHAR(21) [pk]
"file_id" VARCHAR(21) [not null]
"dataset_id" VARCHAR(12) [not null]
"start_time" DECIMAL(7,3) [not null]
"end_time" DECIMAL(7,3) [not null]
"freq_low" DECIMAL(9,3) [check: `freq_low < 300000`]
"freq_high" DECIMAL(9,3) [check: `freq_high < 300000`]
"description" VARCHAR(255)
"created_at" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"last_modified" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"active" BOOLEAN [default: TRUE]

Indexes {
file_id [name: "idx_segment_file"]
dataset_id [name: "idx_segment_dataset"]
}
}

Table "ebird_taxonomy" {
"id" VARCHAR(12) [pk]
"taxonomy_version" VARCHAR(4) [not null]
"taxon_order" INTEGER [not null]
"category" VARCHAR(15) [not null]
"species_code" VARCHAR(15) [not null]
"taxon_concept_id" VARCHAR(15)
"primary_com_name" VARCHAR(100) [not null]
"sci_name" VARCHAR(100) [not null]
"bird_order" VARCHAR(30)
"family" VARCHAR(100)
"species_group" VARCHAR(100)
"report_as" VARCHAR(15)
"valid_from" DATE [not null]
"valid_to" DATE
"active" BOOLEAN [default: TRUE]

Indexes {
(species_code, taxonomy_version) [unique]
(species_code, taxonomy_version) [name: "idx_ebird_taxonomy_species_code"]
}
}

Table "species" {
"id" VARCHAR(12) [pk]
"label" VARCHAR(100) [unique, not null]
"ebird_code" VARCHAR(12)
"taxonomy_version" VARCHAR(4)
"description" VARCHAR(255)
"created_at" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"last_modified" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"active" BOOLEAN [default: TRUE]
}

Table "call_type" {
"id" VARCHAR(12) [pk]
"species_id" VARCHAR(12) [not null]
"label" VARCHAR(100) [not null]
"created_at" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"last_modified" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"active" BOOLEAN [default: TRUE]
}

Table "filter" {
"id" VARCHAR(12) [pk]
"name" VARCHAR(140) [not null]
"description" VARCHAR(255)
"created_at" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"last_modified" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"active" BOOLEAN [not null, default: true]
}

Table "label" {
"id" VARCHAR(21) [pk]
"segment_id" VARCHAR(21) [not null]
"species_id" VARCHAR(12) [not null]
"filter_id" VARCHAR(12) [not null]
"certainty" DECIMAL(5,2) [check: `certainty <= 100 AND certainty >= 0`]
"created_at" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"last_modified" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"active" BOOLEAN [not null, default: true]

Indexes {
segment_id [name: "idx_label_segment_id"]
species_id [name: "idx_label_species_id"]
}
}

Table "label_metadata" {
"label_id" VARCHAR(21) [pk]
"json" JSON
"created_at" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"last_modified" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"active" BOOLEAN [default: TRUE]
}

Table "label_subtype" {
"id" VARCHAR(21) [pk]
"label_id" VARCHAR(21) [not null]
"calltype_id" VARCHAR(12) [not null]
"filter_id" VARCHAR(12)
"certainty" DECIMAL(5,2) [check: `certainty <= 100 AND certainty >= 0`]
"created_at" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"last_modified" TIMESTAMP [default: `CURRENT_TIMESTAMP`]
"active" BOOLEAN [not null, default: true]

Indexes {
label_id [name: "idx_label_subtype_label_id"]
calltype_id [name: "idx_label_subtype_calltype_id"]
filter_id [name: "idx_label_subtype_filter_id"]
}
}

Ref:"dataset"."id" < "location"."dataset_id"

Ref:"dataset"."id" < "cluster"."dataset_id"

Ref:"location"."id" < "cluster"."location_id"

Ref:"cyclic_recording_pattern"."id" < "cluster"."cyclic_recording_pattern_id"

Ref:"location"."id" < "file"."location_id"

Ref:"cluster"."id" < "file"."cluster_id"

Ref:"file"."id" < "moth_metadata"."file_id"

Ref:"file"."id" < "file_metadata"."file_id"

Ref:"file"."id" < "file_dataset"."file_id"

Ref:"dataset"."id" < "file_dataset"."dataset_id"

Ref:"file"."id" < "segment"."file_id"

Ref:"dataset"."id" < "segment"."dataset_id"

Ref:"file_dataset".("file_id", "dataset_id") < "segment".("file_id", "dataset_id")

Ref:"ebird_taxonomy".("species_code", "taxonomy_version") < "species".("ebird_code", "taxonomy_version")

Ref:"species"."id" < "call_type"."species_id"

Ref:"segment"."id" < "label"."segment_id"

Ref:"species"."id" < "label"."species_id"

Ref:"filter"."id" < "label"."filter_id"

Ref:"label"."id" < "label_metadata"."label_id"

Ref:"label"."id" < "label_subtype"."label_id"

Ref:"call_type"."id" < "label_subtype"."calltype_id"

Ref:"filter"."id" < "label_subtype"."filter_id"
file addition: invariants_test.go (----------)

[0.790921]

package db

import (
"database/sql"
"testing"

_ "github.com/duckdb/duckdb-go/v2"
)

// setupInvariantsTestDB creates an in-memory database with the full schema
func setupInvariantsTestDB(t *testing.T) *sql.DB {
t.Helper()

db, err := sql.Open("duckdb", ":memory:")
if err != nil {
t.Fatalf("failed to open database: %v", err)
}

schema, err := ReadSchemaSQL()
if err != nil {
t.Fatalf("failed to read schema: %v", err)
}

_, err = db.Exec(schema)
if err != nil {
t.Fatalf("failed to create schema: %v", err)
}

return db
}

// insertDataset creates a test dataset and returns its ID
func insertDataset(t *testing.T, db *sql.DB, id, name string) {
t.Helper()
_, err := db.Exec(
"INSERT INTO dataset (id, name, type, active) VALUES (?, ?, 'structured', true)",
id, name,
)
if err != nil {
t.Fatalf("failed to insert dataset: %v", err)
}
}

// insertLocation creates a test location and returns its ID
func insertLocation(t *testing.T, db *sql.DB, id, datasetID, name string) {
t.Helper()
_, err := db.Exec(
`INSERT INTO location (id, dataset_id, name, latitude, longitude, timezone_id, active)
VALUES (?, ?, ?, -36.8485, 174.7633, 'Pacific/Auckland', true)`,
id, datasetID, name,
)
if err != nil {
t.Fatalf("failed to insert location: %v", err)
}
}

// insertCluster creates a test cluster
func insertCluster(t *testing.T, db *sql.DB, id, datasetID, locationID, name string) {
t.Helper()
_, err := db.Exec(
`INSERT INTO cluster (id, dataset_id, location_id, name, sample_rate, active)
VALUES (?, ?, ?, ?, 48000, true)`,
id, datasetID, locationID, name,
)
if err != nil {
t.Fatalf("failed to insert cluster: %v", err)
}
}

// insertFile creates a test file
func insertFile(t *testing.T, db *sql.DB, id, hash, locationID string) {
t.Helper()
_, err := db.Exec(
`INSERT INTO file (id, file_name, xxh64_hash, location_id, timestamp_local, duration, sample_rate, active)
VALUES (?, 'test.wav', ?, ?, CURRENT_TIMESTAMP, 1.0, 48000, true)`,
id, hash, locationID,
)
if err != nil {
t.Fatalf("failed to insert file: %v", err)
}
}

// ============================================================================
// Phase 1, Test 1: UniqueFileHash invariant
// Spec: validation.allium - UniqueFileHash
// "for f1 in Files: for f2 in Files: f1 != f2 implies f1.xxh64_hash != f2.xxh64_hash"
// ============================================================================

func TestInvariant_UniqueFileHash(t *testing.T) {
db := setupInvariantsTestDB(t)
defer db.Close()

// Setup: create dataset → location → cluster → file
insertDataset(t, db, "ds_test12345", "Test Dataset")
insertLocation(t, db, "loc_test1234", "ds_test12345", "Test Location")
insertCluster(t, db, "clustest1234", "ds_test12345", "loc_test1234", "Test Cluster")

// Insert first file with a specific hash
insertFile(t, db, "filetest1234567890123", "abcd1234efgh5678", "loc_test1234")

// Test: Attempting to insert a second file with the same hash should fail
t.Run("duplicate hash rejected", func(t *testing.T) {
_, err := db.Exec(
`INSERT INTO file (id, file_name, xxh64_hash, location_id, timestamp_local, duration, sample_rate, active)
VALUES ('filetest_diffhash01', 'test2.wav', 'abcd1234efgh5678', 'loc_test1234', CURRENT_TIMESTAMP, 1.0, 48000, true)`,
)
if err == nil {
t.Error("expected error for duplicate xxh64_hash, got nil")
}
})

// Test: Different hash should succeed
t.Run("different hash accepted", func(t *testing.T) {
_, err := db.Exec(
`INSERT INTO file (id, file_name, xxh64_hash, location_id, timestamp_local, duration, sample_rate, active)
VALUES ('filetest_diffhash02', 'test3.wav', '9876zyxw5432vuts', 'loc_test1234', CURRENT_TIMESTAMP, 1.0, 48000, true)`,
)
if err != nil {
t.Errorf("unexpected error for different hash: %v", err)
}
})

// Test: Same hash with inactive file should still fail (constraint applies to all rows)
t.Run("inactive file still blocks duplicate", func(t *testing.T) {
// Mark first file as inactive
_, err := db.Exec("UPDATE file SET active = false WHERE id = 'filetest1234567890123'")
if err != nil {
t.Fatalf("failed to deactivate file: %v", err)
}

// Attempt duplicate hash with new file
_, err = db.Exec(
`INSERT INTO file (id, file_name, xxh64_hash, location_id, timestamp_local, duration, sample_rate, active)
VALUES ('filetest_inactblk01', 'test4.wav', 'abcd1234efgh5678', 'loc_test1234', CURRENT_TIMESTAMP, 1.0, 48000, true)`,
)
if err == nil {
t.Error("expected error for duplicate xxh64_hash even with inactive file, got nil")
}
})
}

// ============================================================================
// Phase 1, Test 2: LocationBelongsToDataset invariant
// Spec: validation.allium - LocationBelongsToDataset
// "for l in Locations: l.dataset exists and is valid"
// ============================================================================

func TestInvariant_LocationBelongsToDataset(t *testing.T) {
db := setupInvariantsTestDB(t)
defer db.Close()

// Setup: create dataset
insertDataset(t, db, "ds_valid123456", "Valid Dataset")

t.Run("location with valid dataset accepted", func(t *testing.T) {
_, err := db.Exec(
`INSERT INTO location (id, dataset_id, name, latitude, longitude, timezone_id, active)
VALUES ('loc_valid12345', 'ds_valid123456', 'Valid Location', -36.8485, 174.7633, 'Pacific/Auckland', true)`,
)
if err != nil {
t.Errorf("unexpected error: %v", err)
}
})

t.Run("location with nonexistent dataset rejected", func(t *testing.T) {
_, err := db.Exec(
`INSERT INTO location (id, dataset_id, name, latitude, longitude, timezone_id, active)
VALUES ('loc_bad_ds_001', 'ds_nonexistent', 'Bad Location', -36.8485, 174.7633, 'Pacific/Auckland', true)`,
)
if err == nil {
t.Error("expected error for nonexistent dataset_id, got nil")
}
})

t.Run("location with deleted dataset rejected", func(t *testing.T) {
// Create and then soft-delete a dataset
insertDataset(t, db, "ds_del_temp_01", "To Be Deleted")
_, err := db.Exec("UPDATE dataset SET active = false WHERE id = 'ds_del_temp_01'")
if err != nil {
t.Fatalf("failed to deactivate dataset: %v", err)
}

// Try to create location pointing to inactive dataset
_, err = db.Exec(
`INSERT INTO location (id, dataset_id, name, latitude, longitude, timezone_id, active)
VALUES ('loc_inact_ds01', 'ds_del_temp_01', 'Inactive DS Location', -36.8485, 174.7633, 'Pacific/Auckland', true)`,
)
// Note: FK constraint may still allow this depending on implementation
// This test documents the current behavior
t.Logf("Insert location to inactive dataset: err=%v", err)
})

t.Run("duplicate location name in same dataset rejected", func(t *testing.T) {
// Try to insert location with same name in same dataset
_, err := db.Exec(
`INSERT INTO location (id, dataset_id, name, latitude, longitude, timezone_id, active)
VALUES ('loc_dup_name01', 'ds_valid123456', 'Valid Location', -40.9006, 174.8860, 'Pacific/Auckland', true)`,
)
if err == nil {
t.Error("expected error for duplicate location name in same dataset, got nil")
}
})

t.Run("same location name in different datasets accepted", func(t *testing.T) {
// Create second dataset
insertDataset(t, db, "ds_second_1234", "Second Dataset")

// Same name as in first dataset should work
_, err := db.Exec(
`INSERT INTO location (id, dataset_id, name, latitude, longitude, timezone_id, active)
VALUES ('loc_same_name2', 'ds_second_1234', 'Valid Location', -36.8485, 174.7633, 'Pacific/Auckland', true)`,
)
if err != nil {
t.Errorf("unexpected error for same name in different dataset: %v", err)
}
})
}

// ============================================================================
// Phase 1, Test 3: ClusterBelongsToLocation invariant
// Spec: validation.allium - ClusterBelongsToLocation, LocationBelongsToDataset (cross-check)
// "for c in Clusters: c.location exists AND c.location.dataset = c.dataset"
// ============================================================================

func TestInvariant_ClusterBelongsToLocation(t *testing.T) {
db := setupInvariantsTestDB(t)
defer db.Close()

// Setup: create two separate dataset hierarchies
insertDataset(t, db, "ds_cluster_t01", "Cluster Test Dataset 1")
insertDataset(t, db, "ds_cluster_t02", "Cluster Test Dataset 2")
insertLocation(t, db, "loc_clust_t001", "ds_cluster_t01", "Location in DS1")
insertLocation(t, db, "loc_clust_t002", "ds_cluster_t02", "Location in DS2")

t.Run("cluster with valid location accepted", func(t *testing.T) {
_, err := db.Exec(
`INSERT INTO cluster (id, dataset_id, location_id, name, sample_rate, active)
VALUES ('cl_valid123456', 'ds_cluster_t01', 'loc_clust_t001', 'Valid Cluster', 48000, true)`,
)
if err != nil {
t.Errorf("unexpected error: %v", err)
}
})

t.Run("cluster with nonexistent location rejected", func(t *testing.T) {
_, err := db.Exec(
`INSERT INTO cluster (id, dataset_id, location_id, name, sample_rate, active)
VALUES ('cl_badloc12345', 'ds_cluster_t01', 'loc_nonexistent', 'Bad Location Cluster', 48000, true)`,
)
if err == nil {
t.Error("expected error for nonexistent location_id, got nil")
}
})

t.Run("cluster with mismatched dataset and location rejected", func(t *testing.T) {
// Attempt: cluster.dataset_id = ds1, but cluster.location_id = location from ds2
_, err := db.Exec(
`INSERT INTO cluster (id, dataset_id, location_id, name, sample_rate, active)
VALUES ('cl_mismatch001', 'ds_cluster_t01', 'loc_clust_t002', 'Mismatched Cluster', 48000, true)`,
)
// This tests the business logic invariant from the spec
// The schema allows this via FKs, but the application should reject it
// If the schema doesn't prevent this, the test documents the gap
t.Logf("Mismatched dataset/location: err=%v", err)
})

t.Run("duplicate cluster name in same location rejected", func(t *testing.T) {
// Try to insert cluster with same name in same location
_, err := db.Exec(
`INSERT INTO cluster (id, dataset_id, location_id, name, sample_rate, active)
VALUES ('cl_dup_name_01', 'ds_cluster_t01', 'loc_clust_t001', 'Valid Cluster', 48000, true)`,
)
if err == nil {
t.Error("expected error for duplicate cluster name in same location, got nil")
}
})

t.Run("same cluster name in different locations accepted", func(t *testing.T) {
// Same name but different location should work
_, err := db.Exec(
`INSERT INTO cluster (id, dataset_id, location_id, name, sample_rate, active)
VALUES ('cl_same_nam_02', 'ds_cluster_t02', 'loc_clust_t002', 'Valid Cluster', 48000, true)`,
)
if err != nil {
t.Errorf("unexpected error for same name in different location: %v", err)
}
})
}

// ============================================================================
// Cross-invariant: Hierarchical integrity
// Tests that the full hierarchy chain is enforced
// ============================================================================

func TestInvariant_HierarchicalIntegrity(t *testing.T) {
db := setupInvariantsTestDB(t)
defer db.Close()

// Build complete hierarchy
insertDataset(t, db, "ds_hier_test01", "Hierarchy Test")
insertLocation(t, db, "loc_hier_test1", "ds_hier_test01", "Hier Location")
insertCluster(t, db, "cl_hier_test01", "ds_hier_test01", "loc_hier_test1", "Hier Cluster")

t.Run("file must have valid location", func(t *testing.T) {
_, err := db.Exec(
`INSERT INTO file (id, file_name, xxh64_hash, location_id, timestamp_local, duration, sample_rate, active)
VALUES ('file_badloc001', 'test.wav', '1111111111111111', 'loc_nonexistent', CURRENT_TIMESTAMP, 1.0, 48000, true)`,
)
if err == nil {
t.Error("expected error for file with invalid location, got nil")
}
})

t.Run("file with valid location but invalid cluster rejected", func(t *testing.T) {
_, err := db.Exec(
`INSERT INTO file (id, file_name, xxh64_hash, location_id, cluster_id, timestamp_local, duration, sample_rate, active)
VALUES ('file_badcl_001', 'test.wav', '2222222222222222', 'loc_hier_test1', 'cl_nonexistent', CURRENT_TIMESTAMP, 1.0, 48000, true)`,
)
if err == nil {
t.Error("expected error for file with invalid cluster, got nil")
}
})

t.Run("valid file through full hierarchy accepted", func(t *testing.T) {
_, err := db.Exec(
`INSERT INTO file (id, file_name, xxh64_hash, location_id, cluster_id, timestamp_local, duration, sample_rate, active)
VALUES ('file_valid0001', 'test.wav', '3333333333333333', 'loc_hier_test1', 'cl_hier_test01', CURRENT_TIMESTAMP, 1.0, 48000, true)`,
)
if err != nil {
t.Errorf("unexpected error: %v", err)
}
})
}
file addition: dbml-error.log (---r------)

[0.790921]

2026-01-20T07:41:23.093Z
undefined
file addition: db.go (----------)

[0.790921]

package db

import (
"database/sql"
"fmt"

_ "github.com/duckdb/duckdb-go/v2" // DuckDB driver
)

// OpenReadOnlyDB opens a DuckDB connection in read-only mode
// Provides additional security layer for query-only operations
// Caller must close the connection when done
func OpenReadOnlyDB(dbPath string) (*sql.DB, error) {
connStr := dbPath + "?access_mode=read_only"
db, err := sql.Open("duckdb", connStr)
if err != nil {
return nil, fmt.Errorf("failed to open database: %w", err)
}

if err = db.Ping(); err != nil {
closeErr := db.Close()
if closeErr != nil {
return nil, fmt.Errorf("failed to ping database: %w (close error: %v)", err, closeErr)
}
return nil, fmt.Errorf("failed to ping database: %w", err)
}

return db, nil
}

// OpenWriteableDB opens a DuckDB connection in read-write mode
// Used for write operations (insert, update, delete)
// Caller must close the connection when done
func OpenWriteableDB(dbPath string) (*sql.DB, error) {
connStr := dbPath + "?access_mode=read_write"
db, err := sql.Open("duckdb", connStr)
if err != nil {
return nil, fmt.Errorf("failed to open database: %w", err)
}

if err = db.Ping(); err != nil {
closeErr := db.Close()
if closeErr != nil {
return nil, fmt.Errorf("failed to ping database: %w (close error: %v)", err, closeErr)
}
return nil, fmt.Errorf("failed to ping database: %w", err)
}

return db, nil
}
file addition: avianz_file_format_specification.md (----------)

[0.790921]

# Specification of file formats used by AviaNZ

AviaNZ annotations and filter definitions are stored in JSON format to allow easy parsing and manual inspection by text editors.

## Annotation files (.data)
A JSON array where the first (optional, but recommended) element stores metadata about the corresponding audio file, and each remaining element corresponds to a segment:

[ Meta, seg, seg, seg, seg ... ]

`Meta`: a JSON object (key-value pairs) containing any metadata. Required fields:
`Operator` - string
`Reviewer` - string
`Duration` - numeric, audio file length, in seconds
...

Each true segment `seg` is a JSON array containing five elements, all required:

[ starttime, endtime, freq.low, freq.high, labels ]

`startime, endtime` - segment start and end positions, in seconds, relative to start of file as 0.
`freq.low, freq.high` - for annotation boxes, frequency band in Hz. For segments (full-band annotations), both `0`. If both `0<freq<1`, old format is assumed, and treated as full-band segment (`0,0`).
`labels` - a JSON array of labels for each type of sound detected:

[ label, label, label... ]

where each `label` is a JSON object, having some of the following fields:

{ "species": "Kiwi (Little spotted)", "certainty": 0, "filter": "kiwi-best", "calltype": "f1", ... }

`species` - string, either `"genus (species)"` or just plain `"species"`. May be `"Don't Know"` or any other label (`"Bellbird/Tui"`, `"Fantail (spp)"`...), except for the internal genus separator `>`. Required.
`certainty` - numeric between 0 and 100. Currently, for `"species": "Don't Know"` only `0` allowed, `100` corresponds to green segments, and `50` corresponds to question marks in earlier formats. `(species, certainty)` defines a unique key for labels. Required.
`filter` - string, name of the filter file that created this label, or `"M"` for manual annotations.
`calltype` - string, to identify the call type. Call types can be annotated manually, or will be automatically generated from clusters during filter training. Required for automatic filters (i.e. if `filter` is not empty or `"M"`).
Any additional attributes defined for this call (male/female, subjective loudness...) are optional and can be passed as key-value pairs.

Thus, a full .data file may look like this:

[ {"Operator": Alice, "Reviewer": Bob, "Duration": 60.0, "Noise": "windy"}, // metadata
// a manually marked box
[1.0, 19.0, 1200, 2500,
[
{ "species": "Kiwi (Little spotted)", "certainty": 100, "filter": "M", "loudness": 3 }
]
],
// box from a "trill" filter
[21.0, 23.0, 800, 6000,
[
{ "species": "Morepork", "certainty": 50, "filter": "ruru-90-10", "calltype": "trill" }
]
],
// a manually marked segment with morepork and something else
[35, 45, 0, 0,
[
{ "species": "Morepork", "certainty": 100, "filter": "M" },
{ "species": "Don't Know", "certainty": 0, "filter": "M" }
]
]
]

## Filter files (.txt)

A JSON array:

{ "species": "Kiwi (Little spotted)", "SampleRate": 16000, "Filters": [], "NN": {}, ...}

Main filter ID is the file name because this automatically ensures that no duplicate IDs are present at any installation of AviaNZ. This name can be any string permitted by the OS, and no further information is gathered from it.
`species` - string. This label will be assigned as the `species` in segments generated by this filter. Can follow `"genus (species)"` format as described above. Required.
`SampleRate` - integer. All analyses will be done after down-(up-)sampling to this rate. Required.
`method` - string, `"wv"` or `"chp"`. Empty defaults to `"wv"`.
Any extra parameters to be applied for all subfilters may be provided (such as `"wind"`).

`Filters` - JSON array of filters corresponding to each type of call (at least one element). Each is a JSON object:

{ "calltype": "clust1", "TimeRange": [min call length, max call length, avg syllable length, max gap between syllables], "WaveletParams": {"thr": 0.5, "M": 1.5, "nodes": [35, 37, 40]}, "FreqRange": [1000, 3000], ... }

`calltype` - either user-defined call type, or automatically generated cluster ID. String. Required.

`TimeRange` - JSON array of length 4: `[minlen, maxlen, avgsyl, maxgap]`, respectively min and max lengths of a call, average syllable length, and maximum gap between parts of same call. Required.
`WaveletParams` - JSON object of parameters needed for wavelet filtering. Required. Currently uses:
* `thr` - numeric, threshold for detecting calls. Required.
* `nodes` - JSON array of wavelet nodes used in this filter. Required.
* `M` - numeric, energy curve window in seconds. Required for `method="wv"`.
* `win` - numeric, window for energy averaging in seconds. Required for `method="chp"`.

`FreqRange` - frequency band for analysis. Identified calls will be marked as boxes with these limits, or as full-band segments if not provided.
Any extra subfilter parameters may follow, such as `"F0"`.

`PostResolution` - numeric. If present, detections will be merged and resplit into pieces of this many seconds (i.e. this parameter is both the merging gap and split piece length).

`NN` - JSON object. Meta information about the Convolution Neural Network (NN) model for this species:

"NN": {"NN_name": "Kiwi (Nth Is Brown)", "loss": "binary_crossentropy", "optimizer": "adam", "win": 0.25, "inputdim": [128, 30], "output": {"0": "Male", "1": "Female", "2": "Noise"}}

If present, all the following are required:
* `NN_name` - File name of the model, e.g. `Kiwi (Nth Is Brown).json` and `Kiwi (Nth Is Brown).h5` or `Kiwi (Nth Is Brown).weights.h5`.
* `loss` - loss function.
* `optimizer` - optimisation algorithm.
* `win` - input image width in seconds.
* `inputdim` - input dimension in pixels.
* `output` - the output classes/labels.
* `windowInc` - window width and increment.
* `thr`- threshold for each call type.

Thus, a full filter file may look like this:

{ "species": "Kiwi (Little spotted)", "SampleRate": 16000, "Rain": false, "Wind": true,
"Filters": [
{ "calltype": "M", "TimeRange": [5, 60, 1, 3], "WaveletParams": {"nodes": [44, 45, 46], "thr": 0.5, "M": 1.5}, "F0": true, "FreqRange": [1500, 5000] },
{ "calltype": "F", "TimeRange": [10.0, 30.0, 0.8, 1.0], "WaveletParams": {"nodes": [41, 44], "thr": 0.8, "M": 2}, "FreqRange": [1000, 2500] }
],
"NN": {"NN_name": "Kiwi (Little spotted)", "loss": "binary_crossentropy", "optimizer": "adam", "win": 0.25, "inputdim": [128, 30], "output": {"0": "M", "1": "F", "2": "Noise", "3": "Silence"}, "windowInc":[256, 128], "thr":[0.5, 0.3]}
}

## NN files (.JSON/.h5/.hdf5)

A NN model has two files: model architecture is stored in a JSON file and the weights are stored in a Hierarchical Data Format 5 file (.h5 or .hdf5).
All the NN models are stored in the user configdir/Filters and referred in the corresponding Filter files.

## Correction files (.corrections/ .corrections_species)

All Species Review mode generates .corrections:

A JSON array where the first element stores metadata, and each remaining element corresponds to a segment changed by reviewer:

[ Meta, [seg, newlabel], [seg, newlabel], [seg, newlabel] ... ]

`Meta`: a JSON object (key-value pairs) containing any metadata, same as in .data.
`seg`: Each segment seg is a JSON array containing five elements, same as in .data.
`newlabel`: New label/s assigned to the segment by the reviewer.

Single Species Review mode generates .corrections_species:

A JSON array where the first element stores metadata, and each remaining element corresponds to a segment deleted by reviewer:

[ Meta, seg, seg, seg ... ]

`Meta`: a JSON object (key-value pairs) containing any metadata, same as in .data.
`seg`: Each segment seg is a JSON array containing five elements, same as in .data.
file addition: cmd (d--r------)

[2.1]
file addition: xxhash.go (----------)

[0.1037540]

package cmd

import (
"encoding/json"
"flag"
"fmt"
"os"

"skraak/utils"
)

// RunXXHash handles the "xxhash" subcommand
//
// JSON output schema:
//
// {
// "file": string, // Path to the hashed file
// "hash": string // XXH64 hash (hex string)
// }
func RunXXHash(args []string) {
fs := flag.NewFlagSet("xxhash", flag.ExitOnError)
filePath := fs.String("file", "", "Path to file (required)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak xxhash --file <path>\n\n")
fmt.Fprintf(os.Stderr, "Compute XXH64 hash of a file (same format stored in database).\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak xxhash --file recording.wav\n")
fmt.Fprintf(os.Stderr, " skraak xxhash --file /path/to/audio.wav | jq '.hash'\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

if *filePath == "" {
fmt.Fprintf(os.Stderr, "Error: --file is required\n\n")
fs.Usage()
os.Exit(1)
}

// Compute hash
hash, err := utils.ComputeXXH64(*filePath)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

// Output as JSON
output := map[string]string{
"file": *filePath,
"hash": hash,
}

enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
if err := enc.Encode(output); err != nil {
fmt.Fprintf(os.Stderr, "Error encoding output: %v\n", err)
os.Exit(1)
}
}
file addition: update.go (----------)

[0.1037540]

package cmd

import (
"fmt"
"os"
)

// RunUpdate handles the "update" command
func RunUpdate(args []string) {
if len(args) < 1 {
printUpdateUsage()
os.Exit(1)
}

switch args[0] {
case "dataset":
RunDatasetUpdate(args[1:])
case "location":
RunLocationUpdate(args[1:])
case "cluster":
RunClusterUpdate(args[1:])
case "pattern":
RunPatternUpdate(args[1:])
default:
fmt.Fprintf(os.Stderr, "Unknown resource to update: %s\n", args[0])
printUpdateUsage()
os.Exit(1)
}
}

func printUpdateUsage() {
fmt.Fprintf(os.Stderr, "Usage: skraak update <resource> [options]\n\n")
fmt.Fprintf(os.Stderr, "Resources:\n")
fmt.Fprintf(os.Stderr, " dataset Update an existing dataset\n")
fmt.Fprintf(os.Stderr, " location Update an existing location\n")
fmt.Fprintf(os.Stderr, " cluster Update an existing cluster\n")
fmt.Fprintf(os.Stderr, " pattern Update an existing pattern\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak update dataset --db ./db/skraak.duckdb --id abc123 --name \"Updated Name\"\n")
fmt.Fprintf(os.Stderr, " skraak update location --db ./db/skraak.duckdb --id loc123 --name \"New Name\" --lat -36.85 --lon 174.76\n")
fmt.Fprintf(os.Stderr, " skraak update cluster --db ./db/skraak.duckdb --id clust123 --name \"New Name\" --sample-rate 192000\n")
fmt.Fprintf(os.Stderr, " skraak update pattern --db ./db/skraak.duckdb --id pattern123 --name \"New Name\" --start-time 19:00 --end-time 05:00\n")
}
file addition: time.go (----------)

[0.1037540]

package cmd

import (
"context"
"encoding/json"
"flag"
"fmt"
"os"

"skraak/tools"
)

// RunTime handles the "time" subcommand
//
// JSON output schema:
//
// {
// "time": string, // Current system time in RFC3339 format
// "timezone": string, // System timezone
// "unix": int // Unix timestamp in seconds
// }
func RunTime(args []string) {
fs := flag.NewFlagSet("time", flag.ExitOnError)

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak time\n\n")
fmt.Fprintf(os.Stderr, "Get the current system time with timezone information.\n\n")
fmt.Fprintf(os.Stderr, "Examples:\n")
fmt.Fprintf(os.Stderr, " skraak time\n")
fmt.Fprintf(os.Stderr, " skraak time | jq '.iso'\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Get current time
output, err := tools.GetCurrentTime(context.Background(), tools.GetCurrentTimeInput{})
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

// Output as JSON
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
if err := enc.Encode(output); err != nil {
fmt.Fprintf(os.Stderr, "Error encoding output: %v\n", err)
os.Exit(1)
}
}
file addition: sql.go (----------)

[0.1037540]

package cmd

import (
"context"
"encoding/json"
"flag"
"fmt"
"os"
"strings"

"skraak/tools"
)

// RunSQL handles the "sql" subcommand
// RunSQL handles CLI SQL query execution with direct database access
//
// JSON output schema:
//
// {
// "rows": [{"column_name": value, ...}, ...], // Query result rows
// "row_count": int, // Number of rows returned
// "columns": [ // Column metadata
// {"name": string, "type": string}
// ],
// "limited": bool, // Whether results were truncated due to row limit
// "query_executed": string // The actual query executed (with LIMIT applied)
// }
func RunSQL(args []string) {
fs := flag.NewFlagSet("sql", flag.ExitOnError)
dbPath := fs.String("db", "", "Path to DuckDB database (required)")
limit := fs.Int("limit", 0, "Maximum rows to return (default 1000, max 10000)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak sql --db <path> [options] <query>\n\n")
fmt.Fprintf(os.Stderr, "Execute a SQL SELECT query against the database.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak sql --db ./db/skraak.duckdb \"SELECT COUNT(*) FROM file WHERE active = true\"\n")
fmt.Fprintf(os.Stderr, " skraak sql --db ./db/skraak.duckdb --limit 10 \"SELECT * FROM dataset\"\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

if *dbPath == "" {
fmt.Fprintf(os.Stderr, "Error: --db is required\n\n")
fs.Usage()
os.Exit(1)
}

// Remaining args are the query
remaining := fs.Args()
if len(remaining) == 0 {
fmt.Fprintf(os.Stderr, "Error: query is required\n\n")
fs.Usage()
os.Exit(1)
}
query := strings.Join(remaining, " ")

tools.SetDBPath(*dbPath)

input := tools.ExecuteSQLInput{
Query: query,
}
if *limit > 0 {
input.Limit = limit
}

output, err := tools.ExecuteSQL(context.Background(), input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
if err := enc.Encode(output); err != nil {
fmt.Fprintf(os.Stderr, "Error encoding output: %v\n", err)
os.Exit(1)
}
}
file addition: replay.go (----------)

[0.1037540]

package cmd

import (
"bufio"
"context"
"database/sql"
"encoding/json"
"flag"
"fmt"
"os"
"strings"

"skraak/db"
)

// RunReplay handles the "replay" subcommand
func RunReplay(args []string) {
if len(args) < 1 {
printReplayUsage()
os.Exit(1)
}

switch args[0] {
case "events":
runReplayEvents(args[1:])
default:
fmt.Fprintf(os.Stderr, "Unknown replay subcommand: %s\n\n", args[0])
printReplayUsage()
os.Exit(1)
}
}

func printReplayUsage() {
fmt.Fprintf(os.Stderr, "Usage: skraak replay <subcommand> [options]\n\n")
fmt.Fprintf(os.Stderr, "Subcommands:\n")
fmt.Fprintf(os.Stderr, " events Replay event log into database\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak replay events --db ./backup.duckdb --log ./skraak.duckdb.events.jsonl\n")
fmt.Fprintf(os.Stderr, " skraak replay events --db ./backup.duckdb --log ./events.jsonl --dry-run\n")
fmt.Fprintf(os.Stderr, " skraak replay events --db ./backup.duckdb --log ./events.jsonl --last 10\n")
}

func runReplayEvents(args []string) {
fs := flag.NewFlagSet("replay events", flag.ExitOnError)
dbPath := fs.String("db", "", "Path to target database (required)")
logPath := fs.String("log", "", "Path to event log file (required)")
dryRun := fs.Bool("dry-run", false, "Print events without executing")
fromID := fs.String("from", "", "Start from event ID (inclusive)")
toID := fs.String("to", "", "Stop at event ID (inclusive)")
lastN := fs.Int("last", 0, "Replay last N events (0 = all)")
continueOnError := fs.Bool("continue", false, "Continue past errors")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak replay events [options]\n\n")
fmt.Fprintf(os.Stderr, "Replay event log into database.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak replay events --db ./backup.duckdb --log ./events.jsonl\n")
fmt.Fprintf(os.Stderr, " skraak replay events --db ./backup.duckdb --log ./events.jsonl --dry-run\n")
fmt.Fprintf(os.Stderr, " skraak replay events --db ./backup.duckdb --log ./events.jsonl --last 10\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
missing := []string{}
if *dbPath == "" {
missing = append(missing, "--db")
}
if *logPath == "" {
missing = append(missing, "--log")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
fs.Usage()
os.Exit(1)
}

// Read events
events, err := readEvents(*logPath)
if err != nil {
fmt.Fprintf(os.Stderr, "Error reading events: %v\n", err)
os.Exit(1)
}

// Filter events
events = filterEvents(events, *fromID, *toID, *lastN)

fmt.Fprintf(os.Stderr, "Found %d events to replay\n", len(events))

if *dryRun {
for i, event := range events {
fmt.Printf("\n[%d/%d] Event %s (%s)\n", i+1, len(events), event.ID, event.Tool)
for _, q := range event.Queries {
fmt.Printf(" SQL: %s\n", truncateSQL(q.SQL, 80))
fmt.Printf(" Params: %v\n", q.Parameters)
}
}
return
}

// Open database
database, err := db.OpenWriteableDB(*dbPath)
if err != nil {
fmt.Fprintf(os.Stderr, "Error opening database: %v\n", err)
os.Exit(1)
}
defer database.Close()

// Disable event logging for replay
db.SetEventLogConfig(db.EventLogConfig{Enabled: false})

// Replay each event
successCount := 0
failCount := 0

for i, event := range events {
fmt.Fprintf(os.Stderr, "\n[%d/%d] Replaying event %s (%s)...\n", i+1, len(events), event.ID, event.Tool)

err := replayEvent(database, event)
if err != nil {
failCount++
fmt.Fprintf(os.Stderr, " ERROR: %v\n", err)
if !*continueOnError {
fmt.Fprintf(os.Stderr, "Stopping due to error. Use --continue to skip errors.\n")
os.Exit(1)
}
} else {
successCount++
fmt.Fprintf(os.Stderr, " OK (%d queries)\n", len(event.Queries))
}
}

fmt.Fprintf(os.Stderr, "\nReplay complete: %d succeeded, %d failed\n", successCount, failCount)
}

// TransactionEvent represents a transaction event from the log
type TransactionEvent struct {
ID string `json:"id"`
Timestamp string `json:"timestamp"`
Tool string `json:"tool,omitempty"`
Queries []QueryRecord `json:"queries"`
Success bool `json:"success"`
Duration int64 `json:"duration_ms"`
}

// QueryRecord represents a single SQL statement with parameters
type QueryRecord struct {
SQL string `json:"sql"`
Parameters []any `json:"parameters"`
}

// readEvents reads all events from a JSONL file
func readEvents(path string) ([]TransactionEvent, error) {
file, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("failed to open event log: %w", err)
}
defer func() { _ = file.Close() }()

var events []TransactionEvent
scanner := bufio.NewScanner(file)
scanner.Buffer(make([]byte, 20*1024*1024), 20*1024*1024) // 20MB max line size
lineNum := 0

for scanner.Scan() {
lineNum++
line := scanner.Bytes()
if len(line) == 0 {
continue
}

var event TransactionEvent
if err := json.Unmarshal(line, &event); err != nil {
fmt.Fprintf(os.Stderr, "Warning: failed to parse line %d: %v\n", lineNum, err)
continue
}

events = append(events, event)
}

if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("error reading event log: %w", err)
}

return events, nil
}

// filterEvents filters events based on criteria
func filterEvents(events []TransactionEvent, fromID, toID string, lastN int) []TransactionEvent {
// Filter by fromID
if fromID != "" {
startIdx := 0
for i, e := range events {
if e.ID == fromID {
startIdx = i
break
}
}
events = events[startIdx:]
}

// Filter by toID
if toID != "" {
endIdx := len(events)
for i, e := range events {
if e.ID == toID {
endIdx = i + 1
break
}
}
events = events[:endIdx]
}

// Filter by lastN
if lastN > 0 && len(events) > lastN {
events = events[len(events)-lastN:]
}

// Only replay successful events
var filtered []TransactionEvent
for _, e := range events {
if e.Success {
filtered = append(filtered, e)
}
}

return filtered
}

// replayEvent replays a single transaction event
func replayEvent(database *sql.DB, event TransactionEvent) error {
ctx := context.Background()
tx, err := database.BeginTx(ctx, nil)
if err != nil {
return fmt.Errorf("failed to begin transaction: %w", err)
}

for _, q := range event.Queries {
// Convert parameters to []interface{} for Exec
_, err := tx.ExecContext(ctx, q.SQL, q.Parameters...)
if err != nil {
tx.Rollback()
return fmt.Errorf("query failed: %w (SQL: %s)", err, truncateSQL(q.SQL, 50))
}
}

if err := tx.Commit(); err != nil {
return fmt.Errorf("failed to commit transaction: %w", err)
}

return nil
}

// truncateSQL truncates a SQL string for display
func truncateSQL(sql string, maxLen int) string {
sql = strings.Join(strings.Fields(sql), " ") // Normalize whitespace
if len(sql) <= maxLen {
return sql
}
return sql[:maxLen] + "..."
}
file addition: prepend.go (----------)

[0.1037540]

package cmd

import (
"encoding/json"
"flag"
"fmt"
"os"

"skraak/tools"
)

// RunPrepend handles the "prepend" subcommand
//
// JSON output schema:
//
// {
// "folder": string, // Target folder path
// "prefix": string, // Prefix that was prepended
// "recursive": bool, // Whether subfolders were included
// "dry_run": bool, // Whether this was a dry run
// "renamed": [ // Successfully renamed files
// {"old": string, "new": string}
// ],
// "skipped": [ // Skipped files
// {"file": string, "reason": string}
// ],
// "errors": [ // Failed renames
// {"file": string, "error": string}
// ]
// }
func RunPrepend(args []string) {
fs := flag.NewFlagSet("prepend", flag.ExitOnError)
folder := fs.String("folder", "", "Target folder path (required)")
prefix := fs.String("prefix", "", "String to prepend to filenames (required)")
recursive := fs.Bool("recursive", false, "Include 1 level of subfolders")
dryRun := fs.Bool("dry-run", false, "Show what would be renamed without doing it")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak prepend --folder <path> --prefix <string> [--recursive] [--dry-run]\n\n")
fmt.Fprintf(os.Stderr, "Rename files by prepending a prefix.\n\n")
fmt.Fprintf(os.Stderr, "Target files:\n")
fmt.Fprintf(os.Stderr, " - *.wav, *.WAV (must start with datestring YYYYMMDD_HHMMSS)\n")
fmt.Fprintf(os.Stderr, " - *.wav.data, *.WAV.data (must start with datestring YYYYMMDD_HHMMSS)\n")
fmt.Fprintf(os.Stderr, " - log.txt (exact name, always renamed)\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak prepend --folder ./recordings --prefix LOC001\n")
fmt.Fprintf(os.Stderr, " skraak prepend --folder ./data --prefix SITE_A --recursive\n")
fmt.Fprintf(os.Stderr, " skraak prepend --folder ./test --prefix TEST --dry-run\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

if *folder == "" {
fmt.Fprintf(os.Stderr, "Error: --folder is required\n\n")
fs.Usage()
os.Exit(1)
}

if *prefix == "" {
fmt.Fprintf(os.Stderr, "Error: --prefix is required\n\n")
fs.Usage()
os.Exit(1)
}

// Run the prepend operation
output, err := tools.Prepend(tools.PrependInput{
Folder: *folder,
Prefix: *prefix,
Recursive: *recursive,
DryRun: *dryRun,
})
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

// Output as JSON
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
if err := enc.Encode(output); err != nil {
fmt.Fprintf(os.Stderr, "Error encoding output: %v\n", err)
os.Exit(1)
}
}
file addition: pattern.go (----------)

[0.1037540]

package cmd

import (
"context"
"flag"
"fmt"
"os"
"strconv"

"skraak/tools"
)

// RunPatternCreate creates a new cyclic recording pattern.
//
// JSON output schema:
//
// {
// "pattern": {
// "id": string, // Pattern ID (12 characters)
// "record_s": int, // Record duration in seconds
// "sleep_s": int, // Sleep duration in seconds
// "created_at": string, // Creation timestamp (RFC3339)
// "last_modified": string, // Last modification timestamp (RFC3339)
// "active": bool // Whether the pattern is active
// },
// "message": string // Success message
// }
func RunPatternCreate(args []string) {
fs := flag.NewFlagSet("pattern create", flag.ExitOnError)
dbPath := fs.String("db", "", "Path to DuckDB database (required)")
record := fs.Int("record", 0, "Record duration in seconds (required, must be positive)")
sleep := fs.Int("sleep", 0, "Sleep duration in seconds (required, must be positive)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak pattern create [options]\n\n")
fmt.Fprintf(os.Stderr, "Create a new cyclic recording pattern.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak pattern create --db ./db/skraak.duckdb --record 60 --sleep 1740\n")
fmt.Fprintf(os.Stderr, " # Creates 60s record / 1740s sleep = 30 min cycle\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
missing := []string{}
if *dbPath == "" {
missing = append(missing, "--db")
}
if *record == 0 {
missing = append(missing, "--record")
}
if *sleep == 0 {
missing = append(missing, "--sleep")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
fs.Usage()
os.Exit(1)
}

tools.SetDBPath(*dbPath)

defer initEventLog(*dbPath)()

input := tools.PatternInput{
RecordSeconds: record,
SleepSeconds: sleep,
}

output, err := tools.CreateOrUpdatePattern(context.Background(), input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

printJSON(output)
}

// RunPatternUpdate updates an existing recording pattern.
//
// JSON output schema: same as RunPatternCreate
func RunPatternUpdate(args []string) {
fs := flag.NewFlagSet("pattern update", flag.ExitOnError)
dbPath := fs.String("db", "", "Path to DuckDB database (required)")
id := fs.String("id", "", "Pattern ID (required)")
recordStr := fs.String("record", "", "New record duration in seconds (optional)")
sleepStr := fs.String("sleep", "", "New sleep duration in seconds (optional)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak pattern update [options]\n\n")
fmt.Fprintf(os.Stderr, "Update an existing recording pattern. Only provided fields are updated.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak pattern update --db ./db/skraak.duckdb --id pattern123 --record 30\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
missing := []string{}
if *dbPath == "" {
missing = append(missing, "--db")
}
if *id == "" {
missing = append(missing, "--id")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
fs.Usage()
os.Exit(1)
}

// Parse optional integers
var record, sleep *int
if *recordStr != "" {
r, err := strconv.Atoi(*recordStr)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: invalid record: %v\n", err)
os.Exit(1)
}
record = &r
}
if *sleepStr != "" {
s, err := strconv.Atoi(*sleepStr)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: invalid sleep: %v\n", err)
os.Exit(1)
}
sleep = &s
}

tools.SetDBPath(*dbPath)

defer initEventLog(*dbPath)()

// Build input - only set fields that were provided
input := tools.PatternInput{
ID: id,
}
if record != nil {
input.RecordSeconds = record
}
if sleep != nil {
input.SleepSeconds = sleep
}

output, err := tools.CreateOrUpdatePattern(context.Background(), input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

printJSON(output)
}
file addition: metadata.go (----------)

[0.1037540]

package cmd

import (
"encoding/json"
"flag"
"fmt"
"os"

"skraak/utils"
)

// RunMetadata handles the "metadata" subcommand
//
// JSON output schema:
//
// {
// "file": string, // Path to the WAV file
// "duration_seconds": float, // Duration in seconds
// "sample_rate": int, // Sample rate in Hz
// "channels": int, // Number of audio channels
// "bits_per_sample": int, // Bits per sample
// "comment": string, // WAV comment (omitted if empty)
// "artist": string, // WAV artist (omitted if empty)
// "file_mod_time": string // File modification time RFC3339 (omitted if zero)
// }
func RunMetadata(args []string) {
fs := flag.NewFlagSet("metadata", flag.ExitOnError)
filePath := fs.String("file", "", "Path to WAV file (required)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak metadata --file <path>\n\n")
fmt.Fprintf(os.Stderr, "Extract metadata from a WAV file header.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak metadata --file recording.wav\n")
fmt.Fprintf(os.Stderr, " skraak metadata --file /path/to/audio.wav | jq '.duration_seconds'\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

if *filePath == "" {
fmt.Fprintf(os.Stderr, "Error: --file is required\n\n")
fs.Usage()
os.Exit(1)
}

// Parse WAV header
metadata, err := utils.ParseWAVHeader(*filePath)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

// Output as JSON
output := map[string]any{
"file": *filePath,
"duration_seconds": metadata.Duration,
"sample_rate": metadata.SampleRate,
"channels": metadata.Channels,
"bits_per_sample": metadata.BitsPerSample,
}

// Add optional fields if present
if metadata.Comment != "" {
output["comment"] = metadata.Comment
}
if metadata.Artist != "" {
output["artist"] = metadata.Artist
}
if !metadata.FileModTime.IsZero() {
output["file_mod_time"] = metadata.FileModTime.Format("2006-01-02T15:04:05Z07:00")
}

enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
if err := enc.Encode(output); err != nil {
fmt.Fprintf(os.Stderr, "Error encoding output: %v\n", err)
os.Exit(1)
}
}
file addition: location.go (----------)

[0.1037540]

package cmd

import (
"context"
"flag"
"fmt"
"os"
"strconv"

"skraak/tools"
)

// RunLocationCreate creates a new location with GPS coordinates.
//
// JSON output schema:
//
// {
// "location": {
// "id": string, // Location ID (12 characters)
// "dataset_id": string, // Parent dataset ID
// "name": string, // Location name
// "latitude": float, // Latitude in decimal degrees
// "longitude": float, // Longitude in decimal degrees
// "description": string, // Optional description (nullable)
// "created_at": string, // Creation timestamp (RFC3339)
// "last_modified": string, // Last modification timestamp (RFC3339)
// "active": bool, // Whether the location is active
// "timezone_id": string // IANA timezone ID
// },
// "message": string // Success message
// }
func RunLocationCreate(args []string) {
fs := flag.NewFlagSet("location create", flag.ExitOnError)
dbPath := fs.String("db", "", "Path to DuckDB database (required)")
datasetID := fs.String("dataset", "", "Dataset ID (required)")
name := fs.String("name", "", "Location name (required)")
lat := fs.String("lat", "", "Latitude in decimal degrees (required)")
lon := fs.String("lon", "", "Longitude in decimal degrees (required)")
tz := fs.String("timezone", "", "IANA timezone ID (required, e.g. Pacific/Auckland)")
description := fs.String("description", "", "Location description (optional)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak location create [options]\n\n")
fmt.Fprintf(os.Stderr, "Create a new location with GPS coordinates.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak location create --db ./db/skraak.duckdb --dataset abc123 --name \"Site A\" --lat -36.85 --lon 174.76 --timezone Pacific/Auckland\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
missing := []string{}
if *dbPath == "" {
missing = append(missing, "--db")
}
if *datasetID == "" {
missing = append(missing, "--dataset")
}
if *name == "" {
missing = append(missing, "--name")
}
if *lat == "" {
missing = append(missing, "--lat")
}
if *lon == "" {
missing = append(missing, "--lon")
}
if *tz == "" {
missing = append(missing, "--timezone")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
fs.Usage()
os.Exit(1)
}

// Parse floats
latitude, err := strconv.ParseFloat(*lat, 64)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: invalid latitude: %v\n", err)
os.Exit(1)
}
longitude, err := strconv.ParseFloat(*lon, 64)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: invalid longitude: %v\n", err)
os.Exit(1)
}

tools.SetDBPath(*dbPath)

defer initEventLog(*dbPath)()

input := tools.LocationInput{
DatasetID: datasetID,
Name: name,
Latitude: &latitude,
Longitude: &longitude,
TimezoneID: tz,
Description: description,
}

output, err := tools.CreateOrUpdateLocation(context.Background(), input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

printJSON(output)
}

// RunLocationUpdate updates an existing location.
//
// JSON output schema: same as RunLocationCreate
func RunLocationUpdate(args []string) {
fs := flag.NewFlagSet("location update", flag.ExitOnError)
dbPath := fs.String("db", "", "Path to DuckDB database (required)")
id := fs.String("id", "", "Location ID (required)")
name := fs.String("name", "", "New location name (optional)")
lat := fs.String("lat", "", "New latitude (optional)")
lon := fs.String("lon", "", "New longitude (optional)")
tz := fs.String("timezone", "", "New IANA timezone ID (optional)")
description := fs.String("description", "", "New location description (optional)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak location update [options]\n\n")
fmt.Fprintf(os.Stderr, "Update an existing location. Only provided fields are updated.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak location update --db ./db/skraak.duckdb --id loc123 --name \"New Name\"\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
missing := []string{}
if *dbPath == "" {
missing = append(missing, "--db")
}
if *id == "" {
missing = append(missing, "--id")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
fs.Usage()
os.Exit(1)
}

// Parse optional floats
var latitude, longitude *float64
if *lat != "" {
latVal, err := strconv.ParseFloat(*lat, 64)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: invalid latitude: %v\n", err)
os.Exit(1)
}
latitude = &latVal
}
if *lon != "" {
lonVal, err := strconv.ParseFloat(*lon, 64)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: invalid longitude: %v\n", err)
os.Exit(1)
}
longitude = &lonVal
}

tools.SetDBPath(*dbPath)

defer initEventLog(*dbPath)()

// Build input - only set fields that were provided (non-empty)
input := tools.LocationInput{
ID: id,
}
if *name != "" {
input.Name = name
}
if latitude != nil {
input.Latitude = latitude
}
if longitude != nil {
input.Longitude = longitude
}
if *tz != "" {
input.TimezoneID = tz
}
if *description != "" {
input.Description = description
}

output, err := tools.CreateOrUpdateLocation(context.Background(), input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

printJSON(output)
}
file addition: isnight.go (----------)

[0.1037540]

package cmd

import (
"encoding/json"
"flag"
"fmt"
"os"

"skraak/tools"
)

// RunIsNight handles the "isnight" subcommand
//
// JSON output schema (full):
//
// {
// "file_path": string, // Path to the WAV file
// "timestamp_utc": string, // Recording start timestamp (UTC)
// "solar_night": bool, // True if recorded during solar night
// "civil_night": bool, // True if recorded during civil night
// "diurnal_active": bool, // True if during diurnal active period
// "moon_phase": float, // Moon phase (0.0=new, 1.0=full)
// "duration_seconds": float, // Recording duration in seconds
// "timestamp_source": string, // How timestamp was derived (comment/filename/mtime)
// "midpoint_utc": string, // Recording midpoint timestamp (UTC)
// "sunrise_utc": string, // Sunrise time (UTC), omitted if not applicable
// "sunset_utc": string, // Sunset time (UTC), omitted if not applicable
// "dawn_utc": string, // Civil dawn time (UTC), omitted if not applicable
// "dusk_utc": string // Civil dusk time (UTC), omitted if not applicable
// }
//
// JSON output schema (--brief):
//
// {
// "file_path": string, // Path to the WAV file
// "solar_night": bool // True if recorded during solar night
// }
func RunIsNight(args []string) {
fs := flag.NewFlagSet("isnight", flag.ExitOnError)
filePath := fs.String("file", "", "Path to WAV file (required)")
lat := fs.Float64("lat", 0, "Latitude in decimal degrees (required)")
lng := fs.Float64("lng", 0, "Longitude in decimal degrees (required)")
timezone := fs.String("timezone", "UTC", "IANA timezone ID for filename timestamps (e.g. Pacific/Auckland)")
brief := fs.Bool("brief", false, "Output only file_path and solar_night (saves tokens for batch use)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak isnight --file <path> --lat <lat> --lng <lng> [--timezone <tz>] [--brief]\n\n")
fmt.Fprintf(os.Stderr, "Determine if a WAV file was recorded at night based on file metadata and GPS coordinates.\n\n")
fmt.Fprintf(os.Stderr, "Uses the recording midpoint (not start time) for astronomical calculations.\n")
fmt.Fprintf(os.Stderr, "Timestamp resolution: AudioMoth comment → filename → file modification time.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak isnight --file recording.wav --lat -36.85 --lng 174.76\n")
fmt.Fprintf(os.Stderr, " skraak isnight --file recording.wav --lat -36.85 --lng 174.76 --timezone Pacific/Auckland\n")
fmt.Fprintf(os.Stderr, " skraak isnight --file recording.wav --lat 51.51 --lng -0.13 | jq '.solar_night'\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

if *filePath == "" {
fmt.Fprintf(os.Stderr, "Error: --file is required\n\n")
fs.Usage()
os.Exit(1)
}
if *lat == 0 && *lng == 0 {
fmt.Fprintf(os.Stderr, "Error: --lat and --lng are required\n\n")
fs.Usage()
os.Exit(1)
}

output, err := tools.IsNight(tools.IsNightInput{
FilePath: *filePath,
Lat: *lat,
Lng: *lng,
Timezone: *timezone,
})
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

var encErr error
if *brief {
enc := json.NewEncoder(os.Stdout)
encErr = enc.Encode(map[string]any{
"file_path": output.FilePath,
"solar_night": output.SolarNight,
})
} else {
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
encErr = enc.Encode(output)
}
if encErr != nil {
fmt.Fprintf(os.Stderr, "Error encoding output: %v\n", encErr)
os.Exit(1)
}
}
file addition: import.go (----------)

[0.1037540]

package cmd

import (
"context"
"encoding/json"
"flag"
"fmt"
"os"

"skraak/tools"
)

// RunImport handles the "import" subcommand
func RunImport(args []string) {
if len(args) < 1 {
printImportUsage()
os.Exit(1)
}

switch args[0] {
case "bulk":
runImportBulk(args[1:])
case "file":
runImportFile(args[1:])
case "folder":
runImportFolder(args[1:])
case "segments":
runImportSegments(args[1:])
case "unstructured":
runImportUnstructured(args[1:])
default:
fmt.Fprintf(os.Stderr, "Unknown import subcommand: %s\n\n", args[0])
printImportUsage()
os.Exit(1)
}
}

func printImportUsage() {
fmt.Fprintf(os.Stderr, "Usage: skraak import <subcommand> [options]\n\n")
fmt.Fprintf(os.Stderr, "Subcommands:\n")
fmt.Fprintf(os.Stderr, " file Import a single WAV file (structured datasets)\n")
fmt.Fprintf(os.Stderr, " folder Import all WAV files from a folder (structured datasets)\n")
fmt.Fprintf(os.Stderr, " bulk Bulk import WAV files from CSV (structured datasets)\n")
fmt.Fprintf(os.Stderr, " unstructured Import WAV files into unstructured dataset (no location/cluster)\n")
fmt.Fprintf(os.Stderr, " segments Import segments from AviaNZ .data files (structured datasets)\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak import bulk --db ./db/skraak.duckdb --dataset abc123 --csv import.csv --log progress.log\n")
fmt.Fprintf(os.Stderr, " skraak import file --db ./db/skraak.duckdb --dataset abc123 --location loc456 --cluster clust789 --file /path/to/file.wav\n")
fmt.Fprintf(os.Stderr, " skraak import folder --db ./db/skraak.duckdb --dataset abc123 --location loc456 --cluster clust789 --folder /path/to/folder\n")
fmt.Fprintf(os.Stderr, " skraak import segments --db ./db/skraak.duckdb --dataset abc123 --location loc456 --cluster clust789 --folder /path/to/folder --mapping mapping.json\n")
fmt.Fprintf(os.Stderr, " skraak import unstructured --db ./db/skraak.duckdb --dataset abc123 --folder /path/to/folder\n")
}

// runImportBulk bulk imports WAV files across multiple locations/clusters using a CSV file.
//
// JSON output schema:
//
// {
// "total_locations": int, // Total locations in CSV
// "clusters_created": int, // New clusters created
// "clusters_existing": int, // Existing clusters reused
// "total_files_scanned": int, // Total WAV files found
// "files_imported": int, // Successfully imported files
// "files_duplicate": int, // Duplicate files skipped
// "files_error": int, // Files that failed to import
// "processing_time": string, // Human-readable duration
// "errors": [string] // Error messages (omitted if empty)
// }
func runImportBulk(args []string) {
fs := flag.NewFlagSet("import bulk", flag.ExitOnError)
dbPath := fs.String("db", "", "Path to DuckDB database (required)")
datasetID := fs.String("dataset", "", "Dataset ID (required)")
csvPath := fs.String("csv", "", "Path to CSV file (required)")
logPath := fs.String("log", "", "Path to progress log file (required)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak import bulk [options]\n\n")
fmt.Fprintf(os.Stderr, "Bulk import WAV files across multiple locations/clusters using a CSV file.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nCSV format: location_name,location_id,directory_path,date_range,sample_rate,file_count\n")
fmt.Fprintf(os.Stderr, "\nMonitor progress: tail -f <log-file>\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
missing := []string{}
if *dbPath == "" {
missing = append(missing, "--db")
}
if *datasetID == "" {
missing = append(missing, "--dataset")
}
if *csvPath == "" {
missing = append(missing, "--csv")
}
if *logPath == "" {
missing = append(missing, "--log")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
fs.Usage()
os.Exit(1)
}

// Set DB path and run
tools.SetDBPath(*dbPath)

defer initEventLog(*dbPath)()

input := tools.BulkFileImportInput{
DatasetID: *datasetID,
CSVPath: *csvPath,
LogFilePath: *logPath,
}

fmt.Fprintf(os.Stderr, "Starting bulk import...\n")
fmt.Fprintf(os.Stderr, " Database: %s\n", *dbPath)
fmt.Fprintf(os.Stderr, " Dataset: %s\n", *datasetID)
fmt.Fprintf(os.Stderr, " CSV: %s\n", *csvPath)
fmt.Fprintf(os.Stderr, " Log: %s\n", *logPath)
fmt.Fprintf(os.Stderr, "\nMonitor progress: tail -f %s\n\n", *logPath)

output, err := tools.BulkFileImport(context.Background(), input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
// Still print partial output if available
if output.TotalLocations > 0 || output.FilesImported > 0 {
printJSON(output)
}
os.Exit(1)
}

printJSON(output)
}

// runImportFile imports a single WAV file into the database.
//
// JSON output schema:
//
// {
// "file_id": string, // Generated 21-character nanoid
// "file_name": string, // Base filename
// "hash": string, // XXH64 hash (16-character hex)
// "duration_seconds": float, // File duration in seconds
// "sample_rate": int, // Sample rate in Hz
// "timestamp_local": string, // Local timestamp (RFC3339)
// "is_audiomoth": bool, // AudioMoth detection
// "is_duplicate": bool, // Skipped as duplicate
// "processing_time": string, // Duration string
// "error": string // Error message if failed (omitted if nil)
// }
func runImportFile(args []string) {
fs := flag.NewFlagSet("import file", flag.ExitOnError)
dbPath := fs.String("db", "", "Path to DuckDB database (required)")
datasetID := fs.String("dataset", "", "Dataset ID (required)")
locationID := fs.String("location", "", "Location ID (required)")
clusterID := fs.String("cluster", "", "Cluster ID (required)")
filePath := fs.String("file", "", "Path to WAV file (required)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak import file [options]\n\n")
fmt.Fprintf(os.Stderr, "Import a single WAV file into the database.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak import file --db ./db/skraak.duckdb --dataset abc123 --location loc456 --cluster clust789 --file /path/to/file.wav\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
missing := []string{}
if *dbPath == "" {
missing = append(missing, "--db")
}
if *datasetID == "" {
missing = append(missing, "--dataset")
}
if *locationID == "" {
missing = append(missing, "--location")
}
if *clusterID == "" {
missing = append(missing, "--cluster")
}
if *filePath == "" {
missing = append(missing, "--file")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
fs.Usage()
os.Exit(1)
}

tools.SetDBPath(*dbPath)

defer initEventLog(*dbPath)()

input := tools.ImportFileInput{
FilePath: *filePath,
DatasetID: *datasetID,
LocationID: *locationID,
ClusterID: *clusterID,
}

fmt.Fprintf(os.Stderr, "Importing file: %s\n", *filePath)

output, err := tools.ImportFile(context.Background(), input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

printJSON(output)
}

// runImportFolder imports all WAV files from a folder into the database.
//
// JSON output schema:
//
// {
// "summary": {
// "total_files": int, // Total WAV files found
// "imported_files": int, // Successfully imported
// "skipped_files": int, // Duplicates skipped
// "failed_files": int, // Failed imports
// "audiomoth_files": int, // AudioMoth files detected
// "total_duration_seconds": float, // Total duration imported
// "processing_time": string // Human-readable duration
// },
// "file_ids": [string], // List of successfully imported file IDs
// "errors": [ // Import errors (omitted if empty)
// {"file_name": string, "error": string, "stage": string}
// ]
// }
func runImportFolder(args []string) {
fs := flag.NewFlagSet("import folder", flag.ExitOnError)
dbPath := fs.String("db", "", "Path to DuckDB database (required)")
datasetID := fs.String("dataset", "", "Dataset ID (required)")
locationID := fs.String("location", "", "Location ID (required)")
clusterID := fs.String("cluster", "", "Cluster ID (required)")
folderPath := fs.String("folder", "", "Path to folder containing WAV files (required)")
recursive := fs.Bool("recursive", true, "Scan subfolders recursively (default: true)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak import folder [options]\n\n")
fmt.Fprintf(os.Stderr, "Import all WAV files from a folder into the database.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak import folder --db ./db/skraak.duckdb --dataset abc123 --location loc456 --cluster clust789 --folder /path/to/folder\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
missing := []string{}
if *dbPath == "" {
missing = append(missing, "--db")
}
if *datasetID == "" {
missing = append(missing, "--dataset")
}
if *locationID == "" {
missing = append(missing, "--location")
}
if *clusterID == "" {
missing = append(missing, "--cluster")
}
if *folderPath == "" {
missing = append(missing, "--folder")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
fs.Usage()
os.Exit(1)
}

tools.SetDBPath(*dbPath)

defer initEventLog(*dbPath)()

input := tools.ImportAudioFilesInput{
FolderPath: *folderPath,
DatasetID: *datasetID,
LocationID: *locationID,
ClusterID: *clusterID,
Recursive: recursive,
}

fmt.Fprintf(os.Stderr, "Importing from folder: %s\n", *folderPath)
if *recursive {
fmt.Fprintf(os.Stderr, "Scanning recursively...\n")
}

output, err := tools.ImportAudioFiles(context.Background(), input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
// Still print partial results if available
if len(output.FileIDs) > 0 {
printJSON(output)
}
os.Exit(1)
}

printJSON(output)
}

// runImportSegments imports segments from AviaNZ .data files into the database.
//
// JSON output schema:
//
// {
// "summary": {
// "data_files_found": int, // .data files found
// "data_files_processed": int, // .data files processed
// "total_segments": int, // Total segments in .data files
// "imported_segments": int, // Successfully imported segments
// "imported_labels": int, // Successfully imported labels
// "imported_subtypes": int, // Successfully imported subtypes
// "processing_time_ms": int // Processing time in milliseconds
// },
// "segments": [
// {
// "segment_id": string, // Generated segment ID
// "file_name": string, // Source WAV filename
// "start_time": float, // Segment start time in seconds
// "end_time": float, // Segment end time in seconds
// "freq_low": float, // Low frequency bound
// "freq_high": float, // High frequency bound
// "labels": [
// {
// "label_id": string, // Generated label ID
// "species": string, // Species name
// "calltype": string, // Call type (omitted if empty)
// "filter": string, // Filter name
// "certainty": int, // Certainty level
// "comment": string // Comment (omitted if empty)
// }
// ]
// }
// ],
// "errors": [ // Import errors (omitted if empty)
// {"file": string, "stage": string, "message": string}
// ]
// }
func runImportSegments(args []string) {
fs := flag.NewFlagSet("import segments", flag.ExitOnError)
dbPath := fs.String("db", "", "Path to DuckDB database (required)")
datasetID := fs.String("dataset", "", "Dataset ID (required)")
locationID := fs.String("location", "", "Location ID (required)")
clusterID := fs.String("cluster", "", "Cluster ID (required)")
folderPath := fs.String("folder", "", "Path to folder containing .data files (required)")
mappingPath := fs.String("mapping", "", "Path to mapping JSON file (required)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak import segments [options]\n\n")
fmt.Fprintf(os.Stderr, "Import segments from AviaNZ .data files into the database.\n")
fmt.Fprintf(os.Stderr, "Applies species/calltype mapping from JSON file.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nMapping file format:\n")
fmt.Fprintf(os.Stderr, " {\n")
fmt.Fprintf(os.Stderr, " \"GSK\": {\"species\": \"Roroa\", \"calltypes\": {\"Male\": \"Male - Solo\"}},\n")
fmt.Fprintf(os.Stderr, " \"Don't Know\": {\"species\": \"Don't Know\"}\n")
fmt.Fprintf(os.Stderr, " }\n")
fmt.Fprintf(os.Stderr, "\nInvariants:\n")
fmt.Fprintf(os.Stderr, " - All file hashes must already exist in database for the cluster\n")
fmt.Fprintf(os.Stderr, " - All files must have no existing labels (fresh imports only)\n")
fmt.Fprintf(os.Stderr, " - All filters, species, and calltypes must exist in database\n")
fmt.Fprintf(os.Stderr, " - Bookmark flags are ignored (not stored in database)\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak import segments --db ./db/skraak.duckdb --dataset dset_id123 --location loc_id456 --cluster clust_id789 --folder /path/to/data --mapping mapping.json\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
missing := []string{}
if *dbPath == "" {
missing = append(missing, "--db")
}
if *datasetID == "" {
missing = append(missing, "--dataset")
}
if *locationID == "" {
missing = append(missing, "--location")
}
if *clusterID == "" {
missing = append(missing, "--cluster")
}
if *folderPath == "" {
missing = append(missing, "--folder")
}
if *mappingPath == "" {
missing = append(missing, "--mapping")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
fs.Usage()
os.Exit(1)
}

tools.SetDBPath(*dbPath)

defer initEventLog(*dbPath)()

input := tools.ImportSegmentsInput{
Folder: *folderPath,
Mapping: *mappingPath,
DatasetID: *datasetID,
LocationID: *locationID,
ClusterID: *clusterID,
ProgressHandler: func(processed, total int, message string) {
if total > 0 {
percent := float64(processed) / float64(total) * 100
fmt.Fprintf(os.Stderr, "\rProcessing .data files: %d/%d (%.0f%%) - %s", processed, total, percent, message)
if processed == total {
fmt.Fprintf(os.Stderr, "\n")
}
}
},
}

fmt.Fprintf(os.Stderr, "Importing segments from: %s\n", *folderPath)
fmt.Fprintf(os.Stderr, "Using mapping: %s\n", *mappingPath)

output, err := tools.ImportSegments(context.Background(), input)
if err != nil {
fmt.Fprintf(os.Stderr, "\nError: %v\n", err)
// Still print partial results if available
if len(output.Segments) > 0 || len(output.Errors) > 0 {
printJSON(output)
}
os.Exit(1)
}

fmt.Fprintf(os.Stderr, "\nImport complete:\n")
fmt.Fprintf(os.Stderr, " Data files processed: %d\n", output.Summary.DataFilesProcessed)
fmt.Fprintf(os.Stderr, " Segments imported: %d\n", output.Summary.ImportedSegments)
fmt.Fprintf(os.Stderr, " Labels imported: %d\n", output.Summary.ImportedLabels)
fmt.Fprintf(os.Stderr, " Subtypes imported: %d\n", output.Summary.ImportedSubtypes)

printJSON(output)
}

// runImportUnstructured imports WAV files into an unstructured dataset.
//
// JSON output schema:
//
// {
// "total_files": int, // Total WAV files found
// "imported_files": int, // Successfully imported
// "skipped_files": int, // Duplicates skipped
// "failed_files": int, // Failed imports
// "total_duration_seconds": float, // Total duration imported
// "processing_time": string, // Human-readable duration
// "errors": [ // Import errors (omitted if empty)
// {"file_name": string, "error": string, "stage": string}
// ]
// }
func runImportUnstructured(args []string) {
fs := flag.NewFlagSet("import unstructured", flag.ExitOnError)
dbPath := fs.String("db", "", "Path to DuckDB database (required)")
datasetID := fs.String("dataset", "", "Dataset ID (required - must be 'unstructured' type)")
folderPath := fs.String("folder", "", "Path to folder containing WAV files (required)")
recursive := fs.Bool("recursive", true, "Scan subfolders recursively (default: true)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak import unstructured [options]\n\n")
fmt.Fprintf(os.Stderr, "Import WAV files into an unstructured dataset.\n")
fmt.Fprintf(os.Stderr, "Files are stored with minimal metadata (hash, duration, sample_rate, file modification time).\n")
fmt.Fprintf(os.Stderr, "No location/cluster hierarchy required.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak import unstructured --db ./db/skraak.duckdb --dataset abc123 --folder /path/to/folder\n")
fmt.Fprintf(os.Stderr, " skraak import unstructured --db ./db/skraak.duckdb --dataset abc123 --folder /path/to/folder --recursive=false\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
missing := []string{}
if *dbPath == "" {
missing = append(missing, "--db")
}
if *datasetID == "" {
missing = append(missing, "--dataset")
}
if *folderPath == "" {
missing = append(missing, "--folder")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
fs.Usage()
os.Exit(1)
}

tools.SetDBPath(*dbPath)

defer initEventLog(*dbPath)()

input := tools.ImportUnstructuredInput{
DatasetID: *datasetID,
FolderPath: *folderPath,
Recursive: recursive,
}

fmt.Fprintf(os.Stderr, "Importing into unstructured dataset: %s\n", *datasetID)
fmt.Fprintf(os.Stderr, "Scanning folder: %s\n", *folderPath)
if *recursive {
fmt.Fprintf(os.Stderr, "Scanning recursively...\n")
}

output, err := tools.ImportUnstructured(context.Background(), input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

printJSON(output)
}

func printJSON(v any) {
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
if err := enc.Encode(v); err != nil {
fmt.Fprintf(os.Stderr, "Error encoding output: %v\n", err)
os.Exit(1)
}
}
file addition: export.go (----------)

[0.1037540]

package cmd

import (
"context"
"encoding/json"
"flag"
"fmt"
"os"

"skraak/tools"
)

// RunExport handles the "export" subcommand
//
// export dataset JSON output schema:
//
// {
// "dataset_id": string, // ID of the exported dataset
// "dataset_name": string, // Name of the exported dataset
// "output_path": string, // Path to the output database
// "row_counts": {string: int}, // Row counts per table (table_name -> count)
// "file_size_mb": float, // Output file size in MB (omitted if dry run)
// "dry_run": bool, // Whether this was a dry run
// "message": string // Summary message
// }
func RunExport(args []string) {
if len(args) < 1 {
printExportUsage()
os.Exit(1)
}

switch args[0] {
case "dataset":
runExportDataset(args[1:])
default:
fmt.Fprintf(os.Stderr, "Unknown export subcommand: %s\n\n", args[0])
printExportUsage()
os.Exit(1)
}
}

func printExportUsage() {
fmt.Fprintf(os.Stderr, "Usage: skraak export <subcommand> [options]\n\n")
fmt.Fprintf(os.Stderr, "Subcommands:\n")
fmt.Fprintf(os.Stderr, " dataset Export a dataset with all related data\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb\n")
fmt.Fprintf(os.Stderr, " skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb --dry-run\n")
}

func runExportDataset(args []string) {
fs := flag.NewFlagSet("export dataset", flag.ExitOnError)
dbPath := fs.String("db", "", "Path to source DuckDB database (required)")
datasetID := fs.String("id", "", "Dataset ID to export (required)")
output := fs.String("output", "", "Output database path (required)")
dryRun := fs.Bool("dry-run", false, "Show what would be exported without creating file")
force := fs.Bool("force", false, "Overwrite existing output file")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak export dataset --db <path> --id <dataset_id> --output <path> [options]\n\n")
fmt.Fprintf(os.Stderr, "Export a dataset with all related data to a new DuckDB database.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb\n")
fmt.Fprintf(os.Stderr, " skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb --dry-run\n")
fmt.Fprintf(os.Stderr, " skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb --force\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
missing := []string{}
if *dbPath == "" {
missing = append(missing, "--db")
}
if *datasetID == "" {
missing = append(missing, "--id")
}
if *output == "" {
missing = append(missing, "--output")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
fs.Usage()
os.Exit(1)
}

tools.SetDBPath(*dbPath)

input := tools.ExportDatasetInput{
DatasetID: *datasetID,
Output: *output,
DryRun: *dryRun,
Force: *force,
}

outputResult, err := tools.ExportDataset(context.Background(), input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
if err := enc.Encode(outputResult); err != nil {
fmt.Fprintf(os.Stderr, "Error encoding output: %v\n", err)
os.Exit(1)
}
}
file addition: dataset.go (----------)

[0.1037540]

package cmd

import (
"context"
"flag"
"fmt"
"os"

"skraak/tools"
)

// RunDatasetCreate creates a new dataset.
//
// JSON output schema:
//
// {
// "dataset": {
// "id": string, // Dataset ID (12 characters)
// "name": string, // Dataset name
// "description": string, // Optional description (nullable)
// "created_at": string, // Creation timestamp (RFC3339)
// "last_modified": string, // Last modification timestamp (RFC3339)
// "active": bool, // Whether the dataset is active
// "type": string // Dataset type: "structured"/"unstructured"/"test"/"train"
// },
// "message": string // Success message
// }
func RunDatasetCreate(args []string) {
fs := flag.NewFlagSet("create dataset", flag.ExitOnError)
dbPath := fs.String("db", "", "Path to DuckDB database (required)")
name := fs.String("name", "", "Dataset name (required)")
dsType := fs.String("type", "structured", "Dataset type: structured (default), unstructured, test, train")
description := fs.String("description", "", "Dataset description (optional)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak create dataset [options]\n\n")
fmt.Fprintf(os.Stderr, "Create a new dataset.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak create dataset --db ./db/skraak.duckdb --name \"My Dataset\"\n")
fmt.Fprintf(os.Stderr, " skraak create dataset --db ./db/skraak.duckdb --name \"Training Data\" --type train --description \"For ML training\"\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
missing := []string{}
if *dbPath == "" {
missing = append(missing, "--db")
}
if *name == "" {
missing = append(missing, "--name")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
fs.Usage()
os.Exit(1)
}

tools.SetDBPath(*dbPath)

defer initEventLog(*dbPath)()

input := tools.DatasetInput{
Name: name,
Type: dsType,
Description: description,
}

output, err := tools.CreateOrUpdateDataset(context.Background(), input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

printJSON(output)
}

// RunDatasetUpdate updates an existing dataset.
//
// JSON output schema: same as RunDatasetCreate
func RunDatasetUpdate(args []string) {
fs := flag.NewFlagSet("update dataset", flag.ExitOnError)
dbPath := fs.String("db", "", "Path to DuckDB database (required)")
id := fs.String("id", "", "Dataset ID (required)")
name := fs.String("name", "", "New dataset name")
dsType := fs.String("type", "", "New dataset type: structured, unstructured, test, train")
description := fs.String("description", "", "New dataset description")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak update dataset [options]\n\n")
fmt.Fprintf(os.Stderr, "Update an existing dataset. Only provided fields are updated.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak update dataset --db ./db/skraak.duckdb --id abc123 --name \"Updated Name\"\n")
fmt.Fprintf(os.Stderr, " skraak update dataset --db ./db/skraak.duckdb --id abc123 --type train\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
missing := []string{}
if *dbPath == "" {
missing = append(missing, "--db")
}
if *id == "" {
missing = append(missing, "--id")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
fs.Usage()
os.Exit(1)
}

tools.SetDBPath(*dbPath)

defer initEventLog(*dbPath)()

// Build input - only set fields that were provided (non-empty)
input := tools.DatasetInput{
ID: id,
}
if *name != "" {
input.Name = name
}
if *dsType != "" {
input.Type = dsType
}
if *description != "" {
input.Description = description
}

output, err := tools.CreateOrUpdateDataset(context.Background(), input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

printJSON(output)
}
file addition: create.go (----------)

[0.1037540]

package cmd

import (
"fmt"
"os"
)

// RunCreate handles the "create" command
func RunCreate(args []string) {
if len(args) < 1 {
printCreateUsage()
os.Exit(1)
}

switch args[0] {
case "dataset":
RunDatasetCreate(args[1:])
case "location":
RunLocationCreate(args[1:])
case "cluster":
RunClusterCreate(args[1:])
case "pattern":
RunPatternCreate(args[1:])
default:
fmt.Fprintf(os.Stderr, "Unknown resource to create: %s\n", args[0])
printCreateUsage()
os.Exit(1)
}
}

func printCreateUsage() {
fmt.Fprintf(os.Stderr, "Usage: skraak create <resource> [options]\n\n")
fmt.Fprintf(os.Stderr, "Resources:\n")
fmt.Fprintf(os.Stderr, " dataset Create a new dataset\n")
fmt.Fprintf(os.Stderr, " location Create a new location\n")
fmt.Fprintf(os.Stderr, " cluster Create a new cluster\n")
fmt.Fprintf(os.Stderr, " pattern Create a new pattern\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak create dataset --db ./db/skraak.duckdb --name \"Test Dataset\"\n")
fmt.Fprintf(os.Stderr, " skraak create location --db ./db/skraak.duckdb --dataset abc123 --name \"Site A\" --lat -36.85 --lon 174.76 --timezone Pacific/Auckland\n")
fmt.Fprintf(os.Stderr, " skraak create cluster --db ./db/skraak.duckdb --dataset abc123 --location loc456 --name \"2024-01\" --sample-rate 250000\n")
fmt.Fprintf(os.Stderr, " skraak create pattern --db ./db/skraak.duckdb --dataset abc123 --name \"Recording Schedule\" --type daily --start-time 18:00 --end-time 06:00\n")
}
file addition: common.go (----------)

[0.1037540]

package cmd

import (
"fmt"
"os"

"skraak/db"
)

// initEventLog configures transaction event logging for the given database path.
// Returns a cleanup function that should be deferred by the caller.
func initEventLog(dbPath string) func() {
db.SetEventLogConfig(db.EventLogConfig{
Enabled: true,
Path: dbPath + ".events.jsonl",
})
return func() {
if err := db.CloseEventLog(); err != nil {
fmt.Fprintf(os.Stderr, "Warning: failed to close event log: %v\n", err)
}
}
}
file addition: cluster.go (----------)

[0.1037540]

package cmd

import (
"context"
"flag"
"fmt"
"os"
"strconv"

"skraak/tools"
)

// RunClusterCreate creates a new cluster for grouping recordings.
//
// JSON output schema:
//
// {
// "cluster": {
// "id": string, // Cluster ID (12 characters)
// "dataset_id": string, // Parent dataset ID
// "location_id": string, // Parent location ID
// "name": string, // Cluster name
// "description": string, // Optional description (nullable)
// "created_at": string, // Creation timestamp (RFC3339)
// "last_modified": string, // Last modification timestamp (RFC3339)
// "active": bool, // Whether the cluster is active
// "cyclic_recording_pattern_id": string, // Optional pattern ID (nullable)
// "sample_rate": int // Sample rate in Hz
// },
// "message": string // Success message
// }
func RunClusterCreate(args []string) {
fs := flag.NewFlagSet("cluster create", flag.ExitOnError)
dbPath := fs.String("db", "", "Path to DuckDB database (required)")
datasetID := fs.String("dataset", "", "Dataset ID (required)")
locationID := fs.String("location", "", "Location ID (required)")
name := fs.String("name", "", "Cluster name (required)")
sampleRate := fs.String("sample-rate", "", "Sample rate in Hz (required)")
description := fs.String("description", "", "Cluster description (optional)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak cluster create [options]\n\n")
fmt.Fprintf(os.Stderr, "Create a new cluster for grouping recordings.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak cluster create --db ./db/skraak.duckdb --dataset abc123 --location loc456 --name \"2024-01\" --sample-rate 250000\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
missing := []string{}
if *dbPath == "" {
missing = append(missing, "--db")
}
if *datasetID == "" {
missing = append(missing, "--dataset")
}
if *locationID == "" {
missing = append(missing, "--location")
}
if *name == "" {
missing = append(missing, "--name")
}
if *sampleRate == "" {
missing = append(missing, "--sample-rate")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
fs.Usage()
os.Exit(1)
}

// Parse sample rate
sr, err := strconv.Atoi(*sampleRate)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: invalid sample rate: %v\n", err)
os.Exit(1)
}

tools.SetDBPath(*dbPath)

defer initEventLog(*dbPath)()

input := tools.ClusterInput{
DatasetID: datasetID,
LocationID: locationID,
Name: name,
SampleRate: &sr,
Description: description,
}

output, err := tools.CreateOrUpdateCluster(context.Background(), input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

printJSON(output)
}

// RunClusterUpdate updates an existing cluster.
//
// JSON output schema: same as RunClusterCreate
func RunClusterUpdate(args []string) {
fs := flag.NewFlagSet("cluster update", flag.ExitOnError)
dbPath := fs.String("db", "", "Path to DuckDB database (required)")
id := fs.String("id", "", "Cluster ID (required)")
name := fs.String("name", "", "New cluster name (optional)")
sampleRate := fs.String("sample-rate", "", "New sample rate in Hz (optional)")
description := fs.String("description", "", "New cluster description (optional)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak cluster update [options]\n\n")
fmt.Fprintf(os.Stderr, "Update an existing cluster. Only provided fields are updated.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak cluster update --db ./db/skraak.duckdb --id clust123 --name \"New Name\"\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
missing := []string{}
if *dbPath == "" {
missing = append(missing, "--db")
}
if *id == "" {
missing = append(missing, "--id")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
fs.Usage()
os.Exit(1)
}

// Parse optional sample rate
var sr *int
if *sampleRate != "" {
srVal, err := strconv.Atoi(*sampleRate)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: invalid sample rate: %v\n", err)
os.Exit(1)
}
sr = &srVal
}

tools.SetDBPath(*dbPath)

defer initEventLog(*dbPath)()

// Build input - only set fields that were provided (non-empty)
input := tools.ClusterInput{
ID: id,
}
if *name != "" {
input.Name = name
}
if sr != nil {
input.SampleRate = sr
}
if *description != "" {
input.Description = description
}

output, err := tools.CreateOrUpdateCluster(context.Background(), input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

printJSON(output)
}
file addition: calls_push_certainty.go (----------)

[0.1037540]

package cmd

import (
"encoding/json"
"fmt"
"os"
"strconv"

"skraak/tools"
"skraak/utils"
)

func printPushCertaintyUsage() {
fmt.Fprintf(os.Stderr, "Usage: skraak calls push-certainty [options]\n\n")
fmt.Fprintf(os.Stderr, "Promote certainty=90 segments to certainty=100 for a filtered set.\n")
fmt.Fprintf(os.Stderr, "Filtering logic matches 'calls classify' exactly. Reviewer is set from config.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fmt.Fprintf(os.Stderr, " --folder <path> Path to folder containing .data files (required, or --file)\n")
fmt.Fprintf(os.Stderr, " --file <path> Path to a single .data file (required, or --folder)\n")
fmt.Fprintf(os.Stderr, " --filter <name> Scope to filter name (optional)\n")
fmt.Fprintf(os.Stderr, " --species <name> Scope to species, optionally with calltype (e.g. Kiwi, Kiwi+Duet)\n")
fmt.Fprintf(os.Stderr, " --night Only act on solar-night recordings (requires --lat and --lng)\n")
fmt.Fprintf(os.Stderr, " --day Only act on solar-day recordings (requires --lat and --lng)\n")
fmt.Fprintf(os.Stderr, " --lat <float> Latitude in decimal degrees (required with --night or --day)\n")
fmt.Fprintf(os.Stderr, " --lng <float> Longitude in decimal degrees (required with --night or --day)\n")
fmt.Fprintf(os.Stderr, " --timezone <zone> IANA timezone ID (e.g. Pacific/Auckland)\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak calls push-certainty --folder ./data --species Kiwi\n")
fmt.Fprintf(os.Stderr, " skraak calls push-certainty --folder ./data --species Kiwi --night --lat -45.5 --lng 167.4\n")
}

// runCallsPushCertainty promotes certainty=90 segments to certainty=100 for a filtered set.
//
// JSON output schema:
//
// {
// "segments_updated": int, // Number of segments promoted from 90→100
// "files_updated": int, // Number of .data files modified
// "time_filtered_count": int // Files skipped by --night/--day filter
// }
func runCallsPushCertainty(args []string) {
var folder, file, filter, species, timezone string
var night, day bool
var lat, lng float64
var latSet, lngSet bool

i := 0
for i < len(args) {
arg := args[i]
switch arg {
case "--folder":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --folder requires a value\n")
os.Exit(1)
}
folder = args[i+1]
i += 2

case "--file":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --file requires a value\n")
os.Exit(1)
}
file = args[i+1]
i += 2

case "--filter":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --filter requires a value\n")
os.Exit(1)
}
filter = args[i+1]
i += 2

case "--species":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --species requires a value\n")
os.Exit(1)
}
species = args[i+1]
i += 2

case "--night":
night = true
i++

case "--day":
day = true
i++

case "--lat":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --lat requires a value\n")
os.Exit(1)
}
v, err := strconv.ParseFloat(args[i+1], 64)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: --lat must be a number\n")
os.Exit(1)
}
lat = v
latSet = true
i += 2

case "--lng":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --lng requires a value\n")
os.Exit(1)
}
v, err := strconv.ParseFloat(args[i+1], 64)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: --lng must be a number\n")
os.Exit(1)
}
lng = v
lngSet = true
i += 2

case "--timezone":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --timezone requires a value\n")
os.Exit(1)
}
timezone = args[i+1]
i += 2

case "--help", "-h":
printPushCertaintyUsage()
os.Exit(0)

default:
fmt.Fprintf(os.Stderr, "Error: unknown flag: %s\n\n", arg)
printPushCertaintyUsage()
os.Exit(1)
}
}

if folder == "" && file == "" {
fmt.Fprintf(os.Stderr, "Error: missing required flag: --folder or --file\n\n")
printPushCertaintyUsage()
os.Exit(1)
}
if night && day {
fmt.Fprintf(os.Stderr, "Error: --night and --day are mutually exclusive\n\n")
printPushCertaintyUsage()
os.Exit(1)
}
if (night || day) && (!latSet || !lngSet) {
fmt.Fprintf(os.Stderr, "Error: --night/--day requires both --lat and --lng\n\n")
printPushCertaintyUsage()
os.Exit(1)
}

cfg, cfgPath, err := utils.LoadConfig()
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
fmt.Fprintf(os.Stderr, "Create %s with a \"classify\" section; run `skraak calls classify --help` for an example.\n", cfgPath)
os.Exit(1)
}
if cfg.Classify.Reviewer == "" {
fmt.Fprintf(os.Stderr, "Error: %s is missing \"classify.reviewer\"\n", cfgPath)
os.Exit(1)
}

speciesName, callType := utils.ParseSpeciesCallType(species)

config := tools.PushCertaintyConfig{
Folder: folder,
File: file,
Filter: filter,
Species: speciesName,
CallType: callType,
Night: night,
Day: day,
Lat: lat,
Lng: lng,
Timezone: timezone,
Reviewer: cfg.Classify.Reviewer,
}

result, err := tools.PushCertainty(config)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

if result.TimeFilteredCount > 0 {
label := "daytime"
if config.Day {
label = "nighttime"
}
fmt.Fprintf(os.Stderr, "Skipped %d %s files\n", result.TimeFilteredCount, label)
}
fmt.Fprintf(os.Stderr, "Updated %d segments across %d files\n",
result.SegmentsUpdated, result.FilesUpdated)

enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
if err := enc.Encode(result); err != nil {
fmt.Fprintf(os.Stderr, "Error encoding output: %v\n", err)
os.Exit(1)
}
}
file addition: calls_propagate.go (----------)

[0.1037540]

package cmd

import (
"encoding/json"
"flag"
"fmt"
"os"

"skraak/tools"
)

// runCallsPropagate propagates verified classifications between filters in .data files.
//
// JSON output schema (--file mode):
//
// {
// "file": string, // .data file path
// "from_filter": string, // Source filter name
// "to_filter": string, // Target filter name
// "species": string, // Species propagated
// "filters_missing": bool, // True if file lacks one or both filters (omitted if false)
// "targets_examined": int, // Target labels examined
// "propagated": int, // Target labels updated
// "skipped_no_overlap": int, // Targets with no overlapping source
// "skipped_conflict": int, // Targets with conflicting sources
// "conflicts": [ // Conflict details (omitted if empty)
// {
// "file": string, // .data filename (omitted in single-file mode)
// "target_start": float, // Target segment start (seconds)
// "target_end": float, // Target segment end (seconds)
// "target_calltype": string, // Target call type (omitted if empty)
// "source_choices": [
// {
// "start": float, // Source segment start
// "end": float, // Source segment end
// "species": string, // Source species
// "calltype": string // Source call type (omitted if empty)
// }
// ]
// }
// ],
// "changes": [ // Change details (omitted if empty)
// {
// "target_start": float, // Target segment start
// "target_end": float, // Target segment end
// "prev_species": string, // Previous species
// "prev_calltype": string, // Previous call type (omitted if empty)
// "prev_certainty": int, // Previous certainty
// "new_species": string, // New species
// "new_calltype": string, // New call type (omitted if empty)
// "new_certainty": int // New certainty
// }
// ],
// "error": string // Error message (omitted if empty)
// }
//
// JSON output schema (--folder mode):
//
// {
// "folder": string, // Folder path
// "from_filter": string, // Source filter name
// "to_filter": string, // Target filter name
// "species": string, // Species propagated
// "files_total": int, // Total .data files scanned
// "files_with_both_filters": int, // Files containing both filters
// "files_skipped_no_filter": int, // Files missing a filter
// "files_changed": int, // Files with at least one propagation
// "files_errored": int, // Files with errors
// "targets_examined": int, // Total target labels examined
// "propagated": int, // Total target labels updated
// "skipped_no_overlap": int, // Targets with no overlapping source
// "skipped_conflict": int, // Targets with conflicting sources
// "conflicts": [PropagateConflict], // See --file mode conflict schema
// "errors": [CallsPropagateOutput], // Per-file error outputs (omitted if empty)
// "error": string // Top-level error (omitted if empty)
// }
func runCallsPropagate(args []string) {
fs := flag.NewFlagSet("calls propagate", flag.ExitOnError)
file := fs.String("file", "", "Path to a single .data file (mutually exclusive with --folder)")
folder := fs.String("folder", "", "Path to folder containing .data files (mutually exclusive with --file)")
from := fs.String("from", "", "Source filter name (required)")
to := fs.String("to", "", "Target filter name (required)")
species := fs.String("species", "", "Species to propagate (required, e.g. Kiwi)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak calls propagate [options]\n\n")
fmt.Fprintf(os.Stderr, "Propagate verified classifications from one filter to another within a .data file\n")
fmt.Fprintf(os.Stderr, "or across every .data file in a folder.\n\n")
fmt.Fprintf(os.Stderr, "Only source labels with certainty=100 and matching --species are considered.\n")
fmt.Fprintf(os.Stderr, "Target labels (filter=--to) are updated when their certainty is 70 or 0.\n")
fmt.Fprintf(os.Stderr, "Updated target labels are set to certainty=90; file reviewer is set to \"Skraak\".\n")
fmt.Fprintf(os.Stderr, "Targets already at certainty=100 or 90 are left alone.\n")
fmt.Fprintf(os.Stderr, "Files that do not contain both --from and --to filter labels are skipped.\n\n")
fmt.Fprintf(os.Stderr, "Exactly one of --file or --folder is required.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak calls propagate --file rec.wav.data \\\n")
fmt.Fprintf(os.Stderr, " --from opensoundscape-kiwi-1.2 --to opensoundscape-kiwi-1.5 --species Kiwi\n\n")
fmt.Fprintf(os.Stderr, " skraak calls propagate --folder ./recordings \\\n")
fmt.Fprintf(os.Stderr, " --from opensoundscape-kiwi-1.2 --to opensoundscape-kiwi-1.5 --species Kiwi\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

if (*file == "") == (*folder == "") {
fmt.Fprintf(os.Stderr, "Error: exactly one of --file or --folder is required\n\n")
fs.Usage()
os.Exit(1)
}

missing := []string{}
if *from == "" {
missing = append(missing, "--from")
}
if *to == "" {
missing = append(missing, "--to")
}
if *species == "" {
missing = append(missing, "--species")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
fs.Usage()
os.Exit(1)
}

enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")

if *file != "" {
result, err := tools.CallsPropagate(tools.CallsPropagateInput{
File: *file,
FromFilter: *from,
ToFilter: *to,
Species: *species,
})
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %s\n", result.Error)
os.Exit(1)
}
if err := enc.Encode(result); err != nil {
fmt.Fprintf(os.Stderr, "Error encoding output: %v\n", err)
os.Exit(1)
}
return
}

result, err := tools.CallsPropagateFolder(tools.CallsPropagateFolderInput{
Folder: *folder,
FromFilter: *from,
ToFilter: *to,
Species: *species,
})
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %s\n", result.Error)
os.Exit(1)
}
fmt.Fprintf(os.Stderr,
"Files: %d total, %d with both filters, %d skipped (missing filter), %d changed, %d errored\n",
result.FilesTotal, result.FilesWithBothFilters, result.FilesSkippedNoFilter,
result.FilesChanged, result.FilesErrored)
fmt.Fprintf(os.Stderr,
"Targets: %d examined, %d propagated, %d no-overlap, %d conflicts\n",
result.TargetsExamined, result.Propagated, result.SkippedNoOverlap, result.SkippedConflict)
if err := enc.Encode(result); err != nil {
fmt.Fprintf(os.Stderr, "Error encoding output: %v\n", err)
os.Exit(1)
}
}
file addition: calls_modify.go (----------)

[0.1037540]

package cmd

import (
"encoding/json"
"fmt"
"os"
"strconv"
"strings"

"skraak/tools"
)

func printModifyUsage() {
fmt.Fprintf(os.Stderr, "Usage: skraak calls modify [options]\n\n")
fmt.Fprintf(os.Stderr, "Modify a label in a .data file.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fmt.Fprintf(os.Stderr, " --file <path> Path to .data file (required)\n")
fmt.Fprintf(os.Stderr, " --reviewer <name> Reviewer name (required)\n")
fmt.Fprintf(os.Stderr, " --filter <name> Filter name to match labels (required)\n")
fmt.Fprintf(os.Stderr, " --segment <start-end> Segment time range in integer seconds (required, e.g., 12-15)\n")
fmt.Fprintf(os.Stderr, " --certainty <int> Certainty value 0-100 (required)\n")
fmt.Fprintf(os.Stderr, " --species <name> Species to set (e.g., Kiwi, Kiwi+Male, Noise)\n")
fmt.Fprintf(os.Stderr, " --bookmark Mark segment as bookmarked for navigation\n")
fmt.Fprintf(os.Stderr, " --comment <text> User comment (max 140 chars, ASCII only)\n")
fmt.Fprintf(os.Stderr, "\nSegment matching:\n")
fmt.Fprintf(os.Stderr, " Segments are matched by floor(start) and ceil(end) times.\n")
fmt.Fprintf(os.Stderr, " For example, a segment from 12.3s to 14.5s matches --segment 12-15.\n")
fmt.Fprintf(os.Stderr, "\nBehavior:\n")
fmt.Fprintf(os.Stderr, " Always updates reviewer on file metadata.\n")
fmt.Fprintf(os.Stderr, " If all specified values match current values, no modification is made.\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " # Change species and certainty (incorrect classification)\n")
fmt.Fprintf(os.Stderr, " skraak calls modify --file recording.data --reviewer GLM-5 \\\n")
fmt.Fprintf(os.Stderr, " --filter mymodel --segment 12-15 --species Kiwi+Male --certainty 100\n\n")
fmt.Fprintf(os.Stderr, " # Change certainty only (correct classification)\n")
fmt.Fprintf(os.Stderr, " skraak calls modify --file recording.data --reviewer GLM-5 \\\n")
fmt.Fprintf(os.Stderr, " --filter mymodel --segment 12-15 --certainty 100\n\n")
fmt.Fprintf(os.Stderr, " # Change to Noise (clears calltype)\n")
fmt.Fprintf(os.Stderr, " skraak calls modify --file recording.data --reviewer GLM-5 \\\n")
fmt.Fprintf(os.Stderr, " --filter mymodel --segment 67-88 --species Noise --certainty 100\n\n")
fmt.Fprintf(os.Stderr, " # Bookmark a segment for later review\n")
fmt.Fprintf(os.Stderr, " skraak calls modify --file recording.data --reviewer GLM-5 \\\n")
fmt.Fprintf(os.Stderr, " --filter mymodel --segment 12-15 --certainty 100 --bookmark\n\n")
fmt.Fprintf(os.Stderr, " # Add a comment to a segment\n")
fmt.Fprintf(os.Stderr, " skraak calls modify --file recording.data --reviewer GLM-5 \\\n")
fmt.Fprintf(os.Stderr, " --filter mymodel --segment 12-15 --certainty 100 --comment \"Good example of duet\"\n")
}

// RunCallsModify handles the "calls modify" subcommand
//
// JSON output schema:
//
// {
// "file": string, // .data file path
// "segment_start": int, // Matched segment start (seconds, floored)
// "segment_end": int, // Matched segment end (seconds, ceiled)
// "species": string, // Updated species (omitted if unchanged)
// "calltype": string, // Updated call type (omitted if empty)
// "certainty": int, // Updated certainty (omitted if unchanged)
// "bookmark": bool, // Bookmark flag (omitted if not set)
// "comment": string, // Comment (omitted if empty)
// "previous_value": string, // Description of previous label value (omitted if unchanged)
// "error": string // Error message (omitted if no error)
// }
func RunCallsModify(args []string) {
var file, reviewer, filter, segment, species, comment string
var certainty int
var certaintySet, bookmark bool

// Parse arguments
i := 0
for i < len(args) {
arg := args[i]

switch arg {
case "--file":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --file requires a value\n")
os.Exit(1)
}
file = args[i+1]
i += 2

case "--reviewer":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --reviewer requires a value\n")
os.Exit(1)
}
reviewer = args[i+1]
i += 2

case "--filter":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --filter requires a value\n")
os.Exit(1)
}
filter = args[i+1]
i += 2

case "--segment":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --segment requires a value\n")
os.Exit(1)
}
segment = args[i+1]
i += 2

case "--species":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --species requires a value\n")
os.Exit(1)
}
species = args[i+1]
i += 2

case "--certainty":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --certainty requires a value\n")
os.Exit(1)
}
v, err := strconv.Atoi(args[i+1])
if err != nil {
fmt.Fprintf(os.Stderr, "Error: --certainty must be an integer\n")
os.Exit(1)
}
certainty = v
certaintySet = true
i += 2

case "--bookmark":
bookmark = true
i++

case "--comment":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --comment requires a value\n")
os.Exit(1)
}
comment = args[i+1]
i += 2

case "-h", "--help":
printModifyUsage()
os.Exit(0)

default:
// Check for unknown flags
if strings.HasPrefix(arg, "--") {
fmt.Fprintf(os.Stderr, "Error: unknown flag: %s\n\n", arg)
printModifyUsage()
os.Exit(1)
}
i++
}
}

// Validate required flags
missing := []string{}
if file == "" {
missing = append(missing, "--file")
}
if reviewer == "" {
missing = append(missing, "--reviewer")
}
if filter == "" {
missing = append(missing, "--filter")
}
if segment == "" {
missing = append(missing, "--segment")
}
if !certaintySet {
missing = append(missing, "--certainty")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
printModifyUsage()
os.Exit(1)
}

// Validate certainty range
if certainty < 0 || certainty > 100 {
fmt.Fprintf(os.Stderr, "Error: --certainty must be between 0 and 100\n")
os.Exit(1)
}

// Build input
input := tools.CallsModifyInput{
File: file,
Reviewer: reviewer,
Filter: filter,
Segment: segment,
Species: species,
Certainty: certainty,
Comment: comment,
}
if bookmark {
input.Bookmark = &bookmark
}

// Execute
result, err := tools.CallsModify(input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %s\n", result.Error)
os.Exit(1)
}

// Output JSON
data, _ := json.Marshal(result)
fmt.Println(string(data))
}
file addition: calls_detect_anomalies.go (----------)

[0.1037540]

package cmd

import (
"encoding/json"
"fmt"
"os"

"skraak/tools"
)

func printDetectAnomaliesUsage() {
fmt.Fprintf(os.Stderr, "Usage: skraak calls detect-anomalies [options]\n\n")
fmt.Fprintf(os.Stderr, "Compare corresponding segments across ML model filters and flag disagreements.\n")
fmt.Fprintf(os.Stderr, "Segments are matched by time overlap. Lonely segments (no overlap in all models) are skipped.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fmt.Fprintf(os.Stderr, " --folder <path> Folder containing .data files (required)\n")
fmt.Fprintf(os.Stderr, " --model <name> Filter name to compare (required, repeat for each model, min 2)\n")
fmt.Fprintf(os.Stderr, " --species <name> Scope to species or species+calltype (optional, repeat to add more)\n")
fmt.Fprintf(os.Stderr, "\nAnomaly types:\n")
fmt.Fprintf(os.Stderr, " label_mismatch Species or calltype disagrees across models\n")
fmt.Fprintf(os.Stderr, " certainty_mismatch Labels agree but certainty values differ\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak calls detect-anomalies --folder ./data \\\n")
fmt.Fprintf(os.Stderr, " --model opensoundscape-kiwi-1.0 --model opensoundscape-kiwi-1.2\n")
fmt.Fprintf(os.Stderr, " skraak calls detect-anomalies --folder ./data \\\n")
fmt.Fprintf(os.Stderr, " --model opensoundscape-kiwi-1.0 --model opensoundscape-kiwi-1.2 --model opensoundscape-kiwi-1.5 \\\n")
fmt.Fprintf(os.Stderr, " --species Kiwi+Duet --species Kiwi+Male\n")
}

// runCallsDetectAnomalies compares segments across ML model filters and flags disagreements.
//
// JSON output schema:
//
// {
// "folder": string, // Folder path
// "models": [string], // Model filter names compared
// "files_examined": int, // Total .data files examined
// "files_with_all_models": int, // Files containing all specified models
// "anomalies_total": int, // Total anomalies found
// "label_mismatches": int, // Species/calltype disagreements
// "certainty_mismatches": int, // Certainty disagreements
// "anomalies": [ // Anomaly details (omitted if empty)
// {
// "file": string, // .data filename
// "type": string, // "label_mismatch" | "certainty_mismatch"
// "segments": [
// {
// "model": string, // Filter name
// "start": float, // Segment start (seconds)
// "end": float, // Segment end (seconds)
// "species": string, // Species name
// "calltype": string, // Call type (omitted if empty)
// "certainty": int // Certainty level (0-100)
// }
// ]
// }
// ],
// "error": string // Error message (omitted if empty)
// }
func runCallsDetectAnomalies(args []string) {
var folder string
var models []string
var species []string

i := 0
for i < len(args) {
arg := args[i]
switch arg {
case "--folder":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --folder requires a value\n")
os.Exit(1)
}
folder = args[i+1]
i += 2

case "--model":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --model requires a value\n")
os.Exit(1)
}
models = append(models, args[i+1])
i += 2

case "--species":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --species requires a value\n")
os.Exit(1)
}
species = append(species, args[i+1])
i += 2

case "--help", "-h":
printDetectAnomaliesUsage()
os.Exit(0)

default:
fmt.Fprintf(os.Stderr, "Error: unknown flag: %s\n\n", arg)
printDetectAnomaliesUsage()
os.Exit(1)
}
}

if folder == "" {
fmt.Fprintf(os.Stderr, "Error: --folder is required\n\n")
printDetectAnomaliesUsage()
os.Exit(1)
}
if len(models) < 2 {
fmt.Fprintf(os.Stderr, "Error: at least 2 --model values required\n\n")
printDetectAnomaliesUsage()
os.Exit(1)
}

output, err := tools.DetectAnomalies(tools.DetectAnomaliesInput{
Folder: folder,
Models: models,
Species: species,
})
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

fmt.Fprintf(os.Stderr, "Examined %d files, %d had all models\n",
output.FilesExamined, output.FilesWithAllModels)
fmt.Fprintf(os.Stderr, "Anomalies: %d total (%d label, %d certainty)\n",
output.AnomaliesTotal, output.LabelMismatches, output.CertaintyMismatches)

enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
if err := enc.Encode(output); err != nil {
fmt.Fprintf(os.Stderr, "Error encoding output: %v\n", err)
os.Exit(1)
}
}
file addition: calls_clip_labels.go (----------)

[0.1037540]

package cmd

import (
"encoding/json"
"flag"
"fmt"
"os"
"sort"

"skraak/tools"
)

// runCallsClipLabels handles the "calls clip-labels" subcommand.
func runCallsClipLabels(args []string) {
fs := flag.NewFlagSet("calls clip-labels", flag.ExitOnError)
folder := fs.String("folder", "", "Folder containing .data files (required)")
mapping := fs.String("mapping", "", "Path to mapping.json (required)")
filter := fs.String("filter", "", "Restrict to a single filter name (default: all filters)")
output := fs.String("output", "./clip_labels.csv", "Output CSV path")
clipDuration := fs.Float64("clip-duration", 4.0, "Clip duration in seconds")
clipOverlap := fs.Float64("clip-overlap", 0.5, "Clip overlap in seconds")
minLabelOverlap := fs.Float64("min-label-overlap", 0.25, "Minimum overlap (s) for an annotation to label a clip")
finalClip := fs.String("final-clip", "full", "Trailing-clip behaviour: full | remainder | extend | none")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak calls clip-labels [options]\n\n")
fmt.Fprintf(os.Stderr, "Generate an OpenSoundScape clip_labels-format CSV from .data files.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nSegment policy:\n")
fmt.Fprintf(os.Stderr, " - Real species → contributes mapped class to overlapping clips.\n")
fmt.Fprintf(os.Stderr, " - Mapped to __NEGATIVE__ → clip emitted, all class columns False;\n")
fmt.Fprintf(os.Stderr, " overrides positives in the same clip.\n")
fmt.Fprintf(os.Stderr, " - Mapped to __IGNORE__ → segment contributes no labels to clips.\n")
fmt.Fprintf(os.Stderr, " - Gaps → clip emitted with all class columns False.\n")
fmt.Fprintf(os.Stderr, "\nIf --output exists: append. Column-set mismatch → hard error.\n")
fmt.Fprintf(os.Stderr, "Duplicate (file, start_time, end_time) row → hard error on first.\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak calls clip-labels --folder ./recordings --mapping ./mapping.json\n")
fmt.Fprintf(os.Stderr, " skraak calls clip-labels --folder ./recordings --mapping ./mapping.json \\\n")
fmt.Fprintf(os.Stderr, " --filter opensoundscape-multi-1.0\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

if *folder == "" {
fmt.Fprintf(os.Stderr, "Error: --folder is required\n\n")
fs.Usage()
os.Exit(1)
}
if *mapping == "" {
fmt.Fprintf(os.Stderr, "Error: --mapping is required\n\n")
fs.Usage()
os.Exit(1)
}

input := tools.CallsClipLabelsInput{
Folder: *folder,
MappingPath: *mapping,
Filter: *filter,
OutputPath: *output,
ClipDuration: *clipDuration,
ClipOverlap: *clipOverlap,
MinLabelOverlap: *minLabelOverlap,
FinalClip: *finalClip,
}

fmt.Fprintf(os.Stderr, "Folder: %s\n", *folder)
fmt.Fprintf(os.Stderr, "Mapping: %s\n", *mapping)
fmt.Fprintf(os.Stderr, "Output: %s\n", *output)
fmt.Fprintf(os.Stderr, "Clip: duration=%.3fs overlap=%.3fs final=%s min-label-overlap=%.3fs\n",
*clipDuration, *clipOverlap, *finalClip, *minLabelOverlap)
if *filter != "" {
fmt.Fprintf(os.Stderr, "Filter: %s\n", *filter)
}

out, err := tools.CallsClipLabels(input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

fmt.Fprintf(os.Stderr, "\nResults\n")
fmt.Fprintf(os.Stderr, " .data files parsed: %d\n", out.DataFilesParsed)
fmt.Fprintf(os.Stderr, " Segments ignored (__IGNORE__): %d\n", out.SegmentsIgnored)
fmt.Fprintf(os.Stderr, " Clips excluded (__IGNORE__): %d\n", out.ClipsIgnored)
fmt.Fprintf(os.Stderr, " Clips emitted: %d\n", out.RowsWritten)
fmt.Fprintf(os.Stderr, " negative (__NEGATIVE__): %d\n", out.ClipsNegative)
fmt.Fprintf(os.Stderr, " all-False (gap): %d\n", out.ClipsAllFalseGap)
if out.AppendedToFile {
fmt.Fprintf(os.Stderr, " Appended to file: yes (%d existing rows)\n", out.ExistingRowsFound)
}
fmt.Fprintf(os.Stderr, "\nPer-class True counts:\n")
keys := make([]string, 0, len(out.PerClassTrueCount))
for k := range out.PerClassTrueCount {
keys = append(keys, k)
}
sort.Strings(keys)
for _, k := range keys {
fmt.Fprintf(os.Stderr, " %-30s %d\n", k+":", out.PerClassTrueCount[k])
}

enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
if err := enc.Encode(out); err != nil {
fmt.Fprintf(os.Stderr, "Error encoding output: %v\n", err)
os.Exit(1)
}
}
file addition: calls_clip.go (----------)

[0.1037540]

package cmd

import (
"encoding/json"
"fmt"
"os"
"strconv"
"strings"

"skraak/tools"
)

func printClipUsage() {
fmt.Fprintf(os.Stderr, "Usage: skraak calls clip [options]\n\n")
fmt.Fprintf(os.Stderr, "Generate audio clips and spectrogram images from .data file segments.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fmt.Fprintf(os.Stderr, " --file <path> Path to .data file (required if no --folder)\n")
fmt.Fprintf(os.Stderr, " --folder <path> Path to folder containing .data files (required if no --file)\n")
fmt.Fprintf(os.Stderr, " --output <path> Output folder for generated clips (required)\n")
fmt.Fprintf(os.Stderr, " --prefix <name> Prefix for output filenames (required)\n")
fmt.Fprintf(os.Stderr, " --filter <name> Filter by ML model name (optional)\n")
fmt.Fprintf(os.Stderr, " --species <name> Filter by species, optionally with calltype (e.g. Kiwi, Kiwi+Duet)\n")
fmt.Fprintf(os.Stderr, " --certainty <int> Filter by certainty value (0-100, optional)\n")
fmt.Fprintf(os.Stderr, " --size <int> Spectrogram image size in pixels (224-896, default 224)\n")
fmt.Fprintf(os.Stderr, " --color Apply L4 colormap to spectrogram (default: grayscale)\n")
fmt.Fprintf(os.Stderr, " --wav-only Generate only WAV clips, skip spectrogram PNG generation\n")
fmt.Fprintf(os.Stderr, " --night Only clip recordings made during solar night (requires --lat and --lng)\n")
fmt.Fprintf(os.Stderr, " --day Only clip recordings made during solar day (requires --lat and --lng)\n")
fmt.Fprintf(os.Stderr, " --lat <float> Latitude in decimal degrees (required with --night or --day)\n")
fmt.Fprintf(os.Stderr, " --lng <float> Longitude in decimal degrees (required with --night or --day)\n")
fmt.Fprintf(os.Stderr, " --timezone <zone> IANA timezone ID (e.g. Pacific/Auckland). Required for non-AudioMoth\n")
fmt.Fprintf(os.Stderr, " recorders whose filenames embed local time (e.g. DOC AR4).\n")
fmt.Fprintf(os.Stderr, " AudioMoth files embed a UTC timestamp in the WAV comment, so\n")
fmt.Fprintf(os.Stderr, " --timezone is not needed for AudioMoth data.\n")
fmt.Fprintf(os.Stderr, "\nOutput files:\n")
fmt.Fprintf(os.Stderr, " <prefix>_<basename>_<start>_<end>.png # spectrogram image\n")
fmt.Fprintf(os.Stderr, " <prefix>_<basename>_<start>_<end>.wav # audio clip (16kHz if downsampled)\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " # Clip all segments from a single file\n")
fmt.Fprintf(os.Stderr, " skraak calls clip --file recording.data --output ./clips --prefix train\n\n")
fmt.Fprintf(os.Stderr, " # Clip only Kiwi segments with color spectrograms at 448px\n")
fmt.Fprintf(os.Stderr, " skraak calls clip --folder ./data --output ./clips --prefix kiwi \\\n")
fmt.Fprintf(os.Stderr, " --filter opensoundscape-kiwi-1.2 --species Kiwi --size 448 --color\n\n")
fmt.Fprintf(os.Stderr, " # Clip Kiwi Duet calls\n")
fmt.Fprintf(os.Stderr, " skraak calls clip --folder ./data --output ./clips --prefix duet \\\n")
fmt.Fprintf(os.Stderr, " --filter opensoundscape-kiwi-1.2 --species Kiwi+Duet\n")
}

// RunCallsClip handles the "calls clip" subcommand
//
// JSON output schema:
//
// {
// "files_processed": int, // .data files processed
// "segments_clipped": int, // Segments that generated clips
// "night_skipped": int, // Segments skipped (--night, omitted if 0)
// "day_skipped": int, // Segments skipped (--day, omitted if 0)
// "output_files": [string], // Paths to generated clip files (.wav/.png)
// "errors": [string] // Error messages (omitted if empty)
// }
func RunCallsClip(args []string) {
var file, folder, output, prefix, filter, species, timezone string
var size, certainty int
var color, wavOnly, night, day bool
var lat, lng float64
var latSet, lngSet bool

// Default to -1 (no certainty filter)
certainty = -1

// Parse arguments
i := 0
for i < len(args) {
arg := args[i]

switch arg {
case "--file":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --file requires a value\n")
os.Exit(1)
}
file = args[i+1]
i += 2

case "--folder":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --folder requires a value\n")
os.Exit(1)
}
folder = args[i+1]
i += 2

case "--output":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --output requires a value\n")
os.Exit(1)
}
output = args[i+1]
i += 2

case "--prefix":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --prefix requires a value\n")
os.Exit(1)
}
prefix = args[i+1]
i += 2

case "--filter":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --filter requires a value\n")
os.Exit(1)
}
if filter != "" {
fmt.Fprintf(os.Stderr, "Error: --filter can only be specified once\n")
os.Exit(1)
}
filter = args[i+1]
i += 2

case "--species":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --species requires a value\n")
os.Exit(1)
}
if species != "" {
fmt.Fprintf(os.Stderr, "Error: --species can only be specified once\n")
os.Exit(1)
}
species = args[i+1]
i += 2

case "--certainty":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --certainty requires a value\n")
os.Exit(1)
}
v, err := strconv.Atoi(args[i+1])
if err != nil {
fmt.Fprintf(os.Stderr, "Error: --certainty must be an integer\n")
os.Exit(1)
}
if v < 0 || v > 100 {
fmt.Fprintf(os.Stderr, "Error: --certainty must be between 0 and 100\n")
os.Exit(1)
}
certainty = v
i += 2

case "--size":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --size requires a value\n")
os.Exit(1)
}
v, err := strconv.Atoi(args[i+1])
if err != nil {
fmt.Fprintf(os.Stderr, "Error: --size must be an integer\n")
os.Exit(1)
}
size = v
i += 2

case "--color":
color = true
i++

case "--wav-only":
wavOnly = true
i++

case "--night":
night = true
i++

case "--day":
day = true
i++

case "--lat":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --lat requires a value\n")
os.Exit(1)
}
v, err := strconv.ParseFloat(args[i+1], 64)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: --lat must be a number\n")
os.Exit(1)
}
lat = v
latSet = true
i += 2

case "--lng":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --lng requires a value\n")
os.Exit(1)
}
v, err := strconv.ParseFloat(args[i+1], 64)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: --lng must be a number\n")
os.Exit(1)
}
lng = v
lngSet = true
i += 2

case "--timezone":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --timezone requires a value\n")
os.Exit(1)
}
timezone = args[i+1]
i += 2

case "-h", "--help":
printClipUsage()
os.Exit(0)

default:
// Check for unknown flags
if strings.HasPrefix(arg, "--") {
fmt.Fprintf(os.Stderr, "Error: unknown flag: %s\n\n", arg)
printClipUsage()
os.Exit(1)
}
i++
}
}

// Validate required flags
missing := []string{}
if file == "" && folder == "" {
missing = append(missing, "--file or --folder")
}
if output == "" {
missing = append(missing, "--output")
}
if prefix == "" {
missing = append(missing, "--prefix")
}
if len(missing) > 0 {
fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)
printClipUsage()
os.Exit(1)
}

if night && day {
fmt.Fprintf(os.Stderr, "Error: --night and --day are mutually exclusive\n\n")
printClipUsage()
os.Exit(1)
}
if (night || day) && (!latSet || !lngSet) {
fmt.Fprintf(os.Stderr, "Error: --night/--day requires both --lat and --lng\n\n")
printClipUsage()
os.Exit(1)
}

// Build input
input := tools.CallsClipInput{
File: file,
Folder: folder,
Output: output,
Prefix: prefix,
Filter: filter,
Species: species,
Certainty: certainty,
Size: size,
Color: color,
WavOnly: wavOnly,
Night: night,
Day: day,
Lat: lat,
Lng: lng,
Timezone: timezone,
}

// Execute
result, err := tools.CallsClip(input)
if err != nil {
// Print partial result as JSON (may contain useful info)
data, _ := json.Marshal(result)
fmt.Println(string(data))
os.Exit(1)
}

// Output JSON
data, _ := json.Marshal(result)
fmt.Println(string(data))
}
file addition: calls_classify.go (----------)

[0.1037540]

package cmd

import (
"fmt"
"os"
"strconv"
"strings"

tea "charm.land/bubbletea/v2"

"skraak/tools"
"skraak/tui"
"skraak/utils"
)

// reservedClassifyKeys are single-character keys the classify TUI handles
// itself (see tui/classify.go). User bindings to these keys would be silently
// overridden by the TUI, so we reject them at config-load time.
var reservedClassifyKeys = map[string]string{
",": "previous segment",
".": "next segment",
"0": "confirm label at certainty 100",
" ": "open comment dialog",
}

func printClassifyUsage() {
fmt.Fprintf(os.Stderr, "Usage: skraak calls classify [options]\n\n")
fmt.Fprintf(os.Stderr, "Interactive TUI for reviewing and classifying bird call segments.\n")
fmt.Fprintf(os.Stderr, "Reads .data files (AviaNZ format) and presents segments for labelling\n")
fmt.Fprintf(os.Stderr, "with spectrogram display and audio playback.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fmt.Fprintf(os.Stderr, " --folder <path> Path to folder containing .data files (required, or --file)\n")
fmt.Fprintf(os.Stderr, " --file <path> Path to a single .data file (required, or --folder)\n")
fmt.Fprintf(os.Stderr, " --filter <name> Filter name to scope which segments to review (optional)\n")
fmt.Fprintf(os.Stderr, " --species <name> Scope to species, optionally with calltype (e.g. Kiwi, Kiwi+Duet)\n")
fmt.Fprintf(os.Stderr, " --certainty <int> Scope to certainty value (0-100, optional)\n")
fmt.Fprintf(os.Stderr, " --sample <1-100> Randomly sample N%% of filtered calls (requires --certainty; 100 = no-op)\n")
fmt.Fprintf(os.Stderr, " --goto <filename> Start at this .data file (basename match, optional)\n")
fmt.Fprintf(os.Stderr, " --night Only review solar-night recordings (requires --lat and --lng)\n")
fmt.Fprintf(os.Stderr, " --day Only review solar-day recordings (requires --lat and --lng)\n")
fmt.Fprintf(os.Stderr, " --lat <float> Latitude in decimal degrees (required with --night or --day)\n")
fmt.Fprintf(os.Stderr, " --lng <float> Longitude in decimal degrees (required with --night or --day)\n")
fmt.Fprintf(os.Stderr, " --timezone <zone> IANA timezone ID (e.g. Pacific/Auckland). Required for non-AudioMoth\n")
fmt.Fprintf(os.Stderr, " recorders whose filenames embed local time (e.g. DOC AR4).\n")
fmt.Fprintf(os.Stderr, "\nConfig (required): ~/.skraak/config.json\n")
fmt.Fprintf(os.Stderr, " Provides reviewer, keybindings, and display flags (color/sixel/iterm/img_dims).\n")
fmt.Fprintf(os.Stderr, " Example:\n")
fmt.Fprintf(os.Stderr, " {\n")
fmt.Fprintf(os.Stderr, " \"classify\": {\n")
fmt.Fprintf(os.Stderr, " \"reviewer\": \"David\",\n")
fmt.Fprintf(os.Stderr, " \"color\": true,\n")
fmt.Fprintf(os.Stderr, " \"bindings\": {\n")
fmt.Fprintf(os.Stderr, " \"k\": \"Kiwi\",\n")
fmt.Fprintf(os.Stderr, " \"1\": \"Kiwi+Duet\",\n")
fmt.Fprintf(os.Stderr, " \"x\": \"Noise\"\n")
fmt.Fprintf(os.Stderr, " }\n")
fmt.Fprintf(os.Stderr, " }\n")
fmt.Fprintf(os.Stderr, " }\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak calls classify --folder /path/to/data\n")
fmt.Fprintf(os.Stderr, " skraak calls classify --file /path/to/file.data --filter opensoundscape-kiwi-1.2\n")
fmt.Fprintf(os.Stderr, " skraak calls classify --folder /path/to/data --species Kiwi+Duet\n")
}

// RunCallsClassify handles the "calls classify" subcommand
func RunCallsClassify(args []string) {
var folder, file, filter, species, gotoFile, timezone string
var certainty, sample int
var night, day bool
var lat, lng float64
var latSet, lngSet bool

// Default to -1 (no filter / no sampling)
certainty = -1
sample = -1

// Parse arguments
i := 0
for i < len(args) {
arg := args[i]

switch arg {
case "--folder":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --folder requires a value\n")
os.Exit(1)
}
folder = args[i+1]
i += 2

case "--file":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --file requires a value\n")
os.Exit(1)
}
file = args[i+1]
i += 2

case "--filter":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --filter requires a value\n")
os.Exit(1)
}
if filter != "" {
fmt.Fprintf(os.Stderr, "Error: --filter can only be specified once\n")
os.Exit(1)
}
filter = args[i+1]
i += 2

case "--species":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --species requires a value\n")
os.Exit(1)
}
if species != "" {
fmt.Fprintf(os.Stderr, "Error: --species can only be specified once\n")
os.Exit(1)
}
species = args[i+1]
i += 2

case "--certainty":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --certainty requires a value\n")
os.Exit(1)
}
v, err := strconv.Atoi(args[i+1])
if err != nil {
fmt.Fprintf(os.Stderr, "Error: --certainty must be an integer\n")
os.Exit(1)
}
if v < 0 || v > 100 {
fmt.Fprintf(os.Stderr, "Error: --certainty must be between 0 and 100\n")
os.Exit(1)
}
certainty = v
i += 2

case "--night":
night = true
i++

case "--day":
day = true
i++

case "--lat":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --lat requires a value\n")
os.Exit(1)
}
v, err := strconv.ParseFloat(args[i+1], 64)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: --lat must be a number\n")
os.Exit(1)
}
lat = v
latSet = true
i += 2

case "--lng":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --lng requires a value\n")
os.Exit(1)
}
v, err := strconv.ParseFloat(args[i+1], 64)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: --lng must be a number\n")
os.Exit(1)
}
lng = v
lngSet = true
i += 2

case "--timezone":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --timezone requires a value\n")
os.Exit(1)
}
timezone = args[i+1]
i += 2

case "--help", "-h":
printClassifyUsage()
os.Exit(0)

case "--sample":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --sample requires a value\n")
os.Exit(1)
}
v, err := strconv.Atoi(args[i+1])
if err != nil {
fmt.Fprintf(os.Stderr, "Error: --sample must be an integer\n")
os.Exit(1)
}
if v <= 0 || v > 100 {
fmt.Fprintf(os.Stderr, "Error: --sample must be between 1 and 100\n")
os.Exit(1)
}
sample = v
i += 2

case "--goto":
if i+1 >= len(args) {
fmt.Fprintf(os.Stderr, "Error: --goto requires a value\n")
os.Exit(1)
}
gotoFile = args[i+1]
i += 2

default:
fmt.Fprintf(os.Stderr, "Error: unknown flag: %s\n\n", arg)
printClassifyUsage()
os.Exit(1)
}
}

// --sample 1-99 requires --certainty; --sample 100 is a no-op
if sample > 0 && sample < 100 && certainty < 0 {
fmt.Fprintf(os.Stderr, "Error: --sample requires --certainty to be set\n")
os.Exit(1)
}

// Validate required flags
if folder == "" && file == "" {
fmt.Fprintf(os.Stderr, "Error: missing required flag: --folder or --file\n\n")
printClassifyUsage()
os.Exit(1)
}

if night && day {
fmt.Fprintf(os.Stderr, "Error: --night and --day are mutually exclusive\n\n")
printClassifyUsage()
os.Exit(1)
}
if (night || day) && (!latSet || !lngSet) {
fmt.Fprintf(os.Stderr, "Error: --night/--day requires both --lat and --lng\n\n")
printClassifyUsage()
os.Exit(1)
}

// Load reviewer, bindings, and display flags from ~/.skraak/config.json.
cfg, cfgPath, err := utils.LoadConfig()
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
fmt.Fprintf(os.Stderr, "Create %s with a \"classify\" section; run `skraak calls classify --help` for an example.\n", cfgPath)
os.Exit(1)
}

// Validate config contents
if cfg.Classify.Reviewer == "" {
fmt.Fprintf(os.Stderr, "Error: %s is missing \"classify.reviewer\"\n", cfgPath)
os.Exit(1)
}
if len(cfg.Classify.Bindings) == 0 {
fmt.Fprintf(os.Stderr, "Error: %s is missing \"classify.bindings\" (need at least one key)\n", cfgPath)
os.Exit(1)
}

// Convert config bindings map -> []tools.KeyBinding via existing parseBind.
bindings := make([]tools.KeyBinding, 0, len(cfg.Classify.Bindings))
for key, value := range cfg.Classify.Bindings {
if len(key) != 1 {
fmt.Fprintf(os.Stderr, "Error: binding key %q in %s must be a single character\n", key, cfgPath)
os.Exit(1)
}
if purpose, reserved := reservedClassifyKeys[key]; reserved {
fmt.Fprintf(os.Stderr,
"Error: binding key %q in %s is reserved by the TUI for %s — pick a different key.\n",
key, cfgPath, purpose)
os.Exit(1)
}
bindings = append(bindings, parseBind(key+"="+value))
}

// Validate secondary_bindings: each outer key must exist in bindings,
// each inner key must be a single non-reserved char, values non-empty.
for primaryKey, inner := range cfg.Classify.SecondaryBindings {
if _, ok := cfg.Classify.Bindings[primaryKey]; !ok {
fmt.Fprintf(os.Stderr,
"Error: secondary_bindings key %q in %s has no matching primary binding\n",
primaryKey, cfgPath)
os.Exit(1)
}
for k, v := range inner {
if len(k) != 1 {
fmt.Fprintf(os.Stderr,
"Error: secondary_bindings[%q] key %q in %s must be a single character\n",
primaryKey, k, cfgPath)
os.Exit(1)
}
if purpose, reserved := reservedClassifyKeys[k]; reserved {
fmt.Fprintf(os.Stderr,
"Error: secondary_bindings[%q] key %q in %s is reserved by the TUI for %s — pick a different key.\n",
primaryKey, k, cfgPath, purpose)
os.Exit(1)
}
if v == "" {
fmt.Fprintf(os.Stderr,
"Error: secondary_bindings[%q][%q] in %s has empty calltype\n",
primaryKey, k, cfgPath)
os.Exit(1)
}
}
}

// Parse species+calltype
speciesName, callType := utils.ParseSpeciesCallType(species)

// Build config
config := tools.ClassifyConfig{
Folder: folder,
File: file,
Filter: filter,
Species: speciesName,
CallType: callType,
Certainty: certainty,
Sample: sample,
Goto: gotoFile,
Reviewer: cfg.Classify.Reviewer,
Color: cfg.Classify.Color,
ImageSize: cfg.Classify.ImgDims,
Sixel: cfg.Classify.Sixel,
ITerm: cfg.Classify.ITerm,
Bindings: bindings,
SecondaryBindings: cfg.Classify.SecondaryBindings,
Night: night,
Day: day,
Lat: lat,
Lng: lng,
Timezone: timezone,
}

// Load data files
state, err := tools.LoadDataFiles(config)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

// Show filtered counts (files with no matching segments are already pruned)
if state.TimeFilteredCount > 0 {
label := "daytime"
if config.Day {
label = "nighttime"
}
fmt.Fprintf(os.Stderr, "Skipped %d %s files\n", state.TimeFilteredCount, label)
}
fmt.Fprintf(os.Stderr, "Loaded %d files with %d matching segments\n",
len(state.DataFiles), state.TotalSegments())

if state.TotalSegments() == 0 {
fmt.Fprintf(os.Stderr, "No segments to review.\n")
os.Exit(0)
}

// Launch TUI (alt screen for clean kitty image rendering)
p := tea.NewProgram(tui.New(state))
if _, err := p.Run(); err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}
}

// parseBind parses "k=Kiwi" or "d=Kiwi+Duet" format
func parseBind(s string) tools.KeyBinding {
parts := strings.SplitN(s, "=", 2)
if len(parts) != 2 {
fmt.Fprintf(os.Stderr, "Error: invalid bind format: %s (expected key=value)\n", s)
os.Exit(1)
}

key := parts[0]
value := parts[1]

// Check for Species+CallType format
if strings.Contains(value, "+") {
valueParts := strings.SplitN(value, "+", 2)
return tools.KeyBinding{
Key: key,
Species: valueParts[0],
CallType: valueParts[1],
}
}

// Species only
return tools.KeyBinding{
Key: key,
Species: value,
}
}
file addition: calls.go (----------)

[0.1037540]

package cmd

import (
"encoding/json"
"flag"
"fmt"
"os"

"skraak/tools"
)

// RunCalls handles the "calls" command
func RunCalls(args []string) {
if len(args) < 1 {
printCallsUsage()
os.Exit(1)
}

switch args[0] {
case "from-preds":
runCallsFromPreds(args[1:])
case "from-birda":
runCallsFromBirda(args[1:])
case "from-raven":
runCallsFromRaven(args[1:])
case "show-images":
runCallsShowImages(args[1:])
case "classify":
RunCallsClassify(args[1:])
case "clip":
RunCallsClip(args[1:])
case "modify":
RunCallsModify(args[1:])
case "push-certainty":
runCallsPushCertainty(args[1:])
case "detect-anomalies":
runCallsDetectAnomalies(args[1:])
case "propagate":
runCallsPropagate(args[1:])
case "summarise":
runCallsSummarise(args[1:])
case "clip-labels":
runCallsClipLabels(args[1:])
default:
fmt.Fprintf(os.Stderr, "Unknown calls subcommand: %s\n\n", args[0])
printCallsUsage()
os.Exit(1)
}
}

func printCallsUsage() {
fmt.Fprintf(os.Stderr, "Usage: skraak calls <subcommand> [options]\n\n")
fmt.Fprintf(os.Stderr, "Subcommands:\n")
fmt.Fprintf(os.Stderr, " from-preds Extract clustered calls from ML predictions CSV\n")
fmt.Fprintf(os.Stderr, " from-birda Import BirdNET results to .data files\n")
fmt.Fprintf(os.Stderr, " from-raven Import Raven selections to .data files\n")
fmt.Fprintf(os.Stderr, " show-images Display spectrogram images from .data file\n")
fmt.Fprintf(os.Stderr, " classify Review and classify segments in .data files\n")
fmt.Fprintf(os.Stderr, " clip Generate audio/image clips from .data files\n")
fmt.Fprintf(os.Stderr, " modify Modify a label in a .data file\n")
fmt.Fprintf(os.Stderr, " push-certainty Promote certainty=90 segments to 100 for a filtered set\n")
fmt.Fprintf(os.Stderr, " detect-anomalies Flag label/certainty disagreements across ML model filters\n")
fmt.Fprintf(os.Stderr, " propagate Propagate verified classifications between filters in a .data file\n")
fmt.Fprintf(os.Stderr, " summarise Summarise all .data files in a folder\n")
fmt.Fprintf(os.Stderr, " clip-labels Export OpenSoundScape clip_labels-format multihot CSV\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak calls from-preds --csv predictions.csv\n")
fmt.Fprintf(os.Stderr, " skraak calls from-birda --folder ./recordings\n")
fmt.Fprintf(os.Stderr, " skraak calls from-raven --folder ./recordings --delete\n")
fmt.Fprintf(os.Stderr, " skraak calls show-images --file recording.wav.data\n")
fmt.Fprintf(os.Stderr, " skraak calls classify --folder ./data --reviewer David --bind k=Kiwi\n")
fmt.Fprintf(os.Stderr, " skraak calls classify --folder ./data --reviewer David --bind k=Kiwi --filter mymodel --species Kiwi+Duet\n")
fmt.Fprintf(os.Stderr, " skraak calls clip --folder ./data --output ./clips --prefix train --filter mymodel --species Kiwi\n")
fmt.Fprintf(os.Stderr, " skraak calls modify --file recording.data --reviewer GLM-5 --filter mymodel --segment 12-15 --species Kiwi\n")
fmt.Fprintf(os.Stderr, " skraak calls summarise --folder ./recordings > summary.json\n")
}

// runCallsFromPreds handles the "calls from-preds" subcommand
//
// JSON output schema:
//
// {
// "calls": [ // Clustered call groups
// {
// "file": string, // WAV filename
// "start_time": float, // Cluster start time (seconds)
// "end_time": float, // Cluster end time (seconds)
// "ebird_code": string, // eBird species code
// "segments": int // Number of detections in cluster
// }
// ],
// "total_calls": int, // Total clustered calls
// "clip_duration": float, // Clip duration in seconds
// "gap_threshold": float, // Gap threshold used for clustering
// "species_count": {string: int}, // Species ebird code -> detection count
// "data_files_written": int, // .data files successfully written
// "data_files_skipped": int, // .data files skipped (already exist)
// "filter": string, // Filter name used
// "error": string // Error message (omitted if nil)
// }
func runCallsFromPreds(args []string) {
fs := flag.NewFlagSet("calls from-preds", flag.ExitOnError)
csvPath := fs.String("csv", "", "Path to predictions CSV file (required)")
filter := fs.String("filter", "", "Filter name for .data files (default: parse from CSV filename)")
dotData := fs.Bool("dot-data", true, "Write .data files alongside audio files (default: true)")
gapMultiplier := fs.Int("gap-multiplier", 0, "Gap threshold multiplier (default: 2, e.g. 3 for kiwi)")
minDetections := fs.Int("min-detections", -1, "Min detections per cluster, filters out small clusters (default: 0 = no filtering)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak calls from-preds [options]\n\n")
fmt.Fprintf(os.Stderr, "Extract clustered bird calls from ML predictions CSV.\n")
fmt.Fprintf(os.Stderr, "Reads prediction CSV with columns: file, start_time, end_time, <ebird_codes...>\n")
fmt.Fprintf(os.Stderr, "Each row is a clip with 1=present, 0=absent for each species.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nOutput:\n")
fmt.Fprintf(os.Stderr, " With --dot-data=true (default): Writes .data files alongside audio files, outputs JSON summary\n")
fmt.Fprintf(os.Stderr, " With --dot-data=false: Outputs JSON with clustered calls only (no .data files)\n")
fmt.Fprintf(os.Stderr, "\nFilter name:\n")
fmt.Fprintf(os.Stderr, " If --filter is provided, uses that value.\n")
fmt.Fprintf(os.Stderr, " Otherwise, parses from CSV filename: prefix_filter_date.csv -> filter\n")
fmt.Fprintf(os.Stderr, " Example: predsST_opensoundscape-kiwi-1.2_2025-11-12.csv -> opensoundscape-kiwi-1.2\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " # Write .data files (default)\n")
fmt.Fprintf(os.Stderr, " skraak calls from-preds --csv predictions.csv\n")
fmt.Fprintf(os.Stderr, "\n")
fmt.Fprintf(os.Stderr, " # JSON output only (no .data files)\n")
fmt.Fprintf(os.Stderr, " skraak calls from-preds --csv predictions.csv --dot-data=false > calls.json\n")
fmt.Fprintf(os.Stderr, "\n")
fmt.Fprintf(os.Stderr, " # Override filter name\n")
fmt.Fprintf(os.Stderr, " skraak calls from-preds --csv preds.csv --filter my-custom-filter\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
if *csvPath == "" {
fmt.Fprintf(os.Stderr, "Error: --csv is required\n\n")
fs.Usage()
os.Exit(1)
}

// Determine filter name
filterName := *filter
if filterName == "" {
filterName = tools.ParseFilterFromFilename(*csvPath)
if filterName == "" {
fmt.Fprintf(os.Stderr, "Error: Could not parse filter from filename. Use --filter flag.\n")
fmt.Fprintf(os.Stderr, "Expected format: prefix_filter_date.csv (e.g., predsST_opensoundscape-kiwi-1.2_2025-11-12.csv)\n")
os.Exit(1)
}
}

input := tools.CallsFromPredsInput{
CSVPath: *csvPath,
Filter: filterName,
WriteDotData: *dotData,
GapMultiplier: *gapMultiplier,
MinDetections: *minDetections,
ProgressHandler: func(processed, total int, message string) {
if total > 0 {
percent := float64(processed) / float64(total) * 100
fmt.Fprintf(os.Stderr, "\rProcessing WAV files: %d/%d (%.0f%%)", processed, total, percent)
if processed == total {
fmt.Fprintf(os.Stderr, "\n")
}
}
},
}

if *dotData {
fmt.Fprintf(os.Stderr, "Extracting calls from predictions: %s\n", *csvPath)
fmt.Fprintf(os.Stderr, "Filter: %s\n", filterName)
fmt.Fprintf(os.Stderr, "Writing .data files: enabled\n")
} else {
fmt.Fprintf(os.Stderr, "Extracting calls from predictions: %s\n", *csvPath)
fmt.Fprintf(os.Stderr, "Filter: %s\n", filterName)
fmt.Fprintf(os.Stderr, "Writing .data files: disabled (--dot-data=false)\n")
}

output, err := tools.CallsFromPreds(input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

fmt.Fprintf(os.Stderr, "Found %d clustered calls across %d species\n",
output.TotalCalls, len(output.SpeciesCount))
fmt.Fprintf(os.Stderr, "Clip duration: %.1fs, Gap threshold: %.1fs\n",
output.ClipDuration, output.GapThreshold)

if *dotData {
fmt.Fprintf(os.Stderr, "Data files written: %d, skipped: %d\n",
output.DataFilesWritten, output.DataFilesSkipped)
}

// Output JSON to stdout
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
if err := enc.Encode(output); err != nil {
fmt.Fprintf(os.Stderr, "Error encoding output: %v\n", err)
os.Exit(1)
}
}

// runCallsShowImages handles the "calls show-images" subcommand
func runCallsShowImages(args []string) {
fs := flag.NewFlagSet("calls show-images", flag.ExitOnError)
filePath := fs.String("file", "", "Path to .data file (required)")
color := fs.Bool("color", false, "Apply L4 colormap (default: false, grayscale)")
imgDims := fs.Int("img-dims", 0, "Spectrogram size in pixels (224-448, default 448)")
sixel := fs.Bool("sixel", false, "Use sixel graphics protocol (default: kitty)")
iterm := fs.Bool("iterm", false, "Use iTerm2 inline image protocol")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak calls show-images [options]\n\n")
fmt.Fprintf(os.Stderr, "Display spectrogram images for each segment in a .data file.\n")
fmt.Fprintf(os.Stderr, "Images are output using the Kitty graphics protocol (or Sixel with --sixel, iTerm2 with --iterm).\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak calls show-images --file recording.wav.data\n")
fmt.Fprintf(os.Stderr, " skraak calls show-images --file recording.wav.data --color\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
if *filePath == "" {
fmt.Fprintf(os.Stderr, "Error: --file is required\n\n")
fs.Usage()
os.Exit(1)
}

input := tools.CallsShowImagesInput{
DataFilePath: *filePath,
Color: *color,
ImageSize: *imgDims,
Sixel: *sixel,
ITerm: *iterm,
}

fmt.Fprintf(os.Stderr, "Showing spectrogram images for: %s\n", *filePath)
if *color {
fmt.Fprintf(os.Stderr, "Color: L4 colormap (Black-Red-Yellow)\n")
}

output, err := tools.CallsShowImages(input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

fmt.Fprintf(os.Stderr, "Displayed %d segment(s) from %s\n", output.SegmentsShown, output.WavFile)
}

// runCallsFromBirda handles the "calls from-birda" subcommand
//
// JSON output schema:
//
// {
// "calls": [ // Clustered call groups
// {
// "file": string, // WAV filename
// "start_time": float, // Cluster start time (seconds)
// "end_time": float, // Cluster end time (seconds)
// "ebird_code": string, // Species code
// "segments": int // Number of detections in cluster
// }
// ],
// "total_calls": int, // Total clustered calls
// "species_count": {string: int}, // Species -> detection count
// "data_files_written": int, // .data files successfully written
// "data_files_skipped": int, // .data files skipped
// "files_processed": int, // BirdNET files processed
// "files_deleted": int, // BirdNET files deleted (--delete)
// "filter": string, // Always "BirdNET"
// "error": string // Error message (omitted if nil)
// }
func runCallsFromBirda(args []string) {
fs := flag.NewFlagSet("calls from-birda", flag.ExitOnError)
folder := fs.String("folder", "", "Folder containing BirdNET results files")
file := fs.String("file", "", "Single BirdNET results file to process")
delete := fs.Bool("delete", false, "Delete BirdNET files after processing")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak calls from-birda [options]\n\n")
fmt.Fprintf(os.Stderr, "Import BirdNET results to .data files.\n")
fmt.Fprintf(os.Stderr, "Reads *.BirdNET.results.csv files and creates/merges .data files.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nBehavior:\n")
fmt.Fprintf(os.Stderr, " - Filter is always 'BirdNET' (parsed from filename)\n")
fmt.Fprintf(os.Stderr, " - If .data file exists with BirdNET filter: error (refuses to clobber)\n")
fmt.Fprintf(os.Stderr, " - If .data file exists with different filter: merge segments\n")
fmt.Fprintf(os.Stderr, " - Confidence (0.0-1.0) converted to certainty (0-100)\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak calls from-birda --folder ./recordings\n")
fmt.Fprintf(os.Stderr, " skraak calls from-birda --file recording.BirdNET.results.csv\n")
fmt.Fprintf(os.Stderr, " skraak calls from-birda --folder ./recordings --delete\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate that either folder or file is specified
if *folder == "" && *file == "" {
fmt.Fprintf(os.Stderr, "Error: Either --folder or --file is required\n\n")
fs.Usage()
os.Exit(1)
}

input := tools.CallsFromBirdaInput{
Folder: *folder,
File: *file,
Delete: *delete,
ProgressHandler: func(processed, total int, message string) {
if total > 0 {
percent := float64(processed) / float64(total) * 100
fmt.Fprintf(os.Stderr, "\rProcessing BirdNET files: %d/%d (%.0f%%)", processed, total, percent)
if processed == total {
fmt.Fprintf(os.Stderr, "\n")
}
}
},
}

fmt.Fprintf(os.Stderr, "Importing BirdNET results\n")
if *folder != "" {
fmt.Fprintf(os.Stderr, "Folder: %s\n", *folder)
} else {
fmt.Fprintf(os.Stderr, "File: %s\n", *file)
}
if *delete {
fmt.Fprintf(os.Stderr, "Delete source files: enabled\n")
}

output, err := tools.CallsFromBirda(input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

fmt.Fprintf(os.Stderr, "Processed %d BirdNET files\n", output.FilesProcessed)
fmt.Fprintf(os.Stderr, "Found %d calls across %d species\n",
output.TotalCalls, len(output.SpeciesCount))
fmt.Fprintf(os.Stderr, "Data files written: %d, skipped: %d\n",
output.DataFilesWritten, output.DataFilesSkipped)
if *delete {
fmt.Fprintf(os.Stderr, "Files deleted: %d\n", output.FilesDeleted)
}

// Output JSON to stdout
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
if err := enc.Encode(output); err != nil {
fmt.Fprintf(os.Stderr, "Error encoding output: %v\n", err)
os.Exit(1)
}
}

// runCallsFromRaven handles the "calls from-raven" subcommand
//
// JSON output schema:
//
// {
// "calls": [ // Clustered call groups
// {
// "file": string, // WAV filename
// "start_time": float, // Cluster start time (seconds)
// "end_time": float, // Cluster end time (seconds)
// "ebird_code": string, // Species code
// "segments": int // Number of detections in cluster
// }
// ],
// "total_calls": int, // Total clustered calls
// "species_count": {string: int}, // Species -> detection count
// "data_files_written": int, // .data files successfully written
// "data_files_skipped": int, // .data files skipped
// "files_processed": int, // Raven files processed
// "files_deleted": int, // Raven files deleted (--delete)
// "filter": string, // Always "Raven"
// "error": string // Error message (omitted if nil)
// }
func runCallsFromRaven(args []string) {
fs := flag.NewFlagSet("calls from-raven", flag.ExitOnError)
folder := fs.String("folder", "", "Folder containing Raven selection files")
file := fs.String("file", "", "Single Raven selection file to process")
delete := fs.Bool("delete", false, "Delete Raven files after processing")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak calls from-raven [options]\n\n")
fmt.Fprintf(os.Stderr, "Import Raven selections to .data files.\n")
fmt.Fprintf(os.Stderr, "Reads *.selections.txt files and creates/merges .data files.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nBehavior:\n")
fmt.Fprintf(os.Stderr, " - Filter is always 'Raven' (parsed from filename)\n")
fmt.Fprintf(os.Stderr, " - If .data file exists with Raven filter: error (refuses to clobber)\n")
fmt.Fprintf(os.Stderr, " - If .data file exists with different filter: merge segments\n")
fmt.Fprintf(os.Stderr, " - Frequency range preserved from Raven selections\n")
fmt.Fprintf(os.Stderr, " - Certainty defaults to 70 (no confidence metric in Raven)\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak calls from-raven --folder ./recordings\n")
fmt.Fprintf(os.Stderr, " skraak calls from-raven --file recording.Table.1.selections.txt\n")
fmt.Fprintf(os.Stderr, " skraak calls from-raven --folder ./recordings --delete\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate that either folder or file is specified
if *folder == "" && *file == "" {
fmt.Fprintf(os.Stderr, "Error: Either --folder or --file is required\n\n")
fs.Usage()
os.Exit(1)
}

input := tools.CallsFromRavenInput{
Folder: *folder,
File: *file,
Delete: *delete,
ProgressHandler: func(processed, total int, message string) {
if total > 0 {
percent := float64(processed) / float64(total) * 100
fmt.Fprintf(os.Stderr, "\rProcessing Raven files: %d/%d (%.0f%%)", processed, total, percent)
if processed == total {
fmt.Fprintf(os.Stderr, "\n")
}
}
},
}

fmt.Fprintf(os.Stderr, "Importing Raven selections\n")
if *folder != "" {
fmt.Fprintf(os.Stderr, "Folder: %s\n", *folder)
} else {
fmt.Fprintf(os.Stderr, "File: %s\n", *file)
}
if *delete {
fmt.Fprintf(os.Stderr, "Delete source files: enabled\n")
}

output, err := tools.CallsFromRaven(input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

fmt.Fprintf(os.Stderr, "Processed %d Raven files\n", output.FilesProcessed)
fmt.Fprintf(os.Stderr, "Found %d calls across %d species\n",
output.TotalCalls, len(output.SpeciesCount))
fmt.Fprintf(os.Stderr, "Data files written: %d, skipped: %d\n",
output.DataFilesWritten, output.DataFilesSkipped)
if *delete {
fmt.Fprintf(os.Stderr, "Files deleted: %d\n", output.FilesDeleted)
}

// Output JSON to stdout
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
if err := enc.Encode(output); err != nil {
fmt.Fprintf(os.Stderr, "Error encoding output: %v\n", err)
os.Exit(1)
}
}

// runCallsSummarise handles the "calls summarise" subcommand
//
// JSON output schema:
//
// {
// "segments": [ // All segments (omitted with --brief)
// {
// "file": string, // .data file path
// "start_time": float, // Segment start time (seconds)
// "end_time": float, // Segment end time (seconds)
// "labels": [
// {
// "filter": string, // Filter name
// "certainty": int, // Certainty level (0-100)
// "species": string, // Species name
// "calltype": string, // Call type (omitted if empty)
// "comment": string, // Comment (omitted if empty)
// "bookmark": bool // Bookmark flag (omitted if false)
// }
// ]
// }
// ],
// "folder": string, // Folder path
// "data_files_read": int, // Successfully parsed .data files
// "data_files_skipped": [string], // Files that failed to parse
// "total_segments": int, // Total number of segments
// "filters": { // Per-filter statistics
// string: {
// "segments": int, // Segment count for this filter
// "species": {string: int}, // Species -> count
// "calltypes": {string: {string: int}} // Species -> calltype -> count (omitted if empty)
// }
// },
// "review_status": {
// "unreviewed": int, // certainty < 100
// "confirmed": int, // certainty = 100
// "dont_know": int, // certainty = 0
// "with_calltype": int, // Labels with call type
// "with_comments": int // Labels with comments
// },
// "operators": [string], // Unique operator names
// "reviewers": [string], // Unique reviewer names
// "error": string // Error message (omitted if nil)
// }
func runCallsSummarise(args []string) {
fs := flag.NewFlagSet("calls summarise", flag.ExitOnError)
folder := fs.String("folder", "", "Folder containing .data files (required)")
brief := fs.Bool("brief", false, "Exclude segments array from output (summary stats only)")
filter := fs.String("filter", "", "Restrict output to a single filter name (default: all filters)")

fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: skraak calls summarise [options]\n\n")
fmt.Fprintf(os.Stderr, "Summarise all .data files in a folder.\n")
fmt.Fprintf(os.Stderr, "Outputs JSON with segments array and summary statistics.\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nOutput includes:\n")
fmt.Fprintf(os.Stderr, " - segments: array of all segments with labels (omitted with --brief)\n")
fmt.Fprintf(os.Stderr, " - data_files_read: count of successfully parsed .data files\n")
fmt.Fprintf(os.Stderr, " - data_files_skipped: list of files that failed to parse\n")
fmt.Fprintf(os.Stderr, " - total_segments: total number of segments\n")
fmt.Fprintf(os.Stderr, " - filters: per-filter statistics (segments, species counts)\n")
fmt.Fprintf(os.Stderr, " - review_status: unreviewed/confirmed/dont_know counts\n")
fmt.Fprintf(os.Stderr, " - operators/reviewers: unique values found\n")
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " skraak calls summarise --folder ./recordings > summary.json\n")
fmt.Fprintf(os.Stderr, " skraak calls summarise --folder ./recordings --brief > summary.json # summary only\n")
fmt.Fprintf(os.Stderr, " skraak calls summarise --folder ./recordings --filter opensoundscape-kiwi-1.2 --brief\n")
}

if err := fs.Parse(args); err != nil {
os.Exit(1)
}

// Validate required flags
if *folder == "" {
fmt.Fprintf(os.Stderr, "Error: --folder is required\n\n")
fs.Usage()
os.Exit(1)
}

input := tools.CallsSummariseInput{
Folder: *folder,
Brief: *brief,
Filter: *filter,
}

fmt.Fprintf(os.Stderr, "Summarising .data files in: %s\n", *folder)
if *filter != "" {
fmt.Fprintf(os.Stderr, "Filter: %s\n", *filter)
}

output, err := tools.CallsSummarise(input)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}

fmt.Fprintf(os.Stderr, "Read %d .data files, skipped %d\n",
output.DataFilesRead, len(output.DataFilesSkipped))
fmt.Fprintf(os.Stderr, "Total segments: %d\n", output.TotalSegments)
fmt.Fprintf(os.Stderr, "Filters: %d\n", len(output.Filters))
fmt.Fprintf(os.Stderr, "Review status: %d unreviewed, %d confirmed, %d don't know\n",
output.ReviewStatus.Unreviewed, output.ReviewStatus.Confirmed, output.ReviewStatus.DontKnow)

// Output JSON to stdout
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
if err := enc.Encode(output); err != nil {
fmt.Fprintf(os.Stderr, "Error encoding output: %v\n", err)
os.Exit(1)
}
}
file addition: README.md (----------)

[2.1]

# Skraak

Acoustic monitoring CLI toolkit in Go.

## CLI Commands

```bash
# Execute SQL query
./skraak sql --db ./db/skraak.duckdb "SELECT COUNT(*) FROM file WHERE active = true"

# Create resources
./skraak create dataset --db ./db/skraak.duckdb --name "My Dataset" --type unstructured
./skraak create location --db ./db/skraak.duckdb --dataset abc123 --name "Site A" --lat -36.85 --lon 174.76 --timezone Pacific/Auckland
./skraak create cluster --db ./db/skraak.duckdb --dataset abc123 --location loc456 --name "2024-01" --sample-rate 250000
./skraak create pattern --db ./db/skraak.duckdb --record 60 --sleep 1740

# Update resources
./skraak update dataset --db ./db/skraak.duckdb --id abc123 --name "Updated Name"
./skraak update location --db ./db/skraak.duckdb --id loc123 --name "Updated Name" --lat -36.85 --lon 174.76
./skraak update cluster --db ./db/skraak.duckdb --id cluster123 --name "Updated Name"
./skraak update pattern --db ./db/skraak.duckdb --id pattern123 --record 30 --sleep 1770

# Import commands
./skraak import file --db ./db/skraak.duckdb --dataset abc123 --location loc456 --cluster clust789 --file /path/to/file.wav
./skraak import folder --db ./db/skraak.duckdb --dataset abc123 --location loc456 --cluster clust789 --folder /path/to/folder
./skraak import bulk --db ./db/skraak.duckdb --dataset abc123 --csv import.csv --log progress.log
./skraak import unstructured --db ./db/skraak.duckdb --dataset 4Sh8_7p1ocks --folder "/media/david/Misc-2/Manu o Kahurangi kiwi survey (3)/Andrew Digby LSK - sorted files"
./skraak import segments --db ./db/skraak.duckdb --dataset abc123 --location loc456 --cluster clust789 --folder /path/to/data --mapping mapping.json

# Export dataset (for collaboration, testing, or archival)
./skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb
./skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb --dry-run

# Event log replay (sync backup databases)
./skraak replay events --db ./backup.duckdb --log ./skraak.duckdb.events.jsonl
./skraak replay events --db ./backup.duckdb --log ./events.jsonl --dry-run
./skraak replay events --db ./backup.duckdb --log ./events.jsonl --last 10

# Call analysis (extract from ML predictions, review/classify)
./skraak calls from-preds --csv predictions.csv # Extract calls, write .data files
./skraak calls from-preds --csv preds.csv --dot-data=false > calls.json # JSON output only
./skraak calls show-images --file recording.wav.data # Display spectrograms
./skraak calls classify --folder ./data # Interactive classification (reviewer + bindings from ~/.skraak/config.json)
./skraak calls classify --folder ./data --filter opensoundscape-kiwi-1.0
./skraak calls summarise --folder ./data > summary.json # Summarise .data files
./skraak calls summarise --folder ./data --brief > summary.json # Summary stats only (no segments)

./skraak calls classify --folder . --filter opensoundscape-kiwi-1.2 --species Kiwi+Male
./skraak calls classify --folder . --filter opensoundscape-multi-1.0

./skraak calls clip --file recording.wav.data --prefix B01 --output /tmp/B01/ --species Kiwi+Duet --filter opensoundscape-multi-1.0 --size 224 --color
./skraak calls clip --folder B01/2026-12-11/ --prefix B01 --output /tmp/B01/ --species Kiwi+Duet --filter opensoundscape-multi-1.0 --size 224 --color

./skraak calls modify --file recording.data --reviewer Claude --filter opensoundscape-multi-1.0 --segment 12-15 --species Kiwi+Male --certainty 80
./skraak calls modify --file recording.data --reviewer Claude --filter opensoundscape-multi-1.0 --segment 12-15 --certainty 80 --bookmark
./skraak calls modify --file recording.data --reviewer Claude --filter opensoundscape-multi-1.0 --segment 12-15 --certainty 80 --comment "Clear example of male call"

./skraak calls propagate --file rec.wav.data --from opensoundscape-kiwi-1.2 --to opensoundscape-kiwi-1.5 --species Kiwi
./skraak calls propagate --folder ./recordings --from opensoundscape-kiwi-1.2 --to opensoundscape-kiwi-1.5 --species Kiwi

# Export OpenSoundScape clip_labels-format CSV from .data files
./skraak calls clip-labels --folder ./data --mapping ./mapping.json
./skraak calls clip-labels --folder ./data --mapping ./mapping.json --filter opensoundscape-multi-1.0

# File utilities
./skraak xxhash --file recording.wav # XXH64 hash (same format as DB)
./skraak metadata --file recording.wav # WAV metadata as JSON
# Works for audiomoth which records time metadata as UTC
./skraak isnight --file recording.wav --lat -36.85 --lng 174.76 # Was it night when recorded?
./skraak isnight --file recording.wav --lat -36.85 --lng 174.76 --brief # Just file_path + solar_night
# DOC recorders record local time without timezone, IANA timezone required
./skraak isnight --file recording.wav --lat -36.85 --lng 174.76 --timezone Pacific/Auckland # Non-UTC timezone
./skraak time # Current time as JSON

# Rename files with location prefix
./skraak prepend --folder ./recordings --prefix LOC001 # WAV files with datestring + log.txt
./skraak prepend --folder ./data --prefix SITE_A --recursive # Include 1 level of subfolders
./skraak prepend --folder ./test --prefix TEST --dry-run # Preview changes
```

**`isnight`** — Night detection for bioacoustic recordings. Determines if a WAV file was recorded at night (between sunset and sunrise) at the given GPS coordinates. The recording timestamp is read from the WAV file metadata, not from the filename — this works reliably because bioacoustic recorders (AudioMoth, BAR-LT, Song Meter, etc.) embed an accurate timestamp in the WAV header at the time of recording. AudioMoth comments are parsed automatically including the embedded UTC offset. For non-AudioMoth files without a recognized filename pattern, the timestamp falls back to the file modification time. Use `--brief` for batch/agent use to return only `file_path` and `solar_night`.

## Event Log

All mutating SQL operations (INSERT, UPDATE, DELETE) are automatically logged for backup synchronization.

**Event log location:** `<database>.events.jsonl`

**Features:**
- SQL-level capture for complete fidelity
- Only successful transactions logged (rollbacks discarded)
- Includes tool name, SQL, parameters, timestamp

**Replay on backup database:**
```bash
# Replay all events
./skraak replay events --db ./backup.duckdb --log ./skraak.duckdb.events.jsonl

# Preview without executing
./skraak replay events --db ./backup.duckdb --log ./events.jsonl --dry-run

# Replay last N events
./skraak replay events --db ./backup.duckdb --log ./events.jsonl --last 10
```

**Event format (JSONL):**
```json
{
"id": "V1StGXR8_Z5jdHi6B-myT",
"timestamp": "2026-02-18T14:30:22+13:00",
"tool": "create_or_update_dataset",
"queries": [{"sql": "INSERT INTO ...", "parameters": [...]}],
"success": true,
"duration_ms": 45
}
```

## Dataset Export

Export a dataset with all related data to a new DuckDB database for collaboration, testing, or archival.

**Use cases:**
- **Collaboration:** Export, send to collaborator, they return event log for replay
- **Testing:** Create focused test database from production (100 MB vs 1.5 GB)
- **Archival:** Snapshot a dataset at a point in time

**Export:**
```bash
# Export dataset to new database
./skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb

# Preview without creating file
./skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb --dry-run

# Overwrite existing export
./skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb --force
```

**What's exported:**
- All rows owned by dataset (via dataset_id foreign key traversal)
- Subset of reference data (species, patterns, filters used)
- Creates empty event log file for changes

**Re-import changes:**
```bash
# After collaborator returns event log, replay on backup
./skraak replay events --db ./backup.duckdb --log export.duckdb.events.jsonl
```

## Call Analysis

Extract and review bird calls from ML predictions.

**Workflow:**

1. **Extract calls from opensoundscape predictions.csv:**
```bash
# Write .data files alongside audio (default)
# filter parsed from preds.csv filename but can be overriden with --filter birdnet-24
./skraak calls from-preds --csv predictions.csv > calls.json
```

2. **Interactive classification:**

Reviewer, keybindings, and display flags (color/sixel/iterm/img_dims) are loaded
from `~/.skraak/config.json` — create it once before first use:

```json
{
"classify": {
"reviewer": "David",
"color": true,
"bindings": {
"a": "eurbla",
"k": "Kiwi",
"d": "Kiwi+Duet",
"n": "Don't Know",
"1": "Kiwi+Duet",
"2": "Kiwi+Female",
"3": "Kiwi+Male",
"4": "Kiwi",
"x": "Noise"
},
"secondary_bindings":
{
"a":
{
"a": "alarm",
"c": "contact",
"s": "song"
}
}
}
}
```

Path resolves to `~/.skraak/config.json` on Linux/macOS and
`C:\Users\<name>\.skraak\config.json` on Windows via `os.UserHomeDir()`.

Secondary bindings for a, eurbla, are accessed by shift-a, a/c/s

```bash
# Launch TUI for reviewing and classifying segments
./skraak calls classify --folder ./data

# Single file mode
./skraak calls classify --file recording.wav.data

# Scope to a specific filter (ML model)
./skraak calls classify --folder ./data --filter opensoundscape-kiwi-1.2

# Scope to species (and optionally calltype) within a filter
./skraak calls classify --folder ./data --filter opensoundscape-kiwi-1.2 --species Kiwi+Duet

# Sample 10% of matching segments (random, requires --certainty; useful for quality-checking large sets)
./skraak calls classify --folder ./data --species Kiwi --certainty 90 --sample 10
```

`--sample <1-99>` randomly selects that percentage of the filtered segment list for review. Files and segments are presented in their original chronological order. `--sample 100` is a no-op. Requires `--certainty` to be set.

3. **Summarise .data files:**
```bash
# Full summary with all segments
./skraak calls summarise --folder ./recordings > summary.json

# Brief summary (stats only, no segment details)
./skraak calls summarise --folder ./recordings --brief > summary.json
```

**Summarise output includes:**
- `segments` - array of all segments with labels (omitted with `--brief`)
- `data_files_read` / `data_files_skipped` - file processing status
- `total_segments` - total count
- `filters` - per-filter statistics (segments, species, calltypes)
- `review_status` - unreviewed/confirmed/dont_know counts
- `operators` / `reviewers` - unique values found

4. **Promote certainty=90 segments to 100:**
```bash
# After reviewing a folder and confirming labels are correct, bulk-promote to certainty=100.
# Filtering flags match calls classify exactly (minus --certainty and --sample).
./skraak calls push-certainty --folder ./data --species Kiwi
./skraak calls push-certainty --folder ./data --species Kiwi --night --lat -45.5 --lng 167.4
```

Sets matching labels from certainty=90 to 100 and updates the reviewer from `~/.skraak/config.json`. Outputs `{"segments_updated": N, "files_updated": M}`.

5. **Propagate verified classifications between filters:**
```bash
# Single file
./skraak calls propagate --file rec.wav.data \
--from opensoundscape-kiwi-1.2 --to opensoundscape-kiwi-1.5 --species Kiwi

# Whole folder
./skraak calls propagate --folder ./recordings \
--from opensoundscape-kiwi-1.2 --to opensoundscape-kiwi-1.5 --species Kiwi
```

Only source labels at certainty=100 matching `--species` are considered. Target labels (filter=`--to`) at certainty 70 or 0 are upgraded to certainty=90 and the file reviewer is set to `Skraak`. Targets already at 100 or 90 are left alone; files missing either filter are skipped.

6. **Export OpenSoundScape clip_labels-format CSV:**
```bash
# Columns = canonical classes from mapping.json
./skraak calls clip-labels --folder ./data --mapping ./mapping.json

# Restrict to a single ML filter
./skraak calls clip-labels --folder ./data --mapping ./mapping.json --filter opensoundscape-multi-1.0
```

Reproduces OpenSoundScape's `BoxedAnnotations.clip_labels()` output
exactly — same row layout, byte-identical CSVs — but in Go, fast, and
without round-tripping through Raven `selections.txt`.

**Algorithm.** For every `.data` file, generate fixed-duration clip
windows from `[0, Duration]` using OPSO's `generate_clip_times_df`
(supports `--final-clip` of `full | remainder | extend | none`). Every
window is emitted as a row; for each output class column, the value is
`True` when at least one cert-100 annotation of that class overlaps the
window by ≥ `--min-label-overlap` seconds, else `False`. Gaps just emit
all-`False` rows.

Only certainty=100 labels participate. `mapping.json` (from the
`/data-mapping` skill) translates `.data` species strings to canonical
class names. Two sentinels with distinct semantics:

- **`"__NEGATIVE__"`** — clip IS emitted, **all class columns False**.
Overrides any positive labels in the same clip's union. Use for
confirmed-negative training examples (e.g. `Noise`, `Not`, rain, wind,
silence, chainsaw, helicopter).
- **`"__IGNORE__"`** — the segment is dropped from output. Any
segment whose species maps to `__IGNORE__` triggers the drop, regardless
of filter. Use for files whose annotation set is incomplete: emitting any
clip from them as confirmed-False would poison the training set with
possibly-wrong negatives.

Override order within a clip: `__NEGATIVE__` beats real classes. (File-level
`__IGNORE__` is checked before any clip is generated.)

**`--filter F`** restricts which ML filter's labels count
(`opensoundscape-multi-1.0`, `BirdNET`, `Raven`, …). The mapping
coverage check also restricts to that filter.

Defaults: `--clip-duration 4 --clip-overlap 0.5 --min-label-overlap 0.25 --final-clip full`.

If `--output` exists, the run **appends**. Column-set mismatch with the
existing header → hard error. Duplicate `(file, start_time, end_time)`
row (within the run, or vs existing rows) → hard error on first
occurrence. Any `.data` parse error, missing `Duration`, or species
missing from `mapping.json` aborts before any row is written.

## Segments Import

Import AviaNZ .data segments into the database with species/calltype mapping.

**Prerequisites:**
1. WAV files must already be imported (hashes must exist in database)
2. No existing labels on files (fresh imports only)
3. All filters, species, and calltypes must exist in database
4. Mapping file must cover all species in .data files
5. Filters / Models must already exist in the database

**Mapping file** (`mapping_2026-03-13.json`):
use claude skill to guide user through creation of species calltype mapping to db
```json
{
"Don't Know": {
"species": "Don't Know"
},
"GSK": {
"species": "Roroa",
"calltypes": {
"Male": "Male - Solo",
"Female": "Female - Solo"
}
}
}
```

**Import Segments:**
```bash
./skraak import segments \
--db ./db/skraak.duckdb \
--dataset dataset_id \
--location location_id \
--cluster cluster_id \
--folder /path/to/data \
--mapping mapping.json
```

**What's imported:**
- `segment` - time ranges with freq_low/freq_high from .data
- `label` - species, filter, certainty for each segment
- `label_subtype` - calltype if present in .data
- `label_metadata` - stores comments (if present)

**Data file updates:**
- `skraak_hash` written to metadata section
- `skraak_label_id` written to each label object

**Bookmarks:** Segments with `bookmark: true` are imported normally; the bookmark flag is ignored (not stored in database).

## Development

```bash
# Build
go build -o skraak

# Run tests
go test ./...

# Run with coverage
go test -cover ./...
```

### Cross-Compile to Windows (from Ubuntu)

DuckDB's Go bindings use CGO with pre-built static libraries. Cross-compiling to Windows requires MinGW and a small ABI compatibility stub.

**Prerequisites:**
```bash
sudo apt install gcc-mingw-w64-x86-64 g++-mingw-w64-x86-64

# Switch to posix threading variant (DuckDB uses pthreads)
sudo update-alternatives --set x86_64-w64-mingw32-gcc /usr/bin/x86_64-w64-mingw32-gcc-posix
sudo update-alternatives --set x86_64-w64-mingw32-g++ /usr/bin/x86_64-w64-mingw32-g++-posix
```

**Build:**
```bash
# Create ABI stub (Ubuntu MinGW defines mbstate_t as int, DuckDB expects _Mbstatet)
echo 'extern "C" { void* _ZNSt15basic_streambufIcSt11char_traitsIcEE7seekposESt4fposI9_MbstatetESt13_Ios_Openmode() { return (void*)-1; } }' \
| tee /tmp/stub_seekpos.cpp
x86_64-w64-mingw32-g++ -c /tmp/stub_seekpos.cpp -o /tmp/stub_seekpos.o

# Cross-compile (windows-amd64 only)
CGO_ENABLED=1 \
CC=x86_64-w64-mingw32-gcc \
CXX=x86_64-w64-mingw32-g++ \
GOOS=windows GOARCH=amd64 \
go build -ldflags '-extldflags "/tmp/stub_seekpos.o -lucrt"' -o skraak.exe
```

**Verify:**
```bash
file skraak.exe
# Expected: PE32+ executable (console) x86-64, for MS Windows
```

See `CLAUDE.md` for detailed development notes.
file addition: CLAUDE.md (----------)

[2.1]

# Skraak CLI/MCP Server

## Documentation Policy

**When making code changes, update CHANGELOG.md first, then CLAUDE.md only if architectural concepts change.**

- CHANGELOG.md: Detailed change history with rationale
- CLAUDE.md: Essential patterns, policies, and quick reference
- **keep it concise**

---

## 🚨 Critical Database Safety

### ALWAYS Use Test Database for Testing

**CORRECT:**
```bash
cd shell_scripts
./test_sql.sh ../db/test.duckdb > test.txt 2>&1
```

- `db/skraak.duckdb` = **PRODUCTION** (1.4M files)
- `db/test.duckdb` = **TEST** (safe for testing)
- **Always specify test.duckdb explicitly**

### Testing Best Practices

- **Always pipe to file** (prevents token overflow from large output)
- Navigate to `shell_scripts/` before running tests
- Verify: `rg '"result":' test.txt | wc -l`

---

## Package Organization

**Simple rule:** If called by `cmd/`, it goes in `tools/`. If called by `tools/`, it goes in `utils/`.

- **`utils/`** - Reusable helpers (no MCP types, no `*Input`/`*Output` structs)
- **`tools/`** - MCP/CLI tools (one file per tool, defines input/output types)
- **`cmd/mcp.go`** - MCP adapters (only file importing MCP SDK)
- **`cmd/*.go`** - CLI commands (parse flags, call tools, print JSON)

---

## Architecture

Two-layer architecture: tools are MCP-free, adapters bridge to MCP protocol.

```
main.go → CLI dispatcher (mcp | import | sql | dataset | ...)
cmd/mcp.go → MCP server + thin adapters (ONLY MCP SDK import)
cmd/*.go → CLI commands (flags → tools → JSON output)
tools/*.go → Core logic (plain Go structs, no MCP dependency)
utils/*.go → Reusable helpers
db/ → Database connection + types
```

---

## Directory Structure

```
skraak/
├── main.go # CLI dispatcher
├── cmd/ # MCP adapters + CLI commands
├── db/
├── tools/ # tools (MCP-free)
├── utils/ # Reusable helpers
├── tui/ # TUI specific code
├── resources/schema.go # Schema resources
└── shell_scripts/ # end-to-end test scripts
```

---

## Building & Running

### Build
```bash
go build -o skraak
```

### MCP Server
```bash
./skraak mcp --db ./db/skraak.duckdb
```

### CLI Commands
```bash
# SQL query
./skraak sql --db ./db/test.duckdb "SELECT COUNT(*) FROM file WHERE active = true"
```

**CLI Design:** All tools output JSON for composability with Unix tools (jq, grep). Errors to stderr.

---

## Testing

### Shell Scripts (in shell_scripts/)
All scripts default to `../db/test.duckdb`:

```bash
cd shell_scripts

./test_sql.sh ../db/test.duckdb > test.txt 2>&1 # SQL tool

# Verify
rg '"result":' test.txt | wc -l # Count successes
rg '"isError":true' test.txt | wc -l # Count expected errors
```

### Go Unit Tests
```bash
go test ./... # All tests
go test -v ./utils/ # Verbose
go test -cover ./utils/ # Coverage
go test -coverprofile=coverage.out ./utils/ && go tool cover -html=coverage.out
```
file addition: CHANGELOG.md (----------)

[2.1]

# Changelog

All notable changes to the Skraak project are documented here.

## [2026-04-28] Remove MCP server support

**Breaking change:** Removed the MCP (Model Context Protocol) server entirely.
All functionality remains available via CLI commands.

- Deleted `cmd/mcp.go` (MCP server + adapters)
- Deleted `cmd/mcp_surface_test.go` (MCP integration tests)
- Deleted `resources/` package (only served MCP schema resource)
- Removed `case "mcp"` from `main.go` dispatch
- Removed `jsonschema` struct tags from all `tools/*.go` (126 tags across 24 files)
- Removed `github.com/modelcontextprotocol/go-sdk` dependency and transitive deps
- Fixed stale "Map to MCP output format" comment in `tools/import_files.go`

Rationale: CLI provides full access to all tools with JSON output for Unix
composability. The MCP server was a parallel access path with no unique
capabilities.

## [2026-04-27] Performance: DirCache + worker pool for `from-raven` and `from-birda`

`calls from-raven` and `calls from-birda` were extremely slow on large
folders (57k files ≈ 2 hours). Root cause: `findWAVFile()` performed
`os.ReadDir()` on every file — O(N²) directory scans. Fix:

1. **DirCache**: Scan directory once, build `map[string]string` for
O(1) WAV lookup. Eliminates the dominant bottleneck (57k × 57k = 3.25B
comparisons → 1 scan + 57k map lookups).

2. **Worker pool**: 8 parallel goroutines for I/O-bound processing
(WAV header reads, .data writes). Same pattern as `from-preds`.

3. Both commands auto-select sequential (< 10 files) vs parallel path.

Expected improvement: 2 hours → 2–5 minutes on 57k files.

`DirCache` is also available for `from-preds` but not yet wired in
(that command already uses a worker pool and typically processes fewer
unique directories).

## [2026-04-27] Add `calls clip-labels` subcommand

New `skraak calls clip-labels` exports a CSV in OpenSoundScape's
`clip_labels` format directly from `.data` files — same row layout as
`BoxedAnnotations.clip_labels()`, byte-identical CSVs — but in Go, fast,
and without round-tripping through Raven `selections.txt`.

For every `.data` file in `--folder`, generate clip windows over
`[0, Duration]` using a Go port of OPSO's `generate_clip_times_df`
(`utils/clip_times.go`, supports `final_clip ∈ {full, remainder, extend,
none}`). Every window is emitted as a row. For each output class column,
the value is `True` when at least one certainty=100 annotation of that
class overlaps the window by ≥ `--min-label-overlap` seconds, else
`False`. Gaps emit all-`False` rows. Booleans capitalized to match
pandas' default; times rendered with at least one decimal place.

Only certainty=100 labels participate (cert<100 is ignored).
`mapping.json` (from the `/data-mapping` skill) translates `.data`
species names to canonical class names. Two sentinels with distinct
semantics:
- `__NEGATIVE__` — clip emitted, all class columns False; overrides any
positives in the same clip. Requires certainty=100. For confirmed-negative
training examples (rain, wind, silence, helicopter, etc.).
- `__IGNORE__` — the **entire file** is dropped from output. Any segment
whose species maps to `__IGNORE__` triggers the drop, regardless of
certainty or filter. For files whose annotation set is incomplete (e.g.
`Don't Know` regions): emitting any clip from them as confirmed-False
would poison the training set with possibly-wrong negatives.

`--filter F` restricts which ML filter's labels count
(`opensoundscape-multi-1.0`, `BirdNET`, `Raven`, …); the mapping coverage
check also restricts to that filter.

Fail-fast: any `.data` parse error, missing `Duration`, missing mapping
entry, or duplicate `(file, start_time, end_time)` row aborts the run
before the CSV is written. Existing output files are appended; column-set
mismatch hard-errors.

Adds `MappingNegative`/`MappingIgnore` sentinels, `Classify`,
`ValidateCoversSpecies`, and `Classes` to `utils/mapping.go`. Adds
`utils/clip_times.go` with the OPSO clip-times port and unit tests
covering all four `final_clip` modes. Verified against an OPSO reference
output on a 100-file Raven test folder: byte-identical CSVs.

## [2026-04-26] Drop `schema://table/{name}` resource

Keeps `schema://full` and removes the per-table schema resource template,
along with its line-based extractor (paren counting, view-vs-table branching,
manual index/ALTER append) and the table-name allowlist. The full schema is
241 lines — small enough that splitting it adds parsing surface for no real
benefit, and clients can also introspect via DuckDB
(`information_schema.columns`, `DESCRIBE`, etc.) through `execute_sql`.

Updates `shell_scripts/test_resources.sh` to drop per-table tests and the
resource-template list call.

## [2026-04-26] Remove `prompts` package

Deletes `prompts/examples.go` and the six MCP prompts it registered
(`query_active_datasets`, `explore_database_schema`,
`explore_location_hierarchy`, `query_location_data`, `analyze_cluster_files`,
`system_status_check`). Drops the `skraak/prompts` import and `AddPrompt` calls
from `cmd/mcp.go`.

Motivation: the prompts were never invoked in practice. Models write SQL
fluently from the `schema://*` resources alone, so the canned templates added
maintenance surface without earning their keep. The `system_status_check`
prompt was self-referential (its body listed the prompts being removed) and
duplicated coverage already in `cmd/mcp_surface_test.go`.

Also drops `shell_scripts/test_prompts.sh` and the prompt references in
`shell_scripts/README.md` and `shell_scripts/TESTING.md`.

## [2026-04-22] `calls summarise`: Add --filter flag to restrict output to a single filter

Adds `--filter <name>` to `skraak calls summarise`. When specified, only labels
matching that filter are included in stats, segments, and review counts.
Segments with no matching labels are omitted entirely. Empty filter (default)
behaves as before (all filters included).

Motivation: a folder of .data files may contain multiple ML model filters;
summarising all of them makes it hard to inspect one. `--filter` scopes the
output the same way `classify --filter` scopes the TUI.

## [2026-04-22] `calls classify`: Shift+primary secondary keybindings for calltype editing

Adds a per-species secondary-binding layer to the classify TUI. Primary flow is
unchanged (keypress → label → save → advance). When a primary key has
`secondary_bindings` configured, pressing **Shift+primary-key** labels the
species with an empty calltype, skips the auto-advance, and enters a one-shot
wait state; the next keypress is looked up in the secondary map and sets the
calltype before advancing. Esc exits the wait state without advancing. Any
non-matching key falls through to normal handling.

Motivation: species like common chaffinch have multiple calltypes (alarm,
contact, song) that couldn't be assigned without burning extra keybindings on
every species. Secondary bindings are per-species (not global) to avoid
accidental mislabels, and deliberately unlisted in the help bar — users know
their own config.

Example config:
```json
"classify": {
"bindings": { "c": "comcha" },
"secondary_bindings": {
"c": { "a": "alarm", "s": "song", "n": "contact" }
}
}
```

Shift+primary on a key with no `secondary_bindings` entry falls back to normal
primary behavior, so existing configs are unaffected.

**Files changed:**
- `utils/config.go` — new `SecondaryBindings` field on `ClassifyFileConfig`.
- `cmd/calls_classify.go` — validation (outer key must exist in bindings,
inner keys single-char non-reserved, values non-empty) and passthrough to
`ClassifyConfig`.
- `tools/calls_classify.go` — `SecondaryBindings` field on `ClassifyConfig`,
new `ApplyCallTypeOnly` and `HasSecondary` methods.
- `tui/classify.go` — `awaitingSecondaryFor` model field, wait-mode intercept
at top of `handleKey`, Shift+letter detection in the default branch, `…`
indicator on the segment info line while waiting.

## [2026-04-18] `--day` redefined as civil dawn → solar sunset (includes dawn chorus)

`--day` previously filtered to solar day (sunrise → sunset), excluding the dawn chorus.
Changed to civil dawn → solar sunset so diurnal species active at dawn are included.

`--night` (solar night) is unchanged. The dawn-chorus window (civil dawn → solar sunrise)
is now covered by **both** flags — a recording at that time is `solar_night=true` and
`diurnal_active=true`. Correct: kiwi and diurnal bird-song both overlap at dawn.

`IsNightOutput` gains a new `diurnal_active` field (bool, present in JSON output of
`skraak isnight`) computed as `midpoint >= civil_dawn && midpoint <= solar_sunset`.

**Files changed:** `tools/isnight.go`, `tools/calls_clip.go`, `tools/calls_classify.go`

## [2026-04-18] `calls classify --night` / `--day`: filter TUI to solar-night or solar-day recordings

Adds `--night`, `--day`, `--lat`, `--lng`, and `--timezone` flags to `skraak calls classify`.
Filtering happens at load time (before the TUI launches) inside `LoadDataFiles`, after the
existing segment filter — so `IsNight` is only called for files that have matching segments.
Skipped file count is reported to stderr before the TUI starts.

Same `--timezone` caveat as `calls clip`: required for non-AudioMoth recorders (e.g. DOC AR4)
that embed local time in filenames. AudioMoth files don't need it.

```bash
skraak calls classify --folder F09/2026-04-06/ --species "Don't Know" \
--night --lat -45.50603 --lng 167.47371
```

**Files changed:**
- `tools/calls_classify.go` — `ClassifyConfig` (Night/Day/Lat/Lng/Timezone fields),
`ClassifyState` (TimeFilteredCount), `LoadDataFiles` (day/night filter block).
- `cmd/calls_classify.go` — flag parsing, mutual-exclusivity + lat/lng validation,
config construction, skipped-count summary line, updated usage text.

## [2026-04-18] `calls clip --night`: filter to solar-night recordings only

Adds `--night`, `--lat`, `--lng`, and `--timezone` flags to `skraak calls clip`.
When `--night` is set, each recording is checked against solar sunrise/sunset at
the given coordinates before its audio is loaded — daytime files are skipped
entirely, saving the cost of reading WAV audio for files that would produce no
useful clips.

`--timezone` is not needed for AudioMoth recorders (timestamp comes from the WAV
comment in UTC). It is required for recorders that embed **local time** in the
filename (e.g. DOC AR4) — without it the filename is parsed as UTC and
`solar_night` will be wrong. Pass `--timezone Pacific/Auckland` or the
appropriate IANA zone.

The JSON output gains a `night_skipped` field (omitted when 0) counting how many
files were filtered out. Skipped filenames are logged to stderr.

```bash
skraak calls clip --folder ./data --output ./clips --prefix kiwi \
--species Kiwi --night --lat -40.85 --lng 172.81

# Non-AudioMoth (DOC AR4, filename in local time):
skraak calls clip --folder ./data --output ./clips --prefix kiwi \
--species Kiwi --night --lat -40.85 --lng 172.81 --timezone Pacific/Auckland
```

**Files changed:**
- `tools/calls_clip.go` — `CallsClipInput` (Night/Lat/Lng/Timezone fields),
`CallsClipOutput` (NightSkipped field), `processFile` night-filter block.
- `cmd/calls_clip.go` — flag parsing, `--night` requires lat/lng validation,
updated usage/help text.

## [2026-04-18] `calls classify` reviewer, bindings, and display flags moved to config file

**Breaking CLI change.** `skraak calls classify` no longer accepts `--reviewer`,
`--bind`, `--color`, `--sixel`, `--iterm`, or `--img-dims`. These values are now
loaded from `~/.skraak/config.json`.

Rationale: users (e.g. David) were typing the same ~25 `--bind` flags on every
invocation. Moving stable, personal defaults into a config file eliminates that
repetition. Per-invocation flags (`--folder`, `--file`, `--filter`, `--species`,
`--certainty`, `--goto`) stay on the CLI.

Path works cross-platform via `os.UserHomeDir()` — resolves to
`~/.skraak/config.json` on Linux/macOS and `C:\Users\<name>\.skraak\config.json`
on Windows.

Config shape:
```json
{
"classify": {
"reviewer": "David",
"color": true,
"sixel": false,
"iterm": false,
"img_dims": 0,
"bindings": {
"k": "Kiwi",
"1": "Kiwi+Duet",
"x": "Noise",
"z": "Don't Know"
}
}
}
```

`bindings` values use the same `Species` or `Species+CallType` grammar the old
`--bind key=value` flag accepted — parsing is shared (`cmd/calls_classify.go:parseBind`).

Config-load rejects bindings that collide with keys the TUI reserves for its own
commands (`,` previous segment, `.` next segment, `0` confirm at certainty 100,
space opens the comment dialog). Previously these were silently shadowed by the
TUI hotkey and the user's binding did nothing.

**Files added:**
- `utils/config.go` — `Config`, `ClassifyFileConfig`, `LoadConfig`, `ConfigPath`.
Named `LoadConfig` (not `LoadClassifyConfig`) so future subcommands can add
their own sections to the same file.

**Files changed:**
- `cmd/calls_classify.go` — Removed six flag cases, added config load after arg
parsing (so `--help` still works without a config), added `--help`/`-h` case,
added single-character validation on binding keys.

## [2026-04-17] New `skraak isnight` CLI command

Adds a standalone CLI command to check if a WAV file was recorded at night,
without needing a database connection.

```
skraak isnight --file recording.wav --lat -36.85 --lng 174.76
```

Determines the recording timestamp from WAV metadata (AudioMoth comment →
filename pattern → file modification time), then calculates sunrise/sunset
at the given GPS coordinates using the recording midpoint. Returns JSON with
` solar_night`, `civil_night`, `moon_phase`, and sun event times.

Optional `--timezone` flag (default UTC) is used for filename-based timestamps;
AudioMoth comments embed their own timezone. Use `--brief` for batch/agent
use to return only `file_path` and `solar_night` (compact JSON, saves tokens).

**Files added:**
- `tools/isnight.go` — IsNight tool (MCP-free core logic)
- `cmd/isnight.go` — CLI command (flags → tool → JSON output)

**Files changed:**
- `main.go` — Register `isnight` command and usage text

## [2026-04-17] Numpad-friendly keybinds in classify TUI

Two keyboard tweaks to make the TUI easier to drive from the numeric keypad
while labeling kiwi calls:

- **Numpad Enter plays audio.** The Enter-key handler in `tui/classify.go` now
matches both `tea.KeyEnter` and `tea.KeyKpEnter`, so the keypad's Enter key
plays the current segment like the main Enter (and still respects Shift for
half-speed playback). Previously, terminals that disambiguate keypad keys
(e.g. via Kitty keyboard protocol) delivered numpad Enter as `KeyKpEnter`,
which fell through the handler and did nothing.
- **Arrow keys navigate segments.** Left arrow now does prev-segment (same as
`,`) and right arrow does next-segment (same as `.`), so the user can
navigate without moving their hand off the numpad.

**Files changed:**
- `tui/classify.go` — Enter branch matches `KeyKpEnter`; `,`/`.` switch cases
also match `"left"`/`"right"`

## [2026-04-05] Simplify calls classify TUI

**Static segment list:** Filtered segments are now computed once at startup and cached.
Reclassifying a segment no longer removes it from the navigation list mid-session.
This fixes instability/crashes when working fast with `--species` or other filters.

**Replace goto dialog with `--goto` flag:**
- Removed ctrl+g goto dialog from TUI (and all supporting code)
- Added `--goto <filename>` CLI flag that opens on the first matching segment in the named file
- Removed `GotoFile()` and `TotalFiles()` methods from `ClassifyState`

**Internal:** Added `NewClassifyState()` constructor for tests. All `getFilteredSegments()` calls
replaced with pre-computed `filteredSegs` cache parallel to `DataFiles`.

**Files changed:**
- `tools/calls_classify.go` — cached segments, `--goto` support, removed dynamic filtering
- `tui/classify.go` — removed goto dialog (model fields, handler, renderer, keybind)
- `cmd/calls_classify.go` — added `--goto` flag parsing
- `tools/calls_classify_*_test.go` — updated to use `NewClassifyState()`

## [2026-04-04] New `prepend` command

Rename WAV files, their .data files, and log.txt by prepending a location prefix.

**Usage:**
```bash
skraak prepend --folder <path> --prefix <string> [--recursive] [--dry-run]
```

**Target files:**
- `*.wav`, `*.WAV` — Only if starting with datestring `YYYYMMDD_HHMMSS`
- `*.wav.data`, `*.WAV.data` — Only if starting with datestring `YYYYMMDD_HHMMSS`
- `log.txt` — Always renamed (exact name match)

**Flags:**
- `--folder <path>` — Target folder (required)
- `--prefix <string>` — String to prepend (required)
- `--recursive` — Include 1 level of subfolders
- `--dry-run` — Show what would be renamed without doing it

**Behavior:**
- Files already starting with `<prefix>_` are skipped with reason "already prefixed"
- WAV files without datestring prefix are skipped with reason "no datestring prefix"
- Non-target files are silently ignored
- Idempotent: running twice is safe

**Examples:**
```bash
# Rename files in a folder
skraak prepend --folder ./recordings --prefix LOC001

# Include subfolders (1 level deep)
skraak prepend --folder ./data --prefix SITE_A --recursive

# Preview changes
skraak prepend --folder ./test --prefix TEST --dry-run
```

**Changes:**
- `tools/prepend.go` — Core logic (datestring detection, file renaming)
- `tools/prepend_test.go` — Unit tests
- `cmd/prepend.go` — CLI command with flag parsing
- `main.go` — Added to command dispatcher

## [2026-04-03] Added `--bookmark` and `--comment` flags to `calls modify`

Allow agents and users to bookmark segments and add comments for information preservation in .data files.

**New flags:**
- `--bookmark` — Mark segment as bookmarked for navigation (boolean flag, sets `bookmark=true`)
- `--comment <text>` — Add user comment (max 140 chars, ASCII only)

**Usage:**
```bash
# Bookmark a segment for later review
skraak calls modify --file recording.data --reviewer GLM-5 \
--filter mymodel --segment 12-15 --certainty 100 --bookmark

# Add a comment to a segment
skraak calls modify --file recording.data --reviewer GLM-5 \
--filter mymodel --segment 12-15 --certainty 100 --comment "Good example of duet"
```

**Behavior:**
- `--bookmark` sets `bookmark=true` on the label
- `--comment` stores text in the label's comment field
- Comment validation: max 140 characters, ASCII only
- If all specified values match current values, no modification made (error)

**Changes:**
- `tools/calls_modify.go` — Added `Bookmark` and `Comment` fields to input/output structs, validation logic
- `cmd/calls_modify.go` — Added `--bookmark` and `--comment` flag parsing

## [2026-04-02] New `calls modify` command

Modify a label in a .data file from the command line.

**Usage:**
```bash
skraak calls modify --file recording.data --reviewer GLM-5 \
--filter mymodel --segment 12-15 --certainty 100 --species Kiwi+Male
```

**Required flags:**
- `--file <path>` — Path to .data file
- `--reviewer <name>` — Reviewer name (always set on file metadata)
- `--filter <name>` — Filter name to match labels
- `--segment <start>-<end>` — Segment time range (integer seconds, e.g., `12-15`)
- `--certainty <int>` — Certainty value (0-100)

**Optional flags:**
- `--species <name>` — Species to set (e.g., `Kiwi`, `Kiwi+Male`, `Noise`)

**Segment matching:**
- Segments matched by `floor(start_time)` and `ceil(end_time)`
- A segment from 12.3s to 14.5s matches `--segment 12-15`

**Behavior:**
- Always updates reviewer on file metadata
- If `--species` provided: sets species and calltype (or clears calltype if not specified)
- If species+calltype AND certainty match current values, no modification made (error)
- Error if no matching segment or label found (no-op on error)

**Use cases:**
- Correct classification: `--certainty 100` only (confirms existing species)
- Incorrect classification: `--species NewSpecies --certainty 100` (changes both)

**Changes:**
- `tools/calls_modify.go` — New file, core logic
- `cmd/calls_modify.go` — New file, CLI parsing
- `cmd/calls.go` — Added `modify` subcommand

## [2026-04-02] Clip feature in `calls classify` TUI

Added `ctrl+s` keybinding to save a clip of the current segment directly from
the classification TUI.

**Keybinding:** `ctrl+s` → type prefix → `enter` to save, `esc` to cancel

**Output files:**
- `<prefix>_<basename>_<start>_<end>.png` — 224x224 color spectrogram (L4 colormap)
- `<prefix>_<basename>_<start>_<end>.wav` — audio clip (16kHz if downsampled)

Files are saved to the current working directory where `skraak` was launched.
Error if files already exist (no overwrite).

**Changes:**
- `tui/classify.go` — Added `clipMode` state, `handleClipKey()`, `renderClipDialog()`,
and `saveClip()` function; added `ctrl+s` keybinding; updated help line

## [2026-04-02] New `calls clip` command

Generate audio clips and spectrogram images from .data file segments.
Useful for extracting training data or creating datasets for ML.

**Usage:**
```bash
skraak calls clip --file recording.data --output ./clips --prefix train
skraak calls clip --folder ./data --output ./clips --prefix kiwi \
--filter opensoundscape-kiwi-1.2 --species Kiwi --size 448 --color
```

**Output files:**
- `<prefix>_<basename>_<start>_<end>.png` — spectrogram image (224-896px)
- `<prefix>_<basename>_<start>_<end>.wav` — audio clip (16kHz if downsampled)

where `basename` is the WAV filename without `.wav` extension.

**Features:**
- Single file (`--file`) or batch folder (`--folder`) processing
- Filter by ML model (`--filter`) and/or species (`--species`)
- Species can include calltype: `Kiwi+Duet`
- `--size <int>` — spectrogram image size (224-896px, default 224)
- `--color` — apply L4 colormap (default: grayscale)
- Error if output files already exist (no overwrite)
- WAV files downsampled to 16kHz if input > 16kHz

**New utilities:**
- `utils.WriteWAVFile(path, samples, sampleRate)` — write mono 16-bit PCM WAV
- `utils.WritePNG(img, writer)` — write image as PNG

**Changes:**
- `utils/wav_writer.go` — New file, WAV writer implementation
- `utils/terminal_image.go` — Added `WritePNG()` function
- `tools/calls_clip.go` — New file, core clip logic
- `cmd/calls_clip.go` — New file, CLI parsing
- `cmd/calls.go` — Added `clip` subcommand

## [2026-04-02] Shared spectrogram generation for show-images and classify

Refactored spectrogram image generation into a shared utility function, reducing
duplication between `calls show-images` and `calls classify` TUI.

**New utility:**
- `utils.GenerateSegmentSpectrogram(dataFilePath, startTime, endTime, color, imgSize)` -
generates a spectrogram image from a segment, handling WAV loading, downsampling,
and image creation in one call.

**Changes:**
- `utils/spectrogram.go` — Added `GenerateSegmentSpectrogram()` function
- `tools/calls_show_images.go` — Now uses `utils.ParseDataFile()` (includes labels) and
`GenerateSegmentSpectrogram()`; removed local `Segment` struct and `parseDataFile()`;
segment info now shows labels when present
- `tui/classify.go` — `generateSpectrogramImage()` now delegates to shared function

**Future:** show-images now has access to segment labels, enabling future filtering
by filter/ml model and species+calltype.

## [2026-03-29] Goto file feature for `calls classify` TUI

Added `ctrl+g` keybinding to jump directly to any file by number. The dialog accepts
a file number (1-based) and jumps to the first segment of that file.

**Keybinding:** `ctrl+g` → type number → `enter` to jump, `esc` to cancel

**Changes:**
- `tools/calls_classify.go` — Added `TotalFiles()` and `GotoFile()` methods to `ClassifyState`
- `tui/classify.go` — Added `gotoMode` and `gotoInput` state; `ctrl+g` keybinding;
`handleGotoKey()` for digit/backspace/enter/esc handling; `renderGotoDialog()` for UI display

## [2026-03-29] Clarify segment counts in TUI

Updated progress display to explicitly label the segment count.

**Changes:**
- `tui/classify.go` — Changed title line from `file [progress] 1/40826` to `file [progress] 1/40826 Segments`
- `cmd/calls_classify.go` — Updated startup message to clarify filtered counts
- `tools/calls_classify.go` — Added tests to verify filtering behavior
- Confirmed `TotalSegments()` and `CurrentSegmentNumber()` correctly use `getFilteredSegments()`
- Files with no matching segments are pruned during load (existing behavior)

## [2026-03-29] `--species` flag for `calls classify`

Added `--species` flag to scope classification to a single species (and optionally calltype).
Composable with `--filter` for focused review of specific detections within an ML model's output.

**Examples:**
```bash
# Review only Kiwi Duet calls from a specific filter
skraak calls classify --folder ./data --reviewer dave --bind k=Kiwi \
--filter opensoundscape-kiwi-1.2 --species Kiwi+Duet

# Review all Kiwi calls (any calltype)
skraak calls classify --folder ./data --reviewer dave --bind k=Kiwi --species Kiwi
```

**Changes:**
- `tools/calls_classify.go` — Added `Species` and `CallType` fields to `ClassifyConfig`;
extended `getFilteredSegments()` with `segmentMatchesFilters()` for AND-composable
filter+species+calltype matching; prune data files with no matching segments on load
- `cmd/calls_classify.go` — Parse `--species` flag (rejects duplicates), zero-segment
guard before TUI launch, comprehensive `printClassifyUsage()`

## [2026-03-29] Codebase consistency improvements

**Changes:**
- `tools/import_file.go` — Single DB connection per `ImportFile()` call (was 3), uses
`validateHierarchyIDs()`, passes `ctx` and `*sql.DB` to helpers
- `tools/import_files.go` — Extracted `validateHierarchyIDs()` for reuse
- `tools/bulk_file_import.go` — `bulkCreateCluster` uses `db.BeginLoggedTx()` for
transaction audit logging
- `cmd/common.go` — Extracted `initEventLog()` helper, replacing 14 instances of
6-line event log boilerplate across 7 cmd files
- `tools/export.go` — Documented why `fmt.Sprintf` for table names is safe (hardcoded manifest)
- `tools/location.go` — Fixed `Exec` → `ExecContext` for context propagation consistency
- `utils/cluster_import.go` — Exported `LocationData` and `GetLocationData` for cross-package use
- Removed duplicate godoc comments on several tool functions

## [2026-03-19] NOT NULL Constraint Validation in Bulk Import

Added empty-string validation for CSV fields in `bulkReadCSV()` (`tools/bulk_file_import.go`).

Audited all INSERT/UPDATE paths for NOT NULL constraint enforcement. Found one gap:
`record[3]` (DateRange → cluster name) was not validated for empty strings. Also added
validation for `record[0]` (location_name) and `record[2]` (directory_path) which would
cause downstream failures if empty.

**Changes:**
- `tools/bulk_file_import.go` — validate `location_name`, `directory_path`, and `date_range`
CSV fields are non-empty (with TrimSpace) before building `bulkLocationData` structs

## [2026-03-14] Remove import_ml_selections (Deprecated)

**Breaking Change:** Removed deprecated `import selections` CLI command and `import_ml_selections` MCP tool.

The `import segments` command is the replacement, offering:
- AviaNZ .data file import (industry standard)
- Species/calltype mapping file validation
- Transactional imports with proper error handling
- Simpler, more maintainable codebase

**Removed:**
- `tools/import_ml_selections.go` (1134 lines)
- `cmd/mcp.go` — `import_ml_selections` MCP tool registration
- `cmd/import.go` — `selections` CLI subcommand

**Changes:**
- `utils/mapping.go` — Exported `Placeholders()` function for reuse

## [2026-03-14] Import Segments - Fix Orphaned Segments

**Fix:** Segments with no valid labels are now deleted from the database.

When a segment's labels all fail validation (e.g., missing species in mapping), the segment
was previously left orphaned in the database with no labels. Now the segment is deleted within
the same transaction, maintaining data integrity.

**Changes:**
- `tools/import_segments.go` — Delete orphaned segments when all labels fail validation
- `utils/mapping_test.go` — Unit tests for mapping file loading and validation
- `tools/import_segments_test.go` — Unit tests for input validation and segment counting
- `utils/data_file_test.go` — Added tests for skraak_hash and skraak_label_id round-trip

## [2026-03-14] Import Segments Command

**Feature:** New `skraak import segments` command to import AviaNZ .data segments into the database.

**Changes:**
- `utils/mapping.go` — New utilities for loading and validating species/calltype mapping files
- `tools/import_segments.go` — New tool with `ImportSegments()` function
- `cmd/import.go` — Added `segments` subcommand

**Usage:**
```bash
skraak import segments \
--db ./db/skraak.duckdb \
--dataset gljgxDbfasva \
--location ZEVWGbXzB1bl \
--cluster q7w-iQgyZOYV \
--folder /path/to/data \
--mapping mapping.json
```

**Mapping file format** (`mapping.json`):
```json
{
"Don't Know": {
"species": "Don't Know"
},
"GSK": {
"species": "Roroa",
"calltypes": {
"Male": "Male - Solo",
"Female": "Female - Solo"
}
}
}
```

**Output structure:**
```json
{
"summary": {
"data_files_found": 42,
"data_files_processed": 42,
"total_segments": 342,
"imported_segments": 342,
"imported_labels": 356,
"imported_subtypes": 280,
"processing_time_ms": 1234
},
"segments": [...],
"errors": []
}
```

**Invariants enforced:**
- All file hashes must already exist in database for the cluster
- All files must have no existing labels (fresh imports only)
- All filters, species, and calltypes must exist in database
- Segments with `bookmark: true` labels are skipped
- Mapping must cover all species found in .data files

**Database writes:**
- `segment` table: id, file_id, dataset_id, start_time, end_time, freq_low, freq_high
- `label` table: id, segment_id, species_id, filter_id, certainty
- `label_metadata` table: `{"comment": "..."}` (only if comment present)
- `label_subtype` table: id, label_id, calltype_id, filter_id, certainty (if calltype present)

**Data file updates:**
- `skraak_hash` written to metadata section (first element of .data array)
- `skraak_label_id` written to each label object

**Rationale:**
AviaNZ .data files contain segment annotations from both manual review and ML filters. This command imports those segments into the skraak database with proper species/calltype mapping, enabling integrated analysis across all annotation sources.

## [2026-03-13] Calls Summarise Command

**Feature:** New `skraak calls summarise` command to analyse .data files after classification.

**Changes:**
- `tools/calls_summarise.go` — New tool with `CallsSummarise()` function
- `cmd/calls.go` — Added `summarise` subcommand

**Usage:**
```bash
skraak calls summarise --folder ./recordings > summary.json
skraak calls summarise --folder ./recordings | jq 'del(.segments)' # summary only
```

**Output structure:**
```json
{
"segments": [...],
"data_files_read": 27,
"data_files_skipped": [],
"total_segments": 47,
"filters": {
"opensoundscape-kiwi-1.2": {
"segments": 20,
"species": {"Kiwi": 15, "Don't Know": 5},
"calltypes": {"Kiwi": {"Male": 10, "Duet": 5}}
}
},
"review_status": {
"unreviewed": 30,
"confirmed": 10,
"dont_know": 5,
"with_calltype": 8,
"with_comments": 3,
"bookmarked": 2
},
"operators": ["Auto"],
"reviewers": ["David", "None"]
}
```

**Review status definitions:**
- `unreviewed`: certainty < 100 (default from detection)
- `confirmed`: certainty = 100 (user pressed bind key)
- `dont_know`: certainty = 0

**Calltypes:** Only appears in filters when species have calltypes set, showing per-species calltype counts.

**Rationale:**
After running `skraak classify` on .data files, it's difficult to understand the state of classifications. This command provides a comprehensive summary with both detailed segments array and aggregated statistics.

## [2026-03-10] Spectrogram Sample Rate Limiting

**Feature:** Spectrograms now automatically downsample high sample rate audio to 16kHz.

**Changes:**
- `utils/spectrogram.go` — Added `DefaultMaxSampleRate = 16000` constant
- `utils/resample.go` — Added `ResampleRate()` function for sample rate conversion
- `tools/calls_show_images.go` — Downsample segments before spectrogram generation
- `tui/classify.go` — Downsample segments before spectrogram generation

**Rationale:**
- High sample rates (e.g., 250kHz bat detectors) produce very tall spectrograms
- Birds are typically in 0-8kHz range; 16kHz sample rate (Nyquist = 8kHz) is sufficient
- Audio playback unchanged — plays at original sample rate

**Behavior:**
| Original Rate | Spectrogram Rate | Playback Rate |
|---------------|------------------|---------------|
| 8000 Hz | 8000 Hz | 8000 Hz |
| 16000 Hz | 16000 Hz | 16000 Hz |
| 44100 Hz | 16000 Hz | 44100 Hz |
| 250000 Hz | 16000 Hz | 250000 Hz |

## [2026-03-09] Case-Preserving WAV File Finding

**Fix:** WAV files with lowercase `.wav` extension now produce correct `.wav.data` files.

**Changes:**
- `tools/calls_from_preds.go` — Added `findWAVFile()` helper function
- `tools/calls_from_birda.go` — Updated to use `findWAVFile()`
- `tools/calls_from_raven.go` — Updated to use `findWAVFile()`

**Problem:** Previous code hardcoded `.WAV` extension, causing issues on case-sensitive filesystems:
- `abc.wav` would fail to be found
- Or produce `abc.WAV.data` instead of `abc.wav.data`

**Solution:** `findWAVFile(dir, baseName)` searches for:
1. `.WAV` (most common for main recordings)
2. `.wav` (common for clips)
3. `.Wav` (edge case)
4. Case-insensitive glob fallback

**Result:**
| WAV File | .data File |
|----------|------------|
| `abc.WAV` | `abc.WAV.data` |
| `abc.wav` | `abc.wav.data` |
| `abc.Wav` | `abc.Wav.data` |

## [2026-03-09] Bookmark Navigation in TUI

**New feature:** Bookmark segments for later review.

**Changes:**
- `utils/data_file.go` — Added `Bookmark bool` to Label struct
- `tools/calls_classify.go` — Added bookmark methods
- `tui/classify.go` — Added key handlers and display
- `tui/classify.go` — Header lines now wrap at 80 characters

**Format** (stored in label):
```json
[0, 3, 0, 16000, [{"species": "Kiwi", "certainty": 90, "filter": "BirdNET", "bookmark": true}]]
```

**Key bindings:**
| Key | Action |
|-----|--------|
| `Ctrl+D` | Toggle bookmark on current segment |
| `Ctrl+,` | Previous bookmark (wraps around) |
| `Ctrl+.` | Next bookmark (wraps around) |

**Behavior:**
- Bookmark lives on the filter-matching label
- `--filter BirdNET` shows bookmarks on BirdNET labels only
- No filter shows all bookmarks
- Wrap-around navigation with loop detection
- `[BOOKMARKED]` indicator shown in segment info

## [2026-03-09] Comment Dialog Editing in TUI

**Enhancement:** Full cursor editing support in the comment dialog.

**Changes:**
- `tui/classify.go` — Added cursor position tracking and navigation

**New features:**
| Key | Action |
|-----|--------|
| `←` / `→` | Move cursor left/right |
| `Space` | Insert space at cursor |
| `Backspace` | Delete character before cursor |
| `Delete` | Delete character at cursor |
| `Ctrl+A` | Move cursor to start |
| `Ctrl+E` | Move cursor to end |

**Fixed:**
- Space bar now works in comment dialog
- Backspace deletes at cursor position, not just at end

## [2026-03-09] New Commands: calls from-birda and calls from-raven

**New feature:** Import BirdNET and Raven annotation files to .data files.

**Added:**
- `tools/calls_from_birda.go` — BirdNET results file parser
- `tools/calls_from_raven.go` — Raven selections file parser
- `cmd/calls.go` — New subcommands `from-birda` and `from-raven`
- `tools/calls_from_birda_raven_test.go` — 10 test cases

**Commands:**
```bash
# BirdNET (filter always "BirdNET")
./skraak calls from-birda --folder /path/to/recordings
./skraak calls from-birda --file recording.BirdNET.results.csv [--delete]

# Raven (filter always "Raven")
./skraak calls from-raven --folder /path/to/recordings
./skraak calls from-raven --file recording.Table.1.selections.txt [--delete]
```

**File formats:**
- BirdNET: `*.BirdNET.results.csv` (CSV with BOM, columns: Start, End, Scientific name, Common name, Confidence, File)
- Raven: `*.selections.txt` (Tab-separated, columns: Begin Time, End Time, Low Freq, High Freq, Species)

**Behavior (same as from-preds):**
- Filter is always parsed from filename (no `--filter` option)
- No clobber: if filter already exists, error
- Merge: if different filter exists, append segments
- Confidence (BirdNET) converted from 0.0-1.0 to 0-100
- Frequency range preserved from Raven selections
- `--delete` option removes source files after successful import

**Tests:** 10 new tests covering:
- New .data file creation
- Same filter rejection (no clobber)
- Different filter merge
- Delete option
- Folder mode (BirdNET only)
- Multiple selections (Raven only)

## [2026-03-09] Safe .data File Writing in calls-from-preds

**Breaking change:** Filter must now be non-empty. Previously empty filter was allowed.

**Problem:** `calls-from-preds --write-dot-data` would silently clobber existing `.data` files, potentially destroying manual annotations.

**Solution:** Implemented safe write logic that protects existing data:

1. **No existing file** → Write new file (unchanged behavior)
2. **Existing file, same filter** → Error: "file already contains filter 'X' (refusing to clobber)"
3. **Existing file, different filter** → Merge segments (append new, sort by time)
4. **Existing file, parse error** → Error: "cannot parse existing file (refusing to clobber)"

**Changes:**
- `tools/calls_from_preds.go` — Added `writeDotDataFileSafe()` for safe write/merge logic
- `tools/calls_from_preds.go` — Added filter validation: empty filter now returns error
- `tools/calls_from_preds.go` — Filter defaults to CSV filename parsing if `--filter` not specified
- `tools/calls_from_preds.go` — Added `convertAviaNZSegment()` and `buildAviaNZMetaAndSegments()` helpers

**Filter logic:**
- If `--filter "name"` specified → use that filter
- If `--filter` not specified → parse from CSV filename (e.g., `predsST_opensoundscape-kiwi-1.2_2025-11-12.csv` → `opensoundscape-kiwi-1.2`)
- If filter is empty string → error

**Error handling:** First error stops batch processing (existing behavior preserved).

**Tests added:** `tools/calls_from_preds_test.go` with 7 test cases:
- Empty filter returns error
- New .data file created when none exists
- Existing file with same filter returns error (refuses to clobber)
- Existing file with different filter merges segments
- Existing file with parse error returns error (refuses to clobber)
- Explicit filter via `--filter` flag
- Non-parsable filename without filter returns error

## [2026-03-07] JSON Schema for AviaNZ .data Files

**New feature:** Added JSON Schema (Draft 2020-12) for validating AviaNZ .data annotation files.

**Added:**
- `db/avianz_data_schema.json` — Comprehensive schema for .data file format

**Schema coverage:**
- Root array with metadata object first, then segment arrays
- Meta object with `Operator`, `Reviewer`, `Duration` (optional, allows extra fields)
- Segment array: 5-element tuple `[starttime, endtime, freq_low, freq_high, labels]`
- Label object with required `species` and `certainty` (0-100)
- Optional fields: `filter`, `calltype`, `comment` (max 140 chars)
- Additional properties allowed on all objects (extensibility)
- Pattern constraint: `species` must not contain `>` separator

**Validation tests:**
- Missing required fields caught
- Certainty range (0-100) enforced
- Comment length (max 140) enforced
- Minimal valid files accepted

## [2026-03-07] Comment Feature in Classify TUI

**New feature:** Press spacebar in the classify TUI to add/edit comments on labels.

**Changes:**
- `utils/data_file.go` — Added `Comment` field to `Label` struct, parse/write handling
- `tools/calls_classify.go` — Added `SetComment()` and `GetCurrentComment()` methods, `Comment` field in `BindingResult`
- `tui/classify.go` — Added `commentMode`/`commentText` state, spacebar opens dialog, text input handling, dialog rendering

**AviaNZ spec compliance:** The spec allows "any additional attributes defined for this call" as key-value pairs. Comments are stored as `"comment": "text"` in the label object.

**Usage:**
- `[space]` — Open comment dialog (pre-fills existing comment)
- Type comment (max 140 chars, ASCII only)
- `[enter]` — Save comment
- `[esc]` — Cancel (discard changes)
- `[backspace]` — Delete last character
- `[ctrl+u]` — Clear all

**Help text:** `[esc]quit [,]prev [.]next [space]comment [enter]play [shift+enter]½speed`

## [2026-03-04] Half-Speed Audio Playback in Classify TUI

**New feature:** Press Shift+Enter in the classify TUI to play audio at half speed.

**Changes:**
- `utils/resample.go` — **NEW** Linear interpolation resampling for speed changes
- `utils/audio_player.go` — Added `PlayAtSpeed(samples, sampleRate, speed)` method
- `tools/calls_classify.go` — Added `PlaybackSpeed` field to `ClassifyState`
- `tui/classify.go` — Detect Shift+Enter modifier, display "▶ Playing 0.5x..." in status
- `tui/classify.go` — Changed quit key from `q` to `Escape` (frees `q` for bindings)

**Usage:** `[esc]quit [enter]play [shift+enter]½speed`

## [2026-03-04] Performance Optimizations for calls-from-preds

**Problem:** Processing 7617 WAV files took 16 minutes due to excessive I/O and sequential processing.

**Changes:**
- `utils/wav_metadata.go` — Added `ParseWAVHeaderMinimal()` that reads only 4KB instead of 200KB per file (50× less I/O). Added separate buffer pool for minimal headers.
- `tools/calls_from_preds.go` — Added parallel processing with 8 workers for .data file generation. Small batches (<10 files) use sequential processing to avoid goroutine overhead.
- `tools/calls_from_preds.go` — Added `ProgressHandler` callback type for progress reporting during long operations.
- `cmd/calls.go` — Added progress indicator showing "Processing WAV files: X/Y (Z%)" during .data file writing.

**Expected improvement:** ~8× faster on multi-core systems due to parallel processing + reduced I/O overhead.

## [2026-03-04] Add iTerm2 Inline Image Protocol Support

**New feature:** Added `--iterm` flag for terminals supporting the iTerm2 Inline Image Protocol (WezTerm, iTerm2, VS Code terminal).

- `utils/terminal_image.go` — Added `ProtocolITerm` enum value and `WriteITermImage()` using charm's `x/ansi/iterm2` package; PNG-encodes then base64-encodes for the iTerm2 escape sequence
- `tools/calls_show_images.go` — Added `ITerm` field to `CallsShowImagesInput`, checked before `Sixel` in protocol selection
- `tools/calls_classify.go` — Added `ITerm` field to `ClassifyConfig`
- `cmd/calls.go` — Added `--iterm` flag to `show-images` subcommand
- `cmd/calls_classify.go` — Added `--iterm` flag to `classify` subcommand
- `tui/classify.go` — Renamed `sixelImageCmd` to `inlineImageCmd` with protocol parameter; changed conditionals from `== ProtocolSixel` to `!= ProtocolKitty` so both sixel and iTerm2 use the same inline rendering path
- `utils/terminal_image_test.go` — Tests for `WriteITermImage`, `WriteImage` routing, and `ClearImages` no-op

## [2026-02-28] Fix Kitty Image Rendering at 448px in Classify TUI

**Bug fix:** Spectrogram display upgraded from 224x224 to 448x448 pixels. Old image artifacts persisted between segment navigations at the larger size.

- `utils/kitty_image.go` — Chunked Kitty protocol transmission (4096-byte chunks) per spec; small images still sent as single payload
- `tui/classify.go` — Return `tea.ClearScreen` on navigation keys (`,`, `.`, bindings) to force full redraw and reliable image clearing
- `tui/classify.go` — `ResizeImage` call updated from 224x224 to 448x448
- `utils/kitty_image_test.go` — Tests for single-chunk, multi-chunk, and clear behavior

## [2026-02-28] Audio Playback in Classify TUI

**New feature:** Press Enter to play the current segment's audio during classification.

- Added `utils/audio_player.go` — wraps ebitengine/oto v3 for PCM playback
- Oto context created lazily on first play, reused across segments
- Converts `[]float64` samples → signed int16 LE for oto
- Playback stops automatically on navigation (`,`/`.`), binding keys, and quit
- "▶ Playing..." indicator shown in segment info line
- New dependency: `github.com/ebitengine/oto/v3` (requires `libasound2-dev` on Linux)

## [2026-02-22] New CLI Command: calls-from-preds

**New feature:** Extract clustered bird calls from ML predictions CSV files.

**Usage:**
```bash
./skraak calls-from-preds --csv predictions.csv > calls.json
```

**How it works:**
1. Reads prediction CSV (file, start_time, end_time, ebird_code columns with 1/0 values)
2. Auto-detects clip duration from first row
3. Groups detections by (file, ebird_code) and sorts by start_time
4. Clusters consecutive detections where gap ≤ 3 × clip_duration
5. Filters out single detections (configurable via constant)

**Constants (easily changeable):**
```go
CLUSTER_GAP_MULTIPLIER = 3 // Gap threshold = 3 × clip_duration
MIN_DETECTIONS_PER_CLUSTER = 1 // Filter single detections
```

**Performance:** 400k+ rows processed in ~0.67 seconds

**Output example:**
```json
{
"calls": [
{"file": "path.WAV", "start_time": 0, "end_time": 32, "ebird_code": "tomtit1", "detections": 11}
],
"total_calls": 62593,
"species_count": {"tomtit1": 12636, ...},
"files_count": 14017
}
```

**Files:**
- `tools/calls_from_preds.go` — Core clustering logic
- `cmd/calls_from_preds.go` — CLI handler

---

## [2026-02-21] Remove import_audio_file MCP Tool

**Breaking change:** Removed `import_audio_file` MCP tool. Use CLI command `skraak import file` for single file imports.

**Rationale:** The MCP tool was redundant since:
1. Single file imports are better suited for CLI use (requires file path on local machine)
2. `import_audio_files` handles batch imports efficiently via MCP
3. Reduces MCP tool count from 11 to 10

**Changes:**
- **`cmd/mcp.go`** — Removed `import_audio_file` tool registration and adapter
- **`tools/import_file.go`** — Kept for CLI use only
- **`cmd/import.go`** — CLI command `skraak import file` unchanged

**Migration:** Use CLI command instead:
```bash
./skraak import file --db ./db/skraak.duckdb --dataset abc123 --location loc456 --cluster clust789 --path /path/to/file.wav
```

---

## [2026-02-21] Verb-First CLI Commands

**Breaking change:** Replaced resource-first CLI commands with natural language verb-first structure.

**Before:**
```bash
./skraak dataset create --name "Test"
./skraak location update --id abc123 --name "Updated"
```

**After:**
```bash
./skraak create dataset --name "Test"
./skraak update location --id abc123 --name "Updated"
```

**Changes:**
- **`main.go`** — Removed legacy `dataset`, `location`, `cluster`, `pattern` commands
- **`cmd/create.go`** — New verb-first create handler
- **`cmd/update.go`** — New verb-first update handler
- **`cmd/dataset.go`, `cmd/location.go`, `cmd/cluster.go`, `cmd/pattern.go`** — Exported create/update functions
- **Shell scripts** — Updated `test_bulk_import.sh` and `test_event_log.sh` to use new syntax

**Benefits:**
- Natural language flow: "create dataset" vs "dataset create"
- Consistent with `skraak import file/folder/bulk` pattern
- More intuitive for users
- Maintains clean tool separation in `@tools/` directory

**Migration:** Legacy commands now return "Unknown command" error, forcing adoption of new syntax.

---

## [2026-02-21] Fix Event Log Pointer Serialization

**Bug fix:** Event log contained pointer addresses instead of values for nullable database fields (`*float64`, `*GainLevel`, etc.), causing replay failures.

**Root cause:** `marshalParam()` in `db/tx_logger.go` didn't handle pointer types for numeric values or named type aliases (like `db.GainLevel`). These fell through to `fmt.Sprintf("%v", pointer)` which printed memory addresses like `"0x38a7bfb12078"`.

**Example of corrupted data:**
```json
"parameters": ["file_id", "2025-05-18T18:30:00+13:00", "248AB50053AB1B4A", "0x38a7bfb12078", "0x38a7bfb12088", "0x38a7bfb12090"]
```
The last three values should have been `gain`, `battery_v`, `temp_c` but were pointer addresses.

**Fixed:**
- `db/tx_logger.go` — Added explicit cases for all pointer types (`*int`, `*int64`, `*float64`, `*bool`, etc.)
- `db/tx_logger.go` — Added reflection-based fallback in default case to handle pointer-to-named-type (e.g., `*GainLevel`)
- `cmd/replay.go` — Increased `bufio.Scanner` buffer from 64KB to 20MB to handle large event lines (17,000 files = ~16 MB JSON line)

**Tests added:**
- `db/tx_logger_test.go` — Tests for `*int`, `*int64`, `*float64`, `*float32`, `*bool` with nil and value cases
- `db/tx_logger_test.go` — Tests for named type aliases and pointer-to-named-type

---

## [2026-02-19] Fix Update Commands - Preserve Unset Fields

**Bug fix:** Update commands were overwriting existing values with empty strings when optional flags weren't provided.

**Root cause:** CLI code set pointers to empty strings even when flags weren't provided, causing tools layer to interpret them as intentional empty values.

**Fixed:**
- `cmd/dataset.go` — `runDatasetUpdate()` now only sets pointer fields when flags have non-empty values
- `cmd/location.go` — `runLocationUpdate()` now only sets pointer fields when flags have non-empty values
- `cmd/cluster.go` — Already correct (only sets fields when provided)
- `cmd/pattern.go` — Already correct (only sets fields when provided)

**Tests added:**
- `tools/update_test.go` — Unit tests verifying update preserves unset fields for all entity types

---

## [2026-02-19] Schema Simplification - Remove species_dataset and ebird_taxonomy_v2024

**Database schema changes:**
- Dropped `species_dataset` table — all species now available across all datasets
- Dropped `ebird_taxonomy_v2024` table — use `WHERE taxonomy_version = '2024'` on `ebird_taxonomy` instead

**Rationale:**
- Simplifies species management (no duplicate species names across datasets)
- Reduces schema complexity (one fewer join for species lookups)
- `ebird_taxonomy_v2024` was redundant; filtering `ebird_taxonomy` directly is sufficient

**Code changes:**
- `tools/export.go` — Simplified manifest: `species` and `call_type` now "copy" (full table)
- `tools/export.go` — Removed `buildDerivedTableCreate()`, `populateDerivedTable()`, simplified `buildReferencedQuery()`
- `tools/import_ml_selections.go` — Species lookup no longer joins `species_dataset`
- `resources/schema.go` — Removed tables from list
- `db/schema_test.go` — Removed obsolete test cases
- `prompts/examples.go` — Updated taxonomy schema description

**Export manifest changes:**
- `species_dataset` → removed (no longer exists)
- `ebird_taxonomy_v2024` → removed (no longer exists)
- `species` → changed from "referenced" to "copy"
- `call_type` → changed from "referenced" to "copy"
- `filter` → changed from "referenced" to "copy"
- All "referenced" and "derived" handling code removed

---

## [2026-02-19] Dataset Export for Collaboration and Testing

**New feature: Export a dataset with all related data to a new database**

**Purpose:** Enable dataset-level exports for collaboration (export, modify, replay changes), testing (small focused test DBs), and archival.

**Architecture:**
- Schema read from embedded `db/schema.sql` (DDL statements extracted dynamically)
- Table copy order computed from FK relationships using `duckdb_constraints()`
- ATTACH mechanism for efficient cross-database copying
- Declarative manifest defines table relationships

**Added:**
- `tools/export.go` — `ExportDataset()` with table manifest and copy logic
- `cmd/export.go` — `skraak export dataset` CLI command
- `db/schema.go` — Schema utilities: `ReadSchemaSQL()`, `ExtractDDLStatements()`, `GetFKOrder()`
- `shell_scripts/test_export.sh` — Integration test script

**Command:**
```bash
skraak export dataset --db skraak.duckdb --id abc123 --output export.duckdb
skraak export dataset --db skraak.duckdb --id abc123 --output export.duckdb --dry-run
skraak export dataset --db skraak.duckdb --id abc123 --output export.duckdb --force
```

**What's exported:**
- Dataset row and all owned data (locations, clusters, files, selections, labels)
- Reference tables copied in full (`ebird_taxonomy`, `species`, `call_type`, `cyclic_recording_pattern`, `filter`)
- Empty event log created for capturing changes

**Design decisions:**
- Schema from `schema.sql` ensures schema-resilience (new columns auto-included)
- FK order computed dynamically via `duckdb_constraints()` function
- Close source DB before output DB (DuckDB single-connection limit)
- `SELECT *` copies all columns without hard-coding

**Testing:**
- `db/schema_test.go` — Unit tests for DDL extraction and FK ordering
- Integration tests verify row counts match source
- Error handling tests for missing dataset, existing file

---

## [2026-02-18] Event Log for Database Mutation Replay

**New feature: SQL-level event logging for backup synchronization**

**Purpose:** Capture all mutating SQL operations (INSERT, UPDATE, DELETE) to enable replay on backup databases for synchronization.

**Architecture:**
- Transaction wrapper (`db.LoggedTx`) intercepts all mutations
- Logged only on successful commit (rollback discards recorded queries)
- Events written to JSONL file (`<database>.events.jsonl`)
- Prepared statements fully supported via `LoggedStmt` wrapper

**Added:**
- `db/tx_logger.go` — LoggedTx, LoggedStmt, TransactionEvent types
- `cmd/replay.go` — `skraak replay events` CLI command
- `shell_scripts/test_event_log.sh` — Integration test script

**Modified:**
- All CLI commands initialize event log with defer close
- All tools use `db.BeginLoggedTx()` instead of `database.BeginTx()`
- `utils/cluster_import.go` updated for batch imports

**Event format (JSONL):**
```json
{
"id": "V1StGXR8_Z5jdHi6B-myT",
"timestamp": "2026-02-18T14:30:22+13:00",
"tool": "create_or_update_dataset",
"queries": [
{"sql": "INSERT INTO ...", "parameters": [...]}
],
"success": true,
"duration_ms": 45
}
```

**Replay command:**
```bash
skraak replay events --db backup.duckdb --log skraak.duckdb.events.jsonl
skraak replay events --db backup.duckdb --log events.jsonl --dry-run
skraak replay events --db backup.duckdb --log events.jsonl --last 10
```

**Key design decisions:**
- SQL-level (not tool-level) for complete fidelity including imports
- Tool name included for context/debugging
- Only successful transactions logged
- Failed events skipped during replay
- `--continue` flag to proceed past errors

**Testing:**
- `db/tx_logger_test.go` — 123 unit tests, 75.9% coverage
- Pure function tests (isMutation, marshalParam, JSON marshaling)
- Integration tests with real DuckDB and file system
- Race detector verified

---

## [2026-02-11] CLI Refactoring — Two-Layer Architecture

**Major refactoring: Separated core logic from MCP types, added CLI commands**

**Problem:** All tool functions were tightly coupled to MCP SDK types (`*mcp.CallToolRequest`, `*mcp.CallToolResult`). This meant functionality could only be invoked via MCP protocol — no CLI access for power users.

**Solution:** Two-layer architecture separating core logic from MCP adapters.

**Created:**
- `cmd/mcp.go` — MCP server setup + 10 thin adapter wrappers (~3 lines each)
- `cmd/import.go` — `skraak import bulk` CLI command with flag parsing
- `cmd/sql.go` — `skraak sql` CLI command for ad-hoc queries

**Modified (mechanical, all tools/):**
- Removed `*mcp.CallToolRequest` parameter (was never used — `req` always ignored)
- Removed `*mcp.CallToolResult` from returns (was always empty `&mcp.CallToolResult{}`)
- Removed `import "github.com/modelcontextprotocol/go-sdk/mcp"` from all tool files
- Updated test files (`integration_test.go`, `pattern_test.go`) to match new signatures
- Updated `main.go` to pure dispatcher: `mcp | import | sql`

**Architecture:**
```
main.go → pure dispatcher
cmd/mcp.go → MCP server + adapter wrappers (ONLY file importing mcp SDK)
cmd/import.go → CLI: skraak import bulk --db ... --dataset ... --csv ... --log ...
cmd/sql.go → CLI: skraak sql --db ... "SELECT ..."
tools/*.go → core logic, NO mcp dependency (plain Go structs in/out)
utils/, db/, etc. → unchanged
```

**Benefits:**
- CLI access for power users without MCP
- Token savings (CLI avoids MCP protocol overhead)
- Code sharing between CLI and MCP
- MCP SDK contained to one file
- All tests pass

---

## [2026-02-10] Bulk File Import Cluster Assignment Bug Fix

**Critical Bug Fix: Files now correctly distributed across multiple clusters for same location**

**Problem:** When the same location appeared multiple times in the CSV with different date ranges, all files ended up in the last cluster created instead of being distributed across their respective clusters.

**Root Cause:** The `clusterIDMap` used only `LocationID` as the key, causing each new cluster for the same location to overwrite the previous one in the map.

**Solution:** Changed map key from `LocationID` to composite key `LocationID|DateRange`.

**Modified:**
- `tools/bulk_file_import.go` (lines 125, 171-172, 183-184)

**Impact:**
- Data integrity restored
- Multiple date ranges per location now works correctly
- Simple 3-line fix, backwards compatible

---

## [2026-02-07] File Modification Time Fallback

**Enhancement: Added file modification time as third timestamp fallback**

**Problem:** Small clusters (1-2 files) failed variance-based filename disambiguation because the algorithm needs multiple samples to determine date format (YYYYMMDD vs YYMMDD vs DDMMYY).

**Timestamp Resolution Order:**
```
1. AudioMoth comment → timestamp
2. Filename parsing → timestamp
3. File modification time → timestamp (NEW!)
4. FAIL (skip file with error)
```

**Modified:**
- `utils/cluster_import.go` - Added FileModTime fallback in `batchProcessFiles()`

**Benefits:**
- Fewer failures in small clusters
- No performance impact
- Backwards compatible
- Simple 10-line change

---

## [2026-02-07] Cluster Import Logic Extraction

**Major refactoring: Extracted shared cluster import logic into utils module**

**Key Insight:** A cluster is the atomic unit of import (one SD card / one recording session / one folder).

**Created:**
- `utils/cluster_import.go` (553 lines) - Single source of truth for cluster imports
- `ImportCluster()` - Main entry point
- `scanClusterFiles()` - Recursive WAV file scanning
- `batchProcessFiles()` - Batch processing with variance-based parsing
- `insertClusterFiles()` - Transactional insertion

**Modified:**
- `tools/import_files.go` - 75% code reduction (650 lines → 161 lines)
- `tools/bulk_file_import.go` - Bug fixes:
- **CRITICAL BUG FIXED:** Now inserts into `file_dataset` table (was missing!)
- **CRITICAL BUG FIXED:** Now inserts into `moth_metadata` table (was missing!)

**Benefits:**
- Bug fixed: 68,043 orphaned files found in test database
- ~500 lines of duplicated code eliminated
- Single source of truth for all import logic

---

## [2026-02-06] Tool Consolidation

**Consolidated 8 write/update tools → 4 create_or_update tools**

**Deleted:**
- 8 separate create/update tool files

**Added:**
- `tools/dataset.go` - `create_or_update_dataset`
- `tools/location.go` - `create_or_update_location`
- `tools/cluster.go` - `create_or_update_cluster`
- `tools/pattern.go` - `create_or_update_pattern`

**Design:**
- Omit `id` field → CREATE mode (generates nanoid)
- Provide `id` field → UPDATE mode (verifies exists)

**Benefits:**
- Tool count: 14 → 10
- ~31% less code (~320 lines removed)
- Shared validation logic

---

## [2026-02-06] Test Script Consolidation

**Rationalized and consolidated shell test scripts**

**Removed redundant scripts:**
- 6 incomplete/redundant test scripts

**Current test suite (8 scripts):**
1. `get_time.sh` - Time tool
2. `test_sql.sh` - SQL query tool
3. `test_tools.sh` - All create_or_update tools
4. `test_import_file.sh` - Single file import
5. `test_import_selections.sh` - ML selection import
6. `test_bulk_import.sh` - Bulk CSV import
7. `test_resources_prompts.sh` - Resources/prompts
8. `test_all_prompts.sh` - All 6 prompts

---

## [2026-02-06] Bulk File Import Tool

**New Feature: CSV-based bulk import across multiple locations and clusters**

**Added:**
- `tools/bulk_file_import.go` - CSV-based bulk import (~500 lines)

**Features:**
- CSV-driven import for multiple locations
- Auto-cluster creation
- Progress logging to file
- Summary statistics

**CSV Format:**
```csv
location_name,location_id,directory_path,date_range,sample_rate,file_count
Site A,loc123456789,/path/to/recordings,2024-01,48000,150
```

---

## [2026-02-02] Single File Import Tool

**New Feature: Import individual WAV files**

**Added:**
- `tools/import_file.go` - Single file import implementation (~300 lines)

**Features:**
- Import one WAV file at a time with detailed feedback
- Same processing pipeline as batch import
- Duplicate detection with `is_duplicate` flag
- Atomic operation (succeeds completely or fails)

---

## [2026-01-29] ML Selection Import Tool

**New Feature: Import ML-detected kiwi call selections from folder structure**

**Added:**
- `utils/selection_parser.go` - Selection parsing utilities
- `utils/selection_parser_test.go` - 34 test cases
- `tools/import_ml_selections.go` - MCP tool (~1050 lines)

**Features:**
- Folder structure: `Clips_{filter_name}_{date}/Species/CallType/*.wav+.png`
- Two-pass file matching (exact, then fuzzy)
- Comprehensive validation
- Transactional import

---

## [2026-01-28] Comprehensive Go Unit Testing

**Added comprehensive unit test suite**

**Added:**
- `utils/astronomical_test.go` - 11 test cases
- `utils/audiomoth_parser_test.go` - 36 test cases
- `utils/filename_parser_test.go` - 60 test cases
- `utils/wav_metadata_test.go` - 22 test cases
- `utils/xxh64_test.go` - 6 test cases

**Coverage:**
- 170+ tests total
- 91.5% code coverage

---

## [2026-01-26] Generic SQL Tool + Codebase Rationalization

**Major architectural change: Replaced 6 specialized tools with generic SQL**

**Deleted:**
- 6 specialized query tools (datasets, locations, clusters, files)
- 2 obsolete test scripts

**Added:**
- `tools/sql.go` - Generic `execute_sql` tool (~200 lines)
- `shell_scripts/test_sql.sh` - Comprehensive SQL test suite

**Modified:**
- `prompts/examples.go` - Rewritten to teach SQL patterns

**Benefits:**
- Full SQL expressiveness (JOINs, aggregates, CTEs)
- Infinite query possibilities vs 6 fixed queries
- More aligned with MCP philosophy
- Smaller codebase (2 tools instead of 8)

**Security:**
- Database read-only
- Validation blocks write operations
- Parameterized queries prevent SQL injection
- Row limits prevent overwhelming responses

---

## [2026-01-26] Shell Scripts Organization

**Reorganized all shell scripts into `shell_scripts/` directory**

- Keeps project root clean
- All scripts updated with correct relative paths