3DVPQOKB6BX63XSBIYYCPWBL2RBG3LXZS3XPQBANJP2FWVRAOVZQC JR46EPFZI2AVLOZH2OFQIZZYBTEWJ3YTZAFVZQGYFLRMBU6ZEPYQC 2Y5U3QPUBMTMBF6VKUFIYE22FXWKCQN4ODTCQ7T5QXE5ZVP2Z7NQC IFLKNMMP2NMJG46W3MKRLCUSPAW73N7QSDXQLJBAWPYNVEQP6KXQC 7Z6JBD3RJZIY42T2EYYJIOTWNLOPE5SAZHRQKKJULYCWPIDFXZWAC KZKLAINJJWZ64T5MUZT34LJVQIKBTKZ6EJGD7C7TTSSDGCHEDPMAC GVOVKH5R27K75VXGSZCP3X62FGNCSMDVFEKLR3LFXERFB54CHTUQC ZOSYO3IBH5SCB27UP642O2SGCWQ7ZTQJSZ3PXKZVFQP72S34U3IQC GE3VNRXLBCRRCW3R5CFGYKCAZE2WCQNTKTUW4BKV5GEN3NZKWXTAC JAT3DXOLENZZGXE2NYFF3TVQAQIXMMNYO234ETKQGC2CRHJVZERQC AVQ66WO4R4KVXAVP4YPEF65CPHJJY55H7ZOVPZ2BHFMGEBTWRUQQC LBWQJEDHCNUNMEJWXILGBGYZUKQI7CDAMH2BD44HULM77SVH5UYQC WKQ7LFTPDGWTPJKRWB6DH5PUCX2HF34UCGJDIPYC5PTDX4MCZJXAC RUVJ3V4N5V4Z3HSH2YYESKQF5G7RIHBFB5TLV2IPDWXSGJDRD54AC LQLC7S3ADBR4O2JYVUSQJD65U3HG4ADOQBGB4F7KQCXUMNKMNEKAC GPQSOVBPY7VTPHD75R6VWSNITPOL3AECF4DHJB32MF5Z72NV7YMQC JZRF7OBJNERB4NIB37RSAF3ZK2A4RBWSCFV5OCRXZYVGPSNOWKTAC 2P27XV3DGJCRA4SNJENCJYZLPR2XWZMTY7CGYYSJOY4UMDVVO25AC HYCZTLSZ5WVJFMP4EPVHAVWRYSYNCIBJ62LLBNO4IRVEBY7WJI6QC T2WZBTVFHVWPKL6AKEWSEVQBR3HWWWUPUNUP2MULF4WXEAZP46KQC YE6BZJUKQ7VMYEKKI3WSKTZEBR5NWUUDIN6PGE4W7OTPIY5N3NJQC YUIQQPXYFEVTS4XG4N6JTKBHWC4OFWWSCGJ6MLNA6AWTKSYCSIEQC KLUEQ6X5CXVBV3KLJKEHWQYHIU6AYPP2WT4PWKM2QZJ7SNACCJ6QC DD3LCTLZFDIPVXXSG7RDINZCQ7NGKVG3X52OFVGVZIISD5VPF35QC 54GPBNIXPKRBLD6GS4W6PTZTSQEO4M5HFEGMCVMBZSUFB57VGODAC QFPEKXL5OUKLT4WECMATSOHWYM24QPHKS6WZAAI5BAEQSAGAK6CQC NS4TDPLNAWJYJN37PZDYXMG6OJSAWZCMTPSPKX73JCLZZAMY25BAC VNFPBXF7OPUPHHDUE6I3VAOOQGSTVGMCZKWVG44ZGO6FN6JVFGIQC I4CMOMXFJ3Y4AY5LPA7MDLWVHJ674IRFYLXCEXCC5ZARLCWSKCAAC package tools// AviaNZMeta is the metadata element in a .data filetype AviaNZMeta struct {Operator string `json:"Operator"`Reviewer *string `json:"Reviewer,omitempty"`Duration float64 `json:"Duration"`}// AviaNZLabel represents a species label in a segmenttype AviaNZLabel struct {Species string `json:"species"`Certainty int `json:"certainty"`Filter string `json:"filter"`}// AviaNZSegment represents a detection segment [start, end, freq_low, freq_high, labels]type AviaNZSegment [5]any
package toolsimport ("fmt""os""path/filepath""sort""sync/atomic")// parallelResult is the common interface for birda/raven worker results.type parallelResult interface {filePath() stringgetCalls() []ClusteredCallwasWritten() boolwasSkipped() boolgetError() error}// aggregateStats holds the collected results from a parallel fan-out/fan-in.type aggregateStats struct {calls []ClusteredCallspeciesCount map[string]intdataFilesWritten intdataFilesSkipped intfilesProcessed intfilesDeleted intfirstErr error}// aggregateResults collects results from a channel of parallelResult values,// handling error tracking, species counting, optional file deletion, and// progress reporting. Returns the aggregated stats.func aggregateResults(results <-chan parallelResult,total int,processed *atomic.Int32,deleteFiles bool,progressHandler func(int, int, string),) aggregateStats {var stats aggregateStatsstats.speciesCount = make(map[string]int)for result := range results {if err := result.getError(); err != nil && stats.firstErr == nil {stats.firstErr = err}if result.wasWritten() {stats.dataFilesWritten++}if result.wasSkipped() {stats.dataFilesSkipped++}for _, call := range result.getCalls() {stats.calls = append(stats.calls, call)stats.speciesCount[call.EbirdCode]++}stats.filesProcessed++stats.maybeDeleteFile(deleteFiles, result)if progressHandler != nil {current := int(processed.Add(1))progressHandler(current, total, filepath.Base(result.filePath()))}}return stats}// maybeDeleteFile deletes the source file if requested and it was successfully processed.func (s *aggregateStats) maybeDeleteFile(deleteFiles bool, result parallelResult) {if !deleteFiles || !result.wasWritten() {return}if err := os.Remove(result.filePath()); err != nil {if s.firstErr == nil {s.firstErr = fmt.Errorf("failed to delete %s: %w", result.filePath(), err)}} else {s.filesDeleted++}}// sortCallsByFileAndTime sorts calls by filename, then start time.func sortCallsByFileAndTime(calls []ClusteredCall) {sort.Slice(calls, func(i, j int) bool {if calls[i].File != calls[j].File {return calls[i].File < calls[j].File}return calls[i].StartTime < calls[j].StartTime})}
package toolsimport ("fmt""os""path/filepath""sort""sync""sync/atomic")// CallsFromSourceInput defines the common input for calls-from-source toolstype CallsFromSourceInput struct {Folder string `json:"folder"`File string `json:"file"`Delete bool `json:"delete"`ProgressHandler ProgressHandler `json:"-"` // Optional progress callback}// CallsFromSourceOutput defines the common output for calls-from-source toolstype CallsFromSourceOutput struct {Calls []ClusteredCall `json:"calls"`TotalCalls int `json:"total_calls"`SpeciesCount map[string]int `json:"species_count"`DataFilesWritten int `json:"data_files_written"`DataFilesSkipped int `json:"data_files_skipped"`FilesProcessed int `json:"files_processed"`FilesDeleted int `json:"files_deleted"`Filter string `json:"filter"`Error *string `json:"error,omitempty"`}// CallSource abstracts a source of bird call data (Raven, BirdNET, etc.)type CallSource interface {// Name returns the display name (e.g. "Raven", "BirdNET")Name() string// FindFiles discovers source files in the given folderFindFiles(folder string) ([]string, error)// ProcessFile processes a single source file and returns calls, write/skip statusProcessFile(path string, cache *DirCache) (calls []ClusteredCall, written, skipped bool, err error)}// callsFromSource is the shared entry point for all call source tools.func callsFromSource(src CallSource, input CallsFromSourceInput) (CallsFromSourceOutput, error) {var output CallsFromSourceOutputoutput.Filter = src.Name()// Collect source files to processvar files []stringif input.File != "" {files = []string{input.File}} else if input.Folder != "" {var err errorfiles, err = src.FindFiles(input.Folder)if err != nil {errMsg := fmt.Sprintf("Failed to find %s files: %v", src.Name(), err)output.Error = &errMsgreturn output, fmt.Errorf("%s", errMsg)}} else {errMsg := "Either --folder or --file must be specified"output.Error = &errMsgreturn output, fmt.Errorf("%s", errMsg)}if len(files) == 0 {errMsg := fmt.Sprintf("No %s files found", src.Name())output.Error = &errMsgreturn output, fmt.Errorf("%s", errMsg)}// Single file or small batch: process sequentially (avoid goroutine overhead)if len(files) < 10 {return callsFromSourceSequential(src, input, files)}// Large batch: parallel processing with DirCachereturn callsFromSourceParallel(src, input, files)}// callsFromSourceSequential processes source files one at a time (for small batches)func callsFromSourceSequential(src CallSource, input CallsFromSourceInput, files []string) (CallsFromSourceOutput, error) {var output CallsFromSourceOutputoutput.Filter = src.Name()// Build DirCache once for the folderdirCaches := make(map[string]*DirCache)if input.Folder != "" {dirCaches[input.Folder] = NewDirCache(input.Folder)}speciesCount := make(map[string]int)var allCalls []ClusteredCalldataFilesWritten := 0dataFilesSkipped := 0filesProcessed := 0filesDeleted := 0for _, file := range files {dir := filepath.Dir(file)cache := dirCaches[dir]if cache == nil {cache = NewDirCache(dir)dirCaches[dir] = cache}calls, written, skipped, err := src.ProcessFile(file, cache)if err != nil {errMsg := fmt.Sprintf("Error processing %s: %v", file, err)output.Error = &errMsgreturn output, fmt.Errorf("%s", errMsg)}if written {dataFilesWritten++}if skipped {dataFilesSkipped++}for _, call := range calls {allCalls = append(allCalls, call)speciesCount[call.EbirdCode]++}filesProcessed++// Delete if requested and successfully processedif input.Delete && written {if err := os.Remove(file); err != nil {errMsg := fmt.Sprintf("Failed to delete %s: %v", file, err)output.Error = &errMsgreturn output, fmt.Errorf("%s", errMsg)}filesDeleted++}if input.ProgressHandler != nil {input.ProgressHandler(filesProcessed, len(files), filepath.Base(file))}}// Sort all calls by file, then start timesort.Slice(allCalls, func(i, j int) bool {if allCalls[i].File != allCalls[j].File {return allCalls[i].File < allCalls[j].File}return allCalls[i].StartTime < allCalls[j].StartTime})output.Calls = allCallsoutput.TotalCalls = len(allCalls)output.SpeciesCount = speciesCountoutput.DataFilesWritten = dataFilesWrittenoutput.DataFilesSkipped = dataFilesSkippedoutput.FilesProcessed = filesProcessedoutput.FilesDeleted = filesDeletedreturn output, nil}// sourceJob represents a single file to process (generic over CallSource)type sourceJob struct {filePath string}// sourceResult represents the result of processing a single source filetype sourceResult struct {path stringcalls []ClusteredCallwritten boolskipped boolerr error}func (r sourceResult) filePath() string { return r.path }func (r sourceResult) getCalls() []ClusteredCall { return r.calls }func (r sourceResult) wasWritten() bool { return r.written }func (r sourceResult) wasSkipped() bool { return r.skipped }func (r sourceResult) getError() error { return r.err }// callsFromSourceParallel processes source files concurrently using a worker pool and DirCachefunc callsFromSourceParallel(src CallSource, input CallsFromSourceInput, files []string) (CallsFromSourceOutput, error) {var output CallsFromSourceOutputoutput.Filter = src.Name()total := len(files)var processed atomic.Int32// Build DirCache for the folderdirCaches := &sync.Map{}if input.Folder != "" {cache := NewDirCache(input.Folder)dirCaches.Store(input.Folder, cache)}// Create job and result channelsjobs := make(chan sourceJob, total)results := make(chan parallelResult, total)// Start workersvar wg sync.WaitGroupfor range DOT_DATA_WORKERS {wg.Add(1)go sourceWorker(src, dirCaches, jobs, results, &wg)}// Send jobsfor _, file := range files {jobs <- sourceJob{filePath: file}}close(jobs)// Wait for workers to finish, then close resultsgo func() {wg.Wait()close(results)}()// Collect results with progress reportingstats := aggregateResults(results, total, &processed, input.Delete, input.ProgressHandler)if stats.firstErr != nil {errMsg := stats.firstErr.Error()output.Error = &errMsgreturn output, stats.firstErr}sortCallsByFileAndTime(stats.calls)output.Calls = stats.callsoutput.TotalCalls = len(stats.calls)output.SpeciesCount = stats.speciesCountoutput.DataFilesWritten = stats.dataFilesWrittenoutput.DataFilesSkipped = stats.dataFilesSkippedoutput.FilesProcessed = stats.filesProcessedoutput.FilesDeleted = stats.filesDeletedreturn output, nil}// sourceWorker processes source files from the jobs channelfunc sourceWorker(src CallSource, dirCaches *sync.Map, jobs <-chan sourceJob, results chan<- parallelResult, wg *sync.WaitGroup) {defer wg.Done()for job := range jobs {dir := filepath.Dir(job.filePath)// Get or create DirCache for this directoryvar cache *DirCacheif cached, ok := dirCaches.Load(dir); ok {cache = cached.(*DirCache)} else {cache = NewDirCache(dir)dirCaches.Store(dir, cache)}calls, written, skipped, err := src.ProcessFile(job.filePath, cache)results <- sourceResult{path: job.filePath,calls: calls,written: written,skipped: skipped,err: err,}}}
package toolsimport ("fmt""strings""time""github.com/sixdouglas/suncalc""skraak/utils")// IsNightInput defines the input parameters for the isnight tooltype IsNightInput struct {FilePath string `json:"file_path"`Lat float64 `json:"lat"`Lng float64 `json:"lng"`Timezone string `json:"timezone,omitempty"`}// IsNightOutput defines the output structure for the isnight tooltype IsNightOutput struct {FilePath string `json:"file_path"`TimestampUTC string `json:"timestamp_utc"`SolarNight bool `json:"solar_night"`CivilNight bool `json:"civil_night"`DiurnalActive bool `json:"diurnal_active"`MoonPhase float64 `json:"moon_phase"`DurationSec float64 `json:"duration_seconds"`TimestampSrc string `json:"timestamp_source"`MidpointUTC string `json:"midpoint_utc"`SunriseUTC string `json:"sunrise_utc,omitempty"`SunsetUTC string `json:"sunset_utc,omitempty"`DawnUTC string `json:"dawn_utc,omitempty"`DuskUTC string `json:"dusk_utc,omitempty"`}// IsNight determines if a WAV file was recorded at night based on its// metadata timestamp and the given GPS coordinates.//// Timestamp resolution order:// 1. AudioMoth comment (timezone embedded)// 2. Filename timestamp + timezone offset (requires --timezone)// 3. File modification time (system local time)func IsNight(input IsNightInput) (IsNightOutput, error) {var output IsNightOutput// Step 1: Parse WAV headermetadata, err := utils.ParseWAVHeader(input.FilePath)if err != nil {return output, fmt.Errorf("WAV header parsing failed: %w", err)}output.DurationSec = metadata.Duration// Step 2: Resolve timestamp (use file mod time as fallback)tsResult, err := utils.ResolveTimestamp(metadata, input.FilePath, input.Timezone, true, nil)if err != nil {return output, fmt.Errorf("cannot determine recording timestamp: %w", err)}// Determine timestamp source labeltsSource := "file_mod_time"if tsResult.IsAudioMoth {tsSource = "audiomoth_comment"} else if utils.HasTimestampFilename(input.FilePath) {tsSource = "filename"}// Step 3: Calculate astronomical data using recording midpointastroData := utils.CalculateAstronomicalData(tsResult.Timestamp.UTC(),metadata.Duration,input.Lat,input.Lng,)// Step 4: Get sun event times for informational outputmidpoint := utils.CalculateMidpointTime(tsResult.Timestamp.UTC(), metadata.Duration)sunTimes := suncalc.GetTimes(midpoint, input.Lat, input.Lng)output.FilePath = input.FilePathoutput.TimestampUTC = tsResult.Timestamp.UTC().Format(time.RFC3339)output.SolarNight = astroData.SolarNightoutput.CivilNight = astroData.CivilNightoutput.MoonPhase = astroData.MoonPhaseoutput.TimestampSrc = tsSourceoutput.MidpointUTC = midpoint.Format(time.RFC3339)if dawn, ok := sunTimes[suncalc.Dawn]; ok && !dawn.Value.IsZero() {if sunset, ok := sunTimes[suncalc.Sunset]; ok && !sunset.Value.IsZero() {output.DiurnalActive = !midpoint.Before(dawn.Value) && !midpoint.After(sunset.Value)}}output.SunriseUTC = sunTimeUTC(sunTimes, suncalc.Sunrise)output.SunsetUTC = sunTimeUTC(sunTimes, suncalc.Sunset)output.DawnUTC = sunTimeUTC(sunTimes, suncalc.Dawn)output.DuskUTC = sunTimeUTC(sunTimes, suncalc.Dusk)}// String returns a human-readable summary of the isnight resultfunc (o IsNightOutput) String() string {var sb strings.Builderfmt.Fprintf(&sb, "File: %s\n", o.FilePath)fmt.Fprintf(&sb, "Timestamp (UTC): %s\n", o.TimestampUTC)fmt.Fprintf(&sb, "Midpoint (UTC): %s\n", o.MidpointUTC)fmt.Fprintf(&sb, "Duration: %.1f seconds\n", o.DurationSec)fmt.Fprintf(&sb, "Source: %s\n", o.TimestampSrc)fmt.Fprintf(&sb, "Solar night: %v\n", o.SolarNight)fmt.Fprintf(&sb, "Civil night: %v\n", o.CivilNight)fmt.Fprintf(&sb, "Moon phase: %.2f\n", o.MoonPhase)if o.SunriseUTC != "" {fmt.Fprintf(&sb, "Sunrise (UTC): %s\n", o.SunriseUTC)}if o.SunsetUTC != "" {fmt.Fprintf(&sb, "Sunset (UTC): %s\n", o.SunsetUTC)}if o.DawnUTC != "" {fmt.Fprintf(&sb, "Dawn (UTC): %s\n", o.DawnUTC)}if o.DuskUTC != "" {fmt.Fprintf(&sb, "Dusk (UTC): %s\n", o.DuskUTC)}return sb.String()}// populateSunTimes fills in sun event times and diurnal status from suncalc results.func populateSunTimes(output *IsNightOutput, sunTimes map[suncalc.DayTimeName]suncalc.DayTime, midpoint time.Time) {// Diurnal: midpoint is between dawn and sunset// sunTimeUTC returns the UTC RFC3339 string for a suncalc event, or "" if absent/zero.func sunTimeUTC(sunTimes map[suncalc.DayTimeName]suncalc.DayTime, name suncalc.DayTimeName) string {if entry, ok := sunTimes[name]; ok && !entry.Value.IsZero() {return entry.Value.UTC().Format(time.RFC3339)}return ""}populateSunTimes(&output, sunTimes, midpoint)return output, nil}
package toolsimport ("context""fmt""io/fs""os""path/filepath""strings""time""skraak/db""skraak/utils")// ImportUnstructuredInput defines the input parameters for importing files into an unstructured datasettype ImportUnstructuredInput struct {DatasetID string `json:"dataset_id"`FolderPath string `json:"folder_path"`Recursive *bool `json:"recursive,omitempty"`}// ImportUnstructuredOutput defines the output structuretype ImportUnstructuredOutput struct {TotalFiles int `json:"total_files"`ImportedFiles int `json:"imported_files"`SkippedFiles int `json:"skipped_files"` // DuplicatesFailedFiles int `json:"failed_files"`TotalDuration float64 `json:"total_duration_seconds"`ProcessingTime string `json:"processing_time"`Errors []utils.FileImportError `json:"errors,omitempty"`}// ImportUnstructured imports WAV files into an unstructured dataset// Files are stored with minimal metadata: hash, duration, sample_rate, file_mod_time as timestamp// No location/cluster hierarchy, no astronomical data, no AudioMoth parsingfunc ImportUnstructured(ctx context.Context,input ImportUnstructuredInput,) (ImportUnstructuredOutput, error) {startTime := time.Now()var output ImportUnstructuredOutput// Default recursive to truerecursive := trueif input.Recursive != nil {recursive = *input.Recursive}// Validate inputif err := validateUnstructuredInput(input); err != nil {return output, fmt.Errorf("validation failed: %w", err)}// Scan for WAV files (no DB needed)files, scanErrors := scanWavFiles(input.FolderPath, recursive)output.Errors = append(output.Errors, scanErrors...)output.TotalFiles = len(files)if len(files) == 0 {output.ProcessingTime = time.Since(startTime).String()return output, nil}// Process each filefor _, filePath := range files {fileResult, procErr := processUnstructuredFile(tx, filePath, input.DatasetID)if procErr != nil {output.FailedFiles++output.Errors = append(output.Errors, utils.FileImportError{FileName: filepath.Base(filePath),Error: procErr.Error(),Stage: utils.StageProcess,})continue}if fileResult.Skipped {output.SkippedFiles++} else {output.ImportedFiles++output.TotalDuration += fileResult.Duration}}return nil})if err != nil {return output, err}output.ProcessingTime = time.Since(startTime).String()return output, nil}// unstructuredFileResult holds the result of processing a single filetype unstructuredFileResult struct {Skipped bool // True if duplicateDuration float64 // Duration in seconds}// processUnstructuredFile processes a single WAV file for unstructured importfunc processUnstructuredFile(tx *db.LoggedTx, filePath, datasetID string) (*unstructuredFileResult, error) {result := &unstructuredFileResult{}// Step 1: Parse WAV headermetadata, err := utils.ParseWAVHeader(filePath)if err != nil {return nil, fmt.Errorf("WAV header parsing failed: %w", err)}// Step 2: Calculate hashhash, err := utils.ComputeXXH64(filePath)if err != nil {return nil, fmt.Errorf("hash calculation failed: %w", err)}// Step 3: Check for duplicate - if exists, skip entirely (do not link to dataset)_, isDuplicate, err := utils.CheckDuplicateHash(tx, hash)if err != nil {return nil, fmt.Errorf("duplicate check failed: %w", err)}if isDuplicate {// File already exists in database - skip completely, do not link to datasetresult.Skipped = trueresult.Duration = metadata.Durationreturn result, nil}// Step 4: Generate file IDfileID, err := utils.GenerateLongID()if err != nil {return nil, fmt.Errorf("ID generation failed: %w", err)}// Step 5: Use file modification time as timestamp (no timezone conversion)timestamp := metadata.FileModTime// Step 6: Insert into file table_, err = tx.Exec(`INSERT INTO file (id, file_name, xxh64_hash, location_id, cluster_id,timestamp_local, duration, sample_rate,maybe_solar_night, maybe_civil_night, moon_phase,active) VALUES (?, ?, ?, NULL, NULL, ?, ?, ?, NULL, NULL, NULL, TRUE)`,fileID,filepath.Base(filePath),hash,timestamp,metadata.Duration,metadata.SampleRate,)if err != nil {return nil, fmt.Errorf("file insert failed: %w", err)}// Step 7: Insert into file_dataset table_, err = tx.Exec("INSERT INTO file_dataset (file_id, dataset_id) VALUES (?, ?)",fileID, datasetID,)if err != nil {return nil, fmt.Errorf("file_dataset insert failed: %w", err)}result.Duration = metadata.Durationreturn result, nil}// validateUnstructuredInput validates the input parametersfunc validateUnstructuredInput(input ImportUnstructuredInput) error {// Validate dataset ID formatif err := utils.ValidateShortID(input.DatasetID, "dataset_id"); err != nil {return err}// Verify folder existsinfo, err := os.Stat(input.FolderPath)if err != nil {return fmt.Errorf("folder not accessible: %w", err)}if !info.IsDir() {return fmt.Errorf("path is not a directory: %s", input.FolderPath)}// Verify dataset exists and is activeif _, err := db.DatasetExistsAndActive(database, input.DatasetID); err != nil {return err}// Verify dataset is 'unstructured' typeif err := db.ValidateDatasetTypeUnstructured(database, input.DatasetID); err != nil {return err}return nil})}// scanWavFiles scans a folder for WAV filesfunc scanWavFiles(folderPath string, recursive bool) ([]string, []utils.FileImportError) {var files []stringvar errors []utils.FileImportErrorwalkFunc := func(path string, d fs.DirEntry, err error) error {if err != nil {errors = append(errors, utils.FileImportError{FileName: path,Error: err.Error(),Stage: utils.StageScan,})return nil}// Skip directories if not recursiveif d.IsDir() {if !recursive && path != folderPath {return fs.SkipDir}return nil}// Check for .wav extension (case-insensitive)if strings.HasSuffix(strings.ToLower(d.Name()), ".wav") {files = append(files, path)}return nil}if recursive {if err := filepath.WalkDir(folderPath, walkFunc); err != nil {errors = append(errors, utils.FileImportError{FileName: folderPath,Error: err.Error(),Stage: utils.StageScan,})}} else {// Non-recursive: only scan top-levelentries, err := os.ReadDir(folderPath)if err != nil {errors = append(errors, utils.FileImportError{FileName: folderPath,Error: err.Error(),Stage: utils.StageScan,})return nil, errors}for _, entry := range entries {if !entry.IsDir() && strings.HasSuffix(strings.ToLower(entry.Name()), ".wav") {files = append(files, filepath.Join(folderPath, entry.Name()))}}}return files, errors}return db.WithReadDB(resolveDBPath(input.DBPath), func(database *sql.DB) error {err := db.WithWriteTx(ctx, resolveDBPath(input.DBPath), "import_unstructured", func(database *sql.DB, tx *db.LoggedTx) error {DBPath string `json:"db_path"`"database/sql"
package toolsimport ("testing""skraak/utils")func TestValidateSegmentImportInput(t *testing.T) {t.Run("invalid dataset ID - too short", func(t *testing.T) {input := ImportSegmentsInput{DatasetID: "abc",}err := validateSegmentImportInput(input)if err == nil {t.Fatal("expected error for short dataset ID")}})t.Run("invalid dataset ID - too long", func(t *testing.T) {input := ImportSegmentsInput{DatasetID: "abc123def456ghi789",}err := validateSegmentImportInput(input)if err == nil {t.Fatal("expected error for long dataset ID")}})t.Run("invalid dataset ID - invalid characters", func(t *testing.T) {input := ImportSegmentsInput{DatasetID: "abc123!!!456",}err := validateSegmentImportInput(input)if err == nil {t.Fatal("expected error for invalid characters in dataset ID")}})t.Run("invalid location ID", func(t *testing.T) {input := ImportSegmentsInput{DatasetID: "abc123def456",LocationID: "invalid",}err := validateSegmentImportInput(input)if err == nil {t.Fatal("expected error for invalid location ID")}})t.Run("invalid cluster ID", func(t *testing.T) {input := ImportSegmentsInput{DatasetID: "abc123def456",LocationID: "xyz789uvw012",ClusterID: "invalid",}err := validateSegmentImportInput(input)if err == nil {t.Fatal("expected error for invalid cluster ID")}})}func TestCountTotalSegments(t *testing.T) {t.Run("empty", func(t *testing.T) {count := countTotalSegments(map[string]scannedDataFile{})if count != 0 {t.Errorf("expected 0, got %d", count)}})t.Run("single file - no segments", func(t *testing.T) {files := map[string]scannedDataFile{"file1": {Segments: []*utils.Segment{}},}count := countTotalSegments(files)if count != 0 {t.Errorf("expected 0, got %d", count)}})t.Run("single file - multiple segments", func(t *testing.T) {files := map[string]scannedDataFile{"file1": {Segments: []*utils.Segment{{}, {}, {}}},}count := countTotalSegments(files)if count != 3 {t.Errorf("expected 3, got %d", count)}})t.Run("multiple files", func(t *testing.T) {files := map[string]scannedDataFile{"file1": {Segments: []*utils.Segment{{}, {}}},"file2": {Segments: []*utils.Segment{{}}},"file3": {Segments: []*utils.Segment{{}, {}, {}, {}}},}count := countTotalSegments(files)if count != 7 {t.Errorf("expected 7, got %d", count)}})}
package toolsimport ("context""database/sql""fmt""os""path/filepath""strings""time""skraak/db""skraak/utils")// ImportSegmentsInput defines the input parameters for the import_segments tooltype ImportSegmentsInput struct {Folder string `json:"folder"`Mapping string `json:"mapping"`DatasetID string `json:"dataset_id"`LocationID string `json:"location_id"`ClusterID string `json:"cluster_id"`ProgressHandler func(processed, total int, message string)}// ImportSegmentsOutput defines the output structure for the import_segments tooltype ImportSegmentsOutput struct {Summary ImportSegmentsSummary `json:"summary"`Segments []SegmentImport `json:"segments"`Errors []ImportSegmentError `json:"errors,omitempty"`}// ImportSegmentsSummary provides summary statistics for the import operationtype ImportSegmentsSummary struct {DataFilesFound int `json:"data_files_found"`DataFilesProcessed int `json:"data_files_processed"`TotalSegments int `json:"total_segments"`ImportedSegments int `json:"imported_segments"`ImportedLabels int `json:"imported_labels"`ImportedSubtypes int `json:"imported_subtypes"`ProcessingTimeMs int64 `json:"processing_time_ms"`}// SegmentImport represents an imported segment in the outputtype SegmentImport struct {SegmentID string `json:"segment_id"`FileName string `json:"file_name"`StartTime float64 `json:"start_time"`EndTime float64 `json:"end_time"`FreqLow float64 `json:"freq_low"`FreqHigh float64 `json:"freq_high"`Labels []LabelImport `json:"labels"`}// LabelImport represents an imported label in the outputtype LabelImport struct {LabelID string `json:"label_id"`Species string `json:"species"`CallType string `json:"calltype,omitempty"`Filter string `json:"filter"`Certainty int `json:"certainty"`Comment string `json:"comment,omitempty"`}// ImportSegmentError records errors encountered during segment importtype ImportSegmentError struct {File string `json:"file,omitempty"`Stage utils.ImportStage `json:"stage"`Message string `json:"message"`}// scannedDataFile holds parsed data for a .data filetype scannedDataFile struct {DataPath stringWavPath stringWavHash stringFileID stringDuration float64Segments []*utils.Segment}// ImportSegments imports segments from AviaNZ .data files into the databasefunc ImportSegments(ctx context.Context, input ImportSegmentsInput) (ImportSegmentsOutput, error) {startTime := time.Now()var output ImportSegmentsOutputoutput.Segments = make([]SegmentImport, 0)output.Errors = make([]ImportSegmentError, 0)// Phase A: Input Validationif err := validateSegmentImportInput(input); err != nil {return output, err}// Load mapping filemapping, err := utils.LoadMappingFile(input.Mapping)if err != nil {return output, fmt.Errorf("failed to load mapping file: %w", err)}// Find .data filesdataFiles, err := utils.FindDataFiles(input.Folder)if err != nil {return output, fmt.Errorf("failed to find .data files: %w", err)}output.Summary.DataFilesFound = len(dataFiles)if len(dataFiles) == 0 {return output, fmt.Errorf("no .data files found in folder: %s", input.Folder)}// Phase B+C: Parse data files and validate against DBif err != nil {return output, fmt.Errorf("failed to open database: %w", err)}defer database.Close()val, valErrors, err := validateAndPrepareSegments(database, input, mapping, dataFiles)output.Errors = append(output.Errors, valErrors...)if err != nil {return output, err}if val == nil || len(val.fileIDMap) == 0 {output.Summary.ProcessingTimeMs = time.Since(startTime).Milliseconds()return output, nil}// Phase D: Transactional ImportimportedSegments, importedLabels, importedSubtypes, fileUpdates, importErrors := importSegmentsIntoDB(ctx, database, val.fileIDMap, val.scannedFiles, mapping, val.filterIDMap, val.speciesIDMap, val.calltypeIDMap, input.DatasetID, input.ProgressHandler,)output.Errors = append(output.Errors, importErrors...)output.Segments = append(output.Segments, importedSegments...)// Phase E: Write IDs back to .data filesif len(fileUpdates) > 0 {writeErrors := writeIDsToDataFiles(fileUpdates)output.Errors = append(output.Errors, writeErrors...)}output.Summary.DataFilesProcessed = len(val.fileIDMap)output.Summary.TotalSegments = countTotalSegments(val.fileIDMap)output.Summary.ImportedSegments = len(importedSegments)output.Summary.ImportedLabels = importedLabelsoutput.Summary.ImportedSubtypes = importedSubtypesoutput.Summary.ProcessingTimeMs = time.Since(startTime).Milliseconds()return output, nil}// validateSegmentImportInput validates input parametersfunc validateSegmentImportInput(input ImportSegmentsInput) error {// Validate folder existsif info, err := os.Stat(input.Folder); err != nil {return fmt.Errorf("folder does not exist: %s", input.Folder)} else if !info.IsDir() {return fmt.Errorf("path is not a folder: %s", input.Folder)}// Validate mapping file existsif _, err := os.Stat(input.Mapping); err != nil {return fmt.Errorf("mapping file does not exist: %s", input.Mapping)}// Validate IDsif err := utils.ValidateShortID(input.DatasetID, "dataset_id"); err != nil {return err}if err := utils.ValidateShortID(input.LocationID, "location_id"); err != nil {return err}if err := utils.ValidateShortID(input.ClusterID, "cluster_id"); err != nil {return err}return nil}// validateSegmentHierarchy validates dataset/location/cluster relationshipsfunc validateSegmentHierarchy(dbConn *sql.DB, datasetID, locationID, clusterID string) error {// Validate dataset exists and is structuredif err := db.ValidateDatasetTypeForImport(dbConn, datasetID); err != nil {return err}// Validate location belongs to datasetif err := db.ValidateLocationBelongsToDataset(dbConn, locationID, datasetID); err != nil {return err}// Validate cluster belongs to locationif err := db.ClusterBelongsToLocation(dbConn, clusterID, locationID); err != nil {return err}return nil}// scanAllDataFiles parses all .data files and collects unique valuesfunc scanAllDataFiles(dataFiles []string, folder string) ([]scannedDataFile,[]ImportSegmentError,map[string]bool,map[string]bool,map[string]map[string]bool,) {var scanned []scannedDataFilevar errors []ImportSegmentErroruniqueFilters := make(map[string]bool)uniqueSpecies := make(map[string]bool)uniqueCalltypes := make(map[string]map[string]bool) // species -> calltype -> truefor _, dataPath := range dataFiles {// Find corresponding WAV filewavPath := strings.TrimSuffix(dataPath, ".data")if _, err := os.Stat(wavPath); err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(dataPath),Stage: utils.StageValidation,Message: fmt.Sprintf("corresponding WAV file not found: %s", filepath.Base(wavPath)),})continue}// Parse .data filedf, err := utils.ParseDataFile(dataPath)if err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(dataPath),Stage: utils.StageValidation,Message: fmt.Sprintf("failed to parse .data file: %v", err),})continue}// Collect unique filters, species, calltypesfor _, seg := range df.Segments {for _, label := range seg.Labels {uniqueFilters[label.Filter] = trueuniqueSpecies[label.Species] = trueif label.CallType != "" {if uniqueCalltypes[label.Species] == nil {uniqueCalltypes[label.Species] = make(map[string]bool)}uniqueCalltypes[label.Species][label.CallType] = true}}}scanned = append(scanned, scannedDataFile{DataPath: dataPath,WavPath: wavPath,Duration: df.Meta.Duration,Segments: df.Segments,})}return scanned, errors, uniqueFilters, uniqueSpecies, uniqueCalltypes}// validateFiltersExist checks all filters exist in DB and returns ID mapfunc validateFiltersExist(dbConn *sql.DB, filterNames map[string]bool) (map[string]string, error) {filterIDMap := make(map[string]string)if len(filterNames) == 0 {return filterIDMap, nil}names := make([]string, 0, len(filterNames))for name := range filterNames {names = append(names, name)}query := `SELECT id, name FROM filter WHERE name IN (` + db.Placeholders(len(names)) + `) AND active = true`args := make([]any, len(names))for i, name := range names {args[i] = name}rows, err := dbConn.Query(query, args...)if err != nil {return nil, fmt.Errorf("failed to query filters: %w", err)}defer rows.Close()for rows.Next() {var id, name stringif err := rows.Scan(&id, &name); err == nil {filterIDMap[name] = id}}// Check for missing filtersvar missing []stringfor name := range filterNames {if _, exists := filterIDMap[name]; !exists {missing = append(missing, name)}}if len(missing) > 0 {return nil, fmt.Errorf("filters not found in database: [%s]", strings.Join(missing, ", "))}return filterIDMap, nil}// loadSpeciesCalltypeIDs loads species and calltype ID mapsfunc loadSpeciesCalltypeIDs(dbConn *sql.DB,mapping utils.MappingFile,uniqueSpecies map[string]bool,uniqueCalltypes map[string]map[string]bool,) (map[string]string, map[string]map[string]string, error) {speciesIDMap := make(map[string]string)calltypeIDMap := make(map[string]map[string]string) // (dbSpecies, dbCalltype) -> calltype_id// Collect all DB species labels from mappingdbSpeciesSet := make(map[string]bool)for dataSpecies := range uniqueSpecies {if dbSpecies, ok := mapping.GetDBSpecies(dataSpecies); ok {dbSpeciesSet[dbSpecies] = true}}// Load species IDsif len(dbSpeciesSet) > 0 {dbSpeciesList := make([]string, 0, len(dbSpeciesSet))for s := range dbSpeciesSet {dbSpeciesList = append(dbSpeciesList, s)}query := `SELECT id, label FROM species WHERE label IN (` + db.Placeholders(len(dbSpeciesList)) + `) AND active = true`args := make([]any, len(dbSpeciesList))for i, s := range dbSpeciesList {args[i] = s}rows, err := dbConn.Query(query, args...)if err != nil {return nil, nil, fmt.Errorf("failed to query species: %w", err)}defer rows.Close()for rows.Next() {var id, label stringif err := rows.Scan(&id, &label); err == nil {speciesIDMap[label] = id}}}// Load calltype IDsfor dataSpecies, ctSet := range uniqueCalltypes {dbSpecies, ok := mapping.GetDBSpecies(dataSpecies)if !ok {continue}if calltypeIDMap[dbSpecies] == nil {calltypeIDMap[dbSpecies] = make(map[string]string)}for dataCalltype := range ctSet {dbCalltype := mapping.GetDBCalltype(dataSpecies, dataCalltype)// Query calltype IDvar calltypeID stringerr := dbConn.QueryRow(`SELECT ct.idFROM call_type ctJOIN species s ON ct.species_id = s.idWHERE s.label = ? AND ct.label = ? AND ct.active = true`, dbSpecies, dbCalltype).Scan(&calltypeID)if err == nil {calltypeIDMap[dbSpecies][dbCalltype] = calltypeID}}}return speciesIDMap, calltypeIDMap, nil}// validateAndMapFiles validates files exist by hash, are linked to dataset, and have no existing labelsfunc validateAndMapFiles(dbConn *sql.DB,scannedFiles []scannedDataFile,clusterID string,datasetID string,) (map[string]scannedDataFile, []ImportSegmentError) {fileIDMap := make(map[string]scannedDataFile)var errors []ImportSegmentErrorfor _, sf := range scannedFiles {// Compute hashhash, err := utils.ComputeXXH64(sf.WavPath)if err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.WavPath),Stage: utils.StageHash,Message: fmt.Sprintf("failed to compute hash: %v", err),})continue}sf.WavHash = hash// Find file by hash in clustervar fileID stringvar duration float64err = dbConn.QueryRow(`SELECT id, duration FROM file WHERE xxh64_hash = ? AND cluster_id = ? AND active = true`, hash, clusterID).Scan(&fileID, &duration)if err == sql.ErrNoRows {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.WavPath),Stage: utils.StageValidation,Message: fmt.Sprintf("file hash not found in database for cluster (hash: %s)", hash),})continue}if err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.WavPath),Stage: utils.StageValidation,Message: fmt.Sprintf("failed to query file: %v", err),})continue}sf.FileID = fileIDsf.Duration = duration// Verify file is linked to dataset via file_dataset junction table (composite FK)var fileLinkedToDataset boolerr = dbConn.QueryRow(`SELECT EXISTS(SELECT 1 FROM file_dataset WHERE file_id = ? AND dataset_id = ?)`, fileID, datasetID).Scan(&fileLinkedToDataset)if err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.WavPath),Stage: utils.StageValidation,Message: fmt.Sprintf("failed to verify file-dataset link: %v", err),})continue}if !fileLinkedToDataset {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.WavPath),Stage: utils.StageValidation,Message: fmt.Sprintf("file exists in cluster but is not linked to dataset %s", datasetID),})continue}// Check no existing labels for this filevar labelCount interr = dbConn.QueryRow(`SELECT COUNT(*) FROM label lJOIN segment s ON l.segment_id = s.idWHERE s.file_id = ? AND l.active = true`, fileID).Scan(&labelCount)if err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.WavPath),Stage: utils.StageValidation,Message: fmt.Sprintf("failed to check existing labels: %v", err),})continue}if labelCount > 0 {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.WavPath),Stage: utils.StageValidation,Message: fmt.Sprintf("file already has %d label(s) - fresh imports only", labelCount),})continue}fileIDMap[fileID] = sf}return fileIDMap, errors}// dataFileUpdate holds data to write back to .data file after importtype dataFileUpdate struct {DataPath stringWavHash stringLabelIDs map[int]map[int]string // segmentIndex -> labelIndex -> labelID}// importSegmentsIntoDB performs the transactional importfunc importSegmentsIntoDB(ctx context.Context,database *sql.DB,fileIDMap map[string]scannedDataFile,scannedFiles []scannedDataFile,mapping utils.MappingFile,filterIDMap map[string]string,speciesIDMap map[string]string,calltypeIDMap map[string]map[string]string,datasetID string,progressHandler func(processed, total int, message string),) ([]SegmentImport, int, int, []dataFileUpdate, []ImportSegmentError) {var importedSegments []SegmentImportvar errors []ImportSegmentErrorimportedLabels := 0importedSubtypes := 0var fileUpdates []dataFileUpdatetx, err := db.BeginLoggedTx(ctx, database, "import_segments")if err != nil {errors = append(errors, ImportSegmentError{Stage: utils.StageImport,Message: fmt.Sprintf("failed to begin transaction: %v", err),})return nil, 0, 0, nil, errors}defer tx.Rollback()totalFiles := len(fileIDMap)processedFiles := 0for _, sf := range fileIDMap {if sf.FileID == "" {continue}processedFiles++if progressHandler != nil {progressHandler(processedFiles, totalFiles, filepath.Base(sf.DataPath))}fileUpdate := dataFileUpdate{DataPath: sf.DataPath,WavHash: sf.WavHash,LabelIDs: make(map[int]map[int]string),}for segIdx, seg := range sf.Segments {segImp, labelIDs, subtypes, segErrs := importSegment(ctx, tx, seg, segIdx, sf, datasetID, mapping, filterIDMap, speciesIDMap, calltypeIDMap)errors = append(errors, segErrs...)importedSubtypes += subtypesif len(segImp.Labels) == 0 {// Delete orphaned segment (no labels succeeded)if _, err := tx.ExecContext(ctx, `DELETE FROM segment WHERE id = ?`, segImp.SegmentID); err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("failed to delete orphaned segment: %v", err),})}} else {importedSegments = append(importedSegments, segImp)importedLabels += len(labelIDs)fileUpdate.LabelIDs[segIdx] = labelIDs}}fileUpdates = append(fileUpdates, fileUpdate)}if err := tx.Commit(); err != nil {errors = append(errors, ImportSegmentError{Stage: utils.StageImport,Message: fmt.Sprintf("failed to commit transaction: %v", err),})return nil, 0, 0, nil, errors}return importedSegments, importedLabels, importedSubtypes, fileUpdates, errors}// countTotalSegments counts total segments from validated filesfunc countTotalSegments(fileIDMap map[string]scannedDataFile) int {count := 0for _, sf := range fileIDMap {count += len(sf.Segments)}return count}// writeIDsToDataFiles writes skraak_hash and skraak_label_ids back to .data filesfunc writeIDsToDataFiles(fileUpdates []dataFileUpdate) []ImportSegmentError {var errors []ImportSegmentErrorfor _, fu := range fileUpdates {// Parse the .data filedf, err := utils.ParseDataFile(fu.DataPath)if err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(fu.DataPath),Stage: utils.StageImport,Message: fmt.Sprintf("failed to re-parse .data file for writing: %v", err),})continue}// Write skraak_hash to metadataif df.Meta.Extra == nil {df.Meta.Extra = make(map[string]any)}df.Meta.Extra["skraak_hash"] = fu.WavHash// Write skraak_label_id to each labelfor segIdx, labelIDs := range fu.LabelIDs {if segIdx >= len(df.Segments) {continue}seg := df.Segments[segIdx]for labelIdx, labelID := range labelIDs {if labelIdx >= len(seg.Labels) {continue}label := seg.Labels[labelIdx]if label.Extra == nil {label.Extra = make(map[string]any)}label.Extra["skraak_label_id"] = labelID}}// Write the updated .data fileif err := df.Write(fu.DataPath); err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(fu.DataPath),Stage: utils.StageImport,Message: fmt.Sprintf("failed to write updated .data file: %v", err),})continue}}return errors}if seg.EndTime > sf.Duration {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("segment end time (%.2f) exceeds file duration (%.2f)", seg.EndTime, sf.Duration),})return SegmentImport{}, nil, 0, errors}segmentID, err := utils.GenerateLongID()if err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("failed to generate segment ID: %v", err),})return SegmentImport{}, nil, 0, errors}_, err = tx.ExecContext(ctx, `INSERT INTO segment (id, file_id, dataset_id, start_time, end_time, freq_low, freq_high, created_at, last_modified, active)VALUES (?, ?, ?, ?, ?, ?, ?, now(), now(), true)`, segmentID, sf.FileID, datasetID, seg.StartTime, seg.EndTime, seg.FreqLow, seg.FreqHigh)if err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("failed to insert segment: %v", err),})return SegmentImport{}, nil, 0, errors}segImport := SegmentImport{SegmentID: segmentID,FileName: filepath.Base(sf.WavPath),StartTime: seg.StartTime,EndTime: seg.EndTime,FreqLow: seg.FreqLow,FreqHigh: seg.FreqHigh,Labels: make([]LabelImport, 0),}labelIDs := make(map[int]string)var subtypesImported intfor labelIdx, label := range seg.Labels {result := importSingleLabel(ctx, tx, label, segmentID, segIdx, labelIdx, sf, mapping, filterIDMap, speciesIDMap, calltypeIDMap)if result.hasError {errors = append(errors, result.err)continue}labelIDs[labelIdx] = result.labelIDsegImport.Labels = append(segImport.Labels, result.labelImport)subtypesImported += result.subtypesImported}return segImport, labelIDs, subtypesImported, errors}// importSegment inserts a single segment and its labels into the DB.func importSegment(ctx context.Context,tx *db.LoggedTx,seg *utils.Segment,segIdx int,sf scannedDataFile,datasetID string,mapping utils.MappingFile,filterIDMap map[string]string,speciesIDMap map[string]string,calltypeIDMap map[string]map[string]string,) (SegmentImport, map[int]string, int, []ImportSegmentError) {var errors []ImportSegmentErrorif seg.StartTime >= seg.EndTime {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("invalid segment bounds: start=%.2f >= end=%.2f", seg.StartTime, seg.EndTime),})return SegmentImport{}, nil, 0, errors}}// importLabelResult holds the result of importing a single label.type importLabelResult struct {labelImport LabelImportlabelID stringsubtypesImported interr ImportSegmentErrorhasError bool}// importSingleLabel inserts a single label and its metadata/subtype into the DB.func importSingleLabel(ctx context.Context,tx *db.LoggedTx,label *utils.Label,segmentID string,segIdx, labelIdx int,sf scannedDataFile,mapping utils.MappingFile,filterIDMap map[string]string,speciesIDMap map[string]string,calltypeIDMap map[string]map[string]string,) importLabelResult {dbSpecies, ok := mapping.GetDBSpecies(label.Species)if !ok {return importLabelResult{err: ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("species not found in mapping: %s", label.Species),}, hasError: true}}speciesID, ok := speciesIDMap[dbSpecies]if !ok {return importLabelResult{err: ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("species ID not found: %s", dbSpecies),}, hasError: true}}filterID, ok := filterIDMap[label.Filter]if !ok {return importLabelResult{err: ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("filter ID not found: %s", label.Filter),}, hasError: true}}labelID, err := utils.GenerateLongID()if err != nil {return importLabelResult{err: ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("failed to generate label ID: %v", err),}, hasError: true}}_, err = tx.ExecContext(ctx, `INSERT INTO label (id, segment_id, species_id, filter_id, certainty, created_at, last_modified, active)VALUES (?, ?, ?, ?, ?, now(), now(), true)`, labelID, segmentID, speciesID, filterID, label.Certainty)if err != nil {return importLabelResult{err: ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("failed to insert label: %v", err),}, hasError: true}}// Insert label_metadata if comment existsif label.Comment != "" {escapedComment := strings.ReplaceAll(label.Comment, `"`, `\"`)metadataJSON := fmt.Sprintf(`{"comment": "%s"}`, escapedComment)if _, err := tx.ExecContext(ctx, `INSERT INTO label_metadata (label_id, json, created_at, last_modified, active)VALUES (?, ?, now(), now(), true)`, labelID, metadataJSON); err != nil {return importLabelResult{err: ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("failed to insert label_metadata: %v", err),}, hasError: true}}}labelImport := LabelImport{LabelID: labelID,Species: dbSpecies,Filter: label.Filter,Certainty: label.Certainty,}if label.Comment != "" {labelImport.Comment = label.Comment}// Insert label_subtype if calltype existsif label.CallType != "" {if err := importCalltype(ctx, tx, labelID, label, dbSpecies, filterID, mapping, calltypeIDMap, sf); err != nil {return importLabelResult{err: *err, hasError: true}}labelImport.CallType = mapping.GetDBCalltype(label.Species, label.CallType)return importLabelResult{labelImport: labelImport, labelID: labelID, subtypesImported: 1}}return importLabelResult{labelImport: labelImport, labelID: labelID}}// importCalltype inserts a label_subtype row for a calltype label.func importCalltype(ctx context.Context,tx *db.LoggedTx,labelID string,label *utils.Label,dbSpecies string,filterID string,mapping utils.MappingFile,calltypeIDMap map[string]map[string]string,sf scannedDataFile,) *ImportSegmentError {dbCalltype := mapping.GetDBCalltype(label.Species, label.CallType)calltypeID := ""if calltypeIDMap[dbSpecies] != nil {calltypeID = calltypeIDMap[dbSpecies][dbCalltype]}if calltypeID == "" {return &ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("calltype ID not found: %s/%s", dbSpecies, dbCalltype),}}subtypeID, err := utils.GenerateLongID()if err != nil {return &ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("failed to generate label_subtype ID: %v", err),}}_, err = tx.ExecContext(ctx, `INSERT INTO label_subtype (id, label_id, calltype_id, filter_id, certainty, created_at, last_modified, active)VALUES (?, ?, ?, ?, ?, now(), now(), true)`, subtypeID, labelID, calltypeID, filterID, label.Certainty)if err != nil {return &ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("failed to insert label_subtype: %v", err),}}return nildatabase, err := db.OpenWriteableDB(resolveDBPath(input.DBPath))// segmentValidation holds the results of pre-import validation (phases B+C).type segmentValidation struct {scannedFiles []scannedDataFilefilterIDMap map[string]stringspeciesIDMap map[string]stringcalltypeIDMap map[string]map[string]stringfileIDMap map[string]scannedDataFile}// validateAndPrepareSegments performs phases B+C: parse data files, validate DB state, and prepare ID maps.func validateAndPrepareSegments(database *sql.DB,input ImportSegmentsInput,mapping utils.MappingFile,dataFiles []string,) (*segmentValidation, []ImportSegmentError, error) {// Phase B: Parse all .data files and collect unique valuesscannedFiles, parseErrors, uniqueFilters, uniqueSpecies, uniqueCalltypes := scanAllDataFiles(dataFiles, input.Folder)if len(scannedFiles) == 0 {return nil, parseErrors, nil}// Validate dataset/location/cluster hierarchyif err := validateSegmentHierarchy(database, input.DatasetID, input.LocationID, input.ClusterID); err != nil {return nil, parseErrors, err}// Validate all filters existfilterIDMap, err := validateFiltersExist(database, uniqueFilters)if err != nil {return nil, parseErrors, fmt.Errorf("filter validation failed: %w", err)}// Validate mapping covers all species/calltypes and they exist in DBvalidationResult, err := utils.ValidateMappingAgainstDB(database, mapping, uniqueSpecies, uniqueCalltypes)if err != nil {return nil, parseErrors, fmt.Errorf("mapping validation failed: %w", err)}if validationResult.HasErrors() {return nil, parseErrors, fmt.Errorf("mapping validation failed: %s", validationResult.Error())}// Load species and calltype ID mapsspeciesIDMap, calltypeIDMap, err := loadSpeciesCalltypeIDs(database, mapping, uniqueSpecies, uniqueCalltypes)if err != nil {return nil, parseErrors, fmt.Errorf("failed to load species/calltype IDs: %w", err)}// Validate files: hash exists, linked to dataset, no existing labelsfileIDMap, hashErrors := validateAndMapFiles(database, scannedFiles, input.ClusterID, input.DatasetID)allErrors := append(parseErrors, hashErrors...)return &segmentValidation{scannedFiles: scannedFiles,filterIDMap: filterIDMap,speciesIDMap: speciesIDMap,calltypeIDMap: calltypeIDMap,fileIDMap: fileIDMap,}, allErrors, nil}DBPath string `json:"db_path"`
package toolsimport ("context""database/sql""fmt""os""time""skraak/db""skraak/utils")// ImportAudioFilesInput defines the input parameters for the import_audio_files tooltype ImportAudioFilesInput struct {FolderPath string `json:"folder_path"`DatasetID string `json:"dataset_id"`LocationID string `json:"location_id"`ClusterID string `json:"cluster_id"`Recursive *bool `json:"recursive,omitempty"` // *bool because default is true; plain bool would make "not provided" indistinguishable from "false"}// ImportAudioFilesOutput defines the output structure for the import_audio_files tooltype ImportAudioFilesOutput struct {Summary ImportSummary `json:"summary"`FileIDs []string `json:"file_ids"`Errors []utils.FileImportError `json:"errors,omitempty"`}// ImportSummary provides summary statistics for the import operationtype ImportSummary struct {TotalFiles int `json:"total_files"`ImportedFiles int `json:"imported_files"`SkippedFiles int `json:"skipped_files"` // DuplicatesFailedFiles int `json:"failed_files"`AudioMothFiles int `json:"audiomoth_files"`TotalDuration float64 `json:"total_duration_seconds"`ProcessingTime string `json:"processing_time"`}// ImportAudioFiles batch imports WAV files from a folder with hash-based duplicate detectionfunc ImportAudioFiles(ctx context.Context,input ImportAudioFilesInput,) (ImportAudioFilesOutput, error) {startTime := time.Now()var output ImportAudioFilesOutput// Default recursive to truerecursive := trueif input.Recursive != nil {recursive = *input.Recursive}// Validate database hierarchy (dataset → location → cluster)return output, fmt.Errorf("validation failed: %w", err)}// Open databaseif err != nil {return output, fmt.Errorf("failed to open database: %w", err)}defer database.Close()// Set cluster path if emptyerr = utils.EnsureClusterPath(database, input.ClusterID, input.FolderPath)if err != nil {return output, fmt.Errorf("failed to set cluster path: %w", err)}// Import the cluster (ALL THE LOGIC IS HERE)FolderPath: input.FolderPath,DatasetID: input.DatasetID,LocationID: input.LocationID,ClusterID: input.ClusterID,Recursive: recursive,})if err != nil {return output, fmt.Errorf("cluster import failed: %w", err)}// Map to output formatoutput = ImportAudioFilesOutput{Summary: ImportSummary{TotalFiles: clusterOutput.TotalFiles,ImportedFiles: clusterOutput.ImportedFiles,SkippedFiles: clusterOutput.SkippedFiles,FailedFiles: clusterOutput.FailedFiles,AudioMothFiles: clusterOutput.AudioMothFiles,TotalDuration: clusterOutput.TotalDuration,ProcessingTime: time.Since(startTime).String(),},FileIDs: []string{}, // File IDs not tracked currentlyErrors: clusterOutput.Errors,}return output, nil}// validateImportInput validates all input parameters and database relationshipsfunc validateImportInput(input ImportAudioFilesInput, dbPath string) error {// Verify folder existsinfo, err := os.Stat(input.FolderPath)if err != nil {return fmt.Errorf("folder not accessible: %w", err)}if !info.IsDir() {return fmt.Errorf("path is not a directory: %s", input.FolderPath)}return validateHierarchyIDs(input.DatasetID, input.LocationID, input.ClusterID, dbPath)}// validateHierarchyIDs validates dataset/location/cluster ID formats and database relationshipsfunc validateHierarchyIDs(datasetID, locationID, clusterID, dbPath string) error {// Validate ID formats first (fast fail before DB queries)if err := utils.ValidateShortID(datasetID, "dataset_id"); err != nil {return err}if err := utils.ValidateShortID(locationID, "location_id"); err != nil {return err}if err := utils.ValidateShortID(clusterID, "cluster_id"); err != nil {return err}return db.WithReadDB(dbPath, func(database *sql.DB) error {// Verify dataset exists, is active, and is 'structured' typeif err := db.ValidateDatasetTypeForImport(database, datasetID); err != nil {return err}// Verify location exists and belongs to datasetif err := db.ValidateLocationBelongsToDataset(database, locationID, datasetID); err != nil {return err}// Verify cluster exists and belongs to locationif err := db.ClusterBelongsToLocation(database, clusterID, locationID); err != nil {return err}return nil})}}if err := tx.Commit(); err != nil {return output, fmt.Errorf("transaction commit failed: %w", err)tx.Rollback()tx, err := db.BeginLoggedTx(ctx, database, "import_audio_files")if err != nil {return output, fmt.Errorf("failed to begin transaction: %w", err)}clusterOutput, err := utils.ImportCluster(database, tx.UnderlyingTx(), utils.ClusterImportInput{database, err := db.OpenWriteableDB(resolveDBPath(input.DBPath))if err := validateImportInput(input, resolveDBPath(input.DBPath)); err != nil {DBPath string `json:"db_path"`
package toolsimport ("context""database/sql""fmt""os""path/filepath""strings""time""skraak/db""skraak/utils")// ImportFileInput defines the input parameters for the import_file tooltype ImportFileInput struct {FilePath string `json:"file_path"`DatasetID string `json:"dataset_id"`LocationID string `json:"location_id"`ClusterID string `json:"cluster_id"`}// ImportFileOutput defines the output structure for the import_file tooltype ImportFileOutput struct {FileID string `json:"file_id"`FileName string `json:"file_name"`Hash string `json:"hash"`Duration float64 `json:"duration_seconds"`SampleRate int `json:"sample_rate"`TimestampLocal time.Time `json:"timestamp_local"`IsAudioMoth bool `json:"is_audiomoth"`IsDuplicate bool `json:"is_duplicate"`ProcessingTime string `json:"processing_time"`Error *string `json:"error,omitempty"`}// ImportFile imports a single WAV file into the database with duplicate detectionfunc ImportFile(ctx context.Context,input ImportFileInput,) (ImportFileOutput, error) {startTime := time.Now()var output ImportFileOutput// Phase 1: Validate file path_, err := validateFilePath(input.FilePath)if err != nil {return output, fmt.Errorf("file validation failed: %w", err)}output.FileName = filepath.Base(input.FilePath)// Phase 2: Validate database hierarchyreturn output, fmt.Errorf("hierarchy validation failed: %w", err)}// Phase 3: Open database connection (single connection for all DB operations)if err != nil {return output, fmt.Errorf("database connection failed: %w", err)}defer database.Close()// Phase 4: Get location data for astronomical calculationslocData, err := utils.GetLocationData(database, input.LocationID)if err != nil {return output, fmt.Errorf("failed to get location data: %w", err)}// Phase 5: Process file metadataresult, err := utils.ProcessSingleFile(input.FilePath, locData.Latitude, locData.Longitude, locData.TimezoneID, true)if err != nil {errMsg := err.Error()output.Error = &errMsgoutput.ProcessingTime = time.Since(startTime).String()return output, fmt.Errorf("file processing failed: %w", err)}// Populate output with extracted metadataoutput.FileName = result.FileNameoutput.Hash = result.Hashoutput.Duration = result.Durationoutput.SampleRate = result.SampleRateoutput.TimestampLocal = result.TimestampLocaloutput.IsAudioMoth = result.IsAudioMoth// Phase 6: Ensure cluster path is setif err := utils.EnsureClusterPath(database, input.ClusterID, filepath.Dir(input.FilePath)); err != nil {return output, fmt.Errorf("failed to set cluster path: %w", err)}// Phase 7: Insert into databasefileID, isDuplicate, err := insertFileIntoDB(ctx, database, result, input.DatasetID, input.ClusterID, input.LocationID)if err != nil {errMsg := err.Error()output.Error = &errMsgoutput.ProcessingTime = time.Since(startTime).String()return output, fmt.Errorf("database insertion failed: %w", err)}output.FileID = fileIDoutput.IsDuplicate = isDuplicateoutput.ProcessingTime = time.Since(startTime).String()return output, nil}// validateFilePath validates the file exists, is a regular file, is a WAV file, and is not emptyfunc validateFilePath(filePath string) (os.FileInfo, error) {// Check file existsinfo, err := os.Stat(filePath)if err != nil {if os.IsNotExist(err) {return nil, fmt.Errorf("file does not exist: %s", filePath)}return nil, fmt.Errorf("cannot access file: %w", err)}// Check it's a regular fileif !info.Mode().IsRegular() {return nil, fmt.Errorf("path is not a regular file: %s", filePath)}// Check extension is .wav (case-insensitive)ext := strings.ToLower(filepath.Ext(filePath))if ext != ".wav" {return nil, fmt.Errorf("file must be a WAV file (got extension: %s)", ext)}// Check file is not emptyif info.Size() == 0 {return nil, fmt.Errorf("file is empty: %s", filePath)}return info, nil}// insertFileIntoDB inserts a single file into the database// Returns (fileID, isDuplicate, error)func insertFileIntoDB(ctx context.Context,database *sql.DB,result *utils.FileProcessingResult,datasetID, clusterID, locationID string,) (string, bool, error) {// Begin logged transactiontx, err := db.BeginLoggedTx(ctx, database, "import_audio_file")if err != nil {return "", false, fmt.Errorf("failed to begin transaction: %w", err)}defer tx.Rollback() // Rollback if not committed// Check for duplicate hashexistingID, isDup, err := utils.CheckDuplicateHash(tx, result.Hash)if err != nil {return "", false, err}if isDup {return existingID, true, nil}// Generate file IDfileID, err := utils.GenerateLongID()if err != nil {return "", false, fmt.Errorf("ID generation failed: %w", err)}// Insert file record_, err = tx.ExecContext(ctx, `INSERT INTO file (id, file_name, xxh64_hash, location_id, timestamp_local,cluster_id, duration, sample_rate, maybe_solar_night, maybe_civil_night,moon_phase, created_at, last_modified, active) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, now(), now(), true)`,fileID, result.FileName, result.Hash, locationID,result.TimestampLocal, clusterID, result.Duration, result.SampleRate,result.AstroData.SolarNight, result.AstroData.CivilNight, result.AstroData.MoonPhase,)if err != nil {return "", false, fmt.Errorf("file insert failed: %w", err)}// Insert file_dataset junction_, err = tx.ExecContext(ctx, `INSERT INTO file_dataset (file_id, dataset_id, created_at, last_modified)VALUES (?, ?, now(), now())`, fileID, datasetID)if err != nil {return "", false, fmt.Errorf("file_dataset insert failed: %w", err)}// If AudioMoth, insert moth_metadataif result.IsAudioMoth && result.MothData != nil {_, err = tx.ExecContext(ctx, `INSERT INTO moth_metadata (file_id, timestamp, recorder_id, gain, battery_v, temp_c,created_at, last_modified, active) VALUES (?, ?, ?, ?, ?, ?, now(), now(), true)`,fileID,result.MothData.Timestamp,&result.MothData.RecorderID,&result.MothData.Gain,&result.MothData.BatteryV,&result.MothData.TempC,)if err != nil {return "", false, fmt.Errorf("moth_metadata insert failed: %w", err)}}// Commit transactionif err = tx.Commit(); err != nil {return "", false, fmt.Errorf("transaction commit failed: %w", err)}return fileID, false, nil}database, err := db.OpenWriteableDB(resolveDBPath(input.DBPath))if err := validateHierarchyIDs(input.DatasetID, input.LocationID, input.ClusterID, resolveDBPath(input.DBPath)); err != nil {DBPath string `json:"db_path"`
package toolsimport ("sort""strings""skraak/utils")// CallsSummariseInput defines the input for the calls-summarise tooltype CallsSummariseInput struct {Folder string `json:"folder"`Brief bool `json:"brief"`Filter string `json:"filter,omitempty"`}// CallsSummariseOutput defines the output for the calls-summarise tooltype CallsSummariseOutput struct {Segments []SegmentSummary `json:"segments"`Folder string `json:"folder"`DataFilesRead int `json:"data_files_read"`DataFilesSkipped []string `json:"data_files_skipped"`TotalSegments int `json:"total_segments"`Filters map[string]FilterStats `json:"filters"`ReviewStatus ReviewStatus `json:"review_status"`Operators []string `json:"operators"`Reviewers []string `json:"reviewers"`Error *string `json:"error,omitempty"`}// SegmentSummary represents a single segment in the outputtype SegmentSummary struct {File string `json:"file"`StartTime float64 `json:"start_time"`EndTime float64 `json:"end_time"`Labels []LabelSummary `json:"labels"`}// LabelSummary represents a label in the output (omits empty fields)type LabelSummary struct {Filter string `json:"filter"`Certainty int `json:"certainty"`Species string `json:"species"`CallType string `json:"calltype,omitempty"`Comment string `json:"comment,omitempty"`Bookmark bool `json:"bookmark,omitempty"`}// FilterStats contains per-filter statisticstype FilterStats struct {Segments int `json:"segments"`Species map[string]int `json:"species"`Calltypes map[string]map[string]int `json:"calltypes,omitempty"` // species -> calltype -> count}// ReviewStatus contains review progress statisticstype ReviewStatus struct {Unreviewed int `json:"unreviewed"` // certainty < 100Confirmed int `json:"confirmed"` // certainty = 100DontKnow int `json:"dont_know"` // certainty = 0WithCallType int `json:"with_calltype"`WithComments int `json:"with_comments"`Bookmarked int `json:"bookmarked"`}// CallsSummarise reads all .data files in a folder and produces a summaryfunc CallsSummarise(input CallsSummariseInput) (CallsSummariseOutput, error) {var output CallsSummariseOutput// Find all .data filesfilePaths, err := utils.FindDataFiles(input.Folder)if err != nil {errMsg := err.Error()output.Error = &errMsgreturn output, err}// Initialize empty slices/maps (avoid null in JSON)output.Segments = make([]SegmentSummary, 0)output.Folder = input.Folderoutput.Filters = make(map[string]FilterStats)output.Operators = make([]string, 0)output.Reviewers = make([]string, 0)output.DataFilesSkipped = make([]string, 0)if len(filePaths) == 0 {return output, nil}// Track unique operators and reviewersoperatorSet := make(map[string]bool)reviewerSet := make(map[string]bool)// Count segments for totalif input.Brief {for _, fs := range output.Filters {output.TotalSegments += fs.Segments}} else {output.TotalSegments = len(output.Segments)}finaliseSummary(&output, operatorSet, reviewerSet, input.Brief)return output, nil}// summariseFiles processes all data files, populating output statsfunc summariseFiles(filePaths []string, input CallsSummariseInput, output *CallsSummariseOutput, operatorSet, reviewerSet map[string]bool) {for _, path := range filePaths {df, err := utils.ParseDataFile(path)if err != nil {output.DataFilesSkipped = append(output.DataFilesSkipped, path)continue}output.DataFilesRead++trackMeta(df.Meta, operatorSet, reviewerSet)var relPath stringif !input.Brief {relPath = extractRelativePath(input.Folder, path)}for _, seg := range df.Segments {filteredLabels := filterLabels(seg.Labels, input.Filter)if input.Filter != "" && len(filteredLabels) == 0 {continue}updateStatsFromLabels(filteredLabels, output)if !input.Brief {output.Segments = append(output.Segments, SegmentSummary{File: relPath,StartTime: seg.StartTime,EndTime: seg.EndTime,Labels: buildLabelSummaries(filteredLabels),})}// trackMeta records operator and reviewer from file metadatafunc trackMeta(meta *utils.DataMeta, operatorSet, reviewerSet map[string]bool) {if meta == nil {return}if meta.Operator != "" {operatorSet[meta.Operator] = true}if meta.Reviewer != "" {reviewerSet[meta.Reviewer] = true}}// filterLabels returns labels matching the filter, or all labels if filter is emptyfunc filterLabels(labels []*utils.Label, filter string) []*utils.Label {if filter == "" {return labels}var filtered []*utils.Labelfor _, l := range labels {if l.Filter == filter {filtered = append(filtered, l)}}return filtered}// buildLabelSummaries converts labels to label summariesfunc buildLabelSummaries(labels []*utils.Label) []LabelSummary {var summaries []LabelSummaryfor _, l := range labels {ls := LabelSummary{Filter: l.Filter,Certainty: l.Certainty,Species: l.Species,}if l.CallType != "" {ls.CallType = l.CallType}if l.Comment != "" {ls.Comment = l.Comment}if l.Bookmark {ls.Bookmark = true}summaries = append(summaries, ls)}return summaries}// updateStatsFromLabels updates filter stats and review status from a set of labelsfunc updateStatsFromLabels(labels []*utils.Label, output *CallsSummariseOutput) {for _, l := range labels {updateFilterStats(l, output)updateReviewStatus(l, output)}}// updateFilterStats increments filter-level statistics for a single labelfunc updateFilterStats(l *utils.Label, output *CallsSummariseOutput) {fs, exists := output.Filters[l.Filter]if !exists {fs = FilterStats{Segments: 0,Species: make(map[string]int),Calltypes: make(map[string]map[string]int),}}if l.CallType != "" {if fs.Calltypes[l.Species] == nil {fs.Calltypes[l.Species] = make(map[string]int)}fs.Calltypes[l.Species][l.CallType]++}output.Filters[l.Filter] = fs}// updateReviewStatus increments review status counters for a single labelfunc updateReviewStatus(l *utils.Label, output *CallsSummariseOutput) {switch l.Certainty {case 100:output.ReviewStatus.Confirmed++case 0:output.ReviewStatus.DontKnow++default:output.ReviewStatus.Unreviewed++}if l.CallType != "" {output.ReviewStatus.WithCallType++}if l.Comment != "" {output.ReviewStatus.WithComments++}if l.Bookmark {output.ReviewStatus.Bookmarked++}// finaliseSummary sorts output, cleans empty maps, and converts sets to sorted slicesfunc finaliseSummary(output *CallsSummariseOutput, operatorSet, reviewerSet map[string]bool, brief bool) {// Clean up empty calltypes mapsfor filter, fs := range output.Filters {if len(fs.Calltypes) == 0 {fs.Calltypes = niloutput.Filters[filter] = fs}}// Convert sets to sorted slicesfor op := range operatorSet {output.Operators = append(output.Operators, op)}for r := range reviewerSet {output.Reviewers = append(output.Reviewers, r)}sort.Strings(output.Operators)sort.Strings(output.Reviewers)// Sort segments by file, then start timeif !brief {sort.Slice(output.Segments, func(i, j int) bool {if output.Segments[i].File != output.Segments[j].File {return output.Segments[i].File < output.Segments[j].File}return output.Segments[i].StartTime < output.Segments[j].StartTime})}}// extractRelativePath extracts the audio filename from a .data file path// e.g., "/folder/tx51_LISTENING_20260221_203004.WAV.data" -> "tx51_LISTENING_20260221_203004.WAV"// Preserves the original case of the extension as-is.func extractRelativePath(folder, dataPath string) string {// Get the filenamefilename := dataPathif idx := strings.LastIndex(dataPath, "/"); idx >= 0 {filename = dataPath[idx+1:]}// Remove .data extension, preserve everything elsereturn strings.TrimSuffix(filename, ".data")}}fs.Segments++fs.Species[l.Species]++}}}summariseFiles(filePaths, input, &output, operatorSet, reviewerSet)
package toolsimport ("fmt""os""strings""skraak/utils")// CallsShowImagesInput defines the input for the show-images tooltype CallsShowImagesInput struct {DataFilePath string `json:"data_file_path"`Color bool `json:"color"`ImageSize int `json:"image_size"`Sixel bool `json:"sixel"`ITerm bool `json:"iterm"`}// CallsShowImagesOutput defines the output for the show-images tooltype CallsShowImagesOutput struct {SegmentsShown int `json:"segments_shown"`WavFile string `json:"wav_file"`Error string `json:"error,omitempty"`}// CallsShowImages reads a .data file and displays spectrogram images for each segmentfunc CallsShowImages(input CallsShowImagesInput) (CallsShowImagesOutput, error) {var output CallsShowImagesOutput// Validate file existsif _, err := os.Stat(input.DataFilePath); os.IsNotExist(err) {output.Error = fmt.Sprintf("File not found: %s", input.DataFilePath)return output, fmt.Errorf("%s", output.Error)}// Derive WAV file path (strip .data suffix)wavPath := strings.TrimSuffix(input.DataFilePath, ".data")output.WavFile = wavPath// Check WAV file existsif _, err := os.Stat(wavPath); os.IsNotExist(err) {output.Error = fmt.Sprintf("WAV file not found: %s", wavPath)return output, fmt.Errorf("%s", output.Error)}// Parse .data file (includes labels for future filtering)dataFile, err := utils.ParseDataFile(input.DataFilePath)if err != nil {output.Error = err.Error()return output, fmt.Errorf("%s", output.Error)}if len(dataFile.Segments) == 0 {output.Error = "No segments found in .data file"return output, fmt.Errorf("%s", output.Error)}// Resolve image sizeimgSize := input.ImageSizeif imgSize == 0 {imgSize = utils.SpectrogramDisplaySize}// Select graphics protocolprotocol := utils.ProtocolKittyif input.ITerm {protocol = utils.ProtocolITerm} else if input.Sixel {protocol = utils.ProtocolSixel}// Generate spectrogram for each segment and outputfor i, seg := range dataFile.Segments {// Generate spectrogram imageimg, err := utils.GenerateSegmentSpectrogram(input.DataFilePath, seg.StartTime, seg.EndTime, input.Color, imgSize)if err != nil || img == nil {continue}// Print segment infolabelInfo := formatSegmentLabels(seg.Labels)fmt.Fprintf(os.Stderr, "Segment %d: %.1fs - %.1fs (%.1fs)%s\n",i+1, seg.StartTime, seg.EndTime, seg.EndTime-seg.StartTime, labelInfo)// Write to stdout via terminal graphics protocolif err := utils.WriteImage(img, os.Stdout, protocol); err != nil {output.Error = fmt.Sprintf("Failed to write image: %v", err)return output, fmt.Errorf("%s", output.Error)}fmt.Println() // Newline after image}output.SegmentsShown = len(dataFile.Segments)return output, nil}// formatSegmentLabels formats labels for display in segment infofunc formatSegmentLabels(labels []*utils.Label) string {if len(labels) == 0 {return ""}var parts []stringfor _, l := range labels {part := l.Speciesif l.CallType != "" {part += "/" + l.CallType}if l.Filter != "" {part += " [" + l.Filter + "]"}parts = append(parts, part)}return " " + strings.Join(parts, ", ")}
package toolsimport ("encoding/json""os""path/filepath""testing""skraak/utils")func TestPushCertaintyPromotesMatchingLabels(t *testing.T) {tempDir := t.TempDir()// File with two Kiwi segments: certainty=90 and certainty=70file1 := `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 90}]], [10, 20, 100, 1000, [{"species": "Kiwi", "certainty": 70}]]]`file1Path := filepath.Join(tempDir, "file1.data")if err := os.WriteFile(file1Path, []byte(file1), 0644); err != nil {t.Fatal(err)}// File with one Tomtit at certainty=90 (must not be promoted when species=Kiwi)file2 := `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Tomtit", "certainty": 90}]]]`file2Path := filepath.Join(tempDir, "file2.data")if err := os.WriteFile(file2Path, []byte(file2), 0644); err != nil {t.Fatal(err)}result, err := PushCertainty(PushCertaintyConfig{Folder: tempDir,Species: "Kiwi",Reviewer: "TestReviewer",})if err != nil {t.Fatal(err)}if result.SegmentsUpdated != 1 {t.Errorf("expected 1 segment updated, got %d", result.SegmentsUpdated)}if result.FilesUpdated != 1 {t.Errorf("expected 1 file updated, got %d", result.FilesUpdated)}// Verify file1: certainty=90 Kiwi → 100, certainty=70 Kiwi → unchangeddf, err := utils.ParseDataFile(file1Path)if err != nil {t.Fatal(err)}if df.Segments[0].Labels[0].Certainty != 100 {t.Errorf("expected certainty=100, got %d", df.Segments[0].Labels[0].Certainty)}if df.Segments[1].Labels[0].Certainty != 70 {t.Errorf("expected certainty=70 unchanged, got %d", df.Segments[1].Labels[0].Certainty)}if df.Meta.Reviewer != "TestReviewer" {t.Errorf("expected reviewer=TestReviewer, got %q", df.Meta.Reviewer)}// Verify Tomtit file was not modifieddf2, err := utils.ParseDataFile(file2Path)if err != nil {t.Fatal(err)}if df2.Segments[0].Labels[0].Certainty != 90 {t.Errorf("Tomtit certainty should be unchanged at 90, got %d", df2.Segments[0].Labels[0].Certainty)}}func TestPushCertaintyFilterScope(t *testing.T) {tempDir := t.TempDir()// Segment has two labels from different filters, both Kiwi certainty=90data := []any{map[string]any{"Operator": "test"},[]any{0.0, 10.0, 100.0, 1000.0, []any{map[string]any{"species": "Kiwi", "certainty": 90, "filter": "model-a"},map[string]any{"species": "Kiwi", "certainty": 90, "filter": "model-b"},}},}raw, _ := json.Marshal(data)filePath := filepath.Join(tempDir, "file1.data")if err := os.WriteFile(filePath, raw, 0644); err != nil {t.Fatal(err)}// Push only model-aresult, err := PushCertainty(PushCertaintyConfig{Folder: tempDir,Filter: "model-a",Species: "Kiwi",Reviewer: "TestReviewer",})if err != nil {t.Fatal(err)}if result.SegmentsUpdated != 1 {t.Errorf("expected 1 segment updated, got %d", result.SegmentsUpdated)}// Verify only model-a label was promoted; model-b stays at 90df, err := utils.ParseDataFile(filePath)if err != nil {t.Fatal(err)}for _, label := range df.Segments[0].Labels {if label.Filter == "model-a" && label.Certainty != 100 {t.Errorf("model-a label should be 100, got %d", label.Certainty)}if label.Filter == "model-b" && label.Certainty != 90 {t.Errorf("model-b label should be unchanged at 90, got %d", label.Certainty)}}}
package toolsimport ("fmt""skraak/utils")// PushCertaintyConfig holds the configuration for push-certaintytype PushCertaintyConfig struct {Folder stringFile stringFilter stringSpecies stringCallType stringNight boolDay boolLat float64Lng float64Timezone stringReviewer string}// PushCertaintyResult holds the result of push-certaintytype PushCertaintyResult struct {SegmentsUpdated int `json:"segments_updated"`FilesUpdated int `json:"files_updated"`TimeFilteredCount int `json:"time_filtered_count"`}// PushCertainty promotes all certainty=90 segments matching the filter scope to certainty=100.// Uses identical filtering logic to LoadDataFiles so the scope matches calls classify exactly.func PushCertainty(config PushCertaintyConfig) (*PushCertaintyResult, error) {state, err := LoadDataFiles(ClassifyConfig{Folder: config.Folder,File: config.File,Filter: config.Filter,Species: config.Species,CallType: config.CallType,Certainty: 90,Sample: -1,Night: config.Night,Day: config.Day,Lat: config.Lat,Lng: config.Lng,Timezone: config.Timezone,})if err != nil {return nil, err}var segsUpdated, filesUpdated intfor i, df := range state.DataFiles {changed := falsefor _, seg := range state.FilteredSegs()[i] {for _, label := range seg.Labels {if labelMatchesPush(label, config.Filter, config.Species, config.CallType) {label.Certainty = 100changed = truesegsUpdated++}}}if changed {df.Meta.Reviewer = config.Reviewerif err := df.Write(df.FilePath); err != nil {return nil, fmt.Errorf("write %s: %w", df.FilePath, err)}filesUpdated++}}return &PushCertaintyResult{SegmentsUpdated: segsUpdated,FilesUpdated: filesUpdated,TimeFilteredCount: state.TimeFilteredCount,}, nil}// labelMatchesPush returns true if the label matches the push scope and has certainty=90.// Certainty is already guaranteed by LoadDataFiles, but we re-check to target only the// specific label that matched (a segment may carry labels from multiple filters).func labelMatchesPush(label *utils.Label, filter, species, callType string) bool {if filter != "" && label.Filter != filter {return false}if species != "" && label.Species != species {return false}if callType != "" && label.CallType != callType {return false}return label.Certainty == 90}
package toolsimport ("path/filepath""testing""skraak/utils")// helpersfunc seg(start, end float64, labels ...*utils.Label) *utils.Segment {return &utils.Segment{StartTime: start,EndTime: end,FreqLow: 100,FreqHigh: 8000,Labels: labels,}}func lbl(filter, species, calltype string, certainty int) *utils.Label {return &utils.Label{Filter: filter,Species: species,CallType: calltype,Certainty: certainty,}}func writeFile(t *testing.T, segs ...*utils.Segment) string {t.Helper()dir := t.TempDir()path := filepath.Join(dir, "test.data")df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "ML", Reviewer: "David", Duration: 3600},Segments: segs,}if err := df.Write(path); err != nil {t.Fatalf("write fixture: %v", err)}return path}func readFile(t *testing.T, path string) *utils.DataFile {t.Helper()df, err := utils.ParseDataFile(path)if err != nil {t.Fatalf("parse %s: %v", path, err)}return df}// findLabel returns the label with matching filter and time on the parsed file, or nil.func findLabel(df *utils.DataFile, filter string, start, end float64) *utils.Label {for _, s := range df.Segments {if s.StartTime != start || s.EndTime != end {continue}for _, l := range s.Labels {if l.Filter == filter {return l}}}return nil}const (fFrom = "opensoundscape-kiwi-1.2"fTo = "opensoundscape-kiwi-1.5")func TestPropagate_HappyPathSingle(t *testing.T) {path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v (%s)", err, out.Error)}if out.Propagated != 1 || out.TargetsExamined != 1 || out.SkippedConflict != 0 || out.SkippedNoOverlap != 0 {t.Fatalf("counts wrong: %+v", out)}df := readFile(t, path)target := findLabel(df, fTo, 100, 125)if target == nil {t.Fatal("target label missing")}if target.Species != "Kiwi" || target.CallType != "Male" || target.Certainty != 90 {t.Errorf("target not updated correctly: species=%q calltype=%q cert=%d", target.Species, target.CallType, target.Certainty)}if df.Meta.Reviewer != "Skraak" {t.Errorf("reviewer = %q, want Skraak", df.Meta.Reviewer)}}func TestPropagate_NoOverlap(t *testing.T) {path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(500, 525, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 0 || out.TargetsExamined != 1 || out.SkippedNoOverlap != 1 {t.Fatalf("counts wrong: %+v", out)}df := readFile(t, path)target := findLabel(df, fTo, 500, 525)if target.Certainty != 70 {t.Errorf("target should not be modified, cert=%d", target.Certainty)}if df.Meta.Reviewer != "David" {t.Errorf("reviewer should stay David (no write), got %q", df.Meta.Reviewer)}}func TestPropagate_SourceWrongSpecies_Ignored(t *testing.T) {path := writeFile(t,seg(100, 125, lbl(fFrom, "Weka", "", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 0 || out.SkippedNoOverlap != 1 {t.Fatalf("counts wrong: %+v", out)}}func TestPropagate_SourceWrongCertainty_Ignored(t *testing.T) {// cert=70 and cert=0 source labels must NOT count as sources.path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 70)),seg(200, 225, lbl(fFrom, "Don't Know", "", 0)),seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),seg(200, 225, lbl(fTo, "Kiwi", "Male", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 0 || out.SkippedNoOverlap != 2 {t.Fatalf("counts wrong: %+v", out)}}func TestPropagate_SourceWrongFilter_Ignored(t *testing.T) {path := writeFile(t,seg(100, 125, lbl("some-other-filter", "Kiwi", "Male", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if !out.FiltersMissing || out.Propagated != 0 || out.TargetsExamined != 0 {t.Fatalf("expected FiltersMissing=true with zero counts, got: %+v", out)}}func TestPropagate_TargetCert100_NotTouched(t *testing.T) {// Target with cert=100 is human-verified — must NOT be overwritten.path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Male", 100)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.TargetsExamined != 0 || out.Propagated != 0 {t.Fatalf("cert=100 target must not be examined: %+v", out)}df := readFile(t, path)if df.Meta.Reviewer != "David" {t.Errorf("reviewer should stay David (no write), got %q", df.Meta.Reviewer)}}func TestPropagate_TargetCert90_NotTouched(t *testing.T) {// Target with cert=90 (already propagated earlier) must NOT be re-propagated.path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Female", 90)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.TargetsExamined != 0 || out.Propagated != 0 {t.Fatalf("cert=90 target must not be examined: %+v", out)}df := readFile(t, path)target := findLabel(df, fTo, 100, 125)if target.Certainty != 90 || target.CallType != "Female" {t.Errorf("cert=90 target was modified: %+v", target)}}func TestPropagate_TargetCert0_Propagated(t *testing.T) {// Target at cert=0 ("Don't Know" / "Noise") SHOULD be propagated when an// overlapping cert=100 source exists — rescues labels from the noise bucket// so they surface for review even if occasionally wrong.path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(100, 125, lbl(fTo, "Don't Know", "", 0)),seg(200, 225, lbl(fFrom, "Kiwi", "Female", 100)),seg(200, 225, lbl(fTo, "Noise", "", 0)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.TargetsExamined != 2 || out.Propagated != 2 {t.Fatalf("cert=0 targets must be propagated: %+v", out)}df := readFile(t, path)for _, c := range []struct {start, end float64calltype string}{{100, 125, "Male"}, {200, 225, "Female"}} {l := findLabel(df, fTo, c.start, c.end)if l == nil || l.Species != "Kiwi" || l.CallType != c.calltype || l.Certainty != 90 {t.Errorf("at %v-%v got %+v, want Kiwi+%s cert=90", c.start, c.end, l, c.calltype)}}}func TestPropagate_MultipleSourcesAgree(t *testing.T) {// Two overlapping sources with same calltype → propagate.path := writeFile(t,seg(100, 110, lbl(fFrom, "Kiwi", "Male", 100)),seg(105, 120, lbl(fFrom, "Kiwi", "Male", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 1 || out.SkippedConflict != 0 {t.Fatalf("counts wrong: %+v", out)}df := readFile(t, path)target := findLabel(df, fTo, 100, 125)if target.CallType != "Male" {t.Errorf("calltype should be Male, got %q", target.CallType)}}func TestPropagate_MultipleSourcesConflict(t *testing.T) {// Two overlapping sources with different calltypes → conflict, skip, report.path := writeFile(t,seg(100, 110, lbl(fFrom, "Kiwi", "Male", 100)),seg(115, 120, lbl(fFrom, "Kiwi", "Female", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 0 || out.SkippedConflict != 1 {t.Fatalf("expected 1 conflict skip: %+v", out)}if len(out.Conflicts) != 1 {t.Fatalf("expected 1 conflict report, got %d", len(out.Conflicts))}if out.Conflicts[0].TargetStart != 100 || out.Conflicts[0].TargetEnd != 125 {t.Errorf("conflict target wrong: %+v", out.Conflicts[0])}if len(out.Conflicts[0].SourceChoices) != 2 {t.Errorf("expected 2 source choices, got %d", len(out.Conflicts[0].SourceChoices))}// Target must NOT be modified.df := readFile(t, path)target := findLabel(df, fTo, 100, 125)if target.CallType != "Duet" || target.Certainty != 70 {t.Errorf("conflicted target was modified: %+v", target)}if df.Meta.Reviewer != "David" {t.Errorf("reviewer should stay David (no write), got %q", df.Meta.Reviewer)}}func TestPropagate_EmptyCallTypePropagates(t *testing.T) {// Source with empty calltype → target gets empty calltype.path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Male", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 1 {t.Fatalf("expected propagated=1: %+v", out)}df := readFile(t, path)target := findLabel(df, fTo, 100, 125)if target.CallType != "" {t.Errorf("calltype should be cleared, got %q", target.CallType)}if target.Species != "Kiwi" || target.Certainty != 90 {t.Errorf("target fields wrong: %+v", target)}}func TestPropagate_SpeciesOverride(t *testing.T) {// Target species was different from --species; must be overwritten.path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(100, 125, lbl(fTo, "Don't Know", "", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 1 {t.Fatalf("expected propagated=1: %+v", out)}df := readFile(t, path)target := findLabel(df, fTo, 100, 125)if target.Species != "Kiwi" || target.CallType != "Male" || target.Certainty != 90 {t.Errorf("target not overwritten correctly: %+v", target)}}func TestPropagate_OverlapBoundaryExclusive(t *testing.T) {// Segments touching at a point (src ends exactly where tgt starts) do NOT overlap.path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(125, 150, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 0 || out.SkippedNoOverlap != 1 {t.Fatalf("touching boundary must not count as overlap: %+v", out)}}func TestPropagate_OverlapPartial(t *testing.T) {// 1-second overlap is enough.path := writeFile(t,seg(100, 126, lbl(fFrom, "Kiwi", "Male", 100)),seg(125, 150, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 1 {t.Fatalf("expected propagated=1: %+v", out)}}func TestPropagate_SupersetEitherDirection(t *testing.T) {// Source engulfs target.path1 := writeFile(t,seg(100, 200, lbl(fFrom, "Kiwi", "Male", 100)),seg(110, 150, lbl(fTo, "Kiwi", "Duet", 70)),)if out, _ := CallsPropagate(CallsPropagateInput{File: path1, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi"}); out.Propagated != 1 {t.Errorf("source-engulfs-target: %+v", out)}// Target engulfs source.path2 := writeFile(t,seg(110, 150, lbl(fFrom, "Kiwi", "Male", 100)),seg(100, 200, lbl(fTo, "Kiwi", "Duet", 70)),)if out, _ := CallsPropagate(CallsPropagateInput{File: path2, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi"}); out.Propagated != 1 {t.Errorf("target-engulfs-source: %+v", out)}}func TestPropagate_MissingFlags(t *testing.T) {cases := []struct {name stringin CallsPropagateInput}{{"no file", CallsPropagateInput{FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi"}},{"no from", CallsPropagateInput{File: "x", ToFilter: fTo, Species: "Kiwi"}},{"no to", CallsPropagateInput{File: "x", FromFilter: fFrom, Species: "Kiwi"}},{"no species", CallsPropagateInput{File: "x", FromFilter: fFrom, ToFilter: fTo}},}for _, c := range cases {t.Run(c.name, func(t *testing.T) {_, err := CallsPropagate(c.in)if err == nil {t.Errorf("expected error")}})}}func TestPropagate_SameFromAndTo(t *testing.T) {_, err := CallsPropagate(CallsPropagateInput{File: "x", FromFilter: fFrom, ToFilter: fFrom, Species: "Kiwi",})if err == nil {t.Error("expected error when --from == --to")}}func TestPropagate_NonexistentFile(t *testing.T) {_, err := CallsPropagate(CallsPropagateInput{File: "/nonexistent/path.data", FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err == nil {t.Error("expected error for nonexistent file")}}func TestPropagate_RealisticMixed(t *testing.T) {// Mimics the 20260228_211500.WAV.data case: cert=0 "Don't Know" and cert=100 Kiwi sources// coexist; only cert=100 Kiwi gets propagated.path := writeFile(t,// Sources (kiwi-1.2)seg(45, 52.5, lbl(fFrom, "Don't Know", "", 0)),seg(142.5, 177.5, lbl(fFrom, "Kiwi", "Male", 100)),seg(195, 217.5, lbl(fFrom, "Don't Know", "", 0)),seg(647.5, 682.5, lbl(fFrom, "Kiwi", "Female", 100)),seg(815, 855, lbl(fFrom, "Kiwi", "Duet", 100)),// Targets (kiwi-1.5)seg(147.5, 167.5, lbl(fTo, "Kiwi", "Male", 70)),seg(647.5, 672.5, lbl(fTo, "Kiwi", "Female", 70)),seg(815, 852.5, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.TargetsExamined != 3 || out.Propagated != 3 || out.SkippedConflict != 0 {t.Fatalf("counts wrong: %+v", out)}df := readFile(t, path)expect := []struct {start, end float64calltype string}{{147.5, 167.5, "Male"},{647.5, 672.5, "Female"},{815, 852.5, "Duet"},}for _, e := range expect {l := findLabel(df, fTo, e.start, e.end)if l == nil || l.Certainty != 90 || l.CallType != e.calltype || l.Species != "Kiwi" {t.Errorf("at %v-%v got %+v, want Kiwi+%s cert=90", e.start, e.end, l, e.calltype)}}}func TestPropagate_NoWriteIfNothingChanged(t *testing.T) {// File with only non-target segments should not be rewritten (reviewer unchanged).path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 0 || out.TargetsExamined != 0 {t.Fatalf("expected no activity: %+v", out)}df := readFile(t, path)if df.Meta.Reviewer != "David" {t.Errorf("reviewer should not be touched, got %q", df.Meta.Reviewer)}}// writeFileAt is like writeFile but puts the file inside an existing dir// with a caller-provided basename (must end in .data).func writeFileAt(t *testing.T, dir, base string, segs ...*utils.Segment) string {t.Helper()path := filepath.Join(dir, base)df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "ML", Reviewer: "David", Duration: 3600},Segments: segs,}if err := df.Write(path); err != nil {t.Fatalf("write fixture: %v", err)}return path}func TestPropagateFolder_AggregatesAndSkipsMissing(t *testing.T) {dir := t.TempDir()// File A: both filters present, one clean propagation.aPath := writeFileAt(t, dir, "a.wav.data",seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),)// File B: only target filter — missing source, must be skipped silently.bPath := writeFileAt(t, dir, "b.wav.data",seg(200, 225, lbl(fTo, "Kiwi", "Duet", 70)),)// File C: only source filter — missing target, must be skipped silently.writeFileAt(t, dir, "c.wav.data",seg(300, 325, lbl(fFrom, "Kiwi", "Male", 100)),)// File D: both filters, but no overlap → targets examined, none propagated.dPath := writeFileAt(t, dir, "d.wav.data",seg(400, 425, lbl(fFrom, "Kiwi", "Male", 100)),seg(500, 525, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagateFolder(CallsPropagateFolderInput{Folder: dir, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}assertPropagateStats(t, out, CallsPropagateFolderOutput{FilesTotal: 4,FilesWithBothFilters: 2,FilesSkippedNoFilter: 2,FilesChanged: 1,FilesErrored: 0,TargetsExamined: 2,Propagated: 1,SkippedNoOverlap: 1,})t.Run("file_a_propagated", func(t *testing.T) {aDf := readFile(t, aPath)if aDf.Meta.Reviewer != "Skraak" {t.Errorf("reviewer: got %q, want Skraak", aDf.Meta.Reviewer)}if l := findLabel(aDf, fTo, 100, 125); l == nil || l.Certainty != 90 || l.CallType != "Male" {t.Errorf("target label: got %+v, want cert=90 calltype=Male", l)}})t.Run("file_b_skipped", func(t *testing.T) {bDf := readFile(t, bPath)if bDf.Meta.Reviewer != "David" {t.Errorf("reviewer should not be touched, got %q", bDf.Meta.Reviewer)}})t.Run("file_d_no_overlap", func(t *testing.T) {dDf := readFile(t, dPath)if dDf.Meta.Reviewer != "David" {t.Errorf("reviewer should not be touched, got %q", dDf.Meta.Reviewer)}if l := findLabel(dDf, fTo, 500, 525); l == nil || l.Certainty != 70 {t.Errorf("target label should be unchanged cert=70, got %+v", l)}})}func TestPropagateFolder_EmptyFolder(t *testing.T) {dir := t.TempDir()out, err := CallsPropagateFolder(CallsPropagateFolderInput{Folder: dir, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.FilesTotal != 0 || out.Propagated != 0 {t.Errorf("expected empty result, got %+v", out)}}func TestPropagateFolder_MissingRequiredFlags(t *testing.T) {dir := t.TempDir()cases := []CallsPropagateFolderInput{{Folder: "", FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi"},{Folder: dir, FromFilter: "", ToFilter: fTo, Species: "Kiwi"},{Folder: dir, FromFilter: fFrom, ToFilter: "", Species: "Kiwi"},{Folder: dir, FromFilter: fFrom, ToFilter: fTo, Species: ""},{Folder: dir, FromFilter: fFrom, ToFilter: fFrom, Species: "Kiwi"},}for i, in := range cases {if _, err := CallsPropagateFolder(in); err == nil {t.Errorf("case %d: expected error for input %+v", i, in)}}}func TestPropagateFolder_NonexistentFolder(t *testing.T) {_, err := CallsPropagateFolder(CallsPropagateFolderInput{Folder: "/nonexistent/path/xyz", FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err == nil {t.Fatal("expected error for nonexistent folder")}}func TestPropagateFolder_ConflictsTaggedWithFile(t *testing.T) {dir := t.TempDir()// Two sources with different calltypes both overlapping one target.writeFileAt(t, dir, "conflict.wav.data",seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(110, 130, lbl(fFrom, "Kiwi", "Female", 100)),seg(100, 130, lbl(fTo, "Kiwi", "", 70)),)out, err := CallsPropagateFolder(CallsPropagateFolderInput{Folder: dir, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.SkippedConflict != 1 || len(out.Conflicts) != 1 {t.Fatalf("expected one conflict, got %+v", out)}if out.Conflicts[0].File == "" {t.Errorf("conflict should be tagged with file path, got %+v", out.Conflicts[0])}}}// assertPropagateStats checks output stats against expected values.func assertPropagateStats(t *testing.T, got, want CallsPropagateFolderOutput) {t.Helper()checks := []struct {name stringgot intwant int}{{"FilesTotal", got.FilesTotal, want.FilesTotal},{"FilesWithBothFilters", got.FilesWithBothFilters, want.FilesWithBothFilters},{"FilesSkippedNoFilter", got.FilesSkippedNoFilter, want.FilesSkippedNoFilter},{"FilesChanged", got.FilesChanged, want.FilesChanged},{"FilesErrored", got.FilesErrored, want.FilesErrored},{"TargetsExamined", got.TargetsExamined, want.TargetsExamined},{"Propagated", got.Propagated, want.Propagated},{"SkippedNoOverlap", got.SkippedNoOverlap, want.SkippedNoOverlap},}for _, c := range checks {if c.got != c.want {t.Errorf("%s: got %d, want %d", c.name, c.got, c.want)}}
package toolsimport ("fmt""os""skraak/utils")type CallsPropagateInput struct {File string `json:"file"`FromFilter string `json:"from_filter"`ToFilter string `json:"to_filter"`Species string `json:"species"`}type CallsPropagateOutput struct {File string `json:"file"`FromFilter string `json:"from_filter"`ToFilter string `json:"to_filter"`Species string `json:"species"`FiltersMissing bool `json:"filters_missing,omitempty"`TargetsExamined int `json:"targets_examined"`Propagated int `json:"propagated"`SkippedNoOverlap int `json:"skipped_no_overlap"`SkippedConflict int `json:"skipped_conflict"`Conflicts []PropagateConflict `json:"conflicts,omitempty"`Changes []PropagateChange `json:"changes,omitempty"`Error string `json:"error,omitempty"`}type CallsPropagateFolderInput struct {Folder string `json:"folder"`FromFilter string `json:"from_filter"`ToFilter string `json:"to_filter"`Species string `json:"species"`}type CallsPropagateFolderOutput struct {Folder string `json:"folder"`FromFilter string `json:"from_filter"`ToFilter string `json:"to_filter"`Species string `json:"species"`FilesTotal int `json:"files_total"`FilesWithBothFilters int `json:"files_with_both_filters"`FilesSkippedNoFilter int `json:"files_skipped_no_filter"`FilesChanged int `json:"files_changed"`FilesErrored int `json:"files_errored"`TargetsExamined int `json:"targets_examined"`Propagated int `json:"propagated"`SkippedNoOverlap int `json:"skipped_no_overlap"`SkippedConflict int `json:"skipped_conflict"`Conflicts []PropagateConflict `json:"conflicts,omitempty"`Errors []CallsPropagateOutput `json:"errors,omitempty"`Error string `json:"error,omitempty"`}type PropagateConflict struct {File string `json:"file,omitempty"`TargetStart float64 `json:"target_start"`TargetEnd float64 `json:"target_end"`TargetCallType string `json:"target_calltype,omitempty"`SourceChoices []PropagateSourceChoice `json:"source_choices"`}type PropagateSourceChoice struct {Start float64 `json:"start"`End float64 `json:"end"`Species string `json:"species"`CallType string `json:"calltype,omitempty"`}type PropagateChange struct {TargetStart float64 `json:"target_start"`TargetEnd float64 `json:"target_end"`PrevSpecies string `json:"prev_species"`PrevCallType string `json:"prev_calltype,omitempty"`PrevCertainty int `json:"prev_certainty"`NewSpecies string `json:"new_species"`NewCallType string `json:"new_calltype,omitempty"`NewCertainty int `json:"new_certainty"`}// CallsPropagate copies verified classifications (certainty==100) from one filter's// segments to overlapping target segments of another filter, within a single .data file.// Target labels with certainty==70 (ML-unverified) or certainty==0 (Don't Know / Noise)// are updated — targets at certainty==100 (human-verified) and certainty==90 (already// propagated) are left alone. Only source labels matching --species are considered.// Propagated target labels are set to certainty=90 and file reviewer is set to "Skraak".func CallsPropagate(input CallsPropagateInput) (CallsPropagateOutput, error) {output := CallsPropagateOutput{File: input.File,FromFilter: input.FromFilter,ToFilter: input.ToFilter,Species: input.Species,}if err := validatePropagateInput(&output, input); err != nil {return output, err}df, err := utils.ParseDataFile(input.File)if err != nil {output.Error = fmt.Sprintf("parse %s: %v", input.File, err)return output, fmt.Errorf("%s", output.Error)}// Fast path: skip files that don't contain both filters at all.if !hasBothFilters(df, input.FromFilter, input.ToFilter) {output.FiltersMissing = truereturn output, nil}sources := collectPropagateSources(df, input.FromFilter, input.Species)propagateTargets(df, sources, input, &output)if output.Propagated > 0 {df.Meta.Reviewer = "Skraak"if err := df.Write(input.File); err != nil {output.Error = fmt.Sprintf("write %s: %v", input.File, err)return output, fmt.Errorf("%s", output.Error)}}return output, nil}// validatePropagateInput checks required fields and file existencefunc validatePropagateInput(output *CallsPropagateOutput, input CallsPropagateInput) error {checks := []struct {val stringmsg string}{{input.File, "--file is required"},{input.FromFilter, "--from is required"},{input.ToFilter, "--to is required"},{input.Species, "--species is required"},}for _, c := range checks {if c.val == "" {output.Error = c.msgreturn fmt.Errorf("%s", c.msg)}}if input.FromFilter == input.ToFilter {output.Error = "--from and --to must differ"return fmt.Errorf("%s", output.Error)}if _, err := os.Stat(input.File); os.IsNotExist(err) {output.Error = fmt.Sprintf("file not found: %s", input.File)return fmt.Errorf("%s", output.Error)}// hasBothFilters checks whether the data file contains both from and to filtersfunc hasBothFilters(df *utils.DataFile, fromFilter, toFilter string) bool {hasFrom, hasTo := false, falsefor _, seg := range df.Segments {for _, lbl := range seg.Labels {if lbl.Filter == fromFilter {hasFrom = true}if lbl.Filter == toFilter {hasTo = true}if hasFrom && hasTo {return true}}}// sourceRef pairs a segment with its matching source labeltype sourceRef struct {seg *utils.Segmentlabel *utils.Label}// collectPropagateSources gathers verified source labels (certainty==100) for the given filter/speciesfunc collectPropagateSources(df *utils.DataFile, fromFilter, species string) []sourceRef {var sources []sourceReffor _, seg := range df.Segments {for _, lbl := range seg.Labels {if lbl.Filter == fromFilter && lbl.Species == species && lbl.Certainty == 100 {sources = append(sources, sourceRef{seg: seg, label: lbl})break}}}// propagateTargets iterates target segments, finds overlapping sources, and applies agreed classificationsfunc propagateTargets(df *utils.DataFile, sources []sourceRef, input CallsPropagateInput, output *CallsPropagateOutput) {for _, tSeg := range df.Segments {toLabel := findUpdatableTargetLabel(tSeg.Labels, input.ToFilter)if toLabel == nil {continue}output.TargetsExamined++overlaps := findOverlappingSources(sources, tSeg)if len(overlaps) == 0 {output.SkippedNoOverlap++continue}agreedCallType, conflict := resolveCallType(overlaps)if conflict {output.SkippedConflict++output.Conflicts = append(output.Conflicts, buildConflictRecord(tSeg, toLabel, overlaps))continue}applyPropagation(toLabel, input.Species, agreedCallType, tSeg, output)}}// findUpdatableTargetLabel finds a target label with certainty 70 or 0 for the given filterfunc findUpdatableTargetLabel(labels []*utils.Label, toFilter string) *utils.Label {for _, lbl := range labels {if lbl.Filter == toFilter && (lbl.Certainty == 70 || lbl.Certainty == 0) {return lbl}}return nil}// findOverlappingSources returns sources whose segments overlap with the target segmentfunc findOverlappingSources(sources []sourceRef, tSeg *utils.Segment) []sourceRef {var overlaps []sourceReffor _, s := range sources {if s.seg.StartTime < tSeg.EndTime && tSeg.StartTime < s.seg.EndTime {overlaps = append(overlaps, s)}// resolveCallType checks if all overlapping sources agree on a call type.// Returns the agreed call type and whether there is a conflict.func resolveCallType(overlaps []sourceRef) (string, bool) {agreedCallType := overlaps[0].label.CallTypefor _, s := range overlaps[1:] {if s.label.CallType != agreedCallType {return "", true}}return agreedCallType, false}// buildConflictRecord creates a PropagateConflict from overlapping disagreeing sourcesfunc buildConflictRecord(tSeg *utils.Segment, toLabel *utils.Label, overlaps []sourceRef) PropagateConflict {choices := make([]PropagateSourceChoice, 0, len(overlaps))for _, s := range overlaps {choices = append(choices, PropagateSourceChoice{Start: s.seg.StartTime,End: s.seg.EndTime,Species: s.label.Species,CallType: s.label.CallType,})}return PropagateConflict{TargetStart: tSeg.StartTime,TargetEnd: tSeg.EndTime,TargetCallType: toLabel.CallType,SourceChoices: choices,}// applyPropagation updates the target label and records the changefunc applyPropagation(toLabel *utils.Label, species, callType string, tSeg *utils.Segment, output *CallsPropagateOutput) {change := PropagateChange{TargetStart: tSeg.StartTime,TargetEnd: tSeg.EndTime,PrevSpecies: toLabel.Species,PrevCallType: toLabel.CallType,PrevCertainty: toLabel.Certainty,NewSpecies: species,NewCallType: callType,NewCertainty: 90,}output.Propagated++output.Changes = append(output.Changes, change)}// CallsPropagateFolder runs CallsPropagate against every .data file in a folder,// aggregating counts. Files that do not contain both --from and --to filters are// skipped silently (counted as files_skipped_no_filter). Parse/write errors on// individual files are collected in Errors; they don't abort the run.func CallsPropagateFolder(input CallsPropagateFolderInput) (CallsPropagateFolderOutput, error) {output := CallsPropagateFolderOutput{Folder: input.Folder,FromFilter: input.FromFilter,ToFilter: input.ToFilter,Species: input.Species,}if input.Folder == "" {output.Error = "--folder is required"return output, fmt.Errorf("%s", output.Error)}if input.FromFilter == "" {output.Error = "--from is required"return output, fmt.Errorf("%s", output.Error)}if input.ToFilter == "" {output.Error = "--to is required"return output, fmt.Errorf("%s", output.Error)}if input.Species == "" {output.Error = "--species is required"return output, fmt.Errorf("%s", output.Error)}if input.FromFilter == input.ToFilter {output.Error = "--from and --to must differ"return output, fmt.Errorf("%s", output.Error)}info, err := os.Stat(input.Folder)if err != nil {output.Error = fmt.Sprintf("folder not found: %s", input.Folder)return output, fmt.Errorf("%s", output.Error)}if !info.IsDir() {output.Error = fmt.Sprintf("not a directory: %s", input.Folder)return output, fmt.Errorf("%s", output.Error)}files, err := utils.FindDataFiles(input.Folder)if err != nil {output.Error = fmt.Sprintf("list .data files: %v", err)return output, fmt.Errorf("%s", output.Error)}output.FilesTotal = len(files)for _, f := range files {fileOut, err := CallsPropagate(CallsPropagateInput{File: f,FromFilter: input.FromFilter,ToFilter: input.ToFilter,Species: input.Species,})if err != nil {output.FilesErrored++output.Errors = append(output.Errors, fileOut)continue}if fileOut.FiltersMissing {output.FilesSkippedNoFilter++continue}output.FilesWithBothFilters++output.TargetsExamined += fileOut.TargetsExaminedoutput.Propagated += fileOut.Propagatedoutput.SkippedNoOverlap += fileOut.SkippedNoOverlapoutput.SkippedConflict += fileOut.SkippedConflictif fileOut.Propagated > 0 {output.FilesChanged++}for _, c := range fileOut.Conflicts {c.File = foutput.Conflicts = append(output.Conflicts, c)}}return output, nil}toLabel.Species = speciestoLabel.CallType = callTypetoLabel.Certainty = 90}}return overlaps}return sources}return false}return nil}
package toolsimport ("path/filepath""testing""skraak/utils")func TestCallsModifyBookmark(t *testing.T) {// Create a temp .data file with a bookmarked segmenttmpDir := t.TempDir()tmpFile := filepath.Join(tmpDir, "test.data")df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "test", Duration: 60},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 15.0,FreqLow: 100,FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 80, Filter: "myfilter", CallType: "Duet", Bookmark: true},},},},}if err := df.Write(tmpFile); err != nil {t.Fatalf("failed to write test file: %v", err)}// Test 1: Adding bookmark when already true should do nothingbookmark := trueresult, err := CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "10-15",Certainty: 80,Bookmark: &bookmark,})// Should return error "no changes needed"if err == nil {t.Errorf("expected error 'no changes needed' when bookmark already true, got nil")}if result.Error != "No changes needed: all values already match" {t.Errorf("expected 'no changes needed' error, got: %s", result.Error)}// Verify bookmark is still true in the filedf2, err := utils.ParseDataFile(tmpFile)if err != nil {t.Fatalf("failed to parse file: %v", err)}if !df2.Segments[0].Labels[0].Bookmark {t.Errorf("bookmark should still be true, got false")}}func TestCallsModifyBookmarkFalse(t *testing.T) {// Create a temp .data file WITHOUT a bookmarktmpDir := t.TempDir()tmpFile := filepath.Join(tmpDir, "test.data")df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "test", Duration: 60},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 15.0,FreqLow: 100,FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 80, Filter: "myfilter", CallType: "Duet", Bookmark: false},},},},}if err := df.Write(tmpFile); err != nil {t.Fatalf("failed to write test file: %v", err)}// Test: Adding bookmark when false should set it to truebookmark := trueresult, err := CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "10-15",Certainty: 80,Bookmark: &bookmark,})if err != nil {t.Errorf("unexpected error: %v", err)}if result.Bookmark == nil || !*result.Bookmark {t.Errorf("expected bookmark=true in result, got %v", result.Bookmark)}// Verify bookmark is true in the filedf2, err := utils.ParseDataFile(tmpFile)if err != nil {t.Fatalf("failed to parse file: %v", err)}if !df2.Segments[0].Labels[0].Bookmark {t.Errorf("bookmark should be true, got false")}}func TestCallsModifyCommentAdditive(t *testing.T) {// Create a temp .data file with an existing commenttmpDir := t.TempDir()tmpFile := filepath.Join(tmpDir, "test.data")df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "test", Duration: 60},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 15.0,FreqLow: 100,FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 80, Filter: "myfilter", Comment: "First observation"},},},},}if err := df.Write(tmpFile); err != nil {t.Fatalf("failed to write test file: %v", err)}// Test: Adding comment should be additiveresult, err := CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "10-15",Certainty: 80,Comment: "Good example",})if err != nil {t.Errorf("unexpected error: %v", err)}expectedComment := "First observation | Good example"if result.Comment != expectedComment {t.Errorf("expected comment=%q, got %q", expectedComment, result.Comment)}// Verify comment in filedf2, err := utils.ParseDataFile(tmpFile)if err != nil {t.Fatalf("failed to parse file: %v", err)}if df2.Segments[0].Labels[0].Comment != expectedComment {t.Errorf("expected comment in file=%q, got %q", expectedComment, df2.Segments[0].Labels[0].Comment)}}func TestCallsModifyCommentAdditiveMultiple(t *testing.T) {// Create a temp .data file and add multiple commentstmpDir := t.TempDir()tmpFile := filepath.Join(tmpDir, "test.data")df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "test", Duration: 60},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 15.0,FreqLow: 100,FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 80, Filter: "myfilter"},},},},}if err := df.Write(tmpFile); err != nil {t.Fatalf("failed to write test file: %v", err)}// Add first comment_, err := CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "10-15",Certainty: 80,Comment: "First",})if err != nil {t.Fatalf("unexpected error on first comment: %v", err)}// Add second comment_, err = CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "10-15",Certainty: 80,Comment: "Second",})if err != nil {t.Fatalf("unexpected error on second comment: %v", err)}// Add third commentresult, err := CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "10-15",Certainty: 80,Comment: "Third",})if err != nil {t.Fatalf("unexpected error on third comment: %v", err)}expectedComment := "First | Second | Third"if result.Comment != expectedComment {t.Errorf("expected comment=%q, got %q", expectedComment, result.Comment)}}func TestCallsModifyCommentTooLong(t *testing.T) {// Create a temp .data file with an existing long commenttmpDir := t.TempDir()tmpFile := filepath.Join(tmpDir, "test.data")existingComment := "This is a fairly long existing comment that takes up space"df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "test", Duration: 60},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 15.0,FreqLow: 100,FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 80, Filter: "myfilter", Comment: existingComment},},},},}if err := df.Write(tmpFile); err != nil {t.Fatalf("failed to write test file: %v", err)}// Test: Adding a long comment that would exceed 140 chars should faillongNewComment := "This is another very long comment that when combined with the existing one will exceed the limit"result, err := CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "10-15",Certainty: 80,Comment: longNewComment,})if err == nil {t.Errorf("expected error for combined comment exceeding 140 chars, got nil")}if result.Error == "" {t.Errorf("expected error message, got empty")}// Verify original comment is preserveddf2, err := utils.ParseDataFile(tmpFile)if err != nil {t.Fatalf("failed to parse file: %v", err)}if df2.Segments[0].Labels[0].Comment != existingComment {t.Errorf("original comment should be preserved, got %q", df2.Segments[0].Labels[0].Comment)}}func TestCallsModifyPreservesBookmarkOnOtherChange(t *testing.T) {// Create a temp .data file with a bookmarktmpDir := t.TempDir()tmpFile := filepath.Join(tmpDir, "test.data")df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "test", Duration: 60},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 15.0,FreqLow: 100,FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 80, Filter: "myfilter", Bookmark: true},},},},}if err := df.Write(tmpFile); err != nil {t.Fatalf("failed to write test file: %v", err)}// Change certainty (without passing --bookmark) - bookmark should be preservedresult, err := CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "10-15",Certainty: 100,// No Bookmark set})if err != nil {t.Errorf("unexpected error: %v", err)}if result.Bookmark != nil {t.Errorf("bookmark should not be in output when not changed, got %v", result.Bookmark)}// Verify bookmark is still true in the filedf2, err := utils.ParseDataFile(tmpFile)if err != nil {t.Fatalf("failed to parse file: %v", err)}if !df2.Segments[0].Labels[0].Bookmark {t.Errorf("bookmark should still be true after changing certainty, got false")}}func TestCallsModifyInvalidSegment(t *testing.T) {tmpDir := t.TempDir()tmpFile := filepath.Join(tmpDir, "test.data")df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "test", Duration: 60},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 15.0,FreqLow: 100,FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 80, Filter: "myfilter"},},},},}if err := df.Write(tmpFile); err != nil {t.Fatalf("failed to write test file: %v", err)}// Test: Non-existent segment should errorresult, err := CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "99-100",Certainty: 80,})if err == nil {t.Errorf("expected error for non-existent segment, got nil")}if result.Error == "" {t.Errorf("expected error message, got empty")}}
package toolsimport ("fmt""math""os""strings""skraak/utils")// CallsModifyInput defines the input for the modify tooltype CallsModifyInput struct {File string `json:"file"`Reviewer string `json:"reviewer"`Filter string `json:"filter"`Segment string `json:"segment"`Certainty int `json:"certainty"`Species string `json:"species"`Bookmark *bool `json:"bookmark"`Comment string `json:"comment"`}// CallsModifyOutput defines the output for the modify tooltype CallsModifyOutput struct {File string `json:"file"`SegmentStart int `json:"segment_start"`SegmentEnd int `json:"segment_end"`Species string `json:"species,omitempty"`CallType string `json:"calltype,omitempty"`Certainty int `json:"certainty,omitempty"`Bookmark *bool `json:"bookmark,omitempty"`Comment string `json:"comment,omitempty"`PreviousValue string `json:"previous_value,omitempty"`Error string `json:"error,omitempty"`}// validateModifyInput checks required fields and comment constraints.func validateModifyInput(input CallsModifyInput) error {if input.File == "" {return fmt.Errorf("--file is required")}if input.Reviewer == "" {return fmt.Errorf("--reviewer is required")}if input.Filter == "" {return fmt.Errorf("--filter is required")}if input.Segment == "" {return fmt.Errorf("--segment is required")}if len(input.Comment) > 140 {return fmt.Errorf("--comment must be 140 characters or less")}for i, r := range input.Comment {if r > 127 {return fmt.Errorf("--comment must be ASCII only (non-ASCII at position %d)", i)}}return nil}// resolveSpecies parses species+calltype from the input species string.// If input species is empty, keeps the existing label values.func resolveSpecies(inputSpecies string, label *utils.Label) (species, callType string) {if inputSpecies == "" {return label.Species, label.CallType}if before, after, ok := strings.Cut(inputSpecies, "+"); ok {return before, after}return inputSpecies, ""}// hasModifyChanges checks whether any field would actually change.func hasModifyChanges(newSpecies, newCallType string, input CallsModifyInput, label *utils.Label) bool {if newSpecies != label.Species || newCallType != label.CallType {return true}if input.Certainty != label.Certainty {return true}if input.Bookmark != nil && *input.Bookmark != label.Bookmark {return true}if input.Comment != "" {return true}return false}// applyLabelChanges updates the label and data file, populating the output.func applyLabelChanges(label *utils.Label, dataFile *utils.DataFile, input CallsModifyInput, newSpecies, newCallType string, output *CallsModifyOutput) error {dataFile.Meta.Reviewer = input.Reviewerlabel.Species = newSpecieslabel.CallType = newCallTypeoutput.Species = newSpeciesoutput.CallType = newCallTypelabel.Certainty = input.Certaintyoutput.Certainty = input.Certaintyif input.Bookmark != nil && *input.Bookmark != label.Bookmark {label.Bookmark = *input.Bookmarkoutput.Bookmark = input.Bookmark}if input.Comment != "" {var newComment stringif label.Comment != "" {newComment = label.Comment + " | " + input.Comment} else {newComment = input.Comment}if len(newComment) > 140 {return fmt.Errorf("combined comment exceeds 140 characters (%d)", len(newComment))}}output.File = input.Fileoutput.SegmentStart = startTimeoutput.SegmentEnd = endTimeif _, err := os.Stat(input.File); os.IsNotExist(err) {output.Error = fmt.Sprintf("File not found: %s", input.File)return output, fmt.Errorf("%s", output.Error)}dataFile, err := utils.ParseDataFile(input.File)if err != nil {output.Error = fmt.Sprintf("Failed to parse file: %v", err)return output, fmt.Errorf("%s", output.Error)}segment := findSegment(dataFile.Segments, startTime, endTime, input.Filter)if segment == nil {output.Error = fmt.Sprintf("No segment found matching time range %d-%d", startTime, endTime)return output, fmt.Errorf("%s", output.Error)}if targetLabel == nil {output.Error = fmt.Sprintf("No label found with filter '%s' in segment %d-%d", input.Filter, startTime, endTime)return output, fmt.Errorf("%s", output.Error)}output.PreviousValue = formatLabel(targetLabel)newSpecies, newCallType := resolveSpecies(input.Species, targetLabel)if !hasModifyChanges(newSpecies, newCallType, input, targetLabel) {output.Error = "No changes needed: all values already match"return output, fmt.Errorf("%s", output.Error)}if err := applyLabelChanges(targetLabel, dataFile, input, newSpecies, newCallType, &output); err != nil {output.Error = err.Error()return output, err}if err := dataFile.Write(input.File); err != nil {output.Error = fmt.Sprintf("Failed to save file: %v", err)return output, fmt.Errorf("%s", output.Error)}return output, nil}// parseSegmentRange parses "12-15" format into start and end integersfunc parseSegmentRange(s string) (int, int, error) {parts := strings.Split(s, "-")if len(parts) != 2 {return 0, 0, fmt.Errorf("invalid segment format: %s (expected start-end, e.g., 12-15)", s)}var start, end intif _, err := fmt.Sscanf(parts[0], "%d", &start); err != nil {return 0, 0, fmt.Errorf("invalid start time: %s", parts[0])}if _, err := fmt.Sscanf(parts[1], "%d", &end); err != nil {return 0, 0, fmt.Errorf("invalid end time: %s", parts[1])}if start < 0 || end < 0 {return 0, 0, fmt.Errorf("times must be non-negative")}if start >= end {return 0, 0, fmt.Errorf("start time must be less than end time")}return start, end, nil}// findSegment finds a segment matching the time range using floor/ceil matching.// It also checks that the segment contains a label with the specified filter,// so that duplicate segments (same time range, different filters) are resolved correctly.func findSegment(segments []*utils.Segment, startTime, endTime int, filter string) *utils.Segment {for _, seg := range segments {segStart := int(math.Floor(seg.StartTime))segEnd := int(math.Ceil(seg.EndTime))if segEnd == segStart {segEnd = segStart + 1 // minimum 1 second}if segStart == startTime && segEnd == endTime {for _, label := range seg.Labels {if label.Filter == filter {return seg}}}}return nil}// formatLabel formats a label for displayfunc formatLabel(label *utils.Label) string {result := label.Speciesif label.CallType != "" {result += "+" + label.CallType}result += fmt.Sprintf(" (%d%%)", label.Certainty)return result}}// findLabelByFilter finds the first label matching the given filter in a segment.func findLabelByFilter(segment *utils.Segment, filter string) *utils.Label {for _, label := range segment.Labels {if label.Filter == filter {return label}}return niltargetLabel := findLabelByFilter(segment, input.Filter)startTime, endTime, err := parseSegmentRange(input.Segment)if err != nil {output.Error = err.Error()return output, err}label.Comment = newCommentoutput.Comment = newComment}return nil}// CallsModify modifies a label in a .data filefunc CallsModify(input CallsModifyInput) (CallsModifyOutput, error) {var output CallsModifyOutputif err := validateModifyInput(input); err != nil {output.Error = err.Error()return output, err
package toolsimport ("bufio""fmt""os""path/filepath""strconv""strings""skraak/utils")// CallsFromRavenInput defines the input for the calls-from-raven tooltype CallsFromRavenInput struct {Folder string `json:"folder"`File string `json:"file"`Delete bool `json:"delete"`ProgressHandler ProgressHandler `json:"-"` // Optional progress callback}// CallsFromRavenOutput defines the output for the calls-from-raven tooltype CallsFromRavenOutput struct {Calls []ClusteredCall `json:"calls"`TotalCalls int `json:"total_calls"`SpeciesCount map[string]int `json:"species_count"`DataFilesWritten int `json:"data_files_written"`DataFilesSkipped int `json:"data_files_skipped"`FilesProcessed int `json:"files_processed"`FilesDeleted int `json:"files_deleted"`Filter string `json:"filter"`Error *string `json:"error,omitempty"`}// ravenSource implements CallSource for Raven selection filestype ravenSource struct{}func (ravenSource) Name() string { return "Raven" }func (ravenSource) FindFiles(folder string) ([]string, error) {var files []stringentries, err := os.ReadDir(folder)if err != nil {return nil, err}for _, entry := range entries {name := entry.Name()if strings.HasSuffix(name, ".selections.txt") {files = append(files, filepath.Join(folder, name))}}return files, nil}func (ravenSource) ProcessFile(ravenFile string, cache *DirCache) ([]ClusteredCall, bool, bool, error) {return processRavenFileCached(ravenFile, cache)}// CallsFromRaven processes Raven selection files and writes .data filesfunc CallsFromRaven(input CallsFromRavenInput) (CallsFromRavenOutput, error) {src := ravenSource{}commonInput := CallsFromSourceInput(input)commonOutput, err := callsFromSource(src, commonInput)// Convert to Raven-specific output typevar output CallsFromRavenOutputoutput.Calls = commonOutput.Callsoutput.TotalCalls = commonOutput.TotalCallsoutput.SpeciesCount = commonOutput.SpeciesCountoutput.DataFilesWritten = commonOutput.DataFilesWrittenoutput.DataFilesSkipped = commonOutput.DataFilesSkippedoutput.FilesProcessed = commonOutput.FilesProcessedoutput.FilesDeleted = commonOutput.FilesDeletedoutput.Filter = commonOutput.Filteroutput.Error = commonOutput.Errorreturn output, err}// RavenSelection represents a single Raven selectiontype RavenSelection struct {StartTime float64EndTime float64FreqLow float64FreqHigh float64Species string}// ravenColumnIndices holds the column index positions for a Raven filetype ravenColumnIndices struct {beginTimeIdx intendTimeIdx intlowFreqIdx inthighFreqIdx intspeciesIdx int}for i, col := range header {switch col {case "Begin Time (s)":idx.beginTimeIdx = icase "End Time (s)":idx.endTimeIdx = icase "Low Freq (Hz)":idx.lowFreqIdx = icase "High Freq (Hz)":idx.highFreqIdx = icase "Species":idx.speciesIdx = i}}if idx.beginTimeIdx == -1 || idx.endTimeIdx == -1 || idx.speciesIdx == -1 {return idx, fmt.Errorf("missing required columns in Raven file")}// parseRavenSelections reads all selection rows from a scanner and returns parsed selectionsfunc parseRavenSelections(scanner *bufio.Scanner, idx ravenColumnIndices) ([]RavenSelection, error) {var selections []RavenSelectionfor scanner.Scan() {line := scanner.Text()if line == "" {continue}fields := strings.Split(line, "\t")if len(fields) <= idx.speciesIdx {continue}sel, err := parseRavenRow(fields, idx)if err != nil {return nil, err}selections = append(selections, sel)}if err := scanner.Err(); err != nil {return nil, fmt.Errorf("error reading file: %w", err)}// parseRavenRow parses a single tab-separated row into a RavenSelectionfunc parseRavenRow(fields []string, idx ravenColumnIndices) (RavenSelection, error) {var sel RavenSelectionstartTime, err := strconv.ParseFloat(fields[idx.beginTimeIdx], 64)if err != nil {return sel, fmt.Errorf("failed to parse begin time %q: %w", fields[idx.beginTimeIdx], err)}sel.StartTime = startTimeendTime, err := strconv.ParseFloat(fields[idx.endTimeIdx], 64)if err != nil {return sel, fmt.Errorf("failed to parse end time %q: %w", fields[idx.endTimeIdx], err)}sel.EndTime = endTimeif idx.lowFreqIdx >= 0 && idx.lowFreqIdx < len(fields) {freqLow, err := strconv.ParseFloat(fields[idx.lowFreqIdx], 64)if err != nil {return sel, fmt.Errorf("failed to parse low freq %q: %w", fields[idx.lowFreqIdx], err)}sel.FreqLow = freqLow}if idx.highFreqIdx >= 0 && idx.highFreqIdx < len(fields) {freqHigh, err := strconv.ParseFloat(fields[idx.highFreqIdx], 64)if err != nil {return sel, fmt.Errorf("failed to parse high freq %q: %w", fields[idx.highFreqIdx], err)}sel.FreqHigh = freqHigh}// deriveWAVBaseName extracts the base WAV filename from a Raven .selections.txt filenamefunc deriveWAVBaseName(ravenFile string) string {base := filepath.Base(ravenFile)nameWithoutSuffix := strings.TrimSuffix(base, ".selections.txt")idx := strings.Index(nameWithoutSuffix, ".Table.")if idx > 0 {nameWithoutSuffix = nameWithoutSuffix[:idx]}if !scanner.Scan() {return nil, false, false, fmt.Errorf("empty file")}header := strings.Split(scanner.Text(), "\t")idx, err := parseRavenHeader(header)if err != nil {return nil, false, false, err}selections, err := parseRavenSelections(scanner, idx)if err != nil {return nil, false, false, err}if len(selections) == 0 {return nil, false, true, nil}if wavPath == "" {return nil, false, true, nil}sampleRate, duration, err := utils.ParseWAVHeaderMinimal(wavPath)if err != nil {return nil, false, true, nil}dataPath := wavPath + ".data"segments := buildRavenSegments(selections, sampleRate)meta := AviaNZMeta{Operator: "Raven", Duration: duration}reviewer := "None"meta.Reviewer = &reviewerif err := writeDotDataFileSafe(dataPath, segments, "Raven", meta); err != nil {return nil, false, false, err}var calls []ClusteredCallfor _, sel := range selections {calls = append(calls, ClusteredCall{File: wavPath,StartTime: sel.StartTime,EndTime: sel.EndTime,EbirdCode: sel.Species,Segments: 1,})}return calls, true, false, nil}// buildRavenSegments converts Raven selections to AviaNZ segmentsfunc buildRavenSegments(selections []RavenSelection, sampleRate int) []AviaNZSegment {var segments []AviaNZSegmentfor _, sel := range selections {labels := []AviaNZLabel{{Species: sel.Species,Certainty: 70, // Default certainty for Raven (no confidence metric)Filter: "Raven",},}// Use frequency range from Raven, or full band if not specifiedfreqLow := sel.FreqLowfreqHigh := sel.FreqHighif freqLow == 0 && freqHigh == 0 {freqHigh = float64(sampleRate)}segment := AviaNZSegment{sel.StartTime,sel.EndTime,freqLow,freqHigh,labels,}segments = append(segments, segment)}return segments}}// resolveWAVPath finds the WAV file corresponding to a Raven filefunc resolveWAVPath(ravenFile string, cache *DirCache) string {baseName := deriveWAVBaseName(ravenFile)if cache != nil {return cache.FindWAV(baseName)}return findWAVFile(filepath.Dir(ravenFile), baseName)// Find WAV filewavPath := resolveWAVPath(ravenFile, cache)defer func() { _ = file.Close() }()scanner := bufio.NewScanner(file)}return nameWithoutSuffix}// processRavenFileCached processes a single Raven selection file using a DirCache for WAV lookupfunc processRavenFileCached(ravenFile string, cache *DirCache) ([]ClusteredCall, bool, bool, error) {file, err := os.Open(ravenFile)if err != nil {return nil, false, false, fmt.Errorf("failed to open file: %w", err)sel.Species = fields[idx.speciesIdx]return sel, nil}return selections, nil}return idx, nil}// parseRavenHeader finds column indices from a tab-separated header linefunc parseRavenHeader(header []string) (ravenColumnIndices, error) {idx := ravenColumnIndices{beginTimeIdx: -1, endTimeIdx: -1, lowFreqIdx: -1, highFreqIdx: -1, speciesIdx: -1}
package toolsimport ("os""path/filepath""testing""skraak/utils")func TestCallsFromPreds_EmptyFilterError(t *testing.T) {// Create a temp CSV filetmpDir := t.TempDir()csvPath := filepath.Join(tmpDir, "preds.csv")csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {t.Fatal(err)}// Create a dummy WAV file (minimal valid WAV)wavPath := filepath.Join(tmpDir, "test.wav")createMinimalWAV(t, wavPath, 44100, 10.0)// Test with empty filter (should error)input := CallsFromPredsInput{CSVPath: csvPath,Filter: "",WriteDotData: true,ProgressHandler: nil,}output, err := CallsFromPreds(input)// Should return errorif err == nil {t.Error("expected error for empty filter, got nil")}if output.Error == nil || *output.Error == "" {t.Error("expected error message in output, got empty")}}func TestCallsFromPreds_NewDataFile(t *testing.T) {// Create a temp CSV filetmpDir := t.TempDir()csvPath := filepath.Join(tmpDir, "predsST_test-filter_2025-01-01.csv")csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {t.Fatal(err)}// Create a dummy WAV filewavPath := filepath.Join(tmpDir, "test.wav")createMinimalWAV(t, wavPath, 44100, 10.0)// Test with filter parsed from filenameinput := CallsFromPredsInput{CSVPath: csvPath,Filter: "", // Will parse from filenameWriteDotData: true,ProgressHandler: nil,}output, err := CallsFromPreds(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.DataFilesWritten != 1 {t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)}if output.Filter != "test-filter" {t.Errorf("expected filter 'test-filter', got '%s'", output.Filter)}// Verify .data file was createddataPath := wavPath + ".data"if _, err := os.Stat(dataPath); os.IsNotExist(err) {t.Error("expected .data file to be created")}// Verify contentdf, err := utils.ParseDataFile(dataPath)if err != nil {t.Fatalf("failed to parse .data file: %v", err)}if len(df.Segments) != 1 {t.Errorf("expected 1 segment, got %d", len(df.Segments))}if len(df.Segments[0].Labels) != 1 {t.Errorf("expected 1 label, got %d", len(df.Segments[0].Labels))}if df.Segments[0].Labels[0].Filter != "test-filter" {t.Errorf("expected filter 'test-filter', got '%s'", df.Segments[0].Labels[0].Filter)}}func TestCallsFromPreds_ExistingDataFileSameFilter(t *testing.T) {// Create a temp CSV filetmpDir := t.TempDir()csvPath := filepath.Join(tmpDir, "predsST_existing-filter_2025-01-01.csv")csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {t.Fatal(err)}// Create a dummy WAV filewavPath := filepath.Join(tmpDir, "test.wav")createMinimalWAV(t, wavPath, 44100, 10.0)// Create existing .data file with same filterdataPath := wavPath + ".data"existingData := `[{"Operator": "Manual", "Reviewer": "David", "Duration": 10.0},[5.0, 8.0, 0, 44100, [{"species": "morepork", "certainty": 90, "filter": "existing-filter"}]]]`if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {t.Fatal(err)}// Test with same filter (should error)input := CallsFromPredsInput{CSVPath: csvPath,Filter: "", // Will parse from filename -> "existing-filter"WriteDotData: true,ProgressHandler: nil,}output, err := CallsFromPreds(input)// Should return errorif err == nil {t.Error("expected error for same filter, got nil")}if output.Error == nil {t.Error("expected error message in output")}// Verify original .data file is unchangeddf, err := utils.ParseDataFile(dataPath)if err != nil {t.Fatalf("failed to parse .data file: %v", err)}if len(df.Segments) != 1 {t.Errorf("expected original 1 segment, got %d", len(df.Segments))}if df.Segments[0].Labels[0].Species != "morepork" {t.Errorf("expected original species 'morepork', got '%s'", df.Segments[0].Labels[0].Species)}}func TestCallsFromPreds_ExistingDataFileDifferentFilter(t *testing.T) {// Create a temp CSV filetmpDir := t.TempDir()csvPath := filepath.Join(tmpDir, "predsST_new-filter_2025-01-01.csv")csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {t.Fatal(err)}// Create a dummy WAV filewavPath := filepath.Join(tmpDir, "test.wav")createMinimalWAV(t, wavPath, 44100, 10.0)// Create existing .data file with different filterdataPath := wavPath + ".data"existingData := `[{"Operator": "Manual", "Reviewer": "David", "Duration": 10.0},[5.0, 8.0, 0, 44100, [{"species": "morepork", "certainty": 90, "filter": "old-filter"}]]]`if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {t.Fatal(err)}// Test with different filter (should merge)input := CallsFromPredsInput{CSVPath: csvPath,Filter: "", // Will parse from filename -> "new-filter"WriteDotData: true,ProgressHandler: nil,}output, err := CallsFromPreds(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.DataFilesWritten != 1 {t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)}// Verify .data file has merged contentdf, err := utils.ParseDataFile(dataPath)if err != nil {t.Fatalf("failed to parse .data file: %v", err)}if len(df.Segments) != 2 {t.Errorf("expected 2 segments after merge, got %d", len(df.Segments))}// Check segments are sorted by start timeif df.Segments[0].StartTime > df.Segments[1].StartTime {t.Error("expected segments to be sorted by start time")}// Check both filters are presentfilters := make(map[string]bool)for _, seg := range df.Segments {for _, label := range seg.Labels {filters[label.Filter] = true}}if !filters["old-filter"] {t.Error("expected 'old-filter' to be present")}if !filters["new-filter"] {t.Error("expected 'new-filter' to be present")}}func TestCallsFromPreds_ExistingDataFileParseError(t *testing.T) {// Create a temp CSV filetmpDir := t.TempDir()csvPath := filepath.Join(tmpDir, "predsST_test-filter_2025-01-01.csv")csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {t.Fatal(err)}// Create a dummy WAV filewavPath := filepath.Join(tmpDir, "test.wav")createMinimalWAV(t, wavPath, 44100, 10.0)// Create corrupted .data filedataPath := wavPath + ".data"corruptedData := `this is not valid json`if err := os.WriteFile(dataPath, []byte(corruptedData), 0644); err != nil {t.Fatal(err)}// Test (should error due to parse failure)input := CallsFromPredsInput{CSVPath: csvPath,Filter: "",WriteDotData: true,ProgressHandler: nil,}output, err := CallsFromPreds(input)// Should return errorif err == nil {t.Error("expected error for corrupted .data file, got nil")}if output.Error == nil {t.Error("expected error message in output")}// Verify original file is unchangedcontent, err := os.ReadFile(dataPath)if err != nil {t.Fatal(err)}if string(content) != corruptedData {t.Error("expected corrupted file to remain unchanged")}}func TestCallsFromPreds_ExplicitFilter(t *testing.T) {// Create a temp CSV file with non-standard nametmpDir := t.TempDir()csvPath := filepath.Join(tmpDir, "predictions.csv")csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {t.Fatal(err)}// Create a dummy WAV filewavPath := filepath.Join(tmpDir, "test.wav")createMinimalWAV(t, wavPath, 44100, 10.0)// Test with explicit filterinput := CallsFromPredsInput{CSVPath: csvPath,Filter: "my-custom-filter",WriteDotData: true,ProgressHandler: nil,}output, err := CallsFromPreds(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.Filter != "my-custom-filter" {t.Errorf("expected filter 'my-custom-filter', got '%s'", output.Filter)}// Verify .data file uses explicit filterdataPath := wavPath + ".data"df, err := utils.ParseDataFile(dataPath)if err != nil {t.Fatalf("failed to parse .data file: %v", err)}if df.Segments[0].Labels[0].Filter != "my-custom-filter" {t.Errorf("expected filter 'my-custom-filter' in .data file, got '%s'", df.Segments[0].Labels[0].Filter)}}func TestCallsFromPreds_NonParsableFilenameNoFilter(t *testing.T) {// Create a temp CSV file with non-standard name that can't be parsedtmpDir := t.TempDir()csvPath := filepath.Join(tmpDir, "random_name.csv")csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {t.Fatal(err)}// Create a dummy WAV filewavPath := filepath.Join(tmpDir, "test.wav")createMinimalWAV(t, wavPath, 44100, 10.0)// Test with no filter and non-parsable filename (should error)input := CallsFromPredsInput{CSVPath: csvPath,Filter: "",WriteDotData: true,ProgressHandler: nil,}output, err := CallsFromPreds(input)// Should return errorif err == nil {t.Error("expected error for unparsable filename with no filter, got nil")}if output.Error == nil {t.Error("expected error message in output")}}// createMinimalWAV creates a minimal valid WAV file for testingfunc createMinimalWAV(t *testing.T, path string, sampleRate int, duration float64) {t.Helper()numSamples := int(float64(sampleRate) * duration)dataSize := numSamples * 2 // 16-bit mono// WAV header (44 bytes)header := make([]byte, 44)// RIFF headercopy(header[0:4], "RIFF")totalSize := uint32(36 + dataSize)header[4] = byte(totalSize)header[5] = byte(totalSize >> 8)header[6] = byte(totalSize >> 16)header[7] = byte(totalSize >> 24)copy(header[8:12], "WAVE")// fmt chunkcopy(header[12:16], "fmt ")chunkSize := uint32(16)header[16] = byte(chunkSize)header[17] = byte(chunkSize >> 8)header[18] = byte(chunkSize >> 16)header[19] = byte(chunkSize >> 24)audioFormat := uint16(1) // PCMheader[20] = byte(audioFormat)header[21] = byte(audioFormat >> 8)numChannels := uint16(1)header[22] = byte(numChannels)header[23] = byte(numChannels >> 8)header[24] = byte(sampleRate)header[25] = byte(sampleRate >> 8)header[26] = byte(sampleRate >> 16)header[27] = byte(sampleRate >> 24)byteRate := uint32(sampleRate * 2)header[28] = byte(byteRate)header[29] = byte(byteRate >> 8)header[30] = byte(byteRate >> 16)header[31] = byte(byteRate >> 24)blockAlign := uint16(2)header[32] = byte(blockAlign)header[33] = byte(blockAlign >> 8)bitsPerSample := uint16(16)header[34] = byte(bitsPerSample)header[35] = byte(bitsPerSample >> 8)// data chunkcopy(header[36:40], "data")header[40] = byte(dataSize)header[41] = byte(dataSize >> 8)header[42] = byte(dataSize >> 16)header[43] = byte(dataSize >> 24)// Create file with header and silencefile, err := os.Create(path)if err != nil {t.Fatal(err)}defer file.Close()if _, err := file.Write(header); err != nil {t.Fatal(err)}// Write silence (zeros)silence := make([]byte, dataSize)if _, err := file.Write(silence); err != nil {t.Fatal(err)}}
package toolsimport ("encoding/csv""encoding/json""fmt""io""os""path/filepath""sort""strconv""strings""sync""sync/atomic""skraak/utils")// Constants for clustering algorithmconst (CLUSTER_GAP_MULTIPLIER = 2 // 3 Gap threshold = CLUSTER_GAP_MULTIPLIER * clip_duration. 3 for kiwiMIN_DETECTIONS_PER_CLUSTER = 0 // 1 = filter out single detections (used for kiwi, they have long calls 30s), 0 = let single detections pass throughDEFAULT_CERTAINTY = 70 // .data certainty:70DOT_DATA_WORKERS = 8 // Number of parallel workers for .data file writing)// ClusteredCall represents a clustered bird call detectiontype ClusteredCall struct {File string `json:"file"`StartTime float64 `json:"start_time"`EndTime float64 `json:"end_time"`EbirdCode string `json:"ebird_code"`Segments int `json:"segments"`}// CallsFromPredsInput defines the input for the calls-from-preds tooltype CallsFromPredsInput struct {CSVPath string `json:"csv_path"`Filter string `json:"filter"`WriteDotData bool `json:"write_dot_data"`GapMultiplier int `json:"gap_multiplier"`MinDetections int `json:"min_detections"`ProgressHandler ProgressHandler `json:"-"` // Optional progress callback (not serialized)}// ProgressHandler is a callback function for reporting progress during long operations// processed: number of items processed so far// total: total number of items to process// message: optional status messagetype ProgressHandler func(processed, total int, message string)// CallsFromPredsOutput defines the output for the calls-from-preds tooltype CallsFromPredsOutput struct {Calls []ClusteredCall `json:"calls"`TotalCalls int `json:"total_calls"`ClipDuration float64 `json:"clip_duration"`GapThreshold float64 `json:"gap_threshold"`SpeciesCount map[string]int `json:"species_count"`DataFilesWritten int `json:"data_files_written"`DataFilesSkipped int `json:"data_files_skipped"`Filter string `json:"filter"`Error *string `json:"error,omitempty"`}// AviaNZ .data file types// predFileSpeciesKey groups detections by file and ebird codetype predFileSpeciesKey struct {File stringEbirdCode string}// CallsFromPreds reads a predictions CSV and clusters detections into continuous bird callsfunc CallsFromPreds(input CallsFromPredsInput) (CallsFromPredsOutput, error) {var output CallsFromPredsOutput// Determine filter: use provided filter, or parse from CSV filenamefilter := input.Filterif filter == "" {filter = ParseFilterFromFilename(input.CSVPath)}if filter == "" {errMsg := "Filter must be specified via --filter flag or parsable from CSV filename"output.Error = &errMsgreturn output, fmt.Errorf("%s", errMsg)}output.Filter = filter_, detections, clipDuration, err := readPredCSV(input.CSVPath)if err != nil {errMsg := err.Error()output.Error = &errMsgreturn output, err}output.ClipDuration = clipDurationgapMultiplier := CLUSTER_GAP_MULTIPLIERif input.GapMultiplier > 0 {gapMultiplier = input.GapMultiplier}minDetections := MIN_DETECTIONS_PER_CLUSTERif input.MinDetections >= 0 {minDetections = input.MinDetections}gapThreshold := float64(gapMultiplier) * clipDurationoutput.GapThreshold = gapThresholdallCalls, speciesCount := clusterDetections(detections, clipDuration, gapThreshold, minDetections)output.Calls = allCallsoutput.TotalCalls = len(allCalls)output.SpeciesCount = speciesCountif input.WriteDotData {dataFilesWritten, dataFilesSkipped, err := writeDotFiles(input.CSVPath, filter, allCalls, input.ProgressHandler)if err != nil {errMsg := fmt.Sprintf("Error writing .data files: %v", err)output.Error = &errMsgreturn output, fmt.Errorf("%s", errMsg)}output.DataFilesWritten = dataFilesWrittenoutput.DataFilesSkipped = dataFilesSkipped}return output, nil}// readPredCSV opens and reads a predictions CSV, returning column mappings, detections, and clip durationfunc readPredCSV(csvPath string) (predCSVColumns, map[predFileSpeciesKey][]float64, float64, error) {file, err := os.Open(csvPath)if err != nil {return predCSVColumns{}, nil, 0, fmt.Errorf("failed to open CSV file: %w", err)}defer func() { _ = file.Close() }()reader := csv.NewReader(file)reader.ReuseRecord = trueheader, err := reader.Read()if err != nil {return predCSVColumns{}, nil, 0, fmt.Errorf("failed to read CSV header: %w", err)}cols, err := findPredCSVColumns(header)if err != nil {return predCSVColumns{}, nil, 0, err}detections, clipDuration, err := readPredCSVRows(reader, cols)if err != nil {return predCSVColumns{}, nil, 0, err}return cols, detections, clipDuration, nil}// predCSVColumns holds the column indices for a predictions CSVtype predCSVColumns struct {fileIdx intstartTimeIdx intendTimeIdx intebirdCodes []stringebirdIdx []int}// findPredCSVColumns parses the CSV header to find column indicesfunc findPredCSVColumns(header []string) (predCSVColumns, error) {cols := predCSVColumns{fileIdx: -1,startTimeIdx: -1,endTimeIdx: -1,}for i, col := range header {switch col {case "file":cols.fileIdx = icase "start_time":cols.startTimeIdx = icase "end_time":cols.endTimeIdx = idefault:if ignoredColumns[col] {continue}cols.ebirdCodes = append(cols.ebirdCodes, col)cols.ebirdIdx = append(cols.ebirdIdx, i)}}if cols.fileIdx == -1 || cols.startTimeIdx == -1 || cols.endTimeIdx == -1 {return cols, fmt.Errorf("CSV must have 'file', 'start_time', and 'end_time' columns")}if len(cols.ebirdCodes) == 0 {return cols, fmt.Errorf("CSV must have at least one ebird code column")}// readPredCSVRows reads all CSV data rows and returns detections grouped by file+species, plus clip durationfunc readPredCSVRows(reader *csv.Reader, cols predCSVColumns) (map[predFileSpeciesKey][]float64, float64, error) {detections := make(map[predFileSpeciesKey][]float64)clipDuration := 0.0record, err := reader.Read()if err == io.EOF {return detections, 0, nil}if err != nil {return nil, 0, fmt.Errorf("failed to read first CSV row: %w", err)}startTime, _ := strconv.ParseFloat(record[cols.startTimeIdx], 64)endTime, _ := strconv.ParseFloat(record[cols.endTimeIdx], 64)clipDuration = endTime - startTimeaddDetectionsFromRow(record, cols, startTime, detections)for {record, err := reader.Read()if err == io.EOF {break}if err != nil {return nil, 0, fmt.Errorf("failed to read CSV row: %w", err)}startTime, _ = strconv.ParseFloat(record[cols.startTimeIdx], 64)addDetectionsFromRow(record, cols, startTime, detections)}return detections, clipDuration, nil}// addDetectionsFromRow adds positive detections from a single CSV rowfunc addDetectionsFromRow(record []string, cols predCSVColumns, startTime float64, detections map[predFileSpeciesKey][]float64) {fileName := record[cols.fileIdx]for i, idx := range cols.ebirdIdx {if record[idx] == "1" {key := predFileSpeciesKey{File: fileName, EbirdCode: cols.ebirdCodes[i]}detections[key] = append(detections[key], startTime)}}}// clusterDetections groups detections into clusters and produces sorted ClusteredCallsfunc clusterDetections(detections map[predFileSpeciesKey][]float64, clipDuration, gapThreshold float64, minDetections int) ([]ClusteredCall, map[string]int) {var allCalls []ClusteredCallspeciesCount := make(map[string]int)for key, startTimes := range detections {sort.Float64s(startTimes)clusters := clusterStartTimes(startTimes, gapThreshold)for _, cluster := range clusters {if len(cluster) <= minDetections {continue}call := ClusteredCall{File: key.File,StartTime: cluster[0],EndTime: cluster[len(cluster)-1] + clipDuration,EbirdCode: key.EbirdCode,Segments: len(cluster),}allCalls = append(allCalls, call)speciesCount[key.EbirdCode]++}}sort.Slice(allCalls, func(i, j int) bool {if allCalls[i].File != allCalls[j].File {return allCalls[i].File < allCalls[j].File}return allCalls[i].StartTime < allCalls[j].StartTime})return allCalls, speciesCount}// DirCache caches directory entries for fast WAV file lookup.// Scans the directory once and builds a map from lowercased basename to full filename.// Safe for concurrent read-only use after construction.type DirCache struct {dir stringwavMap map[string]string // lowercase basename -> filename with original case (e.g. "20230610_150000" -> "20230610_150000.WAV")dirMap map[string]string // lowercase basename -> filename for any file (used by from-raven for .selections.txt etc.)}// NewDirCache creates a DirCache by scanning the directory once.func NewDirCache(dir string) *DirCache {entries, err := os.ReadDir(dir)if err != nil {return &DirCache{dir: dir, wavMap: make(map[string]string), dirMap: make(map[string]string)}}wavMap := make(map[string]string, len(entries))dirMap := make(map[string]string, len(entries))for _, entry := range entries {if entry.IsDir() {continue}name := entry.Name()ext := filepath.Ext(name)base := strings.TrimSuffix(name, ext)dirMap[strings.ToLower(base)] = nameif strings.EqualFold(ext, ".wav") {wavMap[strings.ToLower(base)] = name}}return &DirCache{dir: dir, wavMap: wavMap, dirMap: dirMap}}// FindWAV looks up a WAV file by basename (case-insensitive).// Returns the full path with correct case, or empty string if not found.func (dc *DirCache) FindWAV(baseName string) string {if name, ok := dc.wavMap[strings.ToLower(baseName)]; ok {return filepath.Join(dc.dir, name)}return ""}// FindFile looks up any file by basename (case-insensitive).// Returns the full path with correct case, or empty string if not found.func (dc *DirCache) FindFile(baseName string) string {if name, ok := dc.dirMap[strings.ToLower(baseName)]; ok {return filepath.Join(dc.dir, name)}return ""}// findWAVFile finds a WAV file in the directory with case-insensitive matching.// baseName is the filename without extension (e.g., "20230610_150000").// Returns the full path with correct case, or empty string if not found.// Deprecated: Use DirCache.FindWAV for batch operations to avoid repeated directory scans.func findWAVFile(dir, baseName string) string {entries, err := os.ReadDir(dir)if err != nil {return ""}for _, entry := range entries {if entry.IsDir() {continue}name := entry.Name()ext := filepath.Ext(name)nameNoExt := strings.TrimSuffix(name, ext)if nameNoExt == baseName && strings.EqualFold(ext, ".wav") {return filepath.Join(dir, name)}}return ""}// writeDotFiles writes AviaNZ .data files for each audio file with calls// Uses parallel workers for improved performance on large batchesfunc writeDotFiles(csvPath, filter string, calls []ClusteredCall, progress ProgressHandler) (int, int, error) {// Base directory is the directory containing the CSV filecsvDir := filepath.Dir(csvPath)// Group calls by file (using extracted filename)callsByFile := make(map[string][]ClusteredCall)for _, call := range calls {filename := filepath.Base(call.File)callsByFile[filename] = append(callsByFile[filename], call)}// Report initial progressif progress != nil {progress(0, len(callsByFile), "Processing WAV files")}// If small batch, process sequentially (avoid goroutine overhead)if len(callsByFile) < 10 {return writeDotFilesSequential(csvDir, filter, callsByFile, progress)}// Parallel processing for larger batchesreturn writeDotFilesParallel(csvDir, filter, callsByFile, progress)}// dotDataJob represents a single file to processtype dotDataJob struct {filename stringfileCalls []ClusteredCall}// dotDataResult represents the result of processing a single filetype dotDataResult struct {filename stringwritten boolerr error}// writeDotFilesSequential processes files one at a time (for small batches)func writeDotFilesSequential(csvDir, filter string, callsByFile map[string][]ClusteredCall, progress ProgressHandler) (int, int, error) {dataFilesWritten := 0dataFilesSkipped := 0total := len(callsByFile)processed := 0for filename, fileCalls := range callsByFile {// Find WAV file with correct casebaseName := strings.TrimSuffix(filename, filepath.Ext(filename))wavPath := findWAVFile(csvDir, baseName)if wavPath == "" {dataFilesSkipped++processed++if progress != nil {progress(processed, total, "")}continue}dataPath := wavPath + ".data"sampleRate, duration, err := utils.ParseWAVHeaderMinimal(wavPath)if err != nil {dataFilesSkipped++processed++if progress != nil {progress(processed, total, "")}continue}// Build segments and metadatameta, segments := buildAviaNZMetaAndSegments(fileCalls, filter, duration, sampleRate)if err := writeDotDataFileSafe(dataPath, segments, filter, meta); err != nil {return dataFilesWritten, dataFilesSkipped, fmt.Errorf("failed to write %s: %w", dataPath, err)}dataFilesWritten++processed++if progress != nil {progress(processed, total, "")}}return dataFilesWritten, dataFilesSkipped, nil}// writeDotFilesParallel processes files concurrently using a worker poolfunc writeDotFilesParallel(csvDir, filter string, callsByFile map[string][]ClusteredCall, progress ProgressHandler) (int, int, error) {total := len(callsByFile)var processed atomic.Int32// Create job channeljobs := make(chan dotDataJob, len(callsByFile))results := make(chan dotDataResult, len(callsByFile))// Start workersvar wg sync.WaitGroupfor range DOT_DATA_WORKERS {wg.Add(1)go dotDataWorker(csvDir, filter, jobs, results, &wg)}// Send jobsfor filename, fileCalls := range callsByFile {jobs <- dotDataJob{filename: filename, fileCalls: fileCalls}}close(jobs)// Wait for workers to finishgo func() {wg.Wait()close(results)}()// Collect results with progress reportingdataFilesWritten := 0dataFilesSkipped := 0var firstErr errorfor result := range results {if result.err != nil && firstErr == nil {firstErr = result.err}if result.written {dataFilesWritten++} else {dataFilesSkipped++}// Report progressif progress != nil {current := int(processed.Add(1))progress(current, total, "")}}return dataFilesWritten, dataFilesSkipped, firstErr}// dotDataWorker processes files from the jobs channelfunc dotDataWorker(csvDir, filter string, jobs <-chan dotDataJob, results chan<- dotDataResult, wg *sync.WaitGroup) {defer wg.Done()for job := range jobs {// Find WAV file with correct casebaseName := strings.TrimSuffix(job.filename, filepath.Ext(job.filename))wavPath := findWAVFile(csvDir, baseName)if wavPath == "" {results <- dotDataResult{filename: job.filename, written: false, err: nil}continue}dataPath := wavPath + ".data"sampleRate, duration, err := utils.ParseWAVHeaderMinimal(wavPath)if err != nil {results <- dotDataResult{filename: job.filename, written: false, err: nil}continue}// Build segments and metadatameta, segments := buildAviaNZMetaAndSegments(job.fileCalls, filter, duration, sampleRate)if err := writeDotDataFileSafe(dataPath, segments, filter, meta); err != nil {results <- dotDataResult{filename: job.filename, written: false, err: fmt.Errorf("failed to write %s: %w", dataPath, err)}continue}results <- dotDataResult{filename: job.filename, written: true, err: nil}}}// buildAviaNZMetaAndSegments creates metadata and segments for a .data filefunc buildAviaNZMetaAndSegments(calls []ClusteredCall, filter string, duration float64, sampleRate int) (AviaNZMeta, []AviaNZSegment) {// Create metadatareviewer := "None"meta := AviaNZMeta{Operator: "Auto",Reviewer: &reviewer,Duration: duration,}// Build segments arrayvar segments []AviaNZSegmentfor _, call := range calls {// Create labels for this segmentlabels := []AviaNZLabel{{Species: call.EbirdCode,Certainty: DEFAULT_CERTAINTY,Filter: filter,},}// Create segment: [start, end, freq_low, freq_high, labels]// freq_low=0, freq_high=sampleRate for full-band segmentssegment := AviaNZSegment{call.StartTime,call.EndTime,0, // freq_lowsampleRate, // freq_high (full band)labels,}segments = append(segments, segment)}return meta, segments}// writeAviaNZDataFile writes a new .data file to disk (does not check for existing files)func writeAviaNZDataFile(path string, data []any) error {file, err := os.Create(path)if err != nil {return fmt.Errorf("failed to create file: %w", err)}defer func() { _ = file.Close() }()encoder := json.NewEncoder(file)encoder.SetIndent("", "") // No indentation for compact outputif err := encoder.Encode(data); err != nil {return fmt.Errorf("failed to encode JSON: %w", err)}return nil}// writeDotDataFileSafe safely writes or merges .data files// - If file doesn't exist: write new file// - If file exists with same filter: return error (refuse to clobber)// - If file exists with different filter: merge segments and write// - If file exists but can't be parsed: return error (refuse to clobber)func writeDotDataFileSafe(path string, newSegments []AviaNZSegment, filter string, meta AviaNZMeta) error {// Check if file existsif _, err := os.Stat(path); err == nil {// File exists - parse and checkexisting, err := utils.ParseDataFile(path)if err != nil {return fmt.Errorf("cannot parse existing %s: %w (refusing to clobber)", path, err)}// Check for duplicate filterfor _, seg := range existing.Segments {if seg.HasFilterLabel(filter) {return fmt.Errorf("%s already contains filter '%s' (refusing to clobber)", path, filter)}}// Append new segments (different filter - safe to merge)for _, newSeg := range newSegments {seg := convertAviaNZSegment(newSeg, filter)existing.Segments = append(existing.Segments, seg)}// Sort by start timesort.Slice(existing.Segments, func(i, j int) bool {return existing.Segments[i].StartTime < existing.Segments[j].StartTime})return existing.Write(path)}// File doesn't exist - write newdata := buildDataFileFromSegments(meta, newSegments)return writeAviaNZDataFile(path, data)}// convertAviaNZSegment converts an AviaNZSegment to utils.Segmentfunc convertAviaNZSegment(seg AviaNZSegment, filter string) *utils.Segment {labels := seg[4].([]AviaNZLabel)utilsLabels := make([]*utils.Label, len(labels))for i, l := range labels {utilsLabels[i] = &utils.Label{Species: l.Species,Certainty: l.Certainty,Filter: filter,}}// Handle freq values (could be int or float64 depending on how they were created)var freqLow, freqHigh float64switch v := seg[2].(type) {case int:freqLow = float64(v)case float64:freqLow = v}switch v := seg[3].(type) {case int:freqHigh = float64(v)case float64:freqHigh = v}return &utils.Segment{StartTime: seg[0].(float64),EndTime: seg[1].(float64),FreqLow: freqLow,FreqHigh: freqHigh,Labels: utilsLabels,}}// buildDataFileFromSegments builds the data file structure from meta and segmentsfunc buildDataFileFromSegments(meta AviaNZMeta, segments []AviaNZSegment) []any {result := make([]any, 0, 1+len(segments))result = append(result, meta)for _, seg := range segments {result = append(result, seg)}return result}// ParseFilterFromFilename extracts filter name from preds CSV filename// "predsST_opensoundscape-kiwi-1.2_2025-11-12.csv" -> "opensoundscape-kiwi-1.2"// Returns empty string if parsing failsfunc ParseFilterFromFilename(csvPath string) string {filename := filepath.Base(csvPath)// Remove .csv extensionname := strings.TrimSuffix(filename, ".csv")// Split on underscoreparts := strings.Split(name, "_")if len(parts) == 3 {return parts[1]}return ""}// clusterStartTimes groups consecutive start times into clusters// where the gap between consecutive times is <= gapThresholdfunc clusterStartTimes(startTimes []float64, gapThreshold float64) [][]float64 {if len(startTimes) == 0 {return nil}var clusters [][]float64currentCluster := []float64{startTimes[0]}for i := 1; i < len(startTimes); i++ {gap := startTimes[i] - startTimes[i-1]if gap <= gapThreshold {// Same clustercurrentCluster = append(currentCluster, startTimes[i])} else {// New clusterclusters = append(clusters, currentCluster)currentCluster = []float64{startTimes[i]}}}// Don't forget the last clusterclusters = append(clusters, currentCluster)return clusters}return cols, nil}ignoredColumns := map[string]bool{"NotKiwi": true, "0.0": true}
package toolsimport ("os""path/filepath""testing""skraak/utils")// ============================================// BirdNET Tests// ============================================func TestCallsFromBirda_NewDataFile(t *testing.T) {tmpDir := t.TempDir()// Create a minimal WAV filewavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)// Create BirdNET results filebirdaPath := filepath.Join(tmpDir, "test.BirdNET.results.csv")birdaContent := "\ufeffStart (s),End (s),Scientific name,Common name,Confidence,File\n0.0,3.0,Turdus migratorius,American Robin,0.85,/some/path/test.WAV\n"if err := os.WriteFile(birdaPath, []byte(birdaContent), 0644); err != nil {t.Fatal(err)}input := CallsFromBirdaInput{File: birdaPath,}output, err := CallsFromBirda(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.DataFilesWritten != 1 {t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)}if output.Filter != "BirdNET" {t.Errorf("expected filter 'BirdNET', got '%s'", output.Filter)}if output.TotalCalls != 1 {t.Errorf("expected 1 call, got %d", output.TotalCalls)}// Verify .data file was createddataPath := wavPath + ".data"df, err := utils.ParseDataFile(dataPath)if err != nil {t.Fatalf("failed to parse .data file: %v", err)}if len(df.Segments) != 1 {t.Errorf("expected 1 segment, got %d", len(df.Segments))}if df.Segments[0].Labels[0].Filter != "BirdNET" {t.Errorf("expected filter 'BirdNET', got '%s'", df.Segments[0].Labels[0].Filter)}if df.Segments[0].Labels[0].Certainty != 85 {t.Errorf("expected certainty 85, got %d", df.Segments[0].Labels[0].Certainty)}}func TestCallsFromBirda_ExistingSameFilter(t *testing.T) {tmpDir := t.TempDir()wavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)dataPath := wavPath + ".data"existingData := `[{"Operator": "Test", "Duration": 60.0}, [5.0, 10.0, 0, 16000, [{"species": "Existing Bird", "certainty": 90, "filter": "BirdNET"}]]]`if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {t.Fatal(err)}birdaPath := filepath.Join(tmpDir, "test.BirdNET.results.csv")birdaContent := "\ufeffStart (s),End (s),Scientific name,Common name,Confidence,File\n0.0,3.0,New Bird,New Bird,0.85,test.WAV\n"if err := os.WriteFile(birdaPath, []byte(birdaContent), 0644); err != nil {t.Fatal(err)}input := CallsFromBirdaInput{File: birdaPath}output, err := CallsFromBirda(input)if err == nil {t.Error("expected error for same filter, got nil")}if output.Error == nil {t.Error("expected error message in output")}}func TestCallsFromBirda_ExistingDifferentFilter(t *testing.T) {tmpDir := t.TempDir()wavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)dataPath := wavPath + ".data"existingData := `[{"Operator": "Test", "Duration": 60.0}, [5.0, 10.0, 0, 16000, [{"species": "Kiwi", "certainty": 90, "filter": "Manual"}]]]`if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {t.Fatal(err)}birdaPath := filepath.Join(tmpDir, "test.BirdNET.results.csv")birdaContent := "\ufeffStart (s),End (s),Scientific name,Common name,Confidence,File\n0.0,3.0,Robin,Robin,0.85,test.WAV\n"if err := os.WriteFile(birdaPath, []byte(birdaContent), 0644); err != nil {t.Fatal(err)}input := CallsFromBirdaInput{File: birdaPath}output, err := CallsFromBirda(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.DataFilesWritten != 1 {t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)}df, err := utils.ParseDataFile(dataPath)if err != nil {t.Fatalf("failed to parse .data file: %v", err)}if len(df.Segments) != 2 {t.Errorf("expected 2 segments after merge, got %d", len(df.Segments))}}func TestCallsFromBirda_DeleteOption(t *testing.T) {tmpDir := t.TempDir()wavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)birdaPath := filepath.Join(tmpDir, "test.BirdNET.results.csv")birdaContent := "\ufeffStart (s),End (s),Scientific name,Common name,Confidence,File\n0.0,3.0,Robin,Robin,0.85,test.WAV\n"if err := os.WriteFile(birdaPath, []byte(birdaContent), 0644); err != nil {t.Fatal(err)}input := CallsFromBirdaInput{File: birdaPath, Delete: true}output, err := CallsFromBirda(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.FilesDeleted != 1 {t.Errorf("expected 1 file deleted, got %d", output.FilesDeleted)}if _, err := os.Stat(birdaPath); !os.IsNotExist(err) {t.Error("expected BirdNET file to be deleted")}}func TestCallsFromBirda_FolderMode(t *testing.T) {tmpDir := t.TempDir()for i := range 2 {wavPath := filepath.Join(tmpDir, "test"+string(rune('0'+i))+".WAV")createMinimalWAV(t, wavPath, 16000, 60.0)birdaPath := filepath.Join(tmpDir, "test"+string(rune('0'+i))+".BirdNET.results.csv")birdaContent := "\ufeffStart (s),End (s),Scientific name,Common name,Confidence,File\n0.0,3.0,Bird,Bird,0.85,test.WAV\n"if err := os.WriteFile(birdaPath, []byte(birdaContent), 0644); err != nil {t.Fatal(err)}}input := CallsFromBirdaInput{Folder: tmpDir}output, err := CallsFromBirda(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.FilesProcessed != 2 {t.Errorf("expected 2 files processed, got %d", output.FilesProcessed)}if output.DataFilesWritten != 2 {t.Errorf("expected 2 data files written, got %d", output.DataFilesWritten)}}// ============================================// Raven Tests// ============================================func TestCallsFromRaven_NewDataFile(t *testing.T) {tmpDir := t.TempDir()wavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)ravenPath := filepath.Join(tmpDir, "test.Table.1.selections.txt")ravenContent := "Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tSpecies\n1\tSpectrogram 1\t1\t0.0\t5.0\t1000\t5000\tKiwi\n"if err := os.WriteFile(ravenPath, []byte(ravenContent), 0644); err != nil {t.Fatal(err)}input := CallsFromRavenInput{File: ravenPath}output, err := CallsFromRaven(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.DataFilesWritten != 1 {t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)}if output.Filter != "Raven" {t.Errorf("expected filter 'Raven', got '%s'", output.Filter)}dataPath := wavPath + ".data"df, err := utils.ParseDataFile(dataPath)if err != nil {t.Fatalf("failed to parse .data file: %v", err)}if df.Segments[0].FreqLow != 1000 {t.Errorf("expected freq_low 1000, got %f", df.Segments[0].FreqLow)}if df.Segments[0].FreqHigh != 5000 {t.Errorf("expected freq_high 5000, got %f", df.Segments[0].FreqHigh)}}func TestCallsFromRaven_ExistingSameFilter(t *testing.T) {tmpDir := t.TempDir()wavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)dataPath := wavPath + ".data"existingData := `[{"Operator": "Test", "Duration": 60.0}, [5.0, 10.0, 0, 16000, [{"species": "Existing", "certainty": 90, "filter": "Raven"}]]]`if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {t.Fatal(err)}ravenPath := filepath.Join(tmpDir, "test.Table.1.selections.txt")ravenContent := "Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tSpecies\n1\tSpectrogram 1\t1\t0.0\t5.0\t1000\t5000\tNew\n"if err := os.WriteFile(ravenPath, []byte(ravenContent), 0644); err != nil {t.Fatal(err)}input := CallsFromRavenInput{File: ravenPath}output, err := CallsFromRaven(input)if err == nil {t.Error("expected error for same filter, got nil")}if output.Error == nil {t.Error("expected error message in output")}}func TestCallsFromRaven_ExistingDifferentFilter(t *testing.T) {tmpDir := t.TempDir()wavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)dataPath := wavPath + ".data"existingData := `[{"Operator": "Test", "Duration": 60.0}, [5.0, 10.0, 0, 16000, [{"species": "Kiwi", "certainty": 90, "filter": "BirdNET"}]]]`if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {t.Fatal(err)}ravenPath := filepath.Join(tmpDir, "test.Table.1.selections.txt")ravenContent := "Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tSpecies\n1\tSpectrogram 1\t1\t0.0\t5.0\t1000\t5000\tMorepork\n"if err := os.WriteFile(ravenPath, []byte(ravenContent), 0644); err != nil {t.Fatal(err)}input := CallsFromRavenInput{File: ravenPath}output, err := CallsFromRaven(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.DataFilesWritten != 1 {t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)}df, err := utils.ParseDataFile(dataPath)if err != nil {t.Fatalf("failed to parse .data file: %v", err)}if len(df.Segments) != 2 {t.Errorf("expected 2 segments after merge, got %d", len(df.Segments))}}func TestCallsFromRaven_DeleteOption(t *testing.T) {tmpDir := t.TempDir()wavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)ravenPath := filepath.Join(tmpDir, "test.Table.1.selections.txt")ravenContent := "Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tSpecies\n1\tSpectrogram 1\t1\t0.0\t5.0\t1000\t5000\tKiwi\n"if err := os.WriteFile(ravenPath, []byte(ravenContent), 0644); err != nil {t.Fatal(err)}input := CallsFromRavenInput{File: ravenPath, Delete: true}output, err := CallsFromRaven(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.FilesDeleted != 1 {t.Errorf("expected 1 file deleted, got %d", output.FilesDeleted)}if _, err := os.Stat(ravenPath); !os.IsNotExist(err) {t.Error("expected Raven file to be deleted")}}func TestCallsFromRaven_MultipleSelections(t *testing.T) {tmpDir := t.TempDir()wavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)ravenPath := filepath.Join(tmpDir, "test.Table.1.selections.txt")ravenContent := "Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tSpecies\n1\tSpectrogram 1\t1\t0.0\t5.0\t1000\t5000\tKiwi\n2\tSpectrogram 1\t1\t10.0\t15.0\t2000\t6000\tMorepork\n3\tSpectrogram 1\t1\t20.0\t25.0\t1500\t4500\tTui\n"if err := os.WriteFile(ravenPath, []byte(ravenContent), 0644); err != nil {t.Fatal(err)}input := CallsFromRavenInput{File: ravenPath}output, err := CallsFromRaven(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.TotalCalls != 3 {t.Errorf("expected 3 calls, got %d", output.TotalCalls)}if output.SpeciesCount["Kiwi"] != 1 || output.SpeciesCount["Morepork"] != 1 || output.SpeciesCount["Tui"] != 1 {t.Errorf("unexpected species count: %v", output.SpeciesCount)}}
package toolsimport ("encoding/csv""fmt""io""os""path/filepath""strconv""strings""skraak/utils")// CallsFromBirdaInput defines the input for the calls-from-birda tooltype CallsFromBirdaInput struct {Folder string `json:"folder"`File string `json:"file"`Delete bool `json:"delete"`ProgressHandler ProgressHandler `json:"-"` // Optional progress callback}// CallsFromBirdaOutput defines the output for the calls-from-birda tooltype CallsFromBirdaOutput struct {Calls []ClusteredCall `json:"calls"`TotalCalls int `json:"total_calls"`SpeciesCount map[string]int `json:"species_count"`DataFilesWritten int `json:"data_files_written"`DataFilesSkipped int `json:"data_files_skipped"`FilesProcessed int `json:"files_processed"`FilesDeleted int `json:"files_deleted"`Filter string `json:"filter"`Error *string `json:"error,omitempty"`}// birdaSource implements CallSource for BirdNET results filestype birdaSource struct{}func (birdaSource) Name() string { return "BirdNET" }func (birdaSource) FindFiles(folder string) ([]string, error) {var files []stringentries, err := os.ReadDir(folder)if err != nil {return nil, err}for _, entry := range entries {name := entry.Name()if strings.HasSuffix(name, ".BirdNET.results.csv") {files = append(files, filepath.Join(folder, name))}}return files, nil}commonOutput, err := callsFromSource(src, commonInput)// Convert to Birda-specific output typevar output CallsFromBirdaOutputoutput.Calls = commonOutput.Callsoutput.TotalCalls = commonOutput.TotalCallsoutput.SpeciesCount = commonOutput.SpeciesCountoutput.DataFilesWritten = commonOutput.DataFilesWrittenoutput.DataFilesSkipped = commonOutput.DataFilesSkippedoutput.FilesProcessed = commonOutput.FilesProcessedoutput.FilesDeleted = commonOutput.FilesDeletedoutput.Filter = commonOutput.Filteroutput.Error = commonOutput.Errorreturn output, err}// BirdNETDetection represents a single BirdNET detectiontype BirdNETDetection struct {StartTime float64EndTime float64ScientificName stringCommonName stringConfidence float64WAVPath string}// birdaColumnIndices holds the parsed column positions from a BirdNET CSV header.type birdaColumnIndices struct {startIdx intendIdx intcommonNameIdx intconfidenceIdx intfileIdx int}// parseBirdaCSVHeader reads the CSV header row and returns column indices.func parseBirdaCSVHeader(reader *csv.Reader) (birdaColumnIndices, error) {header, err := reader.Read()if err != nil {return birdaColumnIndices{}, fmt.Errorf("failed to read header: %w", err)}idx := birdaColumnIndices{startIdx: -1, endIdx: -1, commonNameIdx: -1, confidenceIdx: -1, fileIdx: -1}for i, col := range header {col = strings.TrimPrefix(col, "\ufeff")switch col {case "Start (s)":idx.startIdx = icase "End (s)":idx.endIdx = icase "Common name":idx.commonNameIdx = icase "Confidence":idx.confidenceIdx = icase "File":idx.fileIdx = i}}if idx.startIdx == -1 || idx.endIdx == -1 || idx.commonNameIdx == -1 || idx.confidenceIdx == -1 {return birdaColumnIndices{}, fmt.Errorf("missing required columns in BirdNET file")}// readBirdaDetections reads all detection records from a BirdNET CSV.func readBirdaDetections(reader *csv.Reader, idx birdaColumnIndices) ([]BirdNETDetection, error) {var detections []BirdNETDetectionfor {record, err := reader.Read()if err == io.EOF {break}if err != nil {return nil, fmt.Errorf("failed to read record: %w", err)}var det BirdNETDetectionstartTime, perr := strconv.ParseFloat(record[idx.startIdx], 64)if perr != nil {return nil, fmt.Errorf("failed to parse start time %q: %w", record[idx.startIdx], perr)}det.StartTime = startTimeendTime, perr := strconv.ParseFloat(record[idx.endIdx], 64)if perr != nil {return nil, fmt.Errorf("failed to parse end time %q: %w", record[idx.endIdx], perr)}det.EndTime = endTimedet.CommonName = record[idx.commonNameIdx]confidence, perr := strconv.ParseFloat(record[idx.confidenceIdx], 64)if perr != nil {return nil, fmt.Errorf("failed to parse confidence %q: %w", record[idx.confidenceIdx], perr)}det.Confidence = confidenceif idx.fileIdx >= 0 && idx.fileIdx < len(record) {det.WAVPath = record[idx.fileIdx]}detections = append(detections, det)}// resolveBirdaWAVPath finds the WAV file associated with a BirdNET results file.func resolveBirdaWAVPath(birdaFile string, firstWAVPath string, cache *DirCache) string {if firstWAVPath != "" {if _, err := os.Stat(firstWAVPath); err == nil {return firstWAVPath}}dir := filepath.Dir(birdaFile)base := filepath.Base(birdaFile)baseName := strings.TrimSuffix(base, ".BirdNET.results.csv")if cache != nil {return cache.FindWAV(baseName)}// processBirdaFileCached processes a single BirdNET results file using a DirCache for WAV lookupfunc processBirdaFileCached(birdaFile string, cache *DirCache) ([]ClusteredCall, bool, bool, error) {file, err := os.Open(birdaFile)if err != nil {return nil, false, false, fmt.Errorf("failed to open file: %w", err)}defer func() { _ = file.Close() }()reader := csv.NewReader(file)idx, err := parseBirdaCSVHeader(reader)if err != nil {return nil, false, false, err}detections, err := readBirdaDetections(reader, idx)if err != nil {return nil, false, false, err}if len(detections) == 0 {return nil, false, true, nil}if wavPath == "" {return nil, false, true, nil}sampleRate, duration, err := utils.ParseWAVHeaderMinimal(wavPath)if err != nil {return nil, false, true, nil}dataPath := wavPath + ".data"segments := buildBirdNETSegments(detections, sampleRate)meta := AviaNZMeta{Operator: "BirdNET", Duration: duration}reviewer := "None"meta.Reviewer = &reviewerif err := writeDotDataFileSafe(dataPath, segments, "BirdNET", meta); err != nil {return nil, false, false, err}var calls []ClusteredCallfor _, det := range detections {calls = append(calls, ClusteredCall{File: wavPath,StartTime: det.StartTime,EndTime: det.EndTime,EbirdCode: det.CommonName,Segments: 1,})}return calls, true, false, nil}// buildBirdNETSegments converts BirdNET detections to AviaNZ segmentsfunc buildBirdNETSegments(detections []BirdNETDetection, sampleRate int) []AviaNZSegment {var segments []AviaNZSegmentfor _, det := range detections {// Convert confidence (0.0-1.0) to certainty (0-100)certainty := min(max(int(det.Confidence*100), 0), 100)labels := []AviaNZLabel{{Species: det.CommonName,Certainty: certainty,Filter: "BirdNET",},}segment := AviaNZSegment{det.StartTime,det.EndTime,0, // freq_lowsampleRate, // freq_high (full band)labels,}segments = append(segments, segment)}return segments}wavPath := resolveBirdaWAVPath(birdaFile, detections[0].WAVPath, cache)return findWAVFile(dir, baseName)}return detections, nil}return idx, nil}// CallsFromBirda processes BirdNET results files and writes .data filesfunc CallsFromBirda(input CallsFromBirdaInput) (CallsFromBirdaOutput, error) {src := birdaSource{}commonInput := CallsFromSourceInput(input)}func (birdaSource) ProcessFile(birdaFile string, cache *DirCache) ([]ClusteredCall, bool, bool, error) {return processBirdaFileCached(birdaFile, cache)
package toolsimport ("os""path/filepath""testing")func TestDetectAnomalies_LabelMismatch(t *testing.T) {dir := t.TempDir()// Same time range, different calltypes across two modelsdata := `[{"Operator":"test"},` +`[0,10,100,1000,[{"species":"Kiwi","calltype":"Duet","certainty":100,"filter":"model-a"},` +`{"species":"Kiwi","calltype":"Male","certainty":100,"filter":"model-b"}]]]`if err := os.WriteFile(filepath.Join(dir, "f1.data"), []byte(data), 0644); err != nil {t.Fatal(err)}out, err := DetectAnomalies(DetectAnomaliesInput{Folder: dir, Models: []string{"model-a", "model-b"}})if err != nil {t.Fatal(err)}if out.LabelMismatches != 1 {t.Errorf("expected 1 label mismatch, got %d", out.LabelMismatches)}if out.CertaintyMismatches != 0 {t.Errorf("expected 0 certainty mismatches, got %d", out.CertaintyMismatches)}if out.Anomalies[0].Type != "label_mismatch" {t.Errorf("expected label_mismatch, got %s", out.Anomalies[0].Type)}}func TestDetectAnomalies_CertaintyMismatch(t *testing.T) {dir := t.TempDir()// Same time range, same labels, different certaintydata := `[{"Operator":"test"},` +`[0,10,100,1000,[{"species":"Kiwi","calltype":"Duet","certainty":90,"filter":"model-a"},` +`{"species":"Kiwi","calltype":"Duet","certainty":100,"filter":"model-b"}]]]`if err := os.WriteFile(filepath.Join(dir, "f1.data"), []byte(data), 0644); err != nil {t.Fatal(err)}out, err := DetectAnomalies(DetectAnomaliesInput{Folder: dir, Models: []string{"model-a", "model-b"}})if err != nil {t.Fatal(err)}if out.CertaintyMismatches != 1 {t.Errorf("expected 1 certainty mismatch, got %d", out.CertaintyMismatches)}if out.LabelMismatches != 0 {t.Errorf("expected 0 label mismatches, got %d", out.LabelMismatches)}}func TestDetectAnomalies_NoAnomalyWhenAgreement(t *testing.T) {dir := t.TempDir()data := `[{"Operator":"test"},` +`[0,10,100,1000,[{"species":"Kiwi","calltype":"Duet","certainty":100,"filter":"model-a"},` +`{"species":"Kiwi","calltype":"Duet","certainty":100,"filter":"model-b"}]]]`if err := os.WriteFile(filepath.Join(dir, "f1.data"), []byte(data), 0644); err != nil {t.Fatal(err)}out, err := DetectAnomalies(DetectAnomaliesInput{Folder: dir, Models: []string{"model-a", "model-b"}})if err != nil {t.Fatal(err)}if out.AnomaliesTotal != 0 {t.Errorf("expected 0 anomalies, got %d", out.AnomaliesTotal)}}func TestDetectAnomalies_LonelySegmentSkipped(t *testing.T) {dir := t.TempDir()// model-a has a segment, model-b has no segment in this filedata := `[{"Operator":"test"},` +`[0,10,100,1000,[{"species":"Kiwi","certainty":100,"filter":"model-a"}]]]`if err := os.WriteFile(filepath.Join(dir, "f1.data"), []byte(data), 0644); err != nil {t.Fatal(err)}out, err := DetectAnomalies(DetectAnomaliesInput{Folder: dir, Models: []string{"model-a", "model-b"}})if err != nil {t.Fatal(err)}if out.AnomaliesTotal != 0 {t.Errorf("lonely segment should be skipped, got %d anomalies", out.AnomaliesTotal)}if out.FilesWithAllModels != 0 {t.Errorf("file missing a model should not count as FilesWithAllModels")}}func TestDetectAnomalies_FailsWithOneModel(t *testing.T) {dir := t.TempDir()_, err := DetectAnomalies(DetectAnomaliesInput{Folder: dir, Models: []string{"model-a"}})if err == nil {t.Error("expected error with only 1 model")}}
package toolsimport ("fmt""os""path/filepath""skraak/utils")type DetectAnomaliesInput struct {Folder stringModels []string // at least 2 filter namesSpecies []string // optional scope; empty = all species}type DetectAnomaliesOutput struct {Folder string `json:"folder"`Models []string `json:"models"`FilesExamined int `json:"files_examined"`FilesWithAllModels int `json:"files_with_all_models"`AnomaliesTotal int `json:"anomalies_total"`LabelMismatches int `json:"label_mismatches"`CertaintyMismatches int `json:"certainty_mismatches"`Anomalies []Anomaly `json:"anomalies,omitempty"`Error string `json:"error,omitempty"`}type Anomaly struct {File string `json:"file"`Type string `json:"type"` // "label_mismatch" | "certainty_mismatch"Segments []AnomalySegment `json:"segments"`}type AnomalySegment struct {Model string `json:"model"`Start float64 `json:"start"`End float64 `json:"end"`Species string `json:"species"`CallType string `json:"calltype,omitempty"`Certainty int `json:"certainty"`}// DetectAnomalies compares corresponding segments across multiple ML model filters// within each .data file. Segments are matched by time overlap (same logic as propagate).// Lonely segments (no overlap in one or more models) are silently skipped.// Anomalies are flagged when overlapping segments disagree on species+calltype,// or when labels match but certainty values differ.// validateAnomalyInput validates the input parameters for DetectAnomalies.func validateAnomalyInput(input DetectAnomaliesInput) error {if len(input.Models) < 2 {return fmt.Errorf("at least 2 --model values required")}for i, a := range input.Models {for j, b := range input.Models {if i != j && a == b {return fmt.Errorf("duplicate --model values are not allowed")}}}info, err := os.Stat(input.Folder)if err != nil {return fmt.Errorf("folder not found: %s", input.Folder)}if !info.IsDir() {return fmt.Errorf("not a directory: %s", input.Folder)}return nil}func DetectAnomalies(input DetectAnomaliesInput) (DetectAnomaliesOutput, error) {folder := filepath.Clean(input.Folder)output := DetectAnomaliesOutput{Folder: folder,Models: input.Models,}files, err := utils.FindDataFiles(folder)if err != nil {output.Error = fmt.Sprintf("list .data files: %v", err)return output, fmt.Errorf("%s", output.Error)}scopeSet := make(map[string]bool, len(input.Species))for _, s := range input.Species {scopeSet[s] = true}for _, path := range files {df, err := utils.ParseDataFile(path)if err != nil {continue}output.FilesExamined++anomalies := detectAnomaliesInFile(df, path, input.Models, scopeSet)if anomalies == nil {// file didn't have all models presentcontinue}output.FilesWithAllModels++for _, a := range anomalies {if a.Type == "label_mismatch" {output.LabelMismatches++} else {output.CertaintyMismatches++}}output.Anomalies = append(output.Anomalies, anomalies...)}output.AnomaliesTotal = len(output.Anomalies)return output, nil}// labeledSeg pairs a segment with the specific label matching the model filter.type labeledSeg struct {seg *utils.Segmentlabel *utils.Label}// detectAnomaliesInFile returns nil if the file doesn't contain all required models.func detectAnomaliesInFile(df *utils.DataFile, path string, models []string, scope map[string]bool) []Anomaly {modelSegs := collectModelSegments(df, models)// Skip file if any model is entirely absent.for _, model := range models {if len(modelSegs[model]) == 0 {return nil}}var anomalies []Anomalyfor _, anchor := range modelSegs[models[0]] {if !inScope(anchor, scope) {continue}if matches := findOverlappingMatches(anchor, models, modelSegs); matches == nil {continue} else {group := buildComparisonGroup(anchor, models, matches)if a := checkGroupAnomaly(group, path, models); a != nil {anomalies = append(anomalies, *a)}}// collectModelSegments groups labeled segments by model filter name.func collectModelSegments(df *utils.DataFile, models []string) map[string][]labeledSeg {modelSegs := make(map[string][]labeledSeg, len(models))for _, seg := range df.Segments {for _, lbl := range seg.Labels {for _, model := range models {if lbl.Filter == model {modelSegs[model] = append(modelSegs[model], labeledSeg{seg: seg, label: lbl})break}}}}return modelSegs}// inScope returns true if the anchor's label is within the species scope filter.func inScope(anchor labeledSeg, scope map[string]bool) bool {if len(scope) == 0 {return true}key := anchor.label.Speciesif anchor.label.CallType != "" {key += "+" + anchor.label.CallType}return scope[key] || scope[anchor.label.Species]}// findOverlappingMatches returns matches[model] = overlapping segments from that model,// or nil if any model has no overlap (lonely anchor).func findOverlappingMatches(anchor labeledSeg, models []string, modelSegs map[string][]labeledSeg) map[string][]labeledSeg {matches := make(map[string][]labeledSeg, len(models)-1)for _, model := range models[1:] {for _, candidate := range modelSegs[model] {if overlaps(anchor.seg, candidate.seg) {matches[model] = append(matches[model], candidate)}}// buildComparisonGroup assembles anchor + first match per other model.func buildComparisonGroup(anchor labeledSeg, models []string, matches map[string][]labeledSeg) []labeledSeg {group := []labeledSeg{anchor}for _, model := range models[1:] {group = append(group, matches[model][0])}return group}// checkGroupAnomaly checks a comparison group for label or certainty mismatches.func checkGroupAnomaly(group []labeledSeg, path string, models []string) *Anomaly {refSpecies := group[0].label.SpeciesrefCallType := group[0].label.CallTypefor _, ls := range group[1:] {if ls.label.Species != refSpecies || ls.label.CallType != refCallType {a := Anomaly{File: path, Type: "label_mismatch", Segments: buildAnomalySegs(group, models)}return &a}}refCertainty := group[0].label.Certaintyfor _, ls := range group[1:] {if ls.label.Certainty != refCertainty {a := Anomaly{File: path, Type: "certainty_mismatch", Segments: buildAnomalySegs(group, models)}return &a}}return nil}func buildAnomalySegs(group []labeledSeg, models []string) []AnomalySegment {segs := make([]AnomalySegment, len(group))for i, ls := range group {segs[i] = AnomalySegment{Model: models[i],Start: ls.seg.StartTime,End: ls.seg.EndTime,Species: ls.label.Species,CallType: ls.label.CallType,Certainty: ls.label.Certainty,}}return segs}// overlaps returns true if two segments share any time overlap.func overlaps(a, b *utils.Segment) bool {return a.StartTime < b.EndTime && b.StartTime < a.EndTime}if len(matches[model]) == 0 {return nil}}return matches}}return anomalies}if err := validateAnomalyInput(input); err != nil {output.Error = err.Error()return output, err}
package toolsimport ("encoding/csv""os""path/filepath""strings""testing""skraak/utils")// --- test helpers (test file only) ---func writeDataFile(t *testing.T, dir, name string, df *utils.DataFile) {t.Helper()if err := df.Write(filepath.Join(dir, name)); err != nil {t.Fatalf("write .data file %s: %v", name, err)}}func writeMapping(t *testing.T, dir, json string) {t.Helper()if err := os.WriteFile(filepath.Join(dir, "mapping.json"), []byte(json), 0644); err != nil {t.Fatalf("write mapping.json: %v", err)}}// parseCSV reads the output CSV, returning header and rows.func parseCSV(t *testing.T, path string) ([]string, [][]string) {t.Helper()f, err := os.Open(path)if err != nil {t.Fatalf("open CSV %s: %v", path, err)}defer f.Close()r := csv.NewReader(f)header, err := r.Read()if err != nil {t.Fatalf("read header: %v", err)}rows, err := r.ReadAll()if err != nil {t.Fatalf("read rows: %v", err)}return header, rows}// clipLabels calls CallsClipLabels with standard test parameters.func clipLabels(t *testing.T, dir string, extra ...func(*CallsClipLabelsInput)) CallsClipLabelsOutput {t.Helper()input := CallsClipLabelsInput{Folder: dir,MappingPath: filepath.Join(dir, "mapping.json"),OutputPath: filepath.Join(dir, "clip_labels.csv"),ClipDuration: 5,ClipOverlap: 0,MinLabelOverlap: 0.25,FinalClip: "full",}for _, fn := range extra {fn(&input)}out, err := CallsClipLabels(input)if err != nil {t.Fatalf("CallsClipLabels: %v", err)}return out}// --- tests ---func TestClipLabels_RealClassTrue(t *testing.T) {dir := t.TempDir()writeDataFile(t, dir, "rec.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 20},Segments: []*utils.Segment{{StartTime: 3, EndTime: 8, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},},},})writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"}}`)out := clipLabels(t, dir)header, rows := parseCSV(t, filepath.Join(dir, "clip_labels.csv"))// Header: file, start_time, end_time, Kiwiif len(header) != 4 || header[3] != "Kiwi" {t.Fatalf("header = %v, want [..., Kiwi]", header)}// Clip 0-5 overlaps segment 3-8 by 2s ≥ 0.25 → Kiwi=True// Clip 5-10 overlaps segment 3-8 by 3s ≥ 0.25 → Kiwi=True// Clip 10-15, 15-20 → Kiwi=FalsekiwiCol := 3for i, row := range rows {switch row[1] {case "0.0", "5.0":if row[kiwiCol] != "True" {t.Errorf("row %d (start=%s): Kiwi=%s, want True", i, row[1], row[kiwiCol])}case "10.0", "15.0":if row[kiwiCol] != "False" {t.Errorf("row %d (start=%s): Kiwi=%s, want False", i, row[1], row[kiwiCol])}}}if out.PerClassTrueCount["Kiwi"] != 2 {t.Errorf("PerClassTrueCount[Kiwi] = %d, want 2", out.PerClassTrueCount["Kiwi"])}}func TestClipLabels_GapClipsAllFalse(t *testing.T) {dir := t.TempDir()// 15s file, Kiwi segment 0-5 only → clips 5-10 and 10-15 are gapswriteDataFile(t, dir, "rec.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 15},Segments: []*utils.Segment{{StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},},},})writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"}}`)out := clipLabels(t, dir)if out.ClipsAllFalseGap != 2 {t.Errorf("ClipsAllFalseGap = %d, want 2", out.ClipsAllFalseGap)}if out.PerClassTrueCount["Kiwi"] != 1 {t.Errorf("PerClassTrueCount[Kiwi] = %d, want 1", out.PerClassTrueCount["Kiwi"])}if out.RowsWritten != 3 {t.Errorf("RowsWritten = %d, want 3", out.RowsWritten)}}func TestClipLabels_NegativeOverridesPositive(t *testing.T) {dir := t.TempDir()// Kiwi segment 0-8, Not segment 0-4 → clip 0-5 overlaps both → __NEGATIVE__ wins// Clip 5-10 overlaps only Kiwi (3s) → TruewriteDataFile(t, dir, "rec.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 10},Segments: []*utils.Segment{{StartTime: 0, EndTime: 8, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},},{StartTime: 0, EndTime: 4, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Not", Certainty: 100, Filter: "f1"}},},},})writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"},"Not":{"species":"__NEGATIVE__"}}`)out := clipLabels(t, dir)if out.ClipsNegative != 1 {t.Errorf("ClipsNegative = %d, want 1", out.ClipsNegative)}_, rows := parseCSV(t, filepath.Join(dir, "clip_labels.csv"))// Clip 0-5: negative hit → all-False (Not overlaps 0-4 by 4s)if rows[0][3] != "False" {t.Errorf("clip 0-5 Kiwi = %s, want False (overridden by __NEGATIVE__)", rows[0][3])}// Clip 5-10: only Kiwi overlaps (3s) → Trueif rows[1][3] != "True" {t.Errorf("clip 5-10 Kiwi = %s, want True", rows[1][3])}}func TestClipLabels_IgnoreExcludesClip(t *testing.T) {dir := t.TempDir()// Don't Know segment 0-5, Kiwi segment 6-10// Clip 0-5 overlaps __IGNORE__ → excluded// Clip 5-10 overlaps Kiwi → emitted with TruewriteDataFile(t, dir, "rec.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 15},Segments: []*utils.Segment{{StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Don't Know", Certainty: 0, Filter: "f1"}},},{StartTime: 6, EndTime: 10, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},},},})writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"},"Don't Know":{"species":"__IGNORE__"}}`)out := clipLabels(t, dir)if out.ClipsIgnored != 1 {t.Errorf("ClipsIgnored = %d, want 1", out.ClipsIgnored)}if out.SegmentsIgnored != 1 {t.Errorf("SegmentsIgnored = %d, want 1", out.SegmentsIgnored)}// Only 2 rows: clip 5-10 (Kiwi=True) and clip 10-15 (gap)if out.RowsWritten != 2 {t.Errorf("RowsWritten = %d, want 2", out.RowsWritten)}}func TestClipLabels_FilterRestrictsLabels(t *testing.T) {dir := t.TempDir()// Same time range, two filters. Only "wanted" should contribute.writeDataFile(t, dir, "rec.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 10},Segments: []*utils.Segment{{StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "wanted"},{Species: "Not", Certainty: 100, Filter: "unwanted"},},},},})writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"},"Not":{"species":"__NEGATIVE__"}}`)out := clipLabels(t, dir, func(in *CallsClipLabelsInput) { in.Filter = "wanted" })// Only Kiwi from "wanted" filter → clip 0-5 should be Kiwi=True// Not from "unwanted" filter should be ignored → no __NEGATIVE__ overrideif out.ClipsNegative != 0 {t.Errorf("ClipsNegative = %d, want 0 (Not filter excluded)", out.ClipsNegative)}if out.PerClassTrueCount["Kiwi"] != 1 {t.Errorf("PerClassTrueCount[Kiwi] = %d, want 1", out.PerClassTrueCount["Kiwi"])}}func TestClipLabels_MappingCoverageError(t *testing.T) {dir := t.TempDir()writeDataFile(t, dir, "rec.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 10},Segments: []*utils.Segment{{StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Mystery", Certainty: 100, Filter: "f1"}},},},})writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"}}`)input := CallsClipLabelsInput{Folder: dir,MappingPath: filepath.Join(dir, "mapping.json"),OutputPath: filepath.Join(dir, "clip_labels.csv"),ClipDuration: 5,ClipOverlap: 0,MinLabelOverlap: 0.25,FinalClip: "full",}_, err := CallsClipLabels(input)if err == nil {t.Fatal("expected error for missing species in mapping")}if !strings.Contains(err.Error(), "Mystery") {t.Errorf("error should mention missing species, got: %v", err)}}func TestClipLabels_AppendMode(t *testing.T) {dir := t.TempDir()writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"}}`)// First filewriteDataFile(t, dir, "a.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 5},Segments: []*utils.Segment{{StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},},},})out1 := clipLabels(t, dir)if out1.RowsWritten != 1 {t.Fatalf("first run: RowsWritten = %d, want 1", out1.RowsWritten)}// Second run on same output file but with a different input folder// Simulate append by running again — should fail on duplicate_, err := CallsClipLabels(CallsClipLabelsInput{Folder: dir,MappingPath: filepath.Join(dir, "mapping.json"),OutputPath: filepath.Join(dir, "clip_labels.csv"),ClipDuration: 5,ClipOverlap: 0,MinLabelOverlap: 0.25,FinalClip: "full",})if err == nil {t.Fatal("expected duplicate error on second run with same folder")}if !strings.Contains(err.Error(), "duplicate") {t.Errorf("error should mention duplicate, got: %v", err)}}func TestClipLabels_MultipleFiles(t *testing.T) {dir := t.TempDir()writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"}}`)writeDataFile(t, dir, "a.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 10},Segments: []*utils.Segment{{StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},},},})writeDataFile(t, dir, "b.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 5},Segments: []*utils.Segment{{StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},},},})out := clipLabels(t, dir)if out.DataFilesParsed != 2 {t.Errorf("DataFilesParsed = %d, want 2", out.DataFilesParsed)}// a: 2 clips (0-5, 5-10), b: 1 clip (0-5) = 3 totalif out.RowsWritten != 3 {t.Errorf("RowsWritten = %d, want 3", out.RowsWritten)}_, rows := parseCSV(t, filepath.Join(dir, "clip_labels.csv"))files := map[string]int{}for _, r := range rows {files[r[0]]++}if len(files) != 2 {t.Errorf("expected 2 distinct files in CSV, got %d", len(files))}}
package toolsimport ("encoding/csv""fmt""io""os""path/filepath""slices""sort""strconv""strings""skraak/utils")// CallsClipLabelsInput configures the clip-labels exporter.type CallsClipLabelsInput struct {Folder string `json:"folder"`MappingPath string `json:"mapping"`Filter string `json:"filter,omitempty"`OutputPath string `json:"output"`ClipDuration float64 `json:"clip_duration"`ClipOverlap float64 `json:"clip_overlap"`MinLabelOverlap float64 `json:"min_label_overlap"`FinalClip string `json:"final_clip"`}// CallsClipLabelsOutput summarises a run.type CallsClipLabelsOutput struct {Folder string `json:"folder"`OutputPath string `json:"output"`Filter string `json:"filter,omitempty"`Classes []string `json:"classes"`DataFilesParsed int `json:"data_files_parsed"`ClipsNegative int `json:"clips_negative"` // emitted, all-False because of __NEGATIVE__ClipsIgnored int `json:"clips_ignored"` // excluded from output because of __IGNORE__ overlapSegmentsIgnored int `json:"segments_ignored"` // segments whose species maps to __IGNORE__ClipsAllFalseGap int `json:"clips_all_false_gap"` // emitted, all-False because no overlapPerClassTrueCount map[string]int `json:"per_class_true_count"`AppendedToFile bool `json:"appended_to_file"`ExistingRowsFound int `json:"existing_rows_found"`RowsWritten int `json:"rows_written"`}// resolvedSeg is a segment that has been classified by the mapping and is// ready for overlap-checking against clip windows.type resolvedSeg struct {start, end float64kind utils.MappingKindclassIdx int // valid only when kind == utils.MappingReal}// clipDisposition describes the outcome for a single clip window.type clipDisposition intconst (dispoLabelled clipDisposition = iota // at least one class column is TruedispoNegative // __NEGATIVE__ hit, all class columns FalsedispoGap // no segment overlaps, all class columns FalsedispoIgnored // __IGNORE__ hit, clip excluded from output)// clipLabelsRow is one row of the output CSV.type clipLabelsRow struct {file stringstart float64end float64flags []bool}// rowKey is used for duplicate detection.type rowKey struct {file stringstart stringend string}// CallsClipLabels reads .data files from a single folder and writes a CSV in// OpenSoundScape's clip_labels format: one row per clip per file, with one// True/False column per class in the mapping.//// Mirrors BoxedAnnotations.clip_labels(): every clip window is emitted; a// column is True when any annotation of that class overlaps the window by// ≥ min_label_overlap seconds. Sentinel mappings (__NEGATIVE__, __IGNORE__)// get no column and contribute no labels.// parsedClipFile holds a parsed .data file for clip-labels processing.type parsedClipFile struct {path stringdf *utils.DataFile}// validateClipLabelsInput validates the input parameters and returns the parsed finalClipMode.func validateClipLabelsInput(input CallsClipLabelsInput) (utils.FinalClipMode, error) {finalClipMode, err := utils.ParseFinalClipMode(input.FinalClip)if err != nil {return 0, err}if input.ClipDuration <= 0 {return 0, fmt.Errorf("--clip-duration must be > 0, got %v", input.ClipDuration)}if input.ClipOverlap < 0 || input.ClipOverlap >= input.ClipDuration {return 0, fmt.Errorf("--clip-overlap must be in [0, clip-duration), got %v", input.ClipOverlap)}if input.MinLabelOverlap <= 0 {return 0, fmt.Errorf("--min-label-overlap must be > 0, got %v", input.MinLabelOverlap)}// parseClipLabelsDataFiles finds and parses .data files, collecting species seen.func parseClipLabelsDataFiles(folder, filter string, mapping utils.MappingFile) ([]parsedClipFile, error) {dataPaths, err := utils.FindDataFiles(folder)if err != nil {return nil, fmt.Errorf("scan folder %s: %w", folder, err)}if len(dataPaths) == 0 {return nil, fmt.Errorf("no .data files found in %s", folder)}speciesSeen := map[string]bool{}for _, p := range dataPaths {df, err := utils.ParseDataFile(p)if err != nil {return nil, fmt.Errorf("parse %s: %w", p, err)}if df.Meta == nil || df.Meta.Duration <= 0 {return nil, fmt.Errorf("missing or non-positive Duration in %s (cannot generate clips)", p)}for _, seg := range df.Segments {for _, lbl := range seg.Labels {if filter != "" && lbl.Filter != filter {continue}speciesSeen[lbl.Species] = true}}parsed = append(parsed, parsedClipFile{path: p, df: df})}if missing := mapping.ValidateCoversSpecies(speciesSeen); len(missing) > 0 {return nil, fmt.Errorf("mapping.json is missing entries for species: %s\n(run /data-mapping to regenerate)", strings.Join(missing, ", "))}return parsed, nil}// dedupClipLabelsRows checks for duplicate rows within new rows and against existing CSV rows.func dedupClipLabelsRows(rows []clipLabelsRow, existing map[rowKey]bool) error {dedup := make(map[rowKey]bool, len(existing)+len(rows))for k := range existing {dedup[k] = true}func CallsClipLabels(input CallsClipLabelsInput) (CallsClipLabelsOutput, error) {out := CallsClipLabelsOutput{Folder: input.Folder,OutputPath: input.OutputPath,PerClassTrueCount: map[string]int{},}finalClipMode, err := validateClipLabelsInput(input)if err != nil {return out, err}mapping, err := utils.LoadMappingFile(input.MappingPath)if err != nil {return out, fmt.Errorf("load mapping %s: %w", input.MappingPath, err)}classes := mapping.Classes()if len(classes) == 0 {return out, fmt.Errorf("mapping.json has no real (non-sentinel) classes")}out.Classes = classesout.Filter = input.FilterclassIdx := map[string]int{}for i, c := range classes {classIdx[c] = i}parsed, err := parseClipLabelsDataFiles(input.Folder, input.Filter, mapping)if err != nil {return out, err}out.DataFilesParsed = len(parsed)expectedHeader := append([]string{"file", "start_time", "end_time"}, classes...)existing, appendMode, err := loadExistingRows(input.OutputPath, expectedHeader)if err != nil {return out, err}out.AppendedToFile = appendModeout.ExistingRowsFound = len(existing)cwd, err := os.Getwd()if err != nil {return out, fmt.Errorf("getwd: %w", err)}folderAbs, err := filepath.Abs(input.Folder)if err != nil {return out, fmt.Errorf("abs %s: %w", input.Folder, err)}rows := make([]clipLabelsRow, 0, 1024)for _, pf := range parsed {fileRows, err := processClipLabelsFile(pf.path, pf.df, mapping, classIdx, classes, input, finalClipMode, cwd, folderAbs, &out)if err != nil {return out, err}rows = append(rows, fileRows...)}if err := dedupClipLabelsRows(rows, existing); err != nil {return out, err}if err := writeRows(input.OutputPath, expectedHeader, rows, appendMode); err != nil {return out, err}out.RowsWritten = len(rows)sort.Strings(out.Classes)return out, nil}// processClipLabelsFile generates clip-labels rows for a single .data file.func processClipLabelsFile(path string,df *utils.DataFile,mapping utils.MappingFile,classIdx map[string]int,classes []string,input CallsClipLabelsInput,finalClipMode utils.FinalClipMode,cwd, folderAbs string,out *CallsClipLabelsOutput,) ([]clipLabelsRow, error) {windows, err := utils.GenerateClipTimes(df.Meta.Duration,input.ClipDuration,input.ClipOverlap,finalClipMode,10,)if err != nil {return nil, fmt.Errorf("generate clip windows for %s: %w", path, err)}if len(windows) == 0 {return nil, nil}// resolveSegments maps segments to their classification and filters out mismatches.func resolveSegments(segments []*utils.Segment,filter string,minLabelOverlap float64,mapping utils.MappingFile,classIdx map[string]int,out *CallsClipLabelsOutput,) []resolvedSeg {segs := make([]resolvedSeg, 0, len(segments))for _, seg := range segments {if seg.EndTime-seg.StartTime < minLabelOverlap {continue}for _, lbl := range seg.Labels {if filter != "" && lbl.Filter != filter {continue}canon, kind, ok := mapping.Classify(lbl.Species)if !ok {continue}switch kind {case utils.MappingIgn:out.SegmentsIgnored++segs = append(segs, resolvedSeg{start: seg.StartTime, end: seg.EndTime, kind: kind})case utils.MappingNeg:segs = append(segs, resolvedSeg{start: seg.StartTime, end: seg.EndTime, kind: kind})case utils.MappingReal:idx, present := classIdx[canon]if !present {continue}segs = append(segs, resolvedSeg{start: seg.StartTime, end: seg.EndTime, kind: kind, classIdx: idx})}}}// computeWavRelPath computes the relative path from cwd to the WAV file corresponding to a .data file.func computeWavRelPath(dataPath, cwd, folderAbs string) (string, error) {wavName := strings.TrimSuffix(filepath.Base(dataPath), ".data")wavAbs := filepath.Join(folderAbs, wavName)rel, err := filepath.Rel(cwd, wavAbs)if err != nil {rel = wavAbs}// Ensure relative paths start with ./ to match OPSO / pandas convention.if rel != "" && !filepath.IsAbs(rel) && !strings.HasPrefix(rel, "."+string(filepath.Separator)) {rel = "." + string(filepath.Separator) + rel}// labelClipWindows classifies each clip window and builds the output rows.func labelClipWindows(windows []utils.ClipWindow, segs []resolvedSeg, rel string, classes []string, minLabelOverlap float64, out *CallsClipLabelsOutput) []clipLabelsRow {var rows []clipLabelsRowfor _, w := range windows {dispo, classHits := classifyClip(w, segs, minLabelOverlap, len(classes))if dispo == dispoIgnored {out.ClipsIgnored++continue}row := clipLabelsRow{file: rel,start: w.Start,end: w.End,flags: make([]bool, len(classes)),}switch dispo {case dispoNegative:out.ClipsNegative++case dispoGap:out.ClipsAllFalseGap++case dispoLabelled:for i, hit := range classHits {if hit {row.flags[i] = trueout.PerClassTrueCount[classes[i]]++}}}rows = append(rows, row)}return rows}// classifyClip determines the disposition of a single clip window against// the resolved segments. Priority: __IGNORE__ > __NEGATIVE__ > class labels.func classifyClip(w utils.ClipWindow, segs []resolvedSeg, minLabelOverlap float64, nClasses int) (clipDisposition, []bool) {ignoreHit := falsenegativeHit := falseclassHits := make([]bool, nClasses)for _, s := range segs {if overlapSeconds(s.start, s.end, w.Start, w.End) < minLabelOverlap {continue}switch s.kind {case utils.MappingIgn:ignoreHit = truecase utils.MappingNeg:negativeHit = truecase utils.MappingReal:classHits[s.classIdx] = true}}if ignoreHit {return dispoIgnored, nil}if negativeHit {return dispoNegative, classHits}for _, hit := range classHits {if hit {return dispoLabelled, classHits}}return dispoGap, classHits}// loadExistingRows reads an existing output CSV and returns its row keys// (for deduplication) and whether we're in append mode.func loadExistingRows(outputPath string, expectedHeader []string) (map[rowKey]bool, bool, error) {fi, err := os.Stat(outputPath)if err != nil {if os.IsNotExist(err) {return nil, false, nil}return nil, false, fmt.Errorf("stat %s: %w", outputPath, err)}if fi.Size() == 0 {return nil, false, nil}f, err := os.Open(outputPath)if err != nil {return nil, false, fmt.Errorf("open existing %s: %w", outputPath, err)}defer func() { _ = f.Close() }()r := csv.NewReader(f)r.FieldsPerRecord = -1header, err := r.Read()if err != nil {return nil, false, fmt.Errorf("read header of existing %s: %w", outputPath, err)}if !slices.Equal(header, expectedHeader) {return nil, false, fmt.Errorf("column-set mismatch in existing %s\n existing: %s\n new: %s",outputPath, strings.Join(header, ","), strings.Join(expectedHeader, ","))}existing := map[rowKey]bool{}for {rec, err := r.Read()if err == io.EOF {break}if err != nil {return nil, false, fmt.Errorf("read row of existing %s: %w", outputPath, err)}if len(rec) < 3 {return nil, false, fmt.Errorf("malformed row in existing %s: %v", outputPath, rec)}existing[rowKey{file: rec[0], start: rec[1], end: rec[2]}] = true}return existing, true, nil}// overlapSeconds returns the duration of overlap between two half-open intervals.func overlapSeconds(aStart, aEnd, bStart, bEnd float64) float64 {lo := max(aStart, bStart)hi := min(aEnd, bEnd)if hi <= lo {return 0}return hi - lo}// formatTime renders a float to match pandas' default float repr in to_csv:// always at least one decimal place, no trailing zeros beyond what's needed.// e.g. 5 → "5.0", 5.5 → "5.5", 3.5001250000 → "3.500125".func formatTime(v float64) string {s := strconv.FormatFloat(v, 'f', -1, 64)if !strings.ContainsRune(s, '.') {s += ".0"}return s}// writeRows writes the clip-labels rows to a CSV file.func writeRows(path string, header []string, rows []clipLabelsRow, appendMode bool) error {var f *os.Filevar err errorif appendMode {f, err = os.OpenFile(path, os.O_APPEND|os.O_WRONLY, 0644)} else {f, err = os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)}if err != nil {return fmt.Errorf("open %s for write: %w", path, err)}defer func() { _ = f.Close() }()w := csv.NewWriter(f)if !appendMode {if err := w.Write(header); err != nil {return fmt.Errorf("write header: %w", err)}}if len(rows) == 0 {w.Flush()return w.Error()}rec := make([]string, 3+len(rows[0].flags))for _, r := range rows {rec[0] = r.filerec[1] = formatTime(r.start)rec[2] = formatTime(r.end)for i, b := range r.flags {if b {rec[3+i] = "True"} else {rec[3+i] = "False"}}if err := w.Write(rec); err != nil {return fmt.Errorf("write row: %w", err)}}w.Flush()return w.Error()}return rel, nil}return segs}return labelClipWindows(windows, segs, rel, classes, input.MinLabelOverlap, out), nil}}segs := resolveSegments(df.Segments, input.Filter, input.MinLabelOverlap, mapping, classIdx, out)rel, err := computeWavRelPath(path, cwd, folderAbs)if err != nil {return nil, errfor _, r := range rows {k := rowKey{file: r.file, start: formatTime(r.start), end: formatTime(r.end)}if dedup[k] {return fmt.Errorf("duplicate clip detected: file=%s start=%s end=%s", k.file, k.start, k.end)}dedup[k] = true}return nil}parsed := make([]parsedClipFile, 0, len(dataPaths))return finalClipMode, nil}
package toolsimport ("encoding/binary""math""os""testing""skraak/utils")const benchWAV = "../audio/20211028_211500.WAV"// ==================== WAV I/O ====================func BenchmarkReadWAV(b *testing.B) {b.ReportAllocs()for i := 0; i < b.N; i++ {_, _, err := utils.ReadWAVSamples(benchWAV)if err != nil {b.Fatal(err)}}}func BenchmarkConvertToFloat64_16bit(b *testing.B) {// Simulate 16-bit mono WAV data (same size as test file: 14.32M samples)numSamples := 14320000data := make([]byte, numSamples*2)for i := range numSamples {binary.LittleEndian.PutUint16(data[i*2:], uint16(i%65536))}b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {_ = convertToFloat64Bench(data, 16, 1)}}// Duplicate of convertToFloat64 for benchmarking (unexported in utils)func convertToFloat64Bench(data []byte, bitsPerSample, channels int) []float64 {bytesPerSample := bitsPerSample / 8blockAlign := bytesPerSample * channelsnumSamples := len(data) / blockAlignsamples := make([]float64, numSamples)for i := range numSamples {offset := i * blockAlignsample := int16(binary.LittleEndian.Uint16(data[offset : offset+2]))samples[i] = float64(sample) / 32768.0}return samples}func BenchmarkWriteWAV(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)b.Logf("segment samples=%d", len(segSamples))b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {f, _ := os.CreateTemp("", "bench_*.wav")utils.WriteWAVFile(f.Name(), segSamples, sr)f.Close()os.Remove(f.Name())}}// ==================== Resample ====================func BenchmarkResampleRate_48k(b *testing.B) {samples, _, _ := utils.ReadWAVSamples(benchWAV)b.Logf("resampling %d samples 48000->16000", len(samples))b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {utils.ResampleRate(samples, 48000, 16000)}}func BenchmarkResampleRate_250k(b *testing.B) {samples, _, _ := utils.ReadWAVSamples(benchWAV)b.Logf("resampling %d samples 250000->16000", len(samples))b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {utils.ResampleRate(samples, 250000, 16000)}}// ==================== Spectrogram pipeline ====================func BenchmarkExtractSegment(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)b.Logf("full file: %d samples, sr=%d", len(samples), sr)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {seg := utils.ExtractSegmentSamples(samples, sr, 872, 895)if len(seg) == 0 {b.Fatal("empty segment")}}}func BenchmarkPowerSpectrumFFT_512(b *testing.B) {n := 512samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)frameData := make([]float64, n)power := make([]float64, n/2+1)scratch := make([]complex128, n)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {// Simulate the windowing step (Hann) + FFTfor j := range n {frameData[j] = segSamples[j] * 0.5 * (1.0 - math.Cos(2.0*math.Pi*float64(j)/float64(n-1)))}utils.PowerSpectrumFFT(frameData, power, scratch)}}func BenchmarkSpectrogram_23s(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)cfg := utils.DefaultSpectrogramConfig(16000)b.Logf("segment samples=%d, windowSize=%d, hopSize=%d", len(segSamples), cfg.WindowSize, cfg.HopSize)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {spect := utils.GenerateSpectrogram(segSamples, cfg)if spect == nil {b.Fatal("nil spectrogram")}}}func BenchmarkSpectrogram_60s(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 0, 60)cfg := utils.DefaultSpectrogramConfig(16000)b.Logf("60s segment samples=%d", len(segSamples))b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {spect := utils.GenerateSpectrogram(segSamples, cfg)if spect == nil {b.Fatal("nil spectrogram")}}}// ==================== Image creation & resize ====================func BenchmarkCreateGrayscaleImage(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)cfg := utils.DefaultSpectrogramConfig(16000)spect := utils.GenerateSpectrogram(segSamples, cfg)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {img := utils.CreateGrayscaleImage(spect)if img == nil {b.Fatal("nil image")}}}func BenchmarkCreateRGBImage(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)cfg := utils.DefaultSpectrogramConfig(16000)spect := utils.GenerateSpectrogram(segSamples, cfg)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {colorData := utils.ApplyL4Colormap(spect)img := utils.CreateRGBImage(colorData)if img == nil {b.Fatal("nil image")}}}func BenchmarkApplyL4Colormap(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)cfg := utils.DefaultSpectrogramConfig(16000)spect := utils.GenerateSpectrogram(segSamples, cfg)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {colorData := utils.ApplyL4Colormap(spect)if colorData == nil {b.Fatal("nil colormap")}}}func BenchmarkResizeGray224(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)cfg := utils.DefaultSpectrogramConfig(16000)spect := utils.GenerateSpectrogram(segSamples, cfg)img := utils.CreateGrayscaleImage(spect)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {resized := utils.ResizeImage(img, 224, 224)if resized == nil {b.Fatal("nil resize")}}}func BenchmarkResizeGray448(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)cfg := utils.DefaultSpectrogramConfig(16000)spect := utils.GenerateSpectrogram(segSamples, cfg)img := utils.CreateGrayscaleImage(spect)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {resized := utils.ResizeImage(img, 448, 448)if resized == nil {b.Fatal("nil resize")}}}// ==================== PNG write ====================func BenchmarkWritePNG_224(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)cfg := utils.DefaultSpectrogramConfig(16000)spect := utils.GenerateSpectrogram(segSamples, cfg)img := utils.CreateGrayscaleImage(spect)resized := utils.ResizeImage(img, 224, 224)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {f, _ := os.CreateTemp("", "bench_*.png")utils.WritePNG(resized, f)f.Close()os.Remove(f.Name())}}// ==================== Full pipeline ====================func BenchmarkFullPipelineGray224(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)outputSR := srif sr > 16000 {segSamples = utils.ResampleRate(segSamples, sr, 16000)outputSR = 16000}cfg := utils.DefaultSpectrogramConfig(outputSR)spect := utils.GenerateSpectrogram(segSamples, cfg)img := utils.CreateGrayscaleImage(spect)resized := utils.ResizeImage(img, 224, 224)f, _ := os.CreateTemp("", "bench_*.png")utils.WritePNG(resized, f)f.Close()os.Remove(f.Name())utils.WriteWAVFile(f.Name(), segSamples, outputSR)os.Remove(f.Name())_ = resized}}func BenchmarkFullPipelineColor448(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)outputSR := srif sr > 16000 {segSamples = utils.ResampleRate(segSamples, sr, 16000)outputSR = 16000}cfg := utils.DefaultSpectrogramConfig(outputSR)spect := utils.GenerateSpectrogram(segSamples, cfg)colorData := utils.ApplyL4Colormap(spect)img := utils.CreateRGBImage(colorData)resized := utils.ResizeImage(img, 448, 448)f, _ := os.CreateTemp("", "bench_*.png")utils.WritePNG(resized, f)f.Close()os.Remove(f.Name())utils.WriteWAVFile(f.Name(), segSamples, outputSR)os.Remove(f.Name())_ = resized}}// ==================== Data dimension report ====================func TestPipelineDimensions(t *testing.T) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)t.Logf("Input: %d samples, sr=%d, segment=%d samples (%.1fs)",len(samples), sr, len(segSamples), float64(len(segSamples))/float64(sr))cfg := utils.DefaultSpectrogramConfig(16000)numFrames := (len(segSamples)-cfg.WindowSize)/cfg.HopSize + 1numBins := cfg.WindowSize/2 + 1t.Logf("Spectrogram: %d freq bins x %d time frames = %d values",numBins, numFrames, numBins*numFrames)spect := utils.GenerateSpectrogram(segSamples, cfg)t.Logf("Output: %d x %d (freq x time)", len(spect), len(spect[0]))img := utils.CreateGrayscaleImage(spect)t.Logf("Grayscale image: %dx%d pixels, %d bytes",img.Bounds().Dx(), img.Bounds().Dy(), img.Bounds().Dx()*img.Bounds().Dy())resized := utils.ResizeImage(img, 224, 224)t.Logf("Resized 224: %dx%d", resized.Bounds().Dx(), resized.Bounds().Dy())resized448 := utils.ResizeImage(img, 448, 448)t.Logf("Resized 448: %dx%d", resized448.Bounds().Dx(), resized448.Bounds().Dy())}
package toolsimport ("fmt""image""math""os""path/filepath""runtime""strings""sync""skraak/utils")// CallsClipInput defines the input for the clip tooltype CallsClipInput struct {File string `json:"file"`Folder string `json:"folder"`Output string `json:"output"`Prefix string `json:"prefix"`Filter string `json:"filter"`Species string `json:"species"`Certainty int `json:"certainty"`Size int `json:"size"`Color bool `json:"color"`Night bool `json:"night"`Day bool `json:"day"`Location string `json:"location,omitempty"`}// CallsClipOutput defines the output for the clip tooltype CallsClipOutput struct {FilesProcessed int `json:"files_processed"`SegmentsClipped int `json:"segments_clipped"`NightSkipped int `json:"night_skipped,omitempty"`DaySkipped int `json:"day_skipped,omitempty"`OutputFiles []string `json:"output_files"`Errors []string `json:"errors,omitempty"`}// CallsClip processes .data files and generates audio/image clips for matching segmentsfunc CallsClip(input CallsClipInput) (CallsClipOutput, error) {var output CallsClipOutput// Validate required flagsif err := validateClipInput(&output, input); err != nil {return output, err}// Parse species+calltypespeciesName, callType := utils.ParseSpeciesCallType(input.Species)// Get list of .data filesfilePaths, err := resolveClipFiles(&output, input)if err != nil {return output, err}// Create output folder if it doesn't existif err := os.MkdirAll(input.Output, 0755); err != nil {output.Errors = append(output.Errors, fmt.Sprintf("failed to create output folder: %v", err))return output, err}// Clamp image size to valid rangeimgSize := utils.ClampImageSize(input.Size)// Process .data files (parallel for larger batches)if len(filePaths) <= 2 {processFilesSequential(&output, filePaths, input, speciesName, callType, imgSize, lat, lng, timezone)} else {processFilesParallel(&output, filePaths, input, speciesName, callType, imgSize, lat, lng, timezone)}return output, nil}// validateClipInput validates required flags for clip generation.func validateClipInput(output *CallsClipOutput, input CallsClipInput) error {if input.File == "" && input.Folder == "" {output.Errors = append(output.Errors, "either --file or --folder is required")return fmt.Errorf("missing required flag: --file or --folder")}if input.Output == "" {output.Errors = append(output.Errors, "--output is required")return fmt.Errorf("missing required flag: --output")}if input.Prefix == "" {output.Errors = append(output.Errors, "--prefix is required")return fmt.Errorf("missing required flag: --prefix")}return nil}// resolveClipFiles returns the list of .data file paths from input.func resolveClipFiles(output *CallsClipOutput, input CallsClipInput) ([]string, error) {if input.File != "" {return []string{input.File}, nil}filePaths, err := utils.FindDataFiles(input.Folder)if err != nil {output.Errors = append(output.Errors, fmt.Sprintf("failed to find .data files: %v", err))return nil, err}if len(filePaths) == 0 {output.Errors = append(output.Errors, "no .data files found")return nil, fmt.Errorf("no .data files found")}return filePaths, nil}// processFilesSequential processes .data files one at a time.func processFilesSequential(output *CallsClipOutput, filePaths []string, input CallsClipInput, speciesName, callType string, imgSize int, lat, lng float64, timezone string) {for _, dataPath := range filePaths {clips, skipped, errs := processFile(dataPath, input.Output, input.Prefix, input.Filter, speciesName, callType, input.Certainty, imgSize, input.Color, input.Night, input.Day, lat, lng, timezone)accumulateFileResult(output, clips, skipped, errs, input.Night)}}// processFilesParallel processes .data files using worker goroutines.func processFilesParallel(output *CallsClipOutput, filePaths []string, input CallsClipInput, speciesName, callType string, imgSize int, lat, lng float64, timezone string) {type fileResult struct {clips []stringskipped interrs []string}workers := min(runtime.NumCPU(), 8, len(filePaths))jobs := make(chan string, len(filePaths))results := make(chan fileResult, len(filePaths))var wg sync.WaitGroupfor range workers {wg.Go(func() {for dataPath := range jobs {clips, skipped, errs := processFile(dataPath, input.Output, input.Prefix, input.Filter, speciesName, callType, input.Certainty, imgSize, input.Color, input.Night, input.Day, lat, lng, timezone)results <- fileResult{clips: clips, skipped: skipped, errs: errs}}})}for _, dataPath := range filePaths {jobs <- dataPath}go func() {wg.Wait()close(results)}()for r := range results {accumulateFileResult(output, r.clips, r.skipped, r.errs, input.Night)}}// accumulateFileResult merges a single file's results into the output.func accumulateFileResult(output *CallsClipOutput, clips []string, skipped int, errs []string, night bool) {output.SegmentsClipped += len(clips)if night {output.NightSkipped += skipped} else {output.DaySkipped += skipped}output.OutputFiles = append(output.OutputFiles, clips...)output.Errors = append(output.Errors, errs...)if len(clips) > 0 || len(errs) == 0 {output.FilesProcessed++}}// processFile processes a single .data file and returns generated clips, time-filter-skipped count, and errorsfunc processFile(dataPath, outputDir, prefix, filter, speciesName, callType string, certainty, imgSize int, color, night, day bool, lat, lng float64, timezone string) ([]string, int, []string) {var clips []stringvar errors []string// Parse .data filedataFile, err := utils.ParseDataFile(dataPath)if err != nil {errors = append(errors, fmt.Sprintf("%s: failed to parse: %v", dataPath, err))return nil, 0, errors}// Get WAV basename (without path and extensions)wavPath := filepath.Clean(strings.TrimSuffix(dataPath, ".data"))basename := filepath.Base(wavPath)basename = strings.TrimSuffix(basename, filepath.Ext(basename))// Filter segmentsmatchingSegments := filterSegments(dataFile.Segments, filter, speciesName, callType, certainty)if len(matchingSegments) == 0 {return nil, 0, nil}// Day/night filter: check WAV header only (cheaper than reading full audio).if night || day {skipped, err := checkDayNightFilter(wavPath, night, day, lat, lng, timezone)if err != nil || skipped {if skipped {return nil, 1, nil}return nil, 0, nil}}// Read WAV samples oncesamples, sampleRate, err := utils.ReadWAVSamples(wavPath)if err != nil {errors = append(errors, fmt.Sprintf("%s: failed to read WAV: %v", dataPath, err))return nil, 0, errors}// Process matching segmentsclips, errors = processSegments(matchingSegments, dataPath, samples, sampleRate, outputDir, prefix, basename, imgSize, color)return clips, 0, errors}// filterSegments returns segments matching the given filter criteria.func filterSegments(segments []*utils.Segment, filter, speciesName, callType string, certainty int) []*utils.Segment {var matching []*utils.Segmentfor _, seg := range segments {if seg.SegmentMatchesFilters(filter, speciesName, callType, certainty) {matching = append(matching, seg)}}return matching}// checkDayNightFilter applies day/night filtering. Returns (skipped=true, nil) if the// recording should be skipped, (false, nil) if it passes, or (false, err) on failure.func checkDayNightFilter(wavPath string, night, day bool, lat, lng float64, timezone string) (bool, error) {result, err := IsNight(IsNightInput{FilePath: wavPath,Lat: lat,Lng: lng,Timezone: timezone,})if err != nil {fmt.Fprintf(os.Stderr, "warning: skipping %s (isnight error: %v)\n", wavPath, err)return false, err}if night && !result.SolarNight {fmt.Fprintf(os.Stderr, "skipped (daytime): %s\n", wavPath)return true, nil}if day && !result.DiurnalActive {fmt.Fprintf(os.Stderr, "skipped (nighttime): %s\n", wavPath)return true, nil}return false, nil}// processSegments generates clips for matching segments, using parallel processing for larger batches.func processSegments(segments []*utils.Segment, dataPath string, samples []float64, sampleRate int, outputDir, prefix, basename string, imgSize int, color bool) ([]string, []string) {var clips []stringvar errors []stringif len(segments) <= 2 {for _, seg := range segments {clipFiles, err := generateClip(samples, sampleRate, outputDir, prefix, basename, seg.StartTime, seg.EndTime, imgSize, color)if err != nil {errors = append(errors, fmt.Sprintf("%s: segment %.0f-%.0f: %v", dataPath, seg.StartTime, seg.EndTime, err))continue}clips = append(clips, clipFiles...)}} else {clips, errors = processSegmentsParallel(segments, dataPath, samples, sampleRate, outputDir, prefix, basename, imgSize, color)}return clips, errors}// processSegmentsParallel generates clips for segments using worker goroutines.func processSegmentsParallel(segments []*utils.Segment, dataPath string, samples []float64, sampleRate int, outputDir, prefix, basename string, imgSize int, color bool) ([]string, []string) {type segResult struct {clips []stringerr string}workers := min(runtime.NumCPU(), len(segments))jobs := make(chan *utils.Segment, len(segments))results := make(chan segResult, len(segments))var wg sync.WaitGroupfor range workers {wg.Go(func() {for seg := range jobs {clipFiles, err := generateClip(samples, sampleRate, outputDir, prefix, basename, seg.StartTime, seg.EndTime, imgSize, color)if err != nil {results <- segResult{err: fmt.Sprintf("%s: segment %.0f-%.0f: %v", dataPath, seg.StartTime, seg.EndTime, err)}} else {results <- segResult{clips: clipFiles}}}})}for _, seg := range segments {jobs <- seg}close(jobs)go func() {wg.Wait()close(results)}()var clips []stringvar errors []stringfor r := range results {if r.err != "" {errors = append(errors, r.err)} else {clips = append(clips, r.clips...)}}return clips, errors}// generateClip generates PNG and WAV files for a segmentfunc generateClip(samples []float64, sampleRate int, outputDir, prefix, basename string, startTime, endTime float64, imgSize int, color bool) ([]string, error) {var files []string// Calculate integer times for filenamestartInt := int(math.Floor(startTime))endInt := int(math.Ceil(endTime))// Build base filenamebaseName := fmt.Sprintf("%s_%s_%d_%d", prefix, basename, startInt, endInt)wavPath := filepath.Join(outputDir, baseName+".wav")// Extract segment samplessegSamples := utils.ExtractSegmentSamples(samples, sampleRate, startTime, endTime)if len(segSamples) == 0 {return nil, fmt.Errorf("no samples in segment")}// Determine output sample rate (downsample if > 16kHz)outputSampleRate := sampleRateif sampleRate > utils.DefaultMaxSampleRate {segSamples = utils.ResampleRate(segSamples, sampleRate, utils.DefaultMaxSampleRate)outputSampleRate = utils.DefaultMaxSampleRate}pngPath := filepath.Join(outputDir, baseName+".png")spectSampleRate := outputSampleRateconfig := utils.DefaultSpectrogramConfig(spectSampleRate)spectrogram := utils.GenerateSpectrogram(segSamples, config)if spectrogram == nil {return nil, fmt.Errorf("failed to generate spectrogram")}// Create image (grayscale or color)var img image.Imageif color {colorData := utils.ApplyL4Colormap(spectrogram)img = utils.CreateRGBImage(colorData)} else {img = utils.CreateGrayscaleImage(spectrogram)}if img == nil {return nil, fmt.Errorf("failed to create image")}resized := utils.ResizeImage(img, imgSize, imgSize)// Write PNG (O_EXCL fails atomically if file exists)pngFile, err := os.OpenFile(pngPath, os.O_WRONLY|os.O_CREATE|os.O_EXCL, 0644)if err != nil {if os.IsExist(err) {return nil, fmt.Errorf("file already exists: %s", pngPath)}return nil, fmt.Errorf("failed to create PNG: %w", err)}if err := utils.WritePNG(resized, pngFile); err != nil {_ = pngFile.Close()return nil, fmt.Errorf("failed to write PNG: %w", err)}if err := pngFile.Close(); err != nil {return nil, fmt.Errorf("failed to close PNG: %w", err)}// Write WAVif err := utils.WriteWAVFile(wavPath, segSamples, outputSampleRate); err != nil {return nil, fmt.Errorf("failed to write WAV: %w", err)}files = append(files, wavPath)return files, nil}files = append(files, pngPath)close(jobs)// Parse location into lat/lng/timezonevar lat, lng float64var timezone stringif input.Location != "" {var err errorlat, lng, timezone, err = utils.ParseLocation(input.Location)if err != nil {output.Errors = append(output.Errors, err.Error())return output, err}}
package toolsimport ("testing""skraak/utils")func NewClassifyState(config ClassifyConfig, dataFiles []*utils.DataFile) *ClassifyState {hasFilter := config.Filter != "" || config.Species != "" || config.Certainty >= 0cached := make([][]*utils.Segment, len(dataFiles))for i, df := range dataFiles {if !hasFilter {cached[i] = df.Segments} else {for _, seg := range df.Segments {if seg.SegmentMatchesFilters(config.Filter, config.Species, config.CallType, config.Certainty) {cached[i] = append(cached[i], seg)}}}}total := 0for _, segs := range cached {total += len(segs)}return &ClassifyState{Config: config,DataFiles: dataFiles,filteredSegs: cached,totalSegs: total,}}func TestParseKeyBuffer(t *testing.T) {bindings := []KeyBinding{{Key: "k", Species: "Kiwi"},{Key: "d", Species: "Kiwi", CallType: "Duet"},{Key: "n", Species: "Don't Know"},{Key: "p", Species: "Morepork"},}state := NewClassifyState(ClassifyConfig{Bindings: bindings, Certainty: -1}, nil)tests := []struct {key stringwant *BindingResultwantNil bool}{{"k", &BindingResult{Species: "Kiwi"}, false},{"d", &BindingResult{Species: "Kiwi", CallType: "Duet"}, false},{"n", &BindingResult{Species: "Don't Know"}, false},{"p", &BindingResult{Species: "Morepork"}, false},{"x", nil, true}, // unknown key}for _, tt := range tests {got := state.ParseKeyBuffer(tt.key)if tt.wantNil {if got != nil {t.Errorf("ParseKeyBuffer(%q) = %v, want nil", tt.key, got)}} else {if got == nil {t.Errorf("ParseKeyBuffer(%q) = nil, want %+v", tt.key, tt.want)continue}if got.Species != tt.want.Species {t.Errorf("ParseKeyBuffer(%q).Species = %q, want %q", tt.key, got.Species, tt.want.Species)}if got.CallType != tt.want.CallType {t.Errorf("ParseKeyBuffer(%q).CallType = %q, want %q", tt.key, got.CallType, tt.want.CallType)}}}}func TestApplyBinding(t *testing.T) {bindings := []KeyBinding{{Key: "k", Species: "Kiwi"},{Key: "n", Species: "Don't Know"},{Key: "d", Species: "Kiwi", CallType: "Duet"},}df := &utils.DataFile{Meta: &utils.DataMeta{},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 20.0,Labels: []*utils.Label{{Species: "Unknown", Certainty: 50, Filter: "test-filter", CallType: "OldType"},},},},}state := NewClassifyState(ClassifyConfig{Filter: "test-filter",Reviewer: "David",Bindings: bindings,Certainty: -1,}, []*utils.DataFile{df})// Apply "k" = Kiwi (no calltype, should remove existing calltype)result := &BindingResult{Species: "Kiwi"}state.ApplyBinding(result)// Check label was updatedif len(df.Segments[0].Labels) != 1 {t.Errorf("expected 1 label, got %d", len(df.Segments[0].Labels))}if df.Segments[0].Labels[0].Species != "Kiwi" {t.Errorf("expected Species=Kiwi, got %s", df.Segments[0].Labels[0].Species)}if df.Segments[0].Labels[0].Certainty != 100 {t.Errorf("expected Certainty=100, got %d", df.Segments[0].Labels[0].Certainty)}if df.Segments[0].Labels[0].CallType != "" {t.Errorf("expected CallType='', got %s (should be removed)", df.Segments[0].Labels[0].CallType)}if df.Meta.Reviewer != "David" {t.Errorf("expected Reviewer=David, got %s", df.Meta.Reviewer)}// Apply "d" = Kiwi/Duet (should set calltype)result = &BindingResult{Species: "Kiwi", CallType: "Duet"}state.ApplyBinding(result)if df.Segments[0].Labels[0].CallType != "Duet" {t.Errorf("expected CallType=Duet, got %s", df.Segments[0].Labels[0].CallType)}// Apply "n" = Don't Know (certainty should be 0)result = &BindingResult{Species: "Don't Know"}state.ApplyBinding(result)if df.Segments[0].Labels[0].Species != "Don't Know" {t.Errorf("expected Species=Don't Know, got %s", df.Segments[0].Labels[0].Species)}if df.Segments[0].Labels[0].Certainty != 0 {t.Errorf("expected Certainty=0 for Don't Know, got %d", df.Segments[0].Labels[0].Certainty)}}func TestApplyBindingCallTypeRemoval(t *testing.T) {bindings := []KeyBinding{{Key: "k", Species: "Kiwi"}, // no calltype}df := &utils.DataFile{Meta: &utils.DataMeta{},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 20.0,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "test-filter", CallType: "Male"},},},},}state := NewClassifyState(ClassifyConfig{Filter: "test-filter",Reviewer: "David",Bindings: bindings,Certainty: -1,}, []*utils.DataFile{df})// Apply "k" = Kiwi (should remove Male calltype)result := &BindingResult{Species: "Kiwi"}state.ApplyBinding(result)if df.Segments[0].Labels[0].CallType != "" {t.Errorf("expected CallType='', got %s (should be removed)", df.Segments[0].Labels[0].CallType)}}func TestConfirmLabelDontKnow(t *testing.T) {df := &utils.DataFile{Meta: &utils.DataMeta{},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 20.0,Labels: []*utils.Label{{Species: "Don't Know", Certainty: 0, Filter: "test-filter"},},},},}state := NewClassifyState(ClassifyConfig{Filter: "test-filter",Reviewer: "David",Certainty: -1,}, []*utils.DataFile{df})// ConfirmLabel on Don't Know should be a no-opif state.ConfirmLabel() {t.Error("ConfirmLabel() should return false for Don't Know (certainty=0)")}label := df.Segments[0].Labels[0]if label.Species != "Don't Know" {t.Errorf("Species should remain Don't Know, got %s", label.Species)}if label.Certainty != 0 {t.Errorf("Certainty should remain 0, got %d", label.Certainty)}if state.Dirty {t.Error("State should not be dirty after confirming Don't Know")}}
package toolsimport ("os""path/filepath""testing")// writeDataFileContent creates a .data file in dir with the given raw content.func writeDataFileContent(t *testing.T, dir, name, content string) {t.Helper()if err := os.WriteFile(filepath.Join(dir, name), []byte(content), 0644); err != nil {t.Fatal(err)}// mustLoadDataFiles is a test helper that calls LoadDataFiles and fatals on error.func mustLoadDataFiles(t *testing.T, config ClassifyConfig) *ClassifyState {t.Helper()state, err := LoadDataFiles(config)if err != nil {t.Fatal(err)}// assertFileSegCounts checks file count and total segment count match expected values.func assertFileSegCounts(t *testing.T, state *ClassifyState, wantFiles, wantSegs int, label string) {t.Helper()if len(state.DataFiles) != wantFiles {t.Errorf("%s: expected %d files, got %d", label, wantFiles, len(state.DataFiles))}if state.TotalSegments() != wantSegs {t.Errorf("%s: expected %d segments total, got %d", label, wantSegs, state.TotalSegments())}}const (kiwiSeg = `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 90}]]]`tomtitSeg = `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Tomtit", "certainty": 90}]]]`)func TestLoadDataFilesFiltersFilesWithNoMatchingSegments(t *testing.T) {tempDir := t.TempDir()writeDataFileContent(t, tempDir, "file1.data", kiwiSeg)writeDataFileContent(t, tempDir, "file2.data", tomtitSeg)writeDataFileContent(t, tempDir, "file3.data", kiwiSeg)t.Run("no_filter", func(t *testing.T) {state := mustLoadDataFiles(t, ClassifyConfig{Folder: tempDir, Certainty: -1})assertFileSegCounts(t, state, 3, 3, "No filter")})t.Run("species_kiwi", func(t *testing.T) {state := mustLoadDataFiles(t, ClassifyConfig{Folder: tempDir, Species: "Kiwi", Certainty: -1})assertFileSegCounts(t, state, 2, 2, "Species=Kiwi")})t.Run("species_tomtit", func(t *testing.T) {state := mustLoadDataFiles(t, ClassifyConfig{Folder: tempDir, Species: "Tomtit", Certainty: -1})assertFileSegCounts(t, state, 1, 1, "Species=Tomtit")})t.Run("species_nonexistent", func(t *testing.T) {state := mustLoadDataFiles(t, ClassifyConfig{Folder: tempDir, Species: "NonExistent", Certainty: -1})assertFileSegCounts(t, state, 0, 0, "Species=NonExistent")})}func TestLoadDataFilesWithMixedSegments(t *testing.T) {tempDir := t.TempDir()file := `[{"Operator": "test"},[0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 90}]],[10, 20, 100, 1000, [{"species": "Tomtit", "certainty": 80}]],[20, 30, 100, 1000, [{"species": "Kiwi", "certainty": 95}]]]`writeDataFileContent(t, tempDir, "mixed.data", file)state := mustLoadDataFiles(t, ClassifyConfig{Folder: tempDir, Species: "Kiwi", Certainty: -1})if len(state.DataFiles) != 1 {t.Errorf("Expected 1 file, got %d", len(state.DataFiles))}if state.TotalSegments() != 2 {t.Errorf("Species=Kiwi: expected 2 segments, got %d", state.TotalSegments())}// The DataFile should still have all 3 segments internally// but cached filtered segments should return only the Kiwi onesif len(state.DataFiles[0].Segments) != 3 {t.Errorf("DataFile should have 3 segments internally, got %d", len(state.DataFiles[0].Segments))}// TotalSegments uses cached filtered segmentsif state.TotalSegments() != 2 {t.Errorf("TotalSegments should return 2 Kiwi segments, got %d", state.TotalSegments())}}// Test that the original DataFile segments are not modified (immutable filtering)func TestFilteringDoesNotModifyOriginalSegments(t *testing.T) {tempDir := t.TempDir()file := `[{"Operator": "test"},[0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 90}]],[10, 20, 100, 1000, [{"species": "Tomtit", "certainty": 80}]]]`writeDataFileContent(t, tempDir, "test.data", file)state := mustLoadDataFiles(t, ClassifyConfig{Folder: tempDir, Species: "Kiwi", Certainty: -1})// Original segments should be untouchedoriginalSegments := state.DataFiles[0].Segmentsif len(originalSegments) != 2 {t.Errorf("Original should have 2 segments, got %d", len(originalSegments))}// Verify all original segments are preservedspecies := []string{}for _, seg := range originalSegments {if len(seg.Labels) > 0 {species = append(species, seg.Labels[0].Species)}}if len(species) != 2 || species[0] != "Kiwi" || species[1] != "Tomtit" {t.Errorf("Original segments should have both species, got %v", species)}}func TestLoadDataFilesCertaintyPruning(t *testing.T) {tempDir := t.TempDir()writeDataFileContent(t, tempDir, "file1.data", `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 70}]]]`)writeDataFileContent(t, tempDir, "file2.data", `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 100}]]]`)state := mustLoadDataFiles(t, ClassifyConfig{Folder: tempDir, Certainty: 100})assertFileSegCounts(t, state, 1, 1, "Certainty=100")// CurrentSegment should work (not nil) because file1 was prunedseg := state.CurrentSegment()if seg == nil {t.Error("CurrentSegment should not be nil after pruning")}}return state}}
package toolsimport ("math/rand""testing""skraak/utils")func TestTotalSegmentsRespectsFilters(t *testing.T) {// Create test data files with different species and filtersdf1 := &utils.DataFile{FilePath: "/test/file1.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0"},},},{StartTime: 10,EndTime: 20,Labels: []*utils.Label{{Species: "Tomtit", Filter: "model-1.0"},},},},}df2 := &utils.DataFile{FilePath: "/test/file2.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0"},},},},}// Test 1: No filters - should count all segments (3)state1 := NewClassifyState(ClassifyConfig{Certainty: -1}, []*utils.DataFile{df1, df2})if got := state1.TotalSegments(); got != 3 {t.Errorf("No filters: expected 3 segments, got %d", got)}// Test 2: Filter by species "Kiwi" - should count only Kiwi segments (2)state2 := NewClassifyState(ClassifyConfig{Species: "Kiwi", Certainty: -1}, []*utils.DataFile{df1, df2})if got := state2.TotalSegments(); got != 2 {t.Errorf("Species=Kiwi: expected 2 segments, got %d", got)}// Test 3: Filter by species "Tomtit" - should count only Tomtit segments (1)state3 := NewClassifyState(ClassifyConfig{Species: "Tomtit", Certainty: -1}, []*utils.DataFile{df1, df2})if got := state3.TotalSegments(); got != 1 {t.Errorf("Species=Tomtit: expected 1 segment, got %d", got)}// Test 4: Filter by filter name "model-1.0" - should count all segments (3)state4 := NewClassifyState(ClassifyConfig{Filter: "model-1.0", Certainty: -1}, []*utils.DataFile{df1, df2})if got := state4.TotalSegments(); got != 3 {t.Errorf("Filter=model-1.0: expected 3 segments, got %d", got)}// Test 5: Filter by non-existent species - should count 0state5 := NewClassifyState(ClassifyConfig{Species: "NonExistent", Certainty: -1}, []*utils.DataFile{df1, df2})if got := state5.TotalSegments(); got != 0 {t.Errorf("Species=NonExistent: expected 0 segments, got %d", got)}// Test 6: Combined filter + speciesdf3 := &utils.DataFile{FilePath: "/test/file3.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0", CallType: "Duet"},},},{StartTime: 10,EndTime: 20,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-2.0", CallType: "Male"},},},},}state6 := NewClassifyState(ClassifyConfig{Filter: "model-1.0", Species: "Kiwi", Certainty: -1}, []*utils.DataFile{df3})if got := state6.TotalSegments(); got != 1 {t.Errorf("Filter=model-1.0 + Species=Kiwi: expected 1 segment, got %d", got)}}func TestCurrentSegmentNumberWithFilters(t *testing.T) {// Create test data filesdf1 := &utils.DataFile{FilePath: "/test/file1.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0"},},},{StartTime: 10,EndTime: 20,Labels: []*utils.Label{{Species: "Tomtit", Filter: "model-1.0"},},},},}df2 := &utils.DataFile{FilePath: "/test/file2.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0"},},},},}// Test: Filter by species "Kiwi", at file 2, segment 0// Should report current segment as 2 (first Kiwi in df1 + first Kiwi in df2)state := NewClassifyState(ClassifyConfig{Species: "Kiwi", Certainty: -1}, []*utils.DataFile{df1, df2})state.FileIdx = 1 // at df2state.SegmentIdx = 0if got := state.CurrentSegmentNumber(); got != 2 {t.Errorf("Species=Kiwi, at file 2, seg 0: expected current segment 2, got %d", got)}}func TestCertaintyFiltering(t *testing.T) {// Create test data files with different certainty levelsdf := &utils.DataFile{FilePath: "/test/file1.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0", Certainty: 70},},},{StartTime: 10,EndTime: 20,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0", Certainty: 100},},},{StartTime: 20,EndTime: 30,Labels: []*utils.Label{{Species: "Tomtit", Filter: "model-1.0", Certainty: 70},},},},}// Test 1: Filter by certainty 70 - should get 2 segmentsstate1 := NewClassifyState(ClassifyConfig{Certainty: 70}, []*utils.DataFile{df})if got := state1.TotalSegments(); got != 2 {t.Errorf("Certainty=70: expected 2 segments, got %d", got)}// Test 2: Filter by certainty 100 - should get 1 segmentstate2 := NewClassifyState(ClassifyConfig{Certainty: 100}, []*utils.DataFile{df})if got := state2.TotalSegments(); got != 1 {t.Errorf("Certainty=100: expected 1 segment, got %d", got)}// Test 3: Filter by certainty 0 - should get 0 segmentsstate3 := NewClassifyState(ClassifyConfig{Certainty: 0}, []*utils.DataFile{df})if got := state3.TotalSegments(); got != 0 {t.Errorf("Certainty=0: expected 0 segments, got %d", got)}// Test 4: Combined species + certaintystate4 := NewClassifyState(ClassifyConfig{Species: "Kiwi", Certainty: 70}, []*utils.DataFile{df})if got := state4.TotalSegments(); got != 1 {t.Errorf("Species=Kiwi + Certainty=70: expected 1 segment, got %d", got)}}func TestSampling(t *testing.T) {makeSegs := func(n int) []*utils.Segment {s := make([]*utils.Segment, n)for i := range s {s[i] = &utils.Segment{StartTime: float64(i), EndTime: float64(i + 1)}}return s}df1 := &utils.DataFile{FilePath: "/test/f1.data", Segments: makeSegs(6)}df2 := &utils.DataFile{FilePath: "/test/f2.data", Segments: makeSegs(4)}kept := []*utils.DataFile{df1, df2}cached := [][]*utils.Segment{df1.Segments, df2.Segments}countTotal := func(c [][]*utils.Segment) int {n := 0for _, s := range c {n += len(s)}return n}// 50% of 10 → 5k, c := applySampling(kept, cached, 50, rand.New(rand.NewSource(42)))if got := countTotal(c); got != 5 {t.Errorf("sample 50%%: expected 5, got %d", got)}// Files must be in original chronological orderfor i := 1; i < len(k); i++ {if k[i].FilePath < k[i-1].FilePath {t.Errorf("sample 50%%: files out of order at index %d", i)}}// 10% of 10 → 1_, c2 := applySampling(kept, cached, 10, rand.New(rand.NewSource(42)))if got := countTotal(c2); got != 1 {t.Errorf("sample 10%%: expected 1, got %d", got)}// 1% of 10 → clamp to 1_, c3 := applySampling(kept, cached, 1, rand.New(rand.NewSource(42)))if got := countTotal(c3); got != 1 {t.Errorf("sample 1%%: expected 1 (clamped), got %d", got)}// 99% of 10 → 9_, c4 := applySampling(kept, cached, 99, rand.New(rand.NewSource(42)))if got := countTotal(c4); got != 9 {t.Errorf("sample 99%%: expected 9, got %d", got)}}func TestCertaintyPruning(t *testing.T) {// Simulate the bug: first file has no matching certainty segmentsdf1 := &utils.DataFile{FilePath: "/test/file1.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0", Certainty: 70},},},},}df2 := &utils.DataFile{FilePath: "/test/file2.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0", Certainty: 100},},},},}// Without pruning (old bug): file1 is first, has no certainty=100 segments// CurrentSegment() would return nil even though TotalSegments() > 0state := NewClassifyState(ClassifyConfig{Certainty: 100}, []*utils.DataFile{df1, df2})// TotalSegments should be 1 (only file2 has certainty 100)if got := state.TotalSegments(); got != 1 {t.Errorf("Certainty=100: expected 1 segment, got %d", got)}// CurrentSegment should work if files are properly pruned// Note: this test assumes LoadDataFiles does the pruning// Here we test the state after manual construction}}func TestCallTypeNoneFiltering(t *testing.T) {// Create test data: Kiwi with calltype, Kiwi without, Tomtit withoutdf := &utils.DataFile{FilePath: "/test/file1.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0", CallType: "Male"},},},{StartTime: 10,EndTime: 20,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0"}, // no calltype},},{StartTime: 20,EndTime: 30,Labels: []*utils.Label{{Species: "Tomtit", Filter: "model-1.0"}, // no calltype, wrong species},},},}// Test 1: --species Kiwi+_ should match only Kiwi with no calltype (1 segment)state1 := NewClassifyState(ClassifyConfig{Species: "Kiwi", CallType: utils.CallTypeNone, Certainty: -1}, []*utils.DataFile{df})if got := state1.TotalSegments(); got != 1 {t.Errorf("Species=Kiwi+_: expected 1 segment, got %d", got)}// Test 2: --species Kiwi should still match all Kiwi (2 segments)state2 := NewClassifyState(ClassifyConfig{Species: "Kiwi", Certainty: -1}, []*utils.DataFile{df})if got := state2.TotalSegments(); got != 2 {t.Errorf("Species=Kiwi: expected 2 segments, got %d", got)}// Test 3: --species Kiwi+Male should still work as before (1 segment)state3 := NewClassifyState(ClassifyConfig{Species: "Kiwi", CallType: "Male", Certainty: -1}, []*utils.DataFile{df})if got := state3.TotalSegments(); got != 1 {t.Errorf("Species=Kiwi+Male: expected 1 segment, got %d", got)}
package toolsimport ("fmt""math/rand""os""path/filepath""slices""sort""strings""time""skraak/utils")// KeyBinding maps a key to a species/calltypetype KeyBinding struct {Key string // single char: "k", "n", "p"Species string // "Kiwi", "Don't Know", "Morepork"CallType string // "Duet", "Female", "Male" (optional)}// ClassifyConfig holds the configuration for classificationtype ClassifyConfig struct {Folder stringFile stringFilter stringSpecies string // scope to this species (optional)CallType string // scope to this calltype within species (optional)Certainty int // scope to this certainty value, -1 = no filter (optional)Sample int // random sample percentage 1-99, -1 = no sampling, 100 = no-opGoto string // goto this file on startup (optional, basename match)Reviewer stringColor boolImageSize int // spectrogram display size in pixels (0 = default)Sixel boolITerm boolBindings []KeyBinding// SecondaryBindings maps a primary binding key to per-species calltype// keys. Invoked via Shift+primary-key: the species is labeled without// advancing, and the next key is interpreted as a calltype.SecondaryBindings map[string]map[string]stringNight boolDay boolLat float64Lng float64Timezone string}// ClassifyState holds the current state for TUItype ClassifyState struct {Config ClassifyConfigDataFiles []*utils.DataFilefilteredSegs [][]*utils.Segment // cached at load time, parallel to DataFilestotalSegs int // pre-computed total segment countFileIdx intSegmentIdx intDirty boolPlayer *utils.AudioPlayerPlaybackSpeed float64 // Current playback speed (1.0 = normal, 0.5 = half speed)TimeFilteredCount int // files skipped by --night or --day filter}// BindingResult represents parsed key resulttype BindingResult struct {Species stringCallType string // empty string = remove calltype}// LoadDataFiles loads all .data files for classification// findDataFilePaths resolves the list of .data file paths from config.func findDataFilePaths(config ClassifyConfig) ([]string, error) {if config.File != "" {return []string{config.File}, nil}paths, err := utils.FindDataFiles(config.Folder)if err != nil {return nil, fmt.Errorf("find data files: %w", err)}return paths, nil}// filterDataFileSegments applies segment and day/night filters to a single data file.// Returns the filtered segments and whether the file should be kept.// If the file is filtered out (no matching segments, or time-of-day), returns nil, false.func filterDataFileSegments(df *utils.DataFile, config ClassifyConfig) ([]*utils.Segment, bool, int) {segs := filterSegmentsByLabel(df.Segments, config)if segs == nil {return nil, false, 0}timeFiltered := 0if config.Night || config.Day {keep, tf := filterByTimeOfDay(df.FilePath, config)if !keep {return nil, false, tf}}if len(filePaths) == 0 {return nil, fmt.Errorf("no .data files found")}var dataFiles []*utils.DataFilefor _, path := range filePaths {df, err := utils.ParseDataFile(path)if err != nil {continue}dataFiles = append(dataFiles, df)}if len(dataFiles) == 0 {return nil, fmt.Errorf("no valid .data files")}sort.Slice(dataFiles, func(i, j int) bool {return dataFiles[i].FilePath < dataFiles[j].FilePath})return dataFiles, nil}// filterDataFiles applies segment filters to each data file, returning kept files and their segments.func filterDataFiles(dataFiles []*utils.DataFile, config ClassifyConfig) ([]*utils.DataFile, [][]*utils.Segment, int) {var kept []*utils.DataFilevar cachedSegs [][]*utils.Segmentvar timeFiltered intfor _, df := range dataFiles {segs, keep, tf := filterDataFileSegments(df, config)timeFiltered += tfif !keep {continue}kept = append(kept, df)cachedSegs = append(cachedSegs, segs)}total := 0for _, segs := range filteredSegs {total += len(segs)}state := &ClassifyState{Config: config,DataFiles: dataFiles,filteredSegs: filteredSegs,totalSegs: total,TimeFilteredCount: timeFiltered,}if config.Goto == "" {return state, nil}for i, df := range state.DataFiles {base := df.FilePath[strings.LastIndex(df.FilePath, "/")+1:]if base == config.Goto {state.FileIdx = ireturn state, nil}}return nil, fmt.Errorf("goto file not found (or has no matching segments): %s", config.Goto)}// applySampling randomly selects sample% of segments from the filtered set.// The returned files and segments preserve the original chronological order.func applySampling(kept []*utils.DataFile, cachedSegs [][]*utils.Segment, sample int, rng *rand.Rand) ([]*utils.DataFile, [][]*utils.Segment) {flat := make([]struct{ fileIdx, segIdx int }, 0)for fi, segs := range cachedSegs {for si := range segs {flat = append(flat, struct{ fileIdx, segIdx int }{fi, si})}}targetCount := max(len(flat)*sample/100, 1)rng.Shuffle(len(flat), func(i, j int) { flat[i], flat[j] = flat[j], flat[i] })selected := flat[:targetCount]// Restore chronological order before rebuildingsort.Slice(selected, func(i, j int) bool {if selected[i].fileIdx != selected[j].fileIdx {return selected[i].fileIdx < selected[j].fileIdx}return selected[i].segIdx < selected[j].segIdx})newCached := make([][]*utils.Segment, len(cachedSegs))for _, ref := range selected {newCached[ref.fileIdx] = append(newCached[ref.fileIdx], cachedSegs[ref.fileIdx][ref.segIdx])}var newKept []*utils.DataFilevar finalCached [][]*utils.Segmentfor i, segs := range newCached {if len(segs) > 0 {newKept = append(newKept, kept[i])finalCached = append(finalCached, segs)}}return newKept, finalCached}// FilteredSegs returns the cached filtered segments parallel to DataFiles.func (s *ClassifyState) FilteredSegs() [][]*utils.Segment {return s.filteredSegs}// CurrentFile returns the current data filefunc (s *ClassifyState) CurrentFile() *utils.DataFile {if s.FileIdx >= len(s.DataFiles) {return nil}return s.DataFiles[s.FileIdx]}// CurrentSegment returns the current segmentfunc (s *ClassifyState) CurrentSegment() *utils.Segment {if s.FileIdx >= len(s.filteredSegs) {return nil}segs := s.filteredSegs[s.FileIdx]if s.SegmentIdx >= len(segs) {return nil}return segs[s.SegmentIdx]}// TotalSegments returns total segments to reviewfunc (s *ClassifyState) TotalSegments() int {return s.totalSegs}// CurrentSegmentNumber returns 1-based segment numberfunc (s *ClassifyState) CurrentSegmentNumber() int {count := 0for i := 0; i < s.FileIdx; i++ {count += len(s.filteredSegs[i])}return count + s.SegmentIdx + 1}// NextSegment moves to the next segment, returns false if at endfunc (s *ClassifyState) NextSegment() bool {if s.FileIdx >= len(s.filteredSegs) {return false}segs := s.filteredSegs[s.FileIdx]if s.SegmentIdx+1 < len(segs) {s.SegmentIdx++return true}// Move to next fileif s.FileIdx+1 < len(s.DataFiles) {s.FileIdx++s.SegmentIdx = 0return true}return false}// PrevSegment moves to the previous segment, returns false if at startfunc (s *ClassifyState) PrevSegment() bool {if s.SegmentIdx > 0 {s.SegmentIdx--return true}// Move to previous fileif s.FileIdx > 0 {s.FileIdx--segs := s.filteredSegs[s.FileIdx]s.SegmentIdx = max(len(segs)-1, 0)return true}return false}// ParseKeyBuffer parses a single key into binding resultfunc (s *ClassifyState) ParseKeyBuffer(key string) *BindingResult {for _, b := range s.Config.Bindings {if b.Key == key {return &BindingResult{Species: b.Species,CallType: b.CallType,}}}return nil}// SetComment sets the comment on the current segment's filter label.// Returns the previous comment (for undo) or empty string if none.func (s *ClassifyState) SetComment(comment string) string {seg := s.CurrentSegment()if seg == nil {return ""}df := s.CurrentFile()if df == nil {return ""}// Set reviewerdf.Meta.Reviewer = s.Config.Reviewer// Get labels matching filterfilterLabels := seg.GetFilterLabels(s.Config.Filter)var oldComment stringif len(filterLabels) == 0 {// No matching labels, add new one with commentlabel := &utils.Label{Species: "Don't Know",Certainty: 0,Filter: s.Config.Filter,Comment: comment,}seg.Labels = append(seg.Labels, label)} else {// Set comment on first matching labeloldComment = filterLabels[0].CommentfilterLabels[0].Comment = comment}s.Dirty = truereturn oldComment}// GetCurrentComment returns the comment on the current segment's filter label.func (s *ClassifyState) GetCurrentComment() string {seg := s.CurrentSegment()if seg == nil {return ""}filterLabels := seg.GetFilterLabels(s.Config.Filter)if len(filterLabels) == 0 {return ""}return filterLabels[0].Comment}// ApplyBinding applies a binding result to the current segmentfunc (s *ClassifyState) ApplyBinding(result *BindingResult) {seg := s.CurrentSegment()if seg == nil {return}df := s.CurrentFile()if df == nil {return}// Set reviewerdf.Meta.Reviewer = s.Config.Reviewer// Get labels matching filterfilterLabels := seg.GetFilterLabels(s.Config.Filter)// Determine certainty: 0 for Don't Know, 100 for otherscertainty := 100if result.Species == "Don't Know" {certainty = 0}if len(filterLabels) == 0 {// No matching labels, add new oneseg.Labels = append(seg.Labels, &utils.Label{Species: result.Species,Certainty: certainty,Filter: s.Config.Filter,CallType: result.CallType,})} else {// Edit first matching label, remove restfilterLabels[0].Species = result.SpeciesfilterLabels[0].Certainty = certaintyfilterLabels[0].CallType = result.CallType // always set (empty = remove)// Remove extra matching labelsif len(filterLabels) > 1 {var newLabels []*utils.Labelfor _, l := range seg.Labels {keep := !slices.Contains(filterLabels[1:], l)if keep {newLabels = append(newLabels, l)}}seg.Labels = newLabels}}// Re-sort labelssort.Slice(seg.Labels, func(i, j int) bool {return seg.Labels[i].Species < seg.Labels[j].Species})s.Dirty = true}// ApplyCallTypeOnly sets the CallType on the current segment's first// filter-matching label. Used after a Shift+primary keypress labeled the// species and we now receive the secondary key for the calltype.// No-op if there is no matching label to update.func (s *ClassifyState) ApplyCallTypeOnly(callType string) {seg := s.CurrentSegment()if seg == nil {return}df := s.CurrentFile()if df == nil {return}filterLabels := seg.GetFilterLabels(s.Config.Filter)if len(filterLabels) == 0 {return}df.Meta.Reviewer = s.Config.ReviewerfilterLabels[0].CallType = callTypes.Dirty = true}// HasSecondary reports whether the given primary key has any secondary// (calltype) bindings configured.func (s *ClassifyState) HasSecondary(primaryKey string) bool {return len(s.Config.SecondaryBindings[primaryKey]) > 0}// ConfirmLabel upgrades the current segment's existing filter label certainty// to 100. Returns true if a write is needed (label existed and was below 100).// Returns false for Don't Know (certainty=0) — confirming a Don't Know is a no-op;// the caller should just advance to the next segment.func (s *ClassifyState) ConfirmLabel() bool {seg := s.CurrentSegment()if seg == nil {return false}filterLabels := seg.GetFilterLabels(s.Config.Filter)if len(filterLabels) == 0 {return false}if filterLabels[0].Certainty == 0 {return false}if filterLabels[0].Certainty == 100 {return false}df := s.CurrentFile()if df == nil {return false}df.Meta.Reviewer = s.Config.ReviewerfilterLabels[0].Certainty = 100s.Dirty = truereturn true}// Save saves the current filefunc (s *ClassifyState) Save() error {df := s.CurrentFile()if df == nil {return nil}if !s.Dirty {return nil}err := df.Write(df.FilePath)if err != nil {return err}s.Dirty = falsereturn nil}// getFilterLabel returns the label matching the current filter, or first label if no filter.func (s *ClassifyState) getFilterLabel(seg *utils.Segment) *utils.Label {if s.Config.Filter == "" {if len(seg.Labels) > 0 {return seg.Labels[0]}return nil}for _, label := range seg.Labels {if label.Filter == s.Config.Filter {return label}}return nil}// getOrCreateFilterLabel gets existing label or creates new one for the current filter.func (s *ClassifyState) getOrCreateFilterLabel(seg *utils.Segment) *utils.Label {label := s.getFilterLabel(seg)if label != nil {return label}// Create new labellabel = &utils.Label{Species: "Don't Know",Certainty: 0,Filter: s.Config.Filter,}seg.Labels = append(seg.Labels, label)s.Dirty = truereturn label}// HasBookmark returns true if current segment has a bookmark on the filter label.func (s *ClassifyState) HasBookmark() bool {seg := s.CurrentSegment()if seg == nil {return false}label := s.getFilterLabel(seg)return label != nil && label.Bookmark}// ToggleBookmark toggles the bookmark on the current segment's filter label.func (s *ClassifyState) ToggleBookmark() {seg := s.CurrentSegment()if seg == nil {return}df := s.CurrentFile()if df == nil {return}// Set reviewerdf.Meta.Reviewer = s.Config.Reviewerlabel := s.getOrCreateFilterLabel(seg)label.Bookmark = !label.Bookmarks.Dirty = true}// NextBookmark navigates to the next bookmark, wrapping around if needed.// Returns false if no bookmarks found (back at start position).func (s *ClassifyState) NextBookmark() bool {startFile := s.FileIdxstartSeg := s.SegmentIdxfirst := truefor {// Advance to next segmentif !s.NextSegment() {// Wrap to start of folders.FileIdx = 0s.SegmentIdx = 0}// Check if we've looped back to startif !first && s.FileIdx == startFile && s.SegmentIdx == startSeg {return false // full circle, no bookmark found}first = false// Check if current segment has bookmarkif s.hasFilterBookmark() {return true}}}// PrevBookmark navigates to the previous bookmark, wrapping around if needed.// Returns false if no bookmarks found (back at start position).func (s *ClassifyState) PrevBookmark() bool {startFile := s.FileIdxstartSeg := s.SegmentIdxfirst := truefor {// Move to previous segmentif !s.PrevSegment() {// Wrap to end of folders.FileIdx = len(s.DataFiles) - 1segs := s.filteredSegs[s.FileIdx]s.SegmentIdx = max(len(segs)-1, 0)}// Check if we've looped back to startif !first && s.FileIdx == startFile && s.SegmentIdx == startSeg {return false // full circle, no bookmark found}first = false// Check if current segment has bookmarkif s.hasFilterBookmark() {return true}}}// hasFilterBookmark checks if current segment has bookmark on filter-matching label.func (s *ClassifyState) hasFilterBookmark() bool {seg := s.CurrentSegment()if seg == nil {return false}label := s.getFilterLabel(seg)return label != nil && label.Bookmark}// FormatLabels formats labels for displayfunc FormatLabels(labels []*utils.Label, filter string) string {var parts []stringfor _, l := range labels {if filter != "" && l.Filter != filter {continue}part := l.Speciesif l.CallType != "" {part += "/" + l.CallType}part += fmt.Sprintf(" (%d%%)", l.Certainty)if l.Filter != "" {part += " [" + l.Filter + "]"}if l.Comment != "" {part += fmt.Sprintf(" \"%s\"", l.Comment)}parts = append(parts, part)}return strings.Join(parts, ", ")}// buildClassifyState constructs the ClassifyState, handling --goto file positioning.func buildClassifyState(config ClassifyConfig, dataFiles []*utils.DataFile, filteredSegs [][]*utils.Segment, timeFiltered int) (*ClassifyState, error) {return kept, cachedSegs, timeFiltered}func LoadDataFiles(config ClassifyConfig) (*ClassifyState, error) {filePaths, err := findDataFilePaths(config)if err != nil {return nil, err}dataFiles, err := parseAndSortDataFiles(config)if err != nil {return nil, err}kept, cachedSegs, timeFiltered := filterDataFiles(dataFiles, config)if config.Sample > 0 && config.Sample < 100 {rng := rand.New(rand.NewSource(time.Now().UnixNano()))kept, cachedSegs = applySampling(kept, cachedSegs, config.Sample, rng)}return buildClassifyState(config, kept, cachedSegs, timeFiltered)}// parseAndSortDataFiles finds, parses, and sorts .data files from the config.func parseAndSortDataFiles(config ClassifyConfig) ([]*utils.DataFile, error) {// filterByTimeOfDay checks --night/--day time-of-day filter for a .data file.// Returns (keep, timeFilteredCount).func filterByTimeOfDay(dataFilePath string, config ClassifyConfig) (bool, int) {wavPath := filepath.Clean(strings.TrimSuffix(dataFilePath, ".data"))result, err := IsNight(IsNightInput{FilePath: wavPath,Lat: config.Lat,Lng: config.Lng,Timezone: config.Timezone,})if err != nil {fmt.Fprintf(os.Stderr, "warning: skipping %s (isnight error: %v)\n", wavPath, err)return false, 1}if config.Night && !result.SolarNight {return false, 1}if config.Day && !result.DiurnalActive {return false, 1}return true, 0}return segs, true, timeFiltered}}// filterSegmentsByLabel applies label/species/certainty filters, returning matching segments.// Returns nil if no segments match (caller should skip the file).func filterSegmentsByLabel(segments []*utils.Segment, config ClassifyConfig) []*utils.Segment {hasFilter := config.Filter != "" || config.Species != "" || config.Certainty >= 0if !hasFilter {return segments}var segs []*utils.Segmentfor _, seg := range segments {if seg.SegmentMatchesFilters(config.Filter, config.Species, config.CallType, config.Certainty) {segs = append(segs, seg)}}return segs // nil if empty, caller treats as "skip"
package toolsimport ("context""database/sql""encoding/csv""fmt""os""path/filepath""strconv""strings""time""skraak/db""skraak/utils")// BulkFileImportInput defines the input parameters for the bulk_file_import tooltype BulkFileImportInput struct {DatasetID string `json:"dataset_id"`CSVPath string `json:"csv_path"`LogFilePath string `json:"log_file_path"`}// BulkFileImportOutput defines the output structure for the bulk_file_import tooltype BulkFileImportOutput struct {TotalLocations int `json:"total_locations"`ClustersCreated int `json:"clusters_created"`ClustersExisting int `json:"clusters_existing"`TotalFilesScanned int `json:"total_files_scanned"`FilesImported int `json:"files_imported"`FilesDuplicate int `json:"files_duplicate"`FilesError int `json:"files_error"`ProcessingTime string `json:"processing_time"`Errors []string `json:"errors,omitempty"`}// bulkLocationData holds CSV row data for a locationtype bulkLocationData struct {LocationName stringLocationID stringDirectoryPath stringDateRange stringSampleRate intFileCount int}// bulkImportStats tracks import statistics for a single clustertype bulkImportStats struct {TotalFiles intImportedFiles intDuplicateFiles intErrorFiles int}// progressLogger handles writing to both log file and internal buffertype progressLogger struct {file *os.Filebuffer *strings.Builder}// Log writes a formatted message with timestamp to both log file and bufferfunc (l *progressLogger) Log(format string, args ...any) {timestamp := time.Now().Format("2006-01-02 15:04:05")message := fmt.Sprintf(format, args...)line := fmt.Sprintf("[%s] %s\n", timestamp, message)// Write to file; log write failures are non-fatal for import progressif _, err := l.file.WriteString(line); err != nil {fmt.Fprintf(os.Stderr, "Warning: log write failed: %v\n", err)}if err := l.file.Sync(); err != nil {fmt.Fprintf(os.Stderr, "Warning: log sync failed: %v\n", err)}// Also keep in memory for potential error reportingl.buffer.WriteString(line)}// BulkFileImport imports WAV files across multiple locations using CSV specificationfunc BulkFileImport(ctx context.Context,input BulkFileImportInput,) (BulkFileImportOutput, error) {startTime := time.Now()var output BulkFileImportOutput// Open log filelogFile, err := os.OpenFile(input.LogFilePath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)if err != nil {return output, fmt.Errorf("failed to open log file: %w", err)}defer func() { _ = logFile.Close() }()logger := &progressLogger{file: logFile,buffer: &strings.Builder{},}logger.Log("Starting bulk file import for dataset %s", input.DatasetID)// Phase 0: Validate inputlogger.Log("Validating input parameters...")if err := bulkValidateInput(input); err != nil {logger.Log("ERROR: Validation failed: %v", err)output.failOutput([]string{fmt.Sprintf("validation failed: %v", err)}, startTime)return output, fmt.Errorf("validation failed: %w", err)}logger.Log("Validation complete")// Phase 1: Read CSVlogger.Log("Reading CSV file: %s", input.CSVPath)locations, err := bulkReadCSV(input.CSVPath)if err != nil {logger.Log("ERROR: Failed to read CSV: %v", err)output.failOutput([]string{fmt.Sprintf("failed to read CSV: %v", err)}, startTime)return output, fmt.Errorf("failed to read CSV: %w", err)}logger.Log("Loaded %d locations from CSV", len(locations))output.TotalLocations = len(locations)// Phase 1.5: Validate all location_ids belong to the datasetlogger.Log("Validating location_ids belong to dataset...")output.failOutput([]string{err.Error()}, startTime)return output, err}logger.Log("Location validation complete")// Phase 2: Create/Validate Clusterslogger.Log("=== Phase 1: Creating/Validating Clusters ===")if err != nil {logger.Log("ERROR: Failed to open database: %v", err)output.failOutput([]string{fmt.Sprintf("failed to open database: %v", err)}, startTime)return output, fmt.Errorf("failed to open database: %w", err)}defer database.Close()clusterIDMap, created, existing, err := bulkCreateClusters(ctx, database, logger, locations, input.DatasetID)if err != nil {output.failOutput(output.Errors, startTime)return output, err}logger.Log("=== Phase 2: Importing Files ===")fileStats, errs := bulkImportAllFiles(database, logger, locations, clusterIDMap, input.DatasetID)output.TotalFilesScanned = fileStats.TotalFilesoutput.FilesImported = fileStats.ImportedFilesoutput.FilesDuplicate = fileStats.DuplicateFilesoutput.FilesError = fileStats.ErrorFilesoutput.Errors = append(output.Errors, errs...)if len(errs) > 0 {output.ProcessingTime = time.Since(startTime).String()return output, fmt.Errorf("failed to import files: %s", errs[0])}logger.Log("=== Import Complete ===")logger.Log("Total files scanned: %d", fileStats.TotalFiles)logger.Log("Files imported: %d", fileStats.ImportedFiles)logger.Log("Duplicates skipped: %d", fileStats.DuplicateFiles)logger.Log("Errors: %d", fileStats.ErrorFiles)logger.Log("Processing time: %s", time.Since(startTime).Round(time.Second))output.ProcessingTime = time.Since(startTime).String()return output, nil}// bulkValidateInput validates input parametersfunc bulkValidateInput(input BulkFileImportInput) error {// Validate ID format first (fast fail before DB queries)if err := utils.ValidateShortID(input.DatasetID, "dataset_id"); err != nil {return err}// Verify CSV file existsif _, err := os.Stat(input.CSVPath); err != nil {return fmt.Errorf("CSV file not accessible: %w", err)}// Verify log file path is writablelogDir := filepath.Dir(input.LogFilePath)if _, err := os.Stat(logDir); err != nil {return fmt.Errorf("log file directory not accessible: %w", err)}// Open database for validation queriesif err != nil {return fmt.Errorf("failed to open database: %w", err)}defer database.Close()// Verify dataset exists and is structuredif err := db.ValidateDatasetTypeForImport(database, input.DatasetID); err != nil {return err}return nil}// bulkValidateLocationsBelongToDataset validates that all unique location_ids in the CSV belong to the datasetfunc bulkValidateLocationsBelongToDataset(dbConn *sql.DB, locations []bulkLocationData, datasetID string) []string {var errors []string// Collect unique location_idsuniqueLocations := make(map[string]bool)for _, loc := range locations {uniqueLocations[loc.LocationID] = true}// Validate each unique location_idfor locationID := range uniqueLocations {if err := db.ValidateLocationBelongsToDataset(dbConn, locationID, datasetID); err != nil {errors = append(errors, err.Error())}}return errors}var clusterID stringif err == sql.ErrNoRows {clusterID, err = bulkCreateCluster(ctx, database, datasetID, loc.LocationID, loc.DateRange, loc.SampleRate)if err != nil {logger.Log("ERROR: Failed to create cluster for location %s: %v", loc.LocationName, err)return nil, 0, 0, fmt.Errorf("failed to create cluster: %w", err)}logger.Log(" Created cluster: %s", clusterID)created++} else if err != nil {logger.Log("ERROR: Failed to check cluster for location %s: %v", loc.LocationName, err)return nil, 0, 0, fmt.Errorf("failed to check cluster: %w", err)} else {clusterID = existingClusterIDlogger.Log(" Using existing cluster: %s", clusterID)existing++}compositeKey := loc.LocationID + "|" + loc.DateRangeclusterIDMap[compositeKey] = clusterID}return clusterIDMap, created, existing, nil}// bulkImportAllFiles imports files for all locations using the cluster ID map.// Returns aggregate stats and any error messages.func bulkImportAllFiles(database *sql.DB, logger *progressLogger, locations []bulkLocationData, clusterIDMap map[string]string, datasetID string) (bulkImportStats, []string) {var total bulkImportStatsvar errs []stringfor i, loc := range locations {compositeKey := loc.LocationID + "|" + loc.DateRangeclusterID, ok := clusterIDMap[compositeKey]if !ok {continue}logger.Log("[%d/%d] Importing files for: %s", i+1, len(locations), loc.LocationName)logger.Log(" Directory: %s", loc.DirectoryPath)if _, err := os.Stat(loc.DirectoryPath); os.IsNotExist(err) {logger.Log(" WARNING: Directory not found, skipping")continue}stats, err := bulkImportFilesForCluster(database, logger, loc.DirectoryPath, datasetID, loc.LocationID, clusterID)if err != nil {errMsg := fmt.Sprintf("Failed to import files for location %s: %v", loc.LocationName, err)logger.Log("ERROR: %s", errMsg)return total, []string{errMsg}}logger.Log(" Scanned: %d files", stats.TotalFiles)logger.Log(" Imported: %d, Duplicates: %d", stats.ImportedFiles, stats.DuplicateFiles)if stats.ErrorFiles > 0 {logger.Log(" Errors: %d files", stats.ErrorFiles)}total.TotalFiles += stats.TotalFilestotal.ImportedFiles += stats.ImportedFilestotal.DuplicateFiles += stats.DuplicateFilestotal.ErrorFiles += stats.ErrorFiles}return total, errs}func bulkReadCSV(path string) ([]bulkLocationData, error) {file, err := os.Open(path)if err != nil {return nil, err}defer func() { _ = file.Close() }()reader := csv.NewReader(file)records, err := reader.ReadAll()if err != nil {return nil, err}if len(records) == 0 {return nil, fmt.Errorf("CSV file is empty")}var locations []bulkLocationDatafor i, record := range records {if i == 0 {continue // Skip header}if len(record) < 6 {return nil, fmt.Errorf("CSV row %d has insufficient columns (expected 6, got %d)", i+1, len(record))}// Validate required string fields are non-emptylocationName := strings.TrimSpace(record[0])if locationName == "" {return nil, fmt.Errorf("empty location_name in row %d", i+1)}directoryPath := strings.TrimSpace(record[2])if directoryPath == "" {return nil, fmt.Errorf("empty directory_path in row %d", i+1)}dateRange := strings.TrimSpace(record[3])if dateRange == "" {return nil, fmt.Errorf("empty date_range in row %d", i+1)}// Validate location_id formatlocationID := record[1]if err := utils.ValidateShortID(locationID, "location_id"); err != nil {return nil, fmt.Errorf("invalid location_id in row %d: %v", i+1, err)}sampleRate, err := strconv.Atoi(record[4])if err != nil {return nil, fmt.Errorf("invalid sample_rate in row %d: %v", i+1, err)}// Validate sample rate is in reasonable rangeif err := utils.ValidateSampleRate(sampleRate); err != nil {return nil, fmt.Errorf("invalid sample_rate in row %d: %v", i+1, err)}fileCount, err := strconv.Atoi(record[5])if err != nil {return nil, fmt.Errorf("invalid file_count in row %d: %v", i+1, err)}locations = append(locations, bulkLocationData{LocationName: locationName,LocationID: locationID,DirectoryPath: directoryPath,DateRange: dateRange,SampleRate: sampleRate,FileCount: fileCount,})}return locations, nil}// bulkCreateCluster creates a new cluster in the databasefunc bulkCreateCluster(ctx context.Context, database *sql.DB, datasetID, locationID, name string, sampleRate int) (string, error) {// Generate a 12-character nanoidclusterID, err := utils.GenerateShortID()if err != nil {return "", fmt.Errorf("failed to generate cluster ID: %v", err)}now := time.Now().UTC()// Get location name for the pathvar locationName stringerr = database.QueryRow("SELECT name FROM location WHERE id = ?", locationID).Scan(&locationName)if err != nil {return "", fmt.Errorf("failed to get location name: %v", err)}// Normalize path: replace spaces and special characterspath := strings.ReplaceAll(locationName, " ", "_")path = strings.ReplaceAll(path, "/", "_")tx, err := db.BeginLoggedTx(ctx, database, "bulk_file_import")if err != nil {return "", fmt.Errorf("failed to begin transaction: %w", err)}defer tx.Rollback()_, err = tx.ExecContext(ctx, `INSERT INTO cluster (id, dataset_id, location_id, name, path, sample_rate, active, created_at, last_modified)VALUES (?, ?, ?, ?, ?, ?, true, ?, ?)`, clusterID, datasetID, locationID, name, path, sampleRate, now, now)if err != nil {return "", fmt.Errorf("failed to insert cluster: %w", err)}if err = tx.Commit(); err != nil {return "", fmt.Errorf("failed to commit cluster creation: %w", err)}return clusterID, nil}// bulkImportFilesForCluster imports all WAV files for a single clusterfunc bulkImportFilesForCluster(database *sql.DB, logger *progressLogger, folderPath, datasetID, locationID, clusterID string) (*bulkImportStats, error) {stats := &bulkImportStats{}// Check if directory existsif _, err := os.Stat(folderPath); os.IsNotExist(err) {logger.Log(" WARNING: Directory not found, skipping")return stats, nil}// Import the cluster (SAME LOGIC AS import_files.go)logger.Log(" Importing cluster %s", clusterID)FolderPath: folderPath,DatasetID: datasetID,LocationID: locationID,ClusterID: clusterID,Recursive: true,})if err != nil {return nil, err}// Map to bulk import statsstats.TotalFiles = clusterOutput.TotalFilesstats.ImportedFiles = clusterOutput.ImportedFilesstats.DuplicateFiles = clusterOutput.SkippedFilesstats.ErrorFiles = clusterOutput.FailedFiles// Log errorsfor i, fileErr := range clusterOutput.Errors {if i < 5 { // Log first 5logger.Log(" ERROR: %s: %s", fileErr.FileName, fileErr.Error)}}logger.Log(" Complete: %d imported, %d duplicates, %d errors", stats.ImportedFiles, stats.DuplicateFiles, stats.ErrorFiles)return stats, nil}if err := tx.Commit(); err != nil {return nil, fmt.Errorf("transaction commit failed: %w", err)}tx.Rollback()ctx := context.Background()tx, err := db.BeginLoggedTx(ctx, database, "import_audio_files")if err != nil {return nil, fmt.Errorf("failed to begin transaction: %w", err)}clusterOutput, err := utils.ImportCluster(database, tx.UnderlyingTx(), utils.ClusterImportInput{// bulkCreateClusters creates or validates clusters for all locations.// Returns the cluster ID map, counts of created/existing clusters, and any error.func bulkCreateClusters(ctx context.Context, database *sql.DB, logger *progressLogger, locations []bulkLocationData, datasetID string) (map[string]string, int, int, error) {clusterIDMap := make(map[string]string)created := 0existing := 0for i, loc := range locations {logger.Log("[%d/%d] Processing location: %s", i+1, len(locations), loc.LocationName)var existingClusterID stringerr := database.QueryRow(`SELECT id FROM clusterWHERE location_id = ? AND name = ? AND active = true`, loc.LocationID, loc.DateRange).Scan(&existingClusterID)}// bulkValidateLocations validates that all location_ids in the CSV belong to the dataset.// Returns an error if validation fails.readDB, err := db.OpenReadOnlyDB(dbPath)if err != nil {logger.Log("ERROR: Failed to open database: %v", err)return fmt.Errorf("failed to open database: %w", err)}locationErrors := bulkValidateLocationsBelongToDataset(readDB, locations, datasetID)readDB.Close()if len(locationErrors) > 0 {for _, locErr := range locationErrors {logger.Log("ERROR: %s", locErr)}return fmt.Errorf("location validation failed: %d location(s) do not belong to dataset %s", len(locationErrors), datasetID)}return nilfunc bulkValidateLocations(logger *progressLogger, locations []bulkLocationData, datasetID string, dbPath string) error {database, err := db.OpenReadOnlyDB(resolveDBPath(input.DBPath))// Phase 3: Import filesoutput.ClustersCreated = createdoutput.ClustersExisting = existingdatabase, err := db.OpenWriteableDB(resolveDBPath(input.DBPath))if err := bulkValidateLocations(logger, locations, input.DatasetID, resolveDBPath(input.DBPath)); err != nil {}// BulkFileImport imports WAV files across multiple locations using CSV specification// failOutput sets error details and processing time on the output before returning.func (o *BulkFileImportOutput) failOutput(errs []string, startTime time.Time) {o.Errors = errso.ProcessingTime = time.Since(startTime).String()DBPath string `json:"db_path"`
// Package-level variable to store database path// Deprecated: use Input.DBPath instead. Will be removed after all callers are migrated.var dbPath string// SetDBPath sets the database path for the tools package// Deprecated: use Input.DBPath instead. Will be removed after all callers are migrated.func SetDBPath(path string) {dbPath = path}
// resolveDBPath returns the DBPath from the input if set, otherwise falls back// to the package-level dbPath. This supports the incremental migration from// the global variable to explicit input fields.
// resolveDBPath returns the DBPath from the input if set, otherwise returns// the empty string. Callers that need a fallback should use db.ResolveDBPath.
package impimport ("context""database/sql""fmt""io/fs""os""path/filepath""strings""time""skraak/db""skraak/utils")// ImportUnstructuredInput defines the input parameters for importing files into an unstructured datasettype ImportUnstructuredInput struct {DBPath string `json:"db_path"`DatasetID string `json:"dataset_id"`FolderPath string `json:"folder_path"`Recursive *bool `json:"recursive,omitempty"`}// ImportUnstructuredOutput defines the output structuretype ImportUnstructuredOutput struct {TotalFiles int `json:"total_files"`ImportedFiles int `json:"imported_files"`SkippedFiles int `json:"skipped_files"` // DuplicatesFailedFiles int `json:"failed_files"`TotalDuration float64 `json:"total_duration_seconds"`ProcessingTime string `json:"processing_time"`Errors []utils.FileImportError `json:"errors,omitempty"`}// ImportUnstructured imports WAV files into an unstructured dataset// Files are stored with minimal metadata: hash, duration, sample_rate, file_mod_time as timestamp// No location/cluster hierarchy, no astronomical data, no AudioMoth parsingfunc ImportUnstructured(ctx context.Context,input ImportUnstructuredInput,) (ImportUnstructuredOutput, error) {startTime := time.Now()var output ImportUnstructuredOutput// Default recursive to truerecursive := trueif input.Recursive != nil {recursive = *input.Recursive}// Validate inputif err := validateUnstructuredInput(input); err != nil {return output, fmt.Errorf("validation failed: %w", err)}// Scan for WAV files (no DB needed)files, scanErrors := scanWavFiles(input.FolderPath, recursive)output.Errors = append(output.Errors, scanErrors...)output.TotalFiles = len(files)if len(files) == 0 {output.ProcessingTime = time.Since(startTime).String()return output, nil}err := db.WithWriteTx(ctx, db.ResolveDBPath(input.DBPath, ""), "import_unstructured", func(database *sql.DB, tx *db.LoggedTx) error {// Process each filefor _, filePath := range files {fileResult, procErr := processUnstructuredFile(tx, filePath, input.DatasetID)if procErr != nil {output.FailedFiles++output.Errors = append(output.Errors, utils.FileImportError{FileName: filepath.Base(filePath),Error: procErr.Error(),Stage: utils.StageProcess,})continue}if fileResult.Skipped {output.SkippedFiles++} else {output.ImportedFiles++output.TotalDuration += fileResult.Duration}}return nil})if err != nil {return output, err}output.ProcessingTime = time.Since(startTime).String()return output, nil}// unstructuredFileResult holds the result of processing a single filetype unstructuredFileResult struct {Skipped bool // True if duplicateDuration float64 // Duration in seconds}// processUnstructuredFile processes a single WAV file for unstructured importfunc processUnstructuredFile(tx *db.LoggedTx, filePath, datasetID string) (*unstructuredFileResult, error) {result := &unstructuredFileResult{}// Step 1: Parse WAV headermetadata, err := utils.ParseWAVHeader(filePath)if err != nil {return nil, fmt.Errorf("WAV header parsing failed: %w", err)}// Step 2: Calculate hashhash, err := utils.ComputeXXH64(filePath)if err != nil {return nil, fmt.Errorf("hash calculation failed: %w", err)}// Step 3: Check for duplicate - if exists, skip entirely (do not link to dataset)_, isDuplicate, err := utils.CheckDuplicateHash(tx, hash)if err != nil {return nil, fmt.Errorf("duplicate check failed: %w", err)}if isDuplicate {// File already exists in database - skip completely, do not link to datasetresult.Skipped = trueresult.Duration = metadata.Durationreturn result, nil}// Step 4: Generate file IDfileID, err := utils.GenerateLongID()if err != nil {return nil, fmt.Errorf("ID generation failed: %w", err)}// Step 5: Use file modification time as timestamp (no timezone conversion)timestamp := metadata.FileModTime// Step 6: Insert into file table_, err = tx.Exec(`INSERT INTO file (id, file_name, xxh64_hash, location_id, cluster_id,timestamp_local, duration, sample_rate,maybe_solar_night, maybe_civil_night, moon_phase,active) VALUES (?, ?, ?, NULL, NULL, ?, ?, ?, NULL, NULL, NULL, TRUE)`,fileID,filepath.Base(filePath),hash,timestamp,metadata.Duration,metadata.SampleRate,)if err != nil {return nil, fmt.Errorf("file insert failed: %w", err)}// Step 7: Insert into file_dataset table_, err = tx.Exec("INSERT INTO file_dataset (file_id, dataset_id) VALUES (?, ?)",fileID, datasetID,)if err != nil {return nil, fmt.Errorf("file_dataset insert failed: %w", err)}result.Duration = metadata.Durationreturn result, nil}// validateUnstructuredInput validates the input parametersfunc validateUnstructuredInput(input ImportUnstructuredInput) error {// Validate dataset ID formatif err := utils.ValidateShortID(input.DatasetID, "dataset_id"); err != nil {return err}// Verify folder existsinfo, err := os.Stat(input.FolderPath)if err != nil {return fmt.Errorf("folder not accessible: %w", err)}if !info.IsDir() {return fmt.Errorf("path is not a directory: %s", input.FolderPath)}return db.WithReadDB(db.ResolveDBPath(input.DBPath, ""), func(database *sql.DB) error {// Verify dataset exists and is activeif _, err := db.DatasetExistsAndActive(database, input.DatasetID); err != nil {return err}// Verify dataset is 'unstructured' typeif err := db.ValidateDatasetTypeUnstructured(database, input.DatasetID); err != nil {return err}return nil})}// scanWavFiles scans a folder for WAV filesfunc scanWavFiles(folderPath string, recursive bool) ([]string, []utils.FileImportError) {var files []stringvar errors []utils.FileImportErrorwalkFunc := func(path string, d fs.DirEntry, err error) error {if err != nil {errors = append(errors, utils.FileImportError{FileName: path,Error: err.Error(),Stage: utils.StageScan,})return nil}// Skip directories if not recursiveif d.IsDir() {if !recursive && path != folderPath {return fs.SkipDir}return nil}// Check for .wav extension (case-insensitive)if strings.HasSuffix(strings.ToLower(d.Name()), ".wav") {files = append(files, path)}return nil}if recursive {if err := filepath.WalkDir(folderPath, walkFunc); err != nil {errors = append(errors, utils.FileImportError{FileName: folderPath,Error: err.Error(),Stage: utils.StageScan,})}} else {// Non-recursive: only scan top-levelentries, err := os.ReadDir(folderPath)if err != nil {errors = append(errors, utils.FileImportError{FileName: folderPath,Error: err.Error(),Stage: utils.StageScan,})return nil, errors}for _, entry := range entries {if !entry.IsDir() && strings.HasSuffix(strings.ToLower(entry.Name()), ".wav") {files = append(files, filepath.Join(folderPath, entry.Name()))}}}return files, errors}
package impimport ("testing""skraak/utils")func TestValidateSegmentImportInput(t *testing.T) {t.Run("invalid dataset ID - too short", func(t *testing.T) {input := ImportSegmentsInput{DatasetID: "abc",}err := validateSegmentImportInput(input)if err == nil {t.Fatal("expected error for short dataset ID")}})t.Run("invalid dataset ID - too long", func(t *testing.T) {input := ImportSegmentsInput{DatasetID: "abc123def456ghi789",}err := validateSegmentImportInput(input)if err == nil {t.Fatal("expected error for long dataset ID")}})t.Run("invalid dataset ID - invalid characters", func(t *testing.T) {input := ImportSegmentsInput{DatasetID: "abc123!!!456",}err := validateSegmentImportInput(input)if err == nil {t.Fatal("expected error for invalid characters in dataset ID")}})t.Run("invalid location ID", func(t *testing.T) {input := ImportSegmentsInput{DatasetID: "abc123def456",LocationID: "invalid",}err := validateSegmentImportInput(input)if err == nil {t.Fatal("expected error for invalid location ID")}})t.Run("invalid cluster ID", func(t *testing.T) {input := ImportSegmentsInput{DatasetID: "abc123def456",LocationID: "xyz789uvw012",ClusterID: "invalid",}err := validateSegmentImportInput(input)if err == nil {t.Fatal("expected error for invalid cluster ID")}})}func TestCountTotalSegments(t *testing.T) {t.Run("empty", func(t *testing.T) {count := countTotalSegments(map[string]scannedDataFile{})if count != 0 {t.Errorf("expected 0, got %d", count)}})t.Run("single file - no segments", func(t *testing.T) {files := map[string]scannedDataFile{"file1": {Segments: []*utils.Segment{}},}count := countTotalSegments(files)if count != 0 {t.Errorf("expected 0, got %d", count)}})t.Run("single file - multiple segments", func(t *testing.T) {files := map[string]scannedDataFile{"file1": {Segments: []*utils.Segment{{}, {}, {}}},}count := countTotalSegments(files)if count != 3 {t.Errorf("expected 3, got %d", count)}})t.Run("multiple files", func(t *testing.T) {files := map[string]scannedDataFile{"file1": {Segments: []*utils.Segment{{}, {}}},"file2": {Segments: []*utils.Segment{{}}},"file3": {Segments: []*utils.Segment{{}, {}, {}, {}}},}count := countTotalSegments(files)if count != 7 {t.Errorf("expected 7, got %d", count)}})}
package impimport ("context""database/sql""fmt""os""path/filepath""strings""time""skraak/db""skraak/utils")// ImportSegmentsInput defines the input parameters for the import_segments tooltype ImportSegmentsInput struct {DBPath string `json:"db_path"`Folder string `json:"folder"`Mapping string `json:"mapping"`DatasetID string `json:"dataset_id"`LocationID string `json:"location_id"`ClusterID string `json:"cluster_id"`ProgressHandler func(processed, total int, message string)}// ImportSegmentsOutput defines the output structure for the import_segments tooltype ImportSegmentsOutput struct {Summary ImportSegmentsSummary `json:"summary"`Segments []SegmentImport `json:"segments"`Errors []ImportSegmentError `json:"errors,omitempty"`}// ImportSegmentsSummary provides summary statistics for the import operationtype ImportSegmentsSummary struct {DataFilesFound int `json:"data_files_found"`DataFilesProcessed int `json:"data_files_processed"`TotalSegments int `json:"total_segments"`ImportedSegments int `json:"imported_segments"`ImportedLabels int `json:"imported_labels"`ImportedSubtypes int `json:"imported_subtypes"`ProcessingTimeMs int64 `json:"processing_time_ms"`}// SegmentImport represents an imported segment in the outputtype SegmentImport struct {SegmentID string `json:"segment_id"`FileName string `json:"file_name"`StartTime float64 `json:"start_time"`EndTime float64 `json:"end_time"`FreqLow float64 `json:"freq_low"`FreqHigh float64 `json:"freq_high"`Labels []LabelImport `json:"labels"`}// LabelImport represents an imported label in the outputtype LabelImport struct {LabelID string `json:"label_id"`Species string `json:"species"`CallType string `json:"calltype,omitempty"`Filter string `json:"filter"`Certainty int `json:"certainty"`Comment string `json:"comment,omitempty"`}// ImportSegmentError records errors encountered during segment importtype ImportSegmentError struct {File string `json:"file,omitempty"`Stage utils.ImportStage `json:"stage"`Message string `json:"message"`}// scannedDataFile holds parsed data for a .data filetype scannedDataFile struct {DataPath stringWavPath stringWavHash stringFileID stringDuration float64Segments []*utils.Segment}// segmentValidation holds the results of pre-import validation (phases B+C).type segmentValidation struct {scannedFiles []scannedDataFilefilterIDMap map[string]stringspeciesIDMap map[string]stringcalltypeIDMap map[string]map[string]stringfileIDMap map[string]scannedDataFile}// validateAndPrepareSegments performs phases B+C: parse data files, validate DB state, and prepare ID maps.func validateAndPrepareSegments(database *sql.DB,input ImportSegmentsInput,mapping utils.MappingFile,dataFiles []string,) (*segmentValidation, []ImportSegmentError, error) {// Phase B: Parse all .data files and collect unique valuesscannedFiles, parseErrors, uniqueFilters, uniqueSpecies, uniqueCalltypes := scanAllDataFiles(dataFiles, input.Folder)if len(scannedFiles) == 0 {return nil, parseErrors, nil}// Validate dataset/location/cluster hierarchyif err := validateSegmentHierarchy(database, input.DatasetID, input.LocationID, input.ClusterID); err != nil {return nil, parseErrors, err}// Validate all filters existfilterIDMap, err := validateFiltersExist(database, uniqueFilters)if err != nil {return nil, parseErrors, fmt.Errorf("filter validation failed: %w", err)}// Validate mapping covers all species/calltypes and they exist in DBvalidationResult, err := utils.ValidateMappingAgainstDB(database, mapping, uniqueSpecies, uniqueCalltypes)if err != nil {return nil, parseErrors, fmt.Errorf("mapping validation failed: %w", err)}if validationResult.HasErrors() {return nil, parseErrors, fmt.Errorf("mapping validation failed: %s", validationResult.Error())}// Load species and calltype ID mapsspeciesIDMap, calltypeIDMap, err := loadSpeciesCalltypeIDs(database, mapping, uniqueSpecies, uniqueCalltypes)if err != nil {return nil, parseErrors, fmt.Errorf("failed to load species/calltype IDs: %w", err)}// Validate files: hash exists, linked to dataset, no existing labelsfileIDMap, hashErrors := validateAndMapFiles(database, scannedFiles, input.ClusterID, input.DatasetID)allErrors := append(parseErrors, hashErrors...)return &segmentValidation{scannedFiles: scannedFiles,filterIDMap: filterIDMap,speciesIDMap: speciesIDMap,calltypeIDMap: calltypeIDMap,fileIDMap: fileIDMap,}, allErrors, nil}// ImportSegments imports segments from AviaNZ .data files into the databasefunc ImportSegments(ctx context.Context, input ImportSegmentsInput) (ImportSegmentsOutput, error) {startTime := time.Now()var output ImportSegmentsOutputoutput.Segments = make([]SegmentImport, 0)output.Errors = make([]ImportSegmentError, 0)// Phase A: Input Validationif err := validateSegmentImportInput(input); err != nil {return output, err}// Load mapping filemapping, err := utils.LoadMappingFile(input.Mapping)if err != nil {return output, fmt.Errorf("failed to load mapping file: %w", err)}// Find .data filesdataFiles, err := utils.FindDataFiles(input.Folder)if err != nil {return output, fmt.Errorf("failed to find .data files: %w", err)}output.Summary.DataFilesFound = len(dataFiles)if len(dataFiles) == 0 {return output, fmt.Errorf("no .data files found in folder: %s", input.Folder)}// Phase B+C: Parse data files and validate against DBdatabase, err := db.OpenWriteableDB(db.ResolveDBPath(input.DBPath, ""))if err != nil {return output, fmt.Errorf("failed to open database: %w", err)}defer database.Close()val, valErrors, err := validateAndPrepareSegments(database, input, mapping, dataFiles)output.Errors = append(output.Errors, valErrors...)if err != nil {return output, err}if val == nil || len(val.fileIDMap) == 0 {output.Summary.ProcessingTimeMs = time.Since(startTime).Milliseconds()return output, nil}// Phase D: Transactional ImportimportedSegments, importedLabels, importedSubtypes, fileUpdates, importErrors := importSegmentsIntoDB(ctx, database, val.fileIDMap, val.scannedFiles, mapping, val.filterIDMap, val.speciesIDMap, val.calltypeIDMap, input.DatasetID, input.ProgressHandler,)output.Errors = append(output.Errors, importErrors...)output.Segments = append(output.Segments, importedSegments...)// Phase E: Write IDs back to .data filesif len(fileUpdates) > 0 {writeErrors := writeIDsToDataFiles(fileUpdates)output.Errors = append(output.Errors, writeErrors...)}output.Summary.DataFilesProcessed = len(val.fileIDMap)output.Summary.TotalSegments = countTotalSegments(val.fileIDMap)output.Summary.ImportedSegments = len(importedSegments)output.Summary.ImportedLabels = importedLabelsoutput.Summary.ImportedSubtypes = importedSubtypesoutput.Summary.ProcessingTimeMs = time.Since(startTime).Milliseconds()return output, nil}// validateSegmentImportInput validates input parametersfunc validateSegmentImportInput(input ImportSegmentsInput) error {// Validate folder existsif info, err := os.Stat(input.Folder); err != nil {return fmt.Errorf("folder does not exist: %s", input.Folder)} else if !info.IsDir() {return fmt.Errorf("path is not a folder: %s", input.Folder)}// Validate mapping file existsif _, err := os.Stat(input.Mapping); err != nil {return fmt.Errorf("mapping file does not exist: %s", input.Mapping)}// Validate IDsif err := utils.ValidateShortID(input.DatasetID, "dataset_id"); err != nil {return err}if err := utils.ValidateShortID(input.LocationID, "location_id"); err != nil {return err}if err := utils.ValidateShortID(input.ClusterID, "cluster_id"); err != nil {return err}return nil}// validateSegmentHierarchy validates dataset/location/cluster relationshipsfunc validateSegmentHierarchy(dbConn *sql.DB, datasetID, locationID, clusterID string) error {// Validate dataset exists and is structuredif err := db.ValidateDatasetTypeForImport(dbConn, datasetID); err != nil {return err}// Validate location belongs to datasetif err := db.ValidateLocationBelongsToDataset(dbConn, locationID, datasetID); err != nil {return err}// Validate cluster belongs to locationif err := db.ClusterBelongsToLocation(dbConn, clusterID, locationID); err != nil {return err}return nil}// scanAllDataFiles parses all .data files and collects unique valuesfunc scanAllDataFiles(dataFiles []string, folder string) ([]scannedDataFile,[]ImportSegmentError,map[string]bool,map[string]bool,map[string]map[string]bool,) {var scanned []scannedDataFilevar errors []ImportSegmentErroruniqueFilters := make(map[string]bool)uniqueSpecies := make(map[string]bool)uniqueCalltypes := make(map[string]map[string]bool) // species -> calltype -> truefor _, dataPath := range dataFiles {// Find corresponding WAV filewavPath := strings.TrimSuffix(dataPath, ".data")if _, err := os.Stat(wavPath); err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(dataPath),Stage: utils.StageValidation,Message: fmt.Sprintf("corresponding WAV file not found: %s", filepath.Base(wavPath)),})continue}// Parse .data filedf, err := utils.ParseDataFile(dataPath)if err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(dataPath),Stage: utils.StageValidation,Message: fmt.Sprintf("failed to parse .data file: %v", err),})continue}// Collect unique filters, species, calltypesfor _, seg := range df.Segments {for _, label := range seg.Labels {uniqueFilters[label.Filter] = trueuniqueSpecies[label.Species] = trueif label.CallType != "" {if uniqueCalltypes[label.Species] == nil {uniqueCalltypes[label.Species] = make(map[string]bool)}uniqueCalltypes[label.Species][label.CallType] = true}}}scanned = append(scanned, scannedDataFile{DataPath: dataPath,WavPath: wavPath,Duration: df.Meta.Duration,Segments: df.Segments,})}return scanned, errors, uniqueFilters, uniqueSpecies, uniqueCalltypes}// validateFiltersExist checks all filters exist in DB and returns ID mapfunc validateFiltersExist(dbConn *sql.DB, filterNames map[string]bool) (map[string]string, error) {filterIDMap := make(map[string]string)if len(filterNames) == 0 {return filterIDMap, nil}names := make([]string, 0, len(filterNames))for name := range filterNames {names = append(names, name)}query := `SELECT id, name FROM filter WHERE name IN (` + db.Placeholders(len(names)) + `) AND active = true`args := make([]any, len(names))for i, name := range names {args[i] = name}rows, err := dbConn.Query(query, args...)if err != nil {return nil, fmt.Errorf("failed to query filters: %w", err)}defer rows.Close()for rows.Next() {var id, name stringif err := rows.Scan(&id, &name); err == nil {filterIDMap[name] = id}}// Check for missing filtersvar missing []stringfor name := range filterNames {if _, exists := filterIDMap[name]; !exists {missing = append(missing, name)}}if len(missing) > 0 {return nil, fmt.Errorf("filters not found in database: [%s]", strings.Join(missing, ", "))}return filterIDMap, nil}// loadSpeciesCalltypeIDs loads species and calltype ID mapsfunc loadSpeciesCalltypeIDs(dbConn *sql.DB,mapping utils.MappingFile,uniqueSpecies map[string]bool,uniqueCalltypes map[string]map[string]bool,) (map[string]string, map[string]map[string]string, error) {speciesIDMap := make(map[string]string)calltypeIDMap := make(map[string]map[string]string) // (dbSpecies, dbCalltype) -> calltype_id// Collect all DB species labels from mappingdbSpeciesSet := make(map[string]bool)for dataSpecies := range uniqueSpecies {if dbSpecies, ok := mapping.GetDBSpecies(dataSpecies); ok {dbSpeciesSet[dbSpecies] = true}}// Load species IDsif len(dbSpeciesSet) > 0 {dbSpeciesList := make([]string, 0, len(dbSpeciesSet))for s := range dbSpeciesSet {dbSpeciesList = append(dbSpeciesList, s)}query := `SELECT id, label FROM species WHERE label IN (` + db.Placeholders(len(dbSpeciesList)) + `) AND active = true`args := make([]any, len(dbSpeciesList))for i, s := range dbSpeciesList {args[i] = s}rows, err := dbConn.Query(query, args...)if err != nil {return nil, nil, fmt.Errorf("failed to query species: %w", err)}defer rows.Close()for rows.Next() {var id, label stringif err := rows.Scan(&id, &label); err == nil {speciesIDMap[label] = id}}}// Load calltype IDsfor dataSpecies, ctSet := range uniqueCalltypes {dbSpecies, ok := mapping.GetDBSpecies(dataSpecies)if !ok {continue}if calltypeIDMap[dbSpecies] == nil {calltypeIDMap[dbSpecies] = make(map[string]string)}for dataCalltype := range ctSet {dbCalltype := mapping.GetDBCalltype(dataSpecies, dataCalltype)// Query calltype IDvar calltypeID stringerr := dbConn.QueryRow(`SELECT ct.idFROM call_type ctJOIN species s ON ct.species_id = s.idWHERE s.label = ? AND ct.label = ? AND ct.active = true`, dbSpecies, dbCalltype).Scan(&calltypeID)if err == nil {calltypeIDMap[dbSpecies][dbCalltype] = calltypeID}}}return speciesIDMap, calltypeIDMap, nil}// validateAndMapFiles validates files exist by hash, are linked to dataset, and have no existing labelsfunc validateAndMapFiles(dbConn *sql.DB,scannedFiles []scannedDataFile,clusterID string,datasetID string,) (map[string]scannedDataFile, []ImportSegmentError) {fileIDMap := make(map[string]scannedDataFile)var errors []ImportSegmentErrorfor _, sf := range scannedFiles {// Compute hashhash, err := utils.ComputeXXH64(sf.WavPath)if err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.WavPath),Stage: utils.StageHash,Message: fmt.Sprintf("failed to compute hash: %v", err),})continue}sf.WavHash = hash// Find file by hash in clustervar fileID stringvar duration float64err = dbConn.QueryRow(`SELECT id, duration FROM file WHERE xxh64_hash = ? AND cluster_id = ? AND active = true`, hash, clusterID).Scan(&fileID, &duration)if err == sql.ErrNoRows {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.WavPath),Stage: utils.StageValidation,Message: fmt.Sprintf("file hash not found in database for cluster (hash: %s)", hash),})continue}if err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.WavPath),Stage: utils.StageValidation,Message: fmt.Sprintf("failed to query file: %v", err),})continue}sf.FileID = fileIDsf.Duration = duration// Verify file is linked to dataset via file_dataset junction table (composite FK)var fileLinkedToDataset boolerr = dbConn.QueryRow(`SELECT EXISTS(SELECT 1 FROM file_dataset WHERE file_id = ? AND dataset_id = ?)`, fileID, datasetID).Scan(&fileLinkedToDataset)if err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.WavPath),Stage: utils.StageValidation,Message: fmt.Sprintf("failed to verify file-dataset link: %v", err),})continue}if !fileLinkedToDataset {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.WavPath),Stage: utils.StageValidation,Message: fmt.Sprintf("file exists in cluster but is not linked to dataset %s", datasetID),})continue}// Check no existing labels for this filevar labelCount interr = dbConn.QueryRow(`SELECT COUNT(*) FROM label lJOIN segment s ON l.segment_id = s.idWHERE s.file_id = ? AND l.active = true`, fileID).Scan(&labelCount)if err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.WavPath),Stage: utils.StageValidation,Message: fmt.Sprintf("failed to check existing labels: %v", err),})continue}if labelCount > 0 {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.WavPath),Stage: utils.StageValidation,Message: fmt.Sprintf("file already has %d label(s) - fresh imports only", labelCount),})continue}fileIDMap[fileID] = sf}return fileIDMap, errors}// dataFileUpdate holds data to write back to .data file after importtype dataFileUpdate struct {DataPath stringWavHash stringLabelIDs map[int]map[int]string // segmentIndex -> labelIndex -> labelID}// importLabelResult holds the result of importing a single label.type importLabelResult struct {labelImport LabelImportlabelID stringsubtypesImported interr ImportSegmentErrorhasError bool}// importSingleLabel inserts a single label and its metadata/subtype into the DB.func importSingleLabel(ctx context.Context,tx *db.LoggedTx,label *utils.Label,segmentID string,segIdx, labelIdx int,sf scannedDataFile,mapping utils.MappingFile,filterIDMap map[string]string,speciesIDMap map[string]string,calltypeIDMap map[string]map[string]string,) importLabelResult {dbSpecies, ok := mapping.GetDBSpecies(label.Species)if !ok {return importLabelResult{err: ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("species not found in mapping: %s", label.Species),}, hasError: true}}speciesID, ok := speciesIDMap[dbSpecies]if !ok {return importLabelResult{err: ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("species ID not found: %s", dbSpecies),}, hasError: true}}filterID, ok := filterIDMap[label.Filter]if !ok {return importLabelResult{err: ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("filter ID not found: %s", label.Filter),}, hasError: true}}labelID, err := utils.GenerateLongID()if err != nil {return importLabelResult{err: ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("failed to generate label ID: %v", err),}, hasError: true}}_, err = tx.ExecContext(ctx, `INSERT INTO label (id, segment_id, species_id, filter_id, certainty, created_at, last_modified, active)VALUES (?, ?, ?, ?, ?, now(), now(), true)`, labelID, segmentID, speciesID, filterID, label.Certainty)if err != nil {return importLabelResult{err: ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("failed to insert label: %v", err),}, hasError: true}}// Insert label_metadata if comment existsif label.Comment != "" {escapedComment := strings.ReplaceAll(label.Comment, `"`, `\"`)metadataJSON := fmt.Sprintf(`{"comment": "%s"}`, escapedComment)if _, err := tx.ExecContext(ctx, `INSERT INTO label_metadata (label_id, json, created_at, last_modified, active)VALUES (?, ?, now(), now(), true)`, labelID, metadataJSON); err != nil {return importLabelResult{err: ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("failed to insert label_metadata: %v", err),}, hasError: true}}}labelImport := LabelImport{LabelID: labelID,Species: dbSpecies,Filter: label.Filter,Certainty: label.Certainty,}if label.Comment != "" {labelImport.Comment = label.Comment}// Insert label_subtype if calltype existsif label.CallType != "" {if err := importCalltype(ctx, tx, labelID, label, dbSpecies, filterID, mapping, calltypeIDMap, sf); err != nil {return importLabelResult{err: *err, hasError: true}}labelImport.CallType = mapping.GetDBCalltype(label.Species, label.CallType)return importLabelResult{labelImport: labelImport, labelID: labelID, subtypesImported: 1}}return importLabelResult{labelImport: labelImport, labelID: labelID}}// importCalltype inserts a label_subtype row for a calltype label.func importCalltype(ctx context.Context,tx *db.LoggedTx,labelID string,label *utils.Label,dbSpecies string,filterID string,mapping utils.MappingFile,calltypeIDMap map[string]map[string]string,sf scannedDataFile,) *ImportSegmentError {dbCalltype := mapping.GetDBCalltype(label.Species, label.CallType)calltypeID := ""if calltypeIDMap[dbSpecies] != nil {calltypeID = calltypeIDMap[dbSpecies][dbCalltype]}if calltypeID == "" {return &ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("calltype ID not found: %s/%s", dbSpecies, dbCalltype),}}subtypeID, err := utils.GenerateLongID()if err != nil {return &ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("failed to generate label_subtype ID: %v", err),}}_, err = tx.ExecContext(ctx, `INSERT INTO label_subtype (id, label_id, calltype_id, filter_id, certainty, created_at, last_modified, active)VALUES (?, ?, ?, ?, ?, now(), now(), true)`, subtypeID, labelID, calltypeID, filterID, label.Certainty)if err != nil {return &ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("failed to insert label_subtype: %v", err),}}return nil}// importSegmentsIntoDB performs the transactional importfunc importSegmentsIntoDB(ctx context.Context,database *sql.DB,fileIDMap map[string]scannedDataFile,scannedFiles []scannedDataFile,mapping utils.MappingFile,filterIDMap map[string]string,speciesIDMap map[string]string,calltypeIDMap map[string]map[string]string,datasetID string,progressHandler func(processed, total int, message string),) ([]SegmentImport, int, int, []dataFileUpdate, []ImportSegmentError) {var importedSegments []SegmentImportvar errors []ImportSegmentErrorimportedLabels := 0importedSubtypes := 0var fileUpdates []dataFileUpdatetx, err := db.BeginLoggedTx(ctx, database, "import_segments")if err != nil {errors = append(errors, ImportSegmentError{Stage: utils.StageImport,Message: fmt.Sprintf("failed to begin transaction: %v", err),})return nil, 0, 0, nil, errors}defer tx.Rollback()totalFiles := len(fileIDMap)processedFiles := 0for _, sf := range fileIDMap {if sf.FileID == "" {continue}processedFiles++if progressHandler != nil {progressHandler(processedFiles, totalFiles, filepath.Base(sf.DataPath))}fileUpdate := dataFileUpdate{DataPath: sf.DataPath,WavHash: sf.WavHash,LabelIDs: make(map[int]map[int]string),}for segIdx, seg := range sf.Segments {segImp, labelIDs, subtypes, segErrs := importSegment(ctx, tx, seg, segIdx, sf, datasetID, mapping, filterIDMap, speciesIDMap, calltypeIDMap)errors = append(errors, segErrs...)importedSubtypes += subtypesif len(segImp.Labels) == 0 {// Delete orphaned segment (no labels succeeded)if _, err := tx.ExecContext(ctx, `DELETE FROM segment WHERE id = ?`, segImp.SegmentID); err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("failed to delete orphaned segment: %v", err),})}} else {importedSegments = append(importedSegments, segImp)importedLabels += len(labelIDs)fileUpdate.LabelIDs[segIdx] = labelIDs}}fileUpdates = append(fileUpdates, fileUpdate)}if err := tx.Commit(); err != nil {errors = append(errors, ImportSegmentError{Stage: utils.StageImport,Message: fmt.Sprintf("failed to commit transaction: %v", err),})return nil, 0, 0, nil, errors}return importedSegments, importedLabels, importedSubtypes, fileUpdates, errors}// importSegment inserts a single segment and its labels into the DB.func importSegment(ctx context.Context,tx *db.LoggedTx,seg *utils.Segment,segIdx int,sf scannedDataFile,datasetID string,mapping utils.MappingFile,filterIDMap map[string]string,speciesIDMap map[string]string,calltypeIDMap map[string]map[string]string,) (SegmentImport, map[int]string, int, []ImportSegmentError) {var errors []ImportSegmentErrorif seg.StartTime >= seg.EndTime {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("invalid segment bounds: start=%.2f >= end=%.2f", seg.StartTime, seg.EndTime),})return SegmentImport{}, nil, 0, errors}if seg.EndTime > sf.Duration {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("segment end time (%.2f) exceeds file duration (%.2f)", seg.EndTime, sf.Duration),})return SegmentImport{}, nil, 0, errors}segmentID, err := utils.GenerateLongID()if err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("failed to generate segment ID: %v", err),})return SegmentImport{}, nil, 0, errors}_, err = tx.ExecContext(ctx, `INSERT INTO segment (id, file_id, dataset_id, start_time, end_time, freq_low, freq_high, created_at, last_modified, active)VALUES (?, ?, ?, ?, ?, ?, ?, now(), now(), true)`, segmentID, sf.FileID, datasetID, seg.StartTime, seg.EndTime, seg.FreqLow, seg.FreqHigh)if err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(sf.DataPath), Stage: utils.StageImport,Message: fmt.Sprintf("failed to insert segment: %v", err),})return SegmentImport{}, nil, 0, errors}segImport := SegmentImport{SegmentID: segmentID,FileName: filepath.Base(sf.WavPath),StartTime: seg.StartTime,EndTime: seg.EndTime,FreqLow: seg.FreqLow,FreqHigh: seg.FreqHigh,Labels: make([]LabelImport, 0),}labelIDs := make(map[int]string)var subtypesImported intfor labelIdx, label := range seg.Labels {result := importSingleLabel(ctx, tx, label, segmentID, segIdx, labelIdx, sf, mapping, filterIDMap, speciesIDMap, calltypeIDMap)if result.hasError {errors = append(errors, result.err)continue}labelIDs[labelIdx] = result.labelIDsegImport.Labels = append(segImport.Labels, result.labelImport)subtypesImported += result.subtypesImported}return segImport, labelIDs, subtypesImported, errors}// countTotalSegments counts total segments from validated filesfunc countTotalSegments(fileIDMap map[string]scannedDataFile) int {count := 0for _, sf := range fileIDMap {count += len(sf.Segments)}return count}// writeIDsToDataFiles writes skraak_hash and skraak_label_ids back to .data filesfunc writeIDsToDataFiles(fileUpdates []dataFileUpdate) []ImportSegmentError {var errors []ImportSegmentErrorfor _, fu := range fileUpdates {// Parse the .data filedf, err := utils.ParseDataFile(fu.DataPath)if err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(fu.DataPath),Stage: utils.StageImport,Message: fmt.Sprintf("failed to re-parse .data file for writing: %v", err),})continue}// Write skraak_hash to metadataif df.Meta.Extra == nil {df.Meta.Extra = make(map[string]any)}df.Meta.Extra["skraak_hash"] = fu.WavHash// Write skraak_label_id to each labelfor segIdx, labelIDs := range fu.LabelIDs {if segIdx >= len(df.Segments) {continue}seg := df.Segments[segIdx]for labelIdx, labelID := range labelIDs {if labelIdx >= len(seg.Labels) {continue}label := seg.Labels[labelIdx]if label.Extra == nil {label.Extra = make(map[string]any)}label.Extra["skraak_label_id"] = labelID}}// Write the updated .data fileif err := df.Write(fu.DataPath); err != nil {errors = append(errors, ImportSegmentError{File: filepath.Base(fu.DataPath),Stage: utils.StageImport,Message: fmt.Sprintf("failed to write updated .data file: %v", err),})continue}}return errors}
package impimport ("context""database/sql""fmt""os""time""skraak/db""skraak/utils")// ImportAudioFilesInput defines the input parameters for the import_audio_files tooltype ImportAudioFilesInput struct {DBPath string `json:"db_path"`FolderPath string `json:"folder_path"`DatasetID string `json:"dataset_id"`LocationID string `json:"location_id"`ClusterID string `json:"cluster_id"`Recursive *bool `json:"recursive,omitempty"` // *bool because default is true; plain bool would make "not provided" indistinguishable from "false"}// ImportAudioFilesOutput defines the output structure for the import_audio_files tooltype ImportAudioFilesOutput struct {Summary ImportSummary `json:"summary"`FileIDs []string `json:"file_ids"`Errors []utils.FileImportError `json:"errors,omitempty"`}// ImportSummary provides summary statistics for the import operationtype ImportSummary struct {TotalFiles int `json:"total_files"`ImportedFiles int `json:"imported_files"`SkippedFiles int `json:"skipped_files"` // DuplicatesFailedFiles int `json:"failed_files"`AudioMothFiles int `json:"audiomoth_files"`TotalDuration float64 `json:"total_duration_seconds"`ProcessingTime string `json:"processing_time"`}// ImportAudioFiles batch imports WAV files from a folder with hash-based duplicate detectionfunc ImportAudioFiles(ctx context.Context,input ImportAudioFilesInput,) (ImportAudioFilesOutput, error) {startTime := time.Now()var output ImportAudioFilesOutput// Default recursive to truerecursive := trueif input.Recursive != nil {recursive = *input.Recursive}// Validate database hierarchy (dataset → location → cluster)if err := validateImportInput(input, db.ResolveDBPath(input.DBPath, "")); err != nil {return output, fmt.Errorf("validation failed: %w", err)}// Open databasedatabase, err := db.OpenWriteableDB(db.ResolveDBPath(input.DBPath, ""))if err != nil {return output, fmt.Errorf("failed to open database: %w", err)}defer database.Close()// Set cluster path if emptyerr = utils.EnsureClusterPath(database, input.ClusterID, input.FolderPath)if err != nil {return output, fmt.Errorf("failed to set cluster path: %w", err)}// Import the cluster (ALL THE LOGIC IS HERE)tx, err := db.BeginLoggedTx(ctx, database, "import_audio_files")if err != nil {return output, fmt.Errorf("failed to begin transaction: %w", err)}clusterOutput, err := utils.ImportCluster(database, tx.UnderlyingTx(), utils.ClusterImportInput{FolderPath: input.FolderPath,DatasetID: input.DatasetID,LocationID: input.LocationID,ClusterID: input.ClusterID,Recursive: recursive,})if err != nil {tx.Rollback()return output, fmt.Errorf("cluster import failed: %w", err)}if err := tx.Commit(); err != nil {return output, fmt.Errorf("transaction commit failed: %w", err)}// Map to output formatoutput = ImportAudioFilesOutput{Summary: ImportSummary{TotalFiles: clusterOutput.TotalFiles,ImportedFiles: clusterOutput.ImportedFiles,SkippedFiles: clusterOutput.SkippedFiles,FailedFiles: clusterOutput.FailedFiles,AudioMothFiles: clusterOutput.AudioMothFiles,TotalDuration: clusterOutput.TotalDuration,ProcessingTime: time.Since(startTime).String(),},FileIDs: []string{}, // File IDs not tracked currentlyErrors: clusterOutput.Errors,}return output, nil}// validateImportInput validates all input parameters and database relationshipsfunc validateImportInput(input ImportAudioFilesInput, dbPath string) error {// Verify folder existsinfo, err := os.Stat(input.FolderPath)if err != nil {return fmt.Errorf("folder not accessible: %w", err)}if !info.IsDir() {return fmt.Errorf("path is not a directory: %s", input.FolderPath)}return validateHierarchyIDs(input.DatasetID, input.LocationID, input.ClusterID, dbPath)}// validateHierarchyIDs validates dataset/location/cluster ID formats and database relationshipsfunc validateHierarchyIDs(datasetID, locationID, clusterID, dbPath string) error {// Validate ID formats first (fast fail before DB queries)if err := utils.ValidateShortID(datasetID, "dataset_id"); err != nil {return err}if err := utils.ValidateShortID(locationID, "location_id"); err != nil {return err}if err := utils.ValidateShortID(clusterID, "cluster_id"); err != nil {return err}return db.WithReadDB(dbPath, func(database *sql.DB) error {// Verify dataset exists, is active, and is 'structured' typeif err := db.ValidateDatasetTypeForImport(database, datasetID); err != nil {return err}// Verify location exists and belongs to datasetif err := db.ValidateLocationBelongsToDataset(database, locationID, datasetID); err != nil {return err}// Verify cluster exists and belongs to locationif err := db.ClusterBelongsToLocation(database, clusterID, locationID); err != nil {return err}return nil})}
package impimport ("context""database/sql""fmt""os""path/filepath""strings""time""skraak/db""skraak/utils")// ImportFileInput defines the input parameters for the import_file tooltype ImportFileInput struct {DBPath string `json:"db_path"`FilePath string `json:"file_path"`DatasetID string `json:"dataset_id"`LocationID string `json:"location_id"`ClusterID string `json:"cluster_id"`}// ImportFileOutput defines the output structure for the import_file tooltype ImportFileOutput struct {FileID string `json:"file_id"`FileName string `json:"file_name"`Hash string `json:"hash"`Duration float64 `json:"duration_seconds"`SampleRate int `json:"sample_rate"`TimestampLocal time.Time `json:"timestamp_local"`IsAudioMoth bool `json:"is_audiomoth"`IsDuplicate bool `json:"is_duplicate"`ProcessingTime string `json:"processing_time"`Error *string `json:"error,omitempty"`}// ImportFile imports a single WAV file into the database with duplicate detectionfunc ImportFile(ctx context.Context,input ImportFileInput,) (ImportFileOutput, error) {startTime := time.Now()var output ImportFileOutput// Phase 1: Validate file path_, err := validateFilePath(input.FilePath)if err != nil {return output, fmt.Errorf("file validation failed: %w", err)}output.FileName = filepath.Base(input.FilePath)// Phase 2: Validate database hierarchyif err := validateHierarchyIDs(input.DatasetID, input.LocationID, input.ClusterID, db.ResolveDBPath(input.DBPath, "")); err != nil {return output, fmt.Errorf("hierarchy validation failed: %w", err)}// Phase 3: Open database connection (single connection for all DB operations)database, err := db.OpenWriteableDB(db.ResolveDBPath(input.DBPath, ""))if err != nil {return output, fmt.Errorf("database connection failed: %w", err)}defer database.Close()// Phase 4: Get location data for astronomical calculationslocData, err := utils.GetLocationData(database, input.LocationID)if err != nil {return output, fmt.Errorf("failed to get location data: %w", err)}// Phase 5: Process file metadataresult, err := utils.ProcessSingleFile(input.FilePath, locData.Latitude, locData.Longitude, locData.TimezoneID, true)if err != nil {errMsg := err.Error()output.Error = &errMsgoutput.ProcessingTime = time.Since(startTime).String()return output, fmt.Errorf("file processing failed: %w", err)}// Populate output with extracted metadataoutput.FileName = result.FileNameoutput.Hash = result.Hashoutput.Duration = result.Durationoutput.SampleRate = result.SampleRateoutput.TimestampLocal = result.TimestampLocaloutput.IsAudioMoth = result.IsAudioMoth// Phase 6: Ensure cluster path is setif err := utils.EnsureClusterPath(database, input.ClusterID, filepath.Dir(input.FilePath)); err != nil {return output, fmt.Errorf("failed to set cluster path: %w", err)}// Phase 7: Insert into databasefileID, isDuplicate, err := insertFileIntoDB(ctx, database, result, input.DatasetID, input.ClusterID, input.LocationID)if err != nil {errMsg := err.Error()output.Error = &errMsgoutput.ProcessingTime = time.Since(startTime).String()return output, fmt.Errorf("database insertion failed: %w", err)}output.FileID = fileIDoutput.IsDuplicate = isDuplicateoutput.ProcessingTime = time.Since(startTime).String()return output, nil}// validateFilePath validates the file exists, is a regular file, is a WAV file, and is not emptyfunc validateFilePath(filePath string) (os.FileInfo, error) {// Check file existsinfo, err := os.Stat(filePath)if err != nil {if os.IsNotExist(err) {return nil, fmt.Errorf("file does not exist: %s", filePath)}return nil, fmt.Errorf("cannot access file: %w", err)}// Check it's a regular fileif !info.Mode().IsRegular() {return nil, fmt.Errorf("path is not a regular file: %s", filePath)}// Check extension is .wav (case-insensitive)ext := strings.ToLower(filepath.Ext(filePath))if ext != ".wav" {return nil, fmt.Errorf("file must be a WAV file (got extension: %s)", ext)}// Check file is not emptyif info.Size() == 0 {return nil, fmt.Errorf("file is empty: %s", filePath)}return info, nil}// insertFileIntoDB inserts a single file into the database// Returns (fileID, isDuplicate, error)func insertFileIntoDB(ctx context.Context,database *sql.DB,result *utils.FileProcessingResult,datasetID, clusterID, locationID string,) (string, bool, error) {// Begin logged transactiontx, err := db.BeginLoggedTx(ctx, database, "import_audio_file")if err != nil {return "", false, fmt.Errorf("failed to begin transaction: %w", err)}defer tx.Rollback() // Rollback if not committed// Check for duplicate hashexistingID, isDup, err := utils.CheckDuplicateHash(tx, result.Hash)if err != nil {return "", false, err}if isDup {return existingID, true, nil}// Generate file IDfileID, err := utils.GenerateLongID()if err != nil {return "", false, fmt.Errorf("ID generation failed: %w", err)}// Insert file record_, err = tx.ExecContext(ctx, `INSERT INTO file (id, file_name, xxh64_hash, location_id, timestamp_local,cluster_id, duration, sample_rate, maybe_solar_night, maybe_civil_night,moon_phase, created_at, last_modified, active) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, now(), now(), true)`,fileID, result.FileName, result.Hash, locationID,result.TimestampLocal, clusterID, result.Duration, result.SampleRate,result.AstroData.SolarNight, result.AstroData.CivilNight, result.AstroData.MoonPhase,)if err != nil {return "", false, fmt.Errorf("file insert failed: %w", err)}// Insert file_dataset junction_, err = tx.ExecContext(ctx, `INSERT INTO file_dataset (file_id, dataset_id, created_at, last_modified)VALUES (?, ?, now(), now())`, fileID, datasetID)if err != nil {return "", false, fmt.Errorf("file_dataset insert failed: %w", err)}// If AudioMoth, insert moth_metadataif result.IsAudioMoth && result.MothData != nil {_, err = tx.ExecContext(ctx, `INSERT INTO moth_metadata (file_id, timestamp, recorder_id, gain, battery_v, temp_c,created_at, last_modified, active) VALUES (?, ?, ?, ?, ?, ?, now(), now(), true)`,fileID,result.MothData.Timestamp,&result.MothData.RecorderID,&result.MothData.Gain,&result.MothData.BatteryV,&result.MothData.TempC,)if err != nil {return "", false, fmt.Errorf("moth_metadata insert failed: %w", err)}}// Commit transactionif err = tx.Commit(); err != nil {return "", false, fmt.Errorf("transaction commit failed: %w", err)}return fileID, false, nil}
package impimport ("context""database/sql""encoding/csv""fmt""os""path/filepath""strconv""strings""time""skraak/db""skraak/utils")// BulkFileImportInput defines the input parameters for the bulk_file_import tooltype BulkFileImportInput struct {DBPath string `json:"db_path"`DatasetID string `json:"dataset_id"`CSVPath string `json:"csv_path"`LogFilePath string `json:"log_file_path"`}// BulkFileImportOutput defines the output structure for the bulk_file_import tooltype BulkFileImportOutput struct {TotalLocations int `json:"total_locations"`ClustersCreated int `json:"clusters_created"`ClustersExisting int `json:"clusters_existing"`TotalFilesScanned int `json:"total_files_scanned"`FilesImported int `json:"files_imported"`FilesDuplicate int `json:"files_duplicate"`FilesError int `json:"files_error"`ProcessingTime string `json:"processing_time"`Errors []string `json:"errors,omitempty"`}// bulkLocationData holds CSV row data for a locationtype bulkLocationData struct {LocationName stringLocationID stringDirectoryPath stringDateRange stringSampleRate intFileCount int}// bulkImportStats tracks import statistics for a single clustertype bulkImportStats struct {TotalFiles intImportedFiles intDuplicateFiles intErrorFiles int}// progressLogger handles writing to both log file and internal buffertype progressLogger struct {file *os.Filebuffer *strings.Builder}// Log writes a formatted message with timestamp to both log file and bufferfunc (l *progressLogger) Log(format string, args ...any) {timestamp := time.Now().Format("2006-01-02 15:04:05")message := fmt.Sprintf(format, args...)line := fmt.Sprintf("[%s] %s\n", timestamp, message)// Write to file; log write failures are non-fatal for import progressif _, err := l.file.WriteString(line); err != nil {fmt.Fprintf(os.Stderr, "Warning: log write failed: %v\n", err)}if err := l.file.Sync(); err != nil {fmt.Fprintf(os.Stderr, "Warning: log sync failed: %v\n", err)}// Also keep in memory for potential error reportingl.buffer.WriteString(line)}// BulkFileImport imports WAV files across multiple locations using CSV specification// failOutput sets error details and processing time on the output before returning.func (o *BulkFileImportOutput) failOutput(errs []string, startTime time.Time) {o.Errors = errso.ProcessingTime = time.Since(startTime).String()}// BulkFileImport imports WAV files across multiple locations using CSV specificationfunc BulkFileImport(ctx context.Context,input BulkFileImportInput,) (BulkFileImportOutput, error) {startTime := time.Now()var output BulkFileImportOutput// Open log filelogFile, err := os.OpenFile(input.LogFilePath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)if err != nil {return output, fmt.Errorf("failed to open log file: %w", err)}defer func() { _ = logFile.Close() }()logger := &progressLogger{file: logFile,buffer: &strings.Builder{},}logger.Log("Starting bulk file import for dataset %s", input.DatasetID)// Phase 0: Validate inputlogger.Log("Validating input parameters...")if err := bulkValidateInput(input); err != nil {logger.Log("ERROR: Validation failed: %v", err)output.failOutput([]string{fmt.Sprintf("validation failed: %v", err)}, startTime)return output, fmt.Errorf("validation failed: %w", err)}logger.Log("Validation complete")// Phase 1: Read CSVlogger.Log("Reading CSV file: %s", input.CSVPath)locations, err := bulkReadCSV(input.CSVPath)if err != nil {logger.Log("ERROR: Failed to read CSV: %v", err)output.failOutput([]string{fmt.Sprintf("failed to read CSV: %v", err)}, startTime)return output, fmt.Errorf("failed to read CSV: %w", err)}logger.Log("Loaded %d locations from CSV", len(locations))output.TotalLocations = len(locations)// Phase 1.5: Validate all location_ids belong to the datasetlogger.Log("Validating location_ids belong to dataset...")if err := bulkValidateLocations(logger, locations, input.DatasetID, db.ResolveDBPath(input.DBPath, "")); err != nil {output.failOutput([]string{err.Error()}, startTime)return output, err}logger.Log("Location validation complete")// Phase 2: Create/Validate Clusterslogger.Log("=== Phase 1: Creating/Validating Clusters ===")database, err := db.OpenWriteableDB(db.ResolveDBPath(input.DBPath, ""))if err != nil {logger.Log("ERROR: Failed to open database: %v", err)output.failOutput([]string{fmt.Sprintf("failed to open database: %v", err)}, startTime)return output, fmt.Errorf("failed to open database: %w", err)}defer database.Close()clusterIDMap, created, existing, err := bulkCreateClusters(ctx, database, logger, locations, input.DatasetID)if err != nil {output.failOutput(output.Errors, startTime)return output, err}output.ClustersCreated = createdoutput.ClustersExisting = existing// Phase 3: Import fileslogger.Log("=== Phase 2: Importing Files ===")fileStats, errs := bulkImportAllFiles(database, logger, locations, clusterIDMap, input.DatasetID)output.TotalFilesScanned = fileStats.TotalFilesoutput.FilesImported = fileStats.ImportedFilesoutput.FilesDuplicate = fileStats.DuplicateFilesoutput.FilesError = fileStats.ErrorFilesoutput.Errors = append(output.Errors, errs...)if len(errs) > 0 {output.ProcessingTime = time.Since(startTime).String()return output, fmt.Errorf("failed to import files: %s", errs[0])}logger.Log("=== Import Complete ===")logger.Log("Total files scanned: %d", fileStats.TotalFiles)logger.Log("Files imported: %d", fileStats.ImportedFiles)logger.Log("Duplicates skipped: %d", fileStats.DuplicateFiles)logger.Log("Errors: %d", fileStats.ErrorFiles)logger.Log("Processing time: %s", time.Since(startTime).Round(time.Second))output.ProcessingTime = time.Since(startTime).String()return output, nil}// bulkValidateInput validates input parametersfunc bulkValidateInput(input BulkFileImportInput) error {// Validate ID format first (fast fail before DB queries)if err := utils.ValidateShortID(input.DatasetID, "dataset_id"); err != nil {return err}// Verify CSV file existsif _, err := os.Stat(input.CSVPath); err != nil {return fmt.Errorf("CSV file not accessible: %w", err)}// Verify log file path is writablelogDir := filepath.Dir(input.LogFilePath)if _, err := os.Stat(logDir); err != nil {return fmt.Errorf("log file directory not accessible: %w", err)}// Open database for validation queriesdatabase, err := db.OpenReadOnlyDB(db.ResolveDBPath(input.DBPath, ""))if err != nil {return fmt.Errorf("failed to open database: %w", err)}defer database.Close()// Verify dataset exists and is structuredif err := db.ValidateDatasetTypeForImport(database, input.DatasetID); err != nil {return err}return nil}// bulkValidateLocationsBelongToDataset validates that all unique location_ids in the CSV belong to the datasetfunc bulkValidateLocationsBelongToDataset(dbConn *sql.DB, locations []bulkLocationData, datasetID string) []string {var errors []string// Collect unique location_idsuniqueLocations := make(map[string]bool)for _, loc := range locations {uniqueLocations[loc.LocationID] = true}// Validate each unique location_idfor locationID := range uniqueLocations {if err := db.ValidateLocationBelongsToDataset(dbConn, locationID, datasetID); err != nil {errors = append(errors, err.Error())}}return errors}// bulkValidateLocations validates that all location_ids in the CSV belong to the dataset.// Returns an error if validation fails.func bulkValidateLocations(logger *progressLogger, locations []bulkLocationData, datasetID string, dbPath string) error {readDB, err := db.OpenReadOnlyDB(dbPath)if err != nil {logger.Log("ERROR: Failed to open database: %v", err)return fmt.Errorf("failed to open database: %w", err)}locationErrors := bulkValidateLocationsBelongToDataset(readDB, locations, datasetID)readDB.Close()if len(locationErrors) > 0 {for _, locErr := range locationErrors {logger.Log("ERROR: %s", locErr)}return fmt.Errorf("location validation failed: %d location(s) do not belong to dataset %s", len(locationErrors), datasetID)}return nil}// bulkCreateClusters creates or validates clusters for all locations.// Returns the cluster ID map, counts of created/existing clusters, and any error.func bulkCreateClusters(ctx context.Context, database *sql.DB, logger *progressLogger, locations []bulkLocationData, datasetID string) (map[string]string, int, int, error) {clusterIDMap := make(map[string]string)created := 0existing := 0for i, loc := range locations {logger.Log("[%d/%d] Processing location: %s", i+1, len(locations), loc.LocationName)var existingClusterID stringerr := database.QueryRow(`SELECT id FROM clusterWHERE location_id = ? AND name = ? AND active = true`, loc.LocationID, loc.DateRange).Scan(&existingClusterID)var clusterID stringif err == sql.ErrNoRows {clusterID, err = bulkCreateCluster(ctx, database, datasetID, loc.LocationID, loc.DateRange, loc.SampleRate)if err != nil {logger.Log("ERROR: Failed to create cluster for location %s: %v", loc.LocationName, err)return nil, 0, 0, fmt.Errorf("failed to create cluster: %w", err)}logger.Log(" Created cluster: %s", clusterID)created++} else if err != nil {logger.Log("ERROR: Failed to check cluster for location %s: %v", loc.LocationName, err)return nil, 0, 0, fmt.Errorf("failed to check cluster: %w", err)} else {clusterID = existingClusterIDlogger.Log(" Using existing cluster: %s", clusterID)existing++}compositeKey := loc.LocationID + "|" + loc.DateRangeclusterIDMap[compositeKey] = clusterID}return clusterIDMap, created, existing, nil}// bulkImportAllFiles imports files for all locations using the cluster ID map.// Returns aggregate stats and any error messages.func bulkImportAllFiles(database *sql.DB, logger *progressLogger, locations []bulkLocationData, clusterIDMap map[string]string, datasetID string) (bulkImportStats, []string) {var total bulkImportStatsvar errs []stringfor i, loc := range locations {compositeKey := loc.LocationID + "|" + loc.DateRangeclusterID, ok := clusterIDMap[compositeKey]if !ok {continue}logger.Log("[%d/%d] Importing files for: %s", i+1, len(locations), loc.LocationName)logger.Log(" Directory: %s", loc.DirectoryPath)if _, err := os.Stat(loc.DirectoryPath); os.IsNotExist(err) {logger.Log(" WARNING: Directory not found, skipping")continue}stats, err := bulkImportFilesForCluster(database, logger, loc.DirectoryPath, datasetID, loc.LocationID, clusterID)if err != nil {errMsg := fmt.Sprintf("Failed to import files for location %s: %v", loc.LocationName, err)logger.Log("ERROR: %s", errMsg)return total, []string{errMsg}}logger.Log(" Scanned: %d files", stats.TotalFiles)logger.Log(" Imported: %d, Duplicates: %d", stats.ImportedFiles, stats.DuplicateFiles)if stats.ErrorFiles > 0 {logger.Log(" Errors: %d files", stats.ErrorFiles)}total.TotalFiles += stats.TotalFilestotal.ImportedFiles += stats.ImportedFilestotal.DuplicateFiles += stats.DuplicateFilestotal.ErrorFiles += stats.ErrorFiles}return total, errs}func bulkReadCSV(path string) ([]bulkLocationData, error) {file, err := os.Open(path)if err != nil {return nil, err}defer func() { _ = file.Close() }()reader := csv.NewReader(file)records, err := reader.ReadAll()if err != nil {return nil, err}if len(records) == 0 {return nil, fmt.Errorf("CSV file is empty")}var locations []bulkLocationDatafor i, record := range records {if i == 0 {continue // Skip header}if len(record) < 6 {return nil, fmt.Errorf("CSV row %d has insufficient columns (expected 6, got %d)", i+1, len(record))}// Validate required string fields are non-emptylocationName := strings.TrimSpace(record[0])if locationName == "" {return nil, fmt.Errorf("empty location_name in row %d", i+1)}directoryPath := strings.TrimSpace(record[2])if directoryPath == "" {return nil, fmt.Errorf("empty directory_path in row %d", i+1)}dateRange := strings.TrimSpace(record[3])if dateRange == "" {return nil, fmt.Errorf("empty date_range in row %d", i+1)}// Validate location_id formatlocationID := record[1]if err := utils.ValidateShortID(locationID, "location_id"); err != nil {return nil, fmt.Errorf("invalid location_id in row %d: %v", i+1, err)}sampleRate, err := strconv.Atoi(record[4])if err != nil {return nil, fmt.Errorf("invalid sample_rate in row %d: %v", i+1, err)}// Validate sample rate is in reasonable rangeif err := utils.ValidateSampleRate(sampleRate); err != nil {return nil, fmt.Errorf("invalid sample_rate in row %d: %v", i+1, err)}fileCount, err := strconv.Atoi(record[5])if err != nil {return nil, fmt.Errorf("invalid file_count in row %d: %v", i+1, err)}locations = append(locations, bulkLocationData{LocationName: locationName,LocationID: locationID,DirectoryPath: directoryPath,DateRange: dateRange,SampleRate: sampleRate,FileCount: fileCount,})}return locations, nil}// bulkCreateCluster creates a new cluster in the databasefunc bulkCreateCluster(ctx context.Context, database *sql.DB, datasetID, locationID, name string, sampleRate int) (string, error) {// Generate a 12-character nanoidclusterID, err := utils.GenerateShortID()if err != nil {return "", fmt.Errorf("failed to generate cluster ID: %v", err)}now := time.Now().UTC()// Get location name for the pathvar locationName stringerr = database.QueryRow("SELECT name FROM location WHERE id = ?", locationID).Scan(&locationName)if err != nil {return "", fmt.Errorf("failed to get location name: %v", err)}// Normalize path: replace spaces and special characterspath := strings.ReplaceAll(locationName, " ", "_")path = strings.ReplaceAll(path, "/", "_")tx, err := db.BeginLoggedTx(ctx, database, "bulk_file_import")if err != nil {return "", fmt.Errorf("failed to begin transaction: %w", err)}defer tx.Rollback()_, err = tx.ExecContext(ctx, `INSERT INTO cluster (id, dataset_id, location_id, name, path, sample_rate, active, created_at, last_modified)VALUES (?, ?, ?, ?, ?, ?, true, ?, ?)`, clusterID, datasetID, locationID, name, path, sampleRate, now, now)if err != nil {return "", fmt.Errorf("failed to insert cluster: %w", err)}if err = tx.Commit(); err != nil {return "", fmt.Errorf("failed to commit cluster creation: %w", err)}return clusterID, nil}// bulkImportFilesForCluster imports all WAV files for a single clusterfunc bulkImportFilesForCluster(database *sql.DB, logger *progressLogger, folderPath, datasetID, locationID, clusterID string) (*bulkImportStats, error) {stats := &bulkImportStats{}// Check if directory existsif _, err := os.Stat(folderPath); os.IsNotExist(err) {logger.Log(" WARNING: Directory not found, skipping")return stats, nil}// Import the cluster (SAME LOGIC AS import_files.go)logger.Log(" Importing cluster %s", clusterID)ctx := context.Background()tx, err := db.BeginLoggedTx(ctx, database, "import_audio_files")if err != nil {return nil, fmt.Errorf("failed to begin transaction: %w", err)}clusterOutput, err := utils.ImportCluster(database, tx.UnderlyingTx(), utils.ClusterImportInput{FolderPath: folderPath,DatasetID: datasetID,LocationID: locationID,ClusterID: clusterID,Recursive: true,})if err != nil {tx.Rollback()return nil, err}if err := tx.Commit(); err != nil {return nil, fmt.Errorf("transaction commit failed: %w", err)}// Map to bulk import statsstats.TotalFiles = clusterOutput.TotalFilesstats.ImportedFiles = clusterOutput.ImportedFilesstats.DuplicateFiles = clusterOutput.SkippedFilesstats.ErrorFiles = clusterOutput.FailedFiles// Log errorsfor i, fileErr := range clusterOutput.Errors {if i < 5 { // Log first 5logger.Log(" ERROR: %s: %s", fileErr.FileName, fileErr.Error)}}logger.Log(" Complete: %d imported, %d duplicates, %d errors", stats.ImportedFiles, stats.DuplicateFiles, stats.ErrorFiles)return stats, nil}
package callsimport ("fmt""os""path/filepath""sort""sync/atomic")// parallelResult is the common interface for birda/raven worker results.type parallelResult interface {filePath() stringgetCalls() []ClusteredCallwasWritten() boolwasSkipped() boolgetError() error}// aggregateStats holds the collected results from a parallel fan-out/fan-in.type aggregateStats struct {calls []ClusteredCallspeciesCount map[string]intdataFilesWritten intdataFilesSkipped intfilesProcessed intfilesDeleted intfirstErr error}// aggregateResults collects results from a channel of parallelResult values,// handling error tracking, species counting, optional file deletion, and// progress reporting. Returns the aggregated stats.func aggregateResults(results <-chan parallelResult,total int,processed *atomic.Int32,deleteFiles bool,progressHandler func(int, int, string),) aggregateStats {var stats aggregateStatsstats.speciesCount = make(map[string]int)for result := range results {if err := result.getError(); err != nil && stats.firstErr == nil {stats.firstErr = err}if result.wasWritten() {stats.dataFilesWritten++}if result.wasSkipped() {stats.dataFilesSkipped++}for _, call := range result.getCalls() {stats.calls = append(stats.calls, call)stats.speciesCount[call.EbirdCode]++}stats.filesProcessed++stats.maybeDeleteFile(deleteFiles, result)if progressHandler != nil {current := int(processed.Add(1))progressHandler(current, total, filepath.Base(result.filePath()))}}return stats}// maybeDeleteFile deletes the source file if requested and it was successfully processed.func (s *aggregateStats) maybeDeleteFile(deleteFiles bool, result parallelResult) {if !deleteFiles || !result.wasWritten() {return}if err := os.Remove(result.filePath()); err != nil {if s.firstErr == nil {s.firstErr = fmt.Errorf("failed to delete %s: %w", result.filePath(), err)}} else {s.filesDeleted++}}// sortCallsByFileAndTime sorts calls by filename, then start time.func sortCallsByFileAndTime(calls []ClusteredCall) {sort.Slice(calls, func(i, j int) bool {if calls[i].File != calls[j].File {return calls[i].File < calls[j].File}return calls[i].StartTime < calls[j].StartTime})}
package callsimport ("fmt""strings""time""github.com/sixdouglas/suncalc""skraak/utils")// IsNightInput defines the input parameters for the isnight tooltype IsNightInput struct {FilePath string `json:"file_path"`Lat float64 `json:"lat"`Lng float64 `json:"lng"`Timezone string `json:"timezone,omitempty"`}// IsNightOutput defines the output structure for the isnight tooltype IsNightOutput struct {FilePath string `json:"file_path"`TimestampUTC string `json:"timestamp_utc"`SolarNight bool `json:"solar_night"`CivilNight bool `json:"civil_night"`DiurnalActive bool `json:"diurnal_active"`MoonPhase float64 `json:"moon_phase"`DurationSec float64 `json:"duration_seconds"`TimestampSrc string `json:"timestamp_source"`MidpointUTC string `json:"midpoint_utc"`SunriseUTC string `json:"sunrise_utc,omitempty"`SunsetUTC string `json:"sunset_utc,omitempty"`DawnUTC string `json:"dawn_utc,omitempty"`DuskUTC string `json:"dusk_utc,omitempty"`}// IsNight determines if a WAV file was recorded at night based on its// metadata timestamp and the given GPS coordinates.//// Timestamp resolution order:// 1. AudioMoth comment (timezone embedded)// 2. Filename timestamp + timezone offset (requires --timezone)// 3. File modification time (system local time)func IsNight(input IsNightInput) (IsNightOutput, error) {var output IsNightOutput// Step 1: Parse WAV headermetadata, err := utils.ParseWAVHeader(input.FilePath)if err != nil {return output, fmt.Errorf("WAV header parsing failed: %w", err)}output.DurationSec = metadata.Duration// Step 2: Resolve timestamp (use file mod time as fallback)tsResult, err := utils.ResolveTimestamp(metadata, input.FilePath, input.Timezone, true, nil)if err != nil {return output, fmt.Errorf("cannot determine recording timestamp: %w", err)}// Determine timestamp source labeltsSource := "file_mod_time"if tsResult.IsAudioMoth {tsSource = "audiomoth_comment"} else if utils.HasTimestampFilename(input.FilePath) {tsSource = "filename"}// Step 3: Calculate astronomical data using recording midpointastroData := utils.CalculateAstronomicalData(tsResult.Timestamp.UTC(),metadata.Duration,input.Lat,input.Lng,)// Step 4: Get sun event times for informational outputmidpoint := utils.CalculateMidpointTime(tsResult.Timestamp.UTC(), metadata.Duration)sunTimes := suncalc.GetTimes(midpoint, input.Lat, input.Lng)output.FilePath = input.FilePathoutput.TimestampUTC = tsResult.Timestamp.UTC().Format(time.RFC3339)output.SolarNight = astroData.SolarNightoutput.CivilNight = astroData.CivilNightoutput.MoonPhase = astroData.MoonPhaseoutput.TimestampSrc = tsSourceoutput.MidpointUTC = midpoint.Format(time.RFC3339)populateSunTimes(&output, sunTimes, midpoint)return output, nil}// sunTimeUTC returns the UTC RFC3339 string for a suncalc event, or "" if absent/zero.func sunTimeUTC(sunTimes map[suncalc.DayTimeName]suncalc.DayTime, name suncalc.DayTimeName) string {if entry, ok := sunTimes[name]; ok && !entry.Value.IsZero() {return entry.Value.UTC().Format(time.RFC3339)}return ""}// populateSunTimes fills in sun event times and diurnal status from suncalc results.func populateSunTimes(output *IsNightOutput, sunTimes map[suncalc.DayTimeName]suncalc.DayTime, midpoint time.Time) {// Diurnal: midpoint is between dawn and sunsetif dawn, ok := sunTimes[suncalc.Dawn]; ok && !dawn.Value.IsZero() {if sunset, ok := sunTimes[suncalc.Sunset]; ok && !sunset.Value.IsZero() {output.DiurnalActive = !midpoint.Before(dawn.Value) && !midpoint.After(sunset.Value)}}output.SunriseUTC = sunTimeUTC(sunTimes, suncalc.Sunrise)output.SunsetUTC = sunTimeUTC(sunTimes, suncalc.Sunset)output.DawnUTC = sunTimeUTC(sunTimes, suncalc.Dawn)output.DuskUTC = sunTimeUTC(sunTimes, suncalc.Dusk)}// String returns a human-readable summary of the isnight resultfunc (o IsNightOutput) String() string {var sb strings.Builderfmt.Fprintf(&sb, "File: %s\n", o.FilePath)fmt.Fprintf(&sb, "Timestamp (UTC): %s\n", o.TimestampUTC)fmt.Fprintf(&sb, "Midpoint (UTC): %s\n", o.MidpointUTC)fmt.Fprintf(&sb, "Duration: %.1f seconds\n", o.DurationSec)fmt.Fprintf(&sb, "Source: %s\n", o.TimestampSrc)fmt.Fprintf(&sb, "Solar night: %v\n", o.SolarNight)fmt.Fprintf(&sb, "Civil night: %v\n", o.CivilNight)fmt.Fprintf(&sb, "Moon phase: %.2f\n", o.MoonPhase)if o.SunriseUTC != "" {fmt.Fprintf(&sb, "Sunrise (UTC): %s\n", o.SunriseUTC)}if o.SunsetUTC != "" {fmt.Fprintf(&sb, "Sunset (UTC): %s\n", o.SunsetUTC)}if o.DawnUTC != "" {fmt.Fprintf(&sb, "Dawn (UTC): %s\n", o.DawnUTC)}if o.DuskUTC != "" {fmt.Fprintf(&sb, "Dusk (UTC): %s\n", o.DuskUTC)}return sb.String()}
package callsimport ("sort""strings""skraak/utils")// CallsSummariseInput defines the input for the calls-summarise tooltype CallsSummariseInput struct {Folder string `json:"folder"`Brief bool `json:"brief"`Filter string `json:"filter,omitempty"`}// CallsSummariseOutput defines the output for the calls-summarise tooltype CallsSummariseOutput struct {Segments []SegmentSummary `json:"segments"`Folder string `json:"folder"`DataFilesRead int `json:"data_files_read"`DataFilesSkipped []string `json:"data_files_skipped"`TotalSegments int `json:"total_segments"`Filters map[string]FilterStats `json:"filters"`ReviewStatus ReviewStatus `json:"review_status"`Operators []string `json:"operators"`Reviewers []string `json:"reviewers"`Error *string `json:"error,omitempty"`}// SegmentSummary represents a single segment in the outputtype SegmentSummary struct {File string `json:"file"`StartTime float64 `json:"start_time"`EndTime float64 `json:"end_time"`Labels []LabelSummary `json:"labels"`}// LabelSummary represents a label in the output (omits empty fields)type LabelSummary struct {Filter string `json:"filter"`Certainty int `json:"certainty"`Species string `json:"species"`CallType string `json:"calltype,omitempty"`Comment string `json:"comment,omitempty"`Bookmark bool `json:"bookmark,omitempty"`}// FilterStats contains per-filter statisticstype FilterStats struct {Segments int `json:"segments"`Species map[string]int `json:"species"`Calltypes map[string]map[string]int `json:"calltypes,omitempty"` // species -> calltype -> count}// ReviewStatus contains review progress statisticstype ReviewStatus struct {Unreviewed int `json:"unreviewed"` // certainty < 100Confirmed int `json:"confirmed"` // certainty = 100DontKnow int `json:"dont_know"` // certainty = 0WithCallType int `json:"with_calltype"`WithComments int `json:"with_comments"`Bookmarked int `json:"bookmarked"`}// CallsSummarise reads all .data files in a folder and produces a summaryfunc CallsSummarise(input CallsSummariseInput) (CallsSummariseOutput, error) {var output CallsSummariseOutput// Find all .data filesfilePaths, err := utils.FindDataFiles(input.Folder)if err != nil {errMsg := err.Error()output.Error = &errMsgreturn output, err}// Initialize empty slices/maps (avoid null in JSON)output.Segments = make([]SegmentSummary, 0)output.Folder = input.Folderoutput.Filters = make(map[string]FilterStats)output.Operators = make([]string, 0)output.Reviewers = make([]string, 0)output.DataFilesSkipped = make([]string, 0)if len(filePaths) == 0 {return output, nil}// Track unique operators and reviewersoperatorSet := make(map[string]bool)reviewerSet := make(map[string]bool)summariseFiles(filePaths, input, &output, operatorSet, reviewerSet)// Count segments for totalif input.Brief {for _, fs := range output.Filters {output.TotalSegments += fs.Segments}} else {output.TotalSegments = len(output.Segments)}finaliseSummary(&output, operatorSet, reviewerSet, input.Brief)return output, nil}// summariseFiles processes all data files, populating output statsfunc summariseFiles(filePaths []string, input CallsSummariseInput, output *CallsSummariseOutput, operatorSet, reviewerSet map[string]bool) {for _, path := range filePaths {df, err := utils.ParseDataFile(path)if err != nil {output.DataFilesSkipped = append(output.DataFilesSkipped, path)continue}output.DataFilesRead++trackMeta(df.Meta, operatorSet, reviewerSet)var relPath stringif !input.Brief {relPath = extractRelativePath(input.Folder, path)}for _, seg := range df.Segments {filteredLabels := filterLabels(seg.Labels, input.Filter)if input.Filter != "" && len(filteredLabels) == 0 {continue}updateStatsFromLabels(filteredLabels, output)if !input.Brief {output.Segments = append(output.Segments, SegmentSummary{File: relPath,StartTime: seg.StartTime,EndTime: seg.EndTime,Labels: buildLabelSummaries(filteredLabels),})}}}}// trackMeta records operator and reviewer from file metadatafunc trackMeta(meta *utils.DataMeta, operatorSet, reviewerSet map[string]bool) {if meta == nil {return}if meta.Operator != "" {operatorSet[meta.Operator] = true}if meta.Reviewer != "" {reviewerSet[meta.Reviewer] = true}}// filterLabels returns labels matching the filter, or all labels if filter is emptyfunc filterLabels(labels []*utils.Label, filter string) []*utils.Label {if filter == "" {return labels}var filtered []*utils.Labelfor _, l := range labels {if l.Filter == filter {filtered = append(filtered, l)}}return filtered}// buildLabelSummaries converts labels to label summariesfunc buildLabelSummaries(labels []*utils.Label) []LabelSummary {var summaries []LabelSummaryfor _, l := range labels {ls := LabelSummary{Filter: l.Filter,Certainty: l.Certainty,Species: l.Species,}if l.CallType != "" {ls.CallType = l.CallType}if l.Comment != "" {ls.Comment = l.Comment}if l.Bookmark {ls.Bookmark = true}summaries = append(summaries, ls)}return summaries}// updateStatsFromLabels updates filter stats and review status from a set of labelsfunc updateStatsFromLabels(labels []*utils.Label, output *CallsSummariseOutput) {for _, l := range labels {updateFilterStats(l, output)updateReviewStatus(l, output)}}// updateFilterStats increments filter-level statistics for a single labelfunc updateFilterStats(l *utils.Label, output *CallsSummariseOutput) {fs, exists := output.Filters[l.Filter]if !exists {fs = FilterStats{Segments: 0,Species: make(map[string]int),Calltypes: make(map[string]map[string]int),}}fs.Segments++fs.Species[l.Species]++if l.CallType != "" {if fs.Calltypes[l.Species] == nil {fs.Calltypes[l.Species] = make(map[string]int)}fs.Calltypes[l.Species][l.CallType]++}output.Filters[l.Filter] = fs}// updateReviewStatus increments review status counters for a single labelfunc updateReviewStatus(l *utils.Label, output *CallsSummariseOutput) {switch l.Certainty {case 100:output.ReviewStatus.Confirmed++case 0:output.ReviewStatus.DontKnow++default:output.ReviewStatus.Unreviewed++}if l.CallType != "" {output.ReviewStatus.WithCallType++}if l.Comment != "" {output.ReviewStatus.WithComments++}if l.Bookmark {output.ReviewStatus.Bookmarked++}}// finaliseSummary sorts output, cleans empty maps, and converts sets to sorted slicesfunc finaliseSummary(output *CallsSummariseOutput, operatorSet, reviewerSet map[string]bool, brief bool) {// Clean up empty calltypes mapsfor filter, fs := range output.Filters {if len(fs.Calltypes) == 0 {fs.Calltypes = niloutput.Filters[filter] = fs}}// Convert sets to sorted slicesfor op := range operatorSet {output.Operators = append(output.Operators, op)}for r := range reviewerSet {output.Reviewers = append(output.Reviewers, r)}sort.Strings(output.Operators)sort.Strings(output.Reviewers)// Sort segments by file, then start timeif !brief {sort.Slice(output.Segments, func(i, j int) bool {if output.Segments[i].File != output.Segments[j].File {return output.Segments[i].File < output.Segments[j].File}return output.Segments[i].StartTime < output.Segments[j].StartTime})}}// extractRelativePath extracts the audio filename from a .data file path// e.g., "/folder/tx51_LISTENING_20260221_203004.WAV.data" -> "tx51_LISTENING_20260221_203004.WAV"// Preserves the original case of the extension as-is.func extractRelativePath(folder, dataPath string) string {// Get the filenamefilename := dataPathif idx := strings.LastIndex(dataPath, "/"); idx >= 0 {filename = dataPath[idx+1:]}// Remove .data extension, preserve everything elsereturn strings.TrimSuffix(filename, ".data")}
package callsimport ("fmt""os""strings""skraak/utils")// CallsShowImagesInput defines the input for the show-images tooltype CallsShowImagesInput struct {DataFilePath string `json:"data_file_path"`Color bool `json:"color"`ImageSize int `json:"image_size"`Sixel bool `json:"sixel"`ITerm bool `json:"iterm"`}// CallsShowImagesOutput defines the output for the show-images tooltype CallsShowImagesOutput struct {SegmentsShown int `json:"segments_shown"`WavFile string `json:"wav_file"`Error string `json:"error,omitempty"`}// CallsShowImages reads a .data file and displays spectrogram images for each segmentfunc CallsShowImages(input CallsShowImagesInput) (CallsShowImagesOutput, error) {var output CallsShowImagesOutput// Validate file existsif _, err := os.Stat(input.DataFilePath); os.IsNotExist(err) {output.Error = fmt.Sprintf("File not found: %s", input.DataFilePath)return output, fmt.Errorf("%s", output.Error)}// Derive WAV file path (strip .data suffix)wavPath := strings.TrimSuffix(input.DataFilePath, ".data")output.WavFile = wavPath// Check WAV file existsif _, err := os.Stat(wavPath); os.IsNotExist(err) {output.Error = fmt.Sprintf("WAV file not found: %s", wavPath)return output, fmt.Errorf("%s", output.Error)}// Parse .data file (includes labels for future filtering)dataFile, err := utils.ParseDataFile(input.DataFilePath)if err != nil {output.Error = err.Error()return output, fmt.Errorf("%s", output.Error)}if len(dataFile.Segments) == 0 {output.Error = "No segments found in .data file"return output, fmt.Errorf("%s", output.Error)}// Resolve image sizeimgSize := input.ImageSizeif imgSize == 0 {imgSize = utils.SpectrogramDisplaySize}// Select graphics protocolprotocol := utils.ProtocolKittyif input.ITerm {protocol = utils.ProtocolITerm} else if input.Sixel {protocol = utils.ProtocolSixel}// Generate spectrogram for each segment and outputfor i, seg := range dataFile.Segments {// Generate spectrogram imageimg, err := utils.GenerateSegmentSpectrogram(input.DataFilePath, seg.StartTime, seg.EndTime, input.Color, imgSize)if err != nil || img == nil {continue}// Print segment infolabelInfo := formatSegmentLabels(seg.Labels)fmt.Fprintf(os.Stderr, "Segment %d: %.1fs - %.1fs (%.1fs)%s\n",i+1, seg.StartTime, seg.EndTime, seg.EndTime-seg.StartTime, labelInfo)// Write to stdout via terminal graphics protocolif err := utils.WriteImage(img, os.Stdout, protocol); err != nil {output.Error = fmt.Sprintf("Failed to write image: %v", err)return output, fmt.Errorf("%s", output.Error)}fmt.Println() // Newline after image}output.SegmentsShown = len(dataFile.Segments)return output, nil}// formatSegmentLabels formats labels for display in segment infofunc formatSegmentLabels(labels []*utils.Label) string {if len(labels) == 0 {return ""}var parts []stringfor _, l := range labels {part := l.Speciesif l.CallType != "" {part += "/" + l.CallType}if l.Filter != "" {part += " [" + l.Filter + "]"}parts = append(parts, part)}return " " + strings.Join(parts, ", ")}
package callsimport ("encoding/json""os""path/filepath""testing""skraak/utils")func TestPushCertaintyPromotesMatchingLabels(t *testing.T) {tempDir := t.TempDir()// File with two Kiwi segments: certainty=90 and certainty=70file1 := `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 90}]], [10, 20, 100, 1000, [{"species": "Kiwi", "certainty": 70}]]]`file1Path := filepath.Join(tempDir, "file1.data")if err := os.WriteFile(file1Path, []byte(file1), 0644); err != nil {t.Fatal(err)}// File with one Tomtit at certainty=90 (must not be promoted when species=Kiwi)file2 := `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Tomtit", "certainty": 90}]]]`file2Path := filepath.Join(tempDir, "file2.data")if err := os.WriteFile(file2Path, []byte(file2), 0644); err != nil {t.Fatal(err)}result, err := PushCertainty(PushCertaintyConfig{Folder: tempDir,Species: "Kiwi",Reviewer: "TestReviewer",})if err != nil {t.Fatal(err)}if result.SegmentsUpdated != 1 {t.Errorf("expected 1 segment updated, got %d", result.SegmentsUpdated)}if result.FilesUpdated != 1 {t.Errorf("expected 1 file updated, got %d", result.FilesUpdated)}// Verify file1: certainty=90 Kiwi → 100, certainty=70 Kiwi → unchangeddf, err := utils.ParseDataFile(file1Path)if err != nil {t.Fatal(err)}if df.Segments[0].Labels[0].Certainty != 100 {t.Errorf("expected certainty=100, got %d", df.Segments[0].Labels[0].Certainty)}if df.Segments[1].Labels[0].Certainty != 70 {t.Errorf("expected certainty=70 unchanged, got %d", df.Segments[1].Labels[0].Certainty)}if df.Meta.Reviewer != "TestReviewer" {t.Errorf("expected reviewer=TestReviewer, got %q", df.Meta.Reviewer)}// Verify Tomtit file was not modifieddf2, err := utils.ParseDataFile(file2Path)if err != nil {t.Fatal(err)}if df2.Segments[0].Labels[0].Certainty != 90 {t.Errorf("Tomtit certainty should be unchanged at 90, got %d", df2.Segments[0].Labels[0].Certainty)}}func TestPushCertaintyFilterScope(t *testing.T) {tempDir := t.TempDir()// Segment has two labels from different filters, both Kiwi certainty=90data := []any{map[string]any{"Operator": "test"},[]any{0.0, 10.0, 100.0, 1000.0, []any{map[string]any{"species": "Kiwi", "certainty": 90, "filter": "model-a"},map[string]any{"species": "Kiwi", "certainty": 90, "filter": "model-b"},}},}raw, _ := json.Marshal(data)filePath := filepath.Join(tempDir, "file1.data")if err := os.WriteFile(filePath, raw, 0644); err != nil {t.Fatal(err)}// Push only model-aresult, err := PushCertainty(PushCertaintyConfig{Folder: tempDir,Filter: "model-a",Species: "Kiwi",Reviewer: "TestReviewer",})if err != nil {t.Fatal(err)}if result.SegmentsUpdated != 1 {t.Errorf("expected 1 segment updated, got %d", result.SegmentsUpdated)}// Verify only model-a label was promoted; model-b stays at 90df, err := utils.ParseDataFile(filePath)if err != nil {t.Fatal(err)}for _, label := range df.Segments[0].Labels {if label.Filter == "model-a" && label.Certainty != 100 {t.Errorf("model-a label should be 100, got %d", label.Certainty)}if label.Filter == "model-b" && label.Certainty != 90 {t.Errorf("model-b label should be unchanged at 90, got %d", label.Certainty)}}}
package callsimport ("fmt""skraak/utils")// PushCertaintyConfig holds the configuration for push-certaintytype PushCertaintyConfig struct {Folder stringFile stringFilter stringSpecies stringCallType stringNight boolDay boolLat float64Lng float64Timezone stringReviewer string}// PushCertaintyResult holds the result of push-certaintytype PushCertaintyResult struct {SegmentsUpdated int `json:"segments_updated"`FilesUpdated int `json:"files_updated"`TimeFilteredCount int `json:"time_filtered_count"`}// PushCertainty promotes all certainty=90 segments matching the filter scope to certainty=100.// Uses identical filtering logic to LoadDataFiles so the scope matches calls classify exactly.func PushCertainty(config PushCertaintyConfig) (*PushCertaintyResult, error) {state, err := LoadDataFiles(ClassifyConfig{Folder: config.Folder,File: config.File,Filter: config.Filter,Species: config.Species,CallType: config.CallType,Certainty: 90,Sample: -1,Night: config.Night,Day: config.Day,Lat: config.Lat,Lng: config.Lng,Timezone: config.Timezone,})if err != nil {return nil, err}var segsUpdated, filesUpdated intfor i, df := range state.DataFiles {changed := falsefor _, seg := range state.FilteredSegs()[i] {for _, label := range seg.Labels {if labelMatchesPush(label, config.Filter, config.Species, config.CallType) {label.Certainty = 100changed = truesegsUpdated++}}}if changed {df.Meta.Reviewer = config.Reviewerif err := df.Write(df.FilePath); err != nil {return nil, fmt.Errorf("write %s: %w", df.FilePath, err)}filesUpdated++}}return &PushCertaintyResult{SegmentsUpdated: segsUpdated,FilesUpdated: filesUpdated,TimeFilteredCount: state.TimeFilteredCount,}, nil}// labelMatchesPush returns true if the label matches the push scope and has certainty=90.// Certainty is already guaranteed by LoadDataFiles, but we re-check to target only the// specific label that matched (a segment may carry labels from multiple filters).func labelMatchesPush(label *utils.Label, filter, species, callType string) bool {if filter != "" && label.Filter != filter {return false}if species != "" && label.Species != species {return false}if callType != "" && label.CallType != callType {return false}return label.Certainty == 90}
package callsimport ("path/filepath""testing""skraak/utils")// helpersfunc seg(start, end float64, labels ...*utils.Label) *utils.Segment {return &utils.Segment{StartTime: start,EndTime: end,FreqLow: 100,FreqHigh: 8000,Labels: labels,}}func lbl(filter, species, calltype string, certainty int) *utils.Label {return &utils.Label{Filter: filter,Species: species,CallType: calltype,Certainty: certainty,}}func writeFile(t *testing.T, segs ...*utils.Segment) string {t.Helper()dir := t.TempDir()path := filepath.Join(dir, "test.data")df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "ML", Reviewer: "David", Duration: 3600},Segments: segs,}if err := df.Write(path); err != nil {t.Fatalf("write fixture: %v", err)}return path}func readFile(t *testing.T, path string) *utils.DataFile {t.Helper()df, err := utils.ParseDataFile(path)if err != nil {t.Fatalf("parse %s: %v", path, err)}return df}// findLabel returns the label with matching filter and time on the parsed file, or nil.func findLabel(df *utils.DataFile, filter string, start, end float64) *utils.Label {for _, s := range df.Segments {if s.StartTime != start || s.EndTime != end {continue}for _, l := range s.Labels {if l.Filter == filter {return l}}}return nil}const (fFrom = "opensoundscape-kiwi-1.2"fTo = "opensoundscape-kiwi-1.5")func TestPropagate_HappyPathSingle(t *testing.T) {path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v (%s)", err, out.Error)}if out.Propagated != 1 || out.TargetsExamined != 1 || out.SkippedConflict != 0 || out.SkippedNoOverlap != 0 {t.Fatalf("counts wrong: %+v", out)}df := readFile(t, path)target := findLabel(df, fTo, 100, 125)if target == nil {t.Fatal("target label missing")}if target.Species != "Kiwi" || target.CallType != "Male" || target.Certainty != 90 {t.Errorf("target not updated correctly: species=%q calltype=%q cert=%d", target.Species, target.CallType, target.Certainty)}if df.Meta.Reviewer != "Skraak" {t.Errorf("reviewer = %q, want Skraak", df.Meta.Reviewer)}}func TestPropagate_NoOverlap(t *testing.T) {path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(500, 525, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 0 || out.TargetsExamined != 1 || out.SkippedNoOverlap != 1 {t.Fatalf("counts wrong: %+v", out)}df := readFile(t, path)target := findLabel(df, fTo, 500, 525)if target.Certainty != 70 {t.Errorf("target should not be modified, cert=%d", target.Certainty)}if df.Meta.Reviewer != "David" {t.Errorf("reviewer should stay David (no write), got %q", df.Meta.Reviewer)}}func TestPropagate_SourceWrongSpecies_Ignored(t *testing.T) {path := writeFile(t,seg(100, 125, lbl(fFrom, "Weka", "", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 0 || out.SkippedNoOverlap != 1 {t.Fatalf("counts wrong: %+v", out)}}func TestPropagate_SourceWrongCertainty_Ignored(t *testing.T) {// cert=70 and cert=0 source labels must NOT count as sources.path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 70)),seg(200, 225, lbl(fFrom, "Don't Know", "", 0)),seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),seg(200, 225, lbl(fTo, "Kiwi", "Male", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 0 || out.SkippedNoOverlap != 2 {t.Fatalf("counts wrong: %+v", out)}}func TestPropagate_SourceWrongFilter_Ignored(t *testing.T) {path := writeFile(t,seg(100, 125, lbl("some-other-filter", "Kiwi", "Male", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if !out.FiltersMissing || out.Propagated != 0 || out.TargetsExamined != 0 {t.Fatalf("expected FiltersMissing=true with zero counts, got: %+v", out)}}func TestPropagate_TargetCert100_NotTouched(t *testing.T) {// Target with cert=100 is human-verified — must NOT be overwritten.path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Male", 100)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.TargetsExamined != 0 || out.Propagated != 0 {t.Fatalf("cert=100 target must not be examined: %+v", out)}df := readFile(t, path)if df.Meta.Reviewer != "David" {t.Errorf("reviewer should stay David (no write), got %q", df.Meta.Reviewer)}}func TestPropagate_TargetCert90_NotTouched(t *testing.T) {// Target with cert=90 (already propagated earlier) must NOT be re-propagated.path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Female", 90)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.TargetsExamined != 0 || out.Propagated != 0 {t.Fatalf("cert=90 target must not be examined: %+v", out)}df := readFile(t, path)target := findLabel(df, fTo, 100, 125)if target.Certainty != 90 || target.CallType != "Female" {t.Errorf("cert=90 target was modified: %+v", target)}}func TestPropagate_TargetCert0_Propagated(t *testing.T) {// Target at cert=0 ("Don't Know" / "Noise") SHOULD be propagated when an// overlapping cert=100 source exists — rescues labels from the noise bucket// so they surface for review even if occasionally wrong.path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(100, 125, lbl(fTo, "Don't Know", "", 0)),seg(200, 225, lbl(fFrom, "Kiwi", "Female", 100)),seg(200, 225, lbl(fTo, "Noise", "", 0)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.TargetsExamined != 2 || out.Propagated != 2 {t.Fatalf("cert=0 targets must be propagated: %+v", out)}df := readFile(t, path)for _, c := range []struct {start, end float64calltype string}{{100, 125, "Male"}, {200, 225, "Female"}} {l := findLabel(df, fTo, c.start, c.end)if l == nil || l.Species != "Kiwi" || l.CallType != c.calltype || l.Certainty != 90 {t.Errorf("at %v-%v got %+v, want Kiwi+%s cert=90", c.start, c.end, l, c.calltype)}}}func TestPropagate_MultipleSourcesAgree(t *testing.T) {// Two overlapping sources with same calltype → propagate.path := writeFile(t,seg(100, 110, lbl(fFrom, "Kiwi", "Male", 100)),seg(105, 120, lbl(fFrom, "Kiwi", "Male", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 1 || out.SkippedConflict != 0 {t.Fatalf("counts wrong: %+v", out)}df := readFile(t, path)target := findLabel(df, fTo, 100, 125)if target.CallType != "Male" {t.Errorf("calltype should be Male, got %q", target.CallType)}}func TestPropagate_MultipleSourcesConflict(t *testing.T) {// Two overlapping sources with different calltypes → conflict, skip, report.path := writeFile(t,seg(100, 110, lbl(fFrom, "Kiwi", "Male", 100)),seg(115, 120, lbl(fFrom, "Kiwi", "Female", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 0 || out.SkippedConflict != 1 {t.Fatalf("expected 1 conflict skip: %+v", out)}if len(out.Conflicts) != 1 {t.Fatalf("expected 1 conflict report, got %d", len(out.Conflicts))}if out.Conflicts[0].TargetStart != 100 || out.Conflicts[0].TargetEnd != 125 {t.Errorf("conflict target wrong: %+v", out.Conflicts[0])}if len(out.Conflicts[0].SourceChoices) != 2 {t.Errorf("expected 2 source choices, got %d", len(out.Conflicts[0].SourceChoices))}// Target must NOT be modified.df := readFile(t, path)target := findLabel(df, fTo, 100, 125)if target.CallType != "Duet" || target.Certainty != 70 {t.Errorf("conflicted target was modified: %+v", target)}if df.Meta.Reviewer != "David" {t.Errorf("reviewer should stay David (no write), got %q", df.Meta.Reviewer)}}func TestPropagate_EmptyCallTypePropagates(t *testing.T) {// Source with empty calltype → target gets empty calltype.path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Male", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 1 {t.Fatalf("expected propagated=1: %+v", out)}df := readFile(t, path)target := findLabel(df, fTo, 100, 125)if target.CallType != "" {t.Errorf("calltype should be cleared, got %q", target.CallType)}if target.Species != "Kiwi" || target.Certainty != 90 {t.Errorf("target fields wrong: %+v", target)}}func TestPropagate_SpeciesOverride(t *testing.T) {// Target species was different from --species; must be overwritten.path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(100, 125, lbl(fTo, "Don't Know", "", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 1 {t.Fatalf("expected propagated=1: %+v", out)}df := readFile(t, path)target := findLabel(df, fTo, 100, 125)if target.Species != "Kiwi" || target.CallType != "Male" || target.Certainty != 90 {t.Errorf("target not overwritten correctly: %+v", target)}}func TestPropagate_OverlapBoundaryExclusive(t *testing.T) {// Segments touching at a point (src ends exactly where tgt starts) do NOT overlap.path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(125, 150, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 0 || out.SkippedNoOverlap != 1 {t.Fatalf("touching boundary must not count as overlap: %+v", out)}}func TestPropagate_OverlapPartial(t *testing.T) {// 1-second overlap is enough.path := writeFile(t,seg(100, 126, lbl(fFrom, "Kiwi", "Male", 100)),seg(125, 150, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 1 {t.Fatalf("expected propagated=1: %+v", out)}}func TestPropagate_SupersetEitherDirection(t *testing.T) {// Source engulfs target.path1 := writeFile(t,seg(100, 200, lbl(fFrom, "Kiwi", "Male", 100)),seg(110, 150, lbl(fTo, "Kiwi", "Duet", 70)),)if out, _ := CallsPropagate(CallsPropagateInput{File: path1, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi"}); out.Propagated != 1 {t.Errorf("source-engulfs-target: %+v", out)}// Target engulfs source.path2 := writeFile(t,seg(110, 150, lbl(fFrom, "Kiwi", "Male", 100)),seg(100, 200, lbl(fTo, "Kiwi", "Duet", 70)),)if out, _ := CallsPropagate(CallsPropagateInput{File: path2, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi"}); out.Propagated != 1 {t.Errorf("target-engulfs-source: %+v", out)}}func TestPropagate_MissingFlags(t *testing.T) {cases := []struct {name stringin CallsPropagateInput}{{"no file", CallsPropagateInput{FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi"}},{"no from", CallsPropagateInput{File: "x", ToFilter: fTo, Species: "Kiwi"}},{"no to", CallsPropagateInput{File: "x", FromFilter: fFrom, Species: "Kiwi"}},{"no species", CallsPropagateInput{File: "x", FromFilter: fFrom, ToFilter: fTo}},}for _, c := range cases {t.Run(c.name, func(t *testing.T) {_, err := CallsPropagate(c.in)if err == nil {t.Errorf("expected error")}})}}func TestPropagate_SameFromAndTo(t *testing.T) {_, err := CallsPropagate(CallsPropagateInput{File: "x", FromFilter: fFrom, ToFilter: fFrom, Species: "Kiwi",})if err == nil {t.Error("expected error when --from == --to")}}func TestPropagate_NonexistentFile(t *testing.T) {_, err := CallsPropagate(CallsPropagateInput{File: "/nonexistent/path.data", FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err == nil {t.Error("expected error for nonexistent file")}}func TestPropagate_RealisticMixed(t *testing.T) {// Mimics the 20260228_211500.WAV.data case: cert=0 "Don't Know" and cert=100 Kiwi sources// coexist; only cert=100 Kiwi gets propagated.path := writeFile(t,// Sources (kiwi-1.2)seg(45, 52.5, lbl(fFrom, "Don't Know", "", 0)),seg(142.5, 177.5, lbl(fFrom, "Kiwi", "Male", 100)),seg(195, 217.5, lbl(fFrom, "Don't Know", "", 0)),seg(647.5, 682.5, lbl(fFrom, "Kiwi", "Female", 100)),seg(815, 855, lbl(fFrom, "Kiwi", "Duet", 100)),// Targets (kiwi-1.5)seg(147.5, 167.5, lbl(fTo, "Kiwi", "Male", 70)),seg(647.5, 672.5, lbl(fTo, "Kiwi", "Female", 70)),seg(815, 852.5, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.TargetsExamined != 3 || out.Propagated != 3 || out.SkippedConflict != 0 {t.Fatalf("counts wrong: %+v", out)}df := readFile(t, path)expect := []struct {start, end float64calltype string}{{147.5, 167.5, "Male"},{647.5, 672.5, "Female"},{815, 852.5, "Duet"},}for _, e := range expect {l := findLabel(df, fTo, e.start, e.end)if l == nil || l.Certainty != 90 || l.CallType != e.calltype || l.Species != "Kiwi" {t.Errorf("at %v-%v got %+v, want Kiwi+%s cert=90", e.start, e.end, l, e.calltype)}}}func TestPropagate_NoWriteIfNothingChanged(t *testing.T) {// File with only non-target segments should not be rewritten (reviewer unchanged).path := writeFile(t,seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),)out, err := CallsPropagate(CallsPropagateInput{File: path, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.Propagated != 0 || out.TargetsExamined != 0 {t.Fatalf("expected no activity: %+v", out)}df := readFile(t, path)if df.Meta.Reviewer != "David" {t.Errorf("reviewer should not be touched, got %q", df.Meta.Reviewer)}}// writeFileAt is like writeFile but puts the file inside an existing dir// with a caller-provided basename (must end in .data).func writeFileAt(t *testing.T, dir, base string, segs ...*utils.Segment) string {t.Helper()path := filepath.Join(dir, base)df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "ML", Reviewer: "David", Duration: 3600},Segments: segs,}if err := df.Write(path); err != nil {t.Fatalf("write fixture: %v", err)}return path}// assertPropagateStats checks output stats against expected values.func assertPropagateStats(t *testing.T, got, want CallsPropagateFolderOutput) {t.Helper()checks := []struct {name stringgot intwant int}{{"FilesTotal", got.FilesTotal, want.FilesTotal},{"FilesWithBothFilters", got.FilesWithBothFilters, want.FilesWithBothFilters},{"FilesSkippedNoFilter", got.FilesSkippedNoFilter, want.FilesSkippedNoFilter},{"FilesChanged", got.FilesChanged, want.FilesChanged},{"FilesErrored", got.FilesErrored, want.FilesErrored},{"TargetsExamined", got.TargetsExamined, want.TargetsExamined},{"Propagated", got.Propagated, want.Propagated},{"SkippedNoOverlap", got.SkippedNoOverlap, want.SkippedNoOverlap},}for _, c := range checks {if c.got != c.want {t.Errorf("%s: got %d, want %d", c.name, c.got, c.want)}}}func TestPropagateFolder_AggregatesAndSkipsMissing(t *testing.T) {dir := t.TempDir()// File A: both filters present, one clean propagation.aPath := writeFileAt(t, dir, "a.wav.data",seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(100, 125, lbl(fTo, "Kiwi", "Duet", 70)),)// File B: only target filter — missing source, must be skipped silently.bPath := writeFileAt(t, dir, "b.wav.data",seg(200, 225, lbl(fTo, "Kiwi", "Duet", 70)),)// File C: only source filter — missing target, must be skipped silently.writeFileAt(t, dir, "c.wav.data",seg(300, 325, lbl(fFrom, "Kiwi", "Male", 100)),)// File D: both filters, but no overlap → targets examined, none propagated.dPath := writeFileAt(t, dir, "d.wav.data",seg(400, 425, lbl(fFrom, "Kiwi", "Male", 100)),seg(500, 525, lbl(fTo, "Kiwi", "Duet", 70)),)out, err := CallsPropagateFolder(CallsPropagateFolderInput{Folder: dir, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}assertPropagateStats(t, out, CallsPropagateFolderOutput{FilesTotal: 4,FilesWithBothFilters: 2,FilesSkippedNoFilter: 2,FilesChanged: 1,FilesErrored: 0,TargetsExamined: 2,Propagated: 1,SkippedNoOverlap: 1,})t.Run("file_a_propagated", func(t *testing.T) {aDf := readFile(t, aPath)if aDf.Meta.Reviewer != "Skraak" {t.Errorf("reviewer: got %q, want Skraak", aDf.Meta.Reviewer)}if l := findLabel(aDf, fTo, 100, 125); l == nil || l.Certainty != 90 || l.CallType != "Male" {t.Errorf("target label: got %+v, want cert=90 calltype=Male", l)}})t.Run("file_b_skipped", func(t *testing.T) {bDf := readFile(t, bPath)if bDf.Meta.Reviewer != "David" {t.Errorf("reviewer should not be touched, got %q", bDf.Meta.Reviewer)}})t.Run("file_d_no_overlap", func(t *testing.T) {dDf := readFile(t, dPath)if dDf.Meta.Reviewer != "David" {t.Errorf("reviewer should not be touched, got %q", dDf.Meta.Reviewer)}if l := findLabel(dDf, fTo, 500, 525); l == nil || l.Certainty != 70 {t.Errorf("target label should be unchanged cert=70, got %+v", l)}})}func TestPropagateFolder_EmptyFolder(t *testing.T) {dir := t.TempDir()out, err := CallsPropagateFolder(CallsPropagateFolderInput{Folder: dir, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.FilesTotal != 0 || out.Propagated != 0 {t.Errorf("expected empty result, got %+v", out)}}func TestPropagateFolder_MissingRequiredFlags(t *testing.T) {dir := t.TempDir()cases := []CallsPropagateFolderInput{{Folder: "", FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi"},{Folder: dir, FromFilter: "", ToFilter: fTo, Species: "Kiwi"},{Folder: dir, FromFilter: fFrom, ToFilter: "", Species: "Kiwi"},{Folder: dir, FromFilter: fFrom, ToFilter: fTo, Species: ""},{Folder: dir, FromFilter: fFrom, ToFilter: fFrom, Species: "Kiwi"},}for i, in := range cases {if _, err := CallsPropagateFolder(in); err == nil {t.Errorf("case %d: expected error for input %+v", i, in)}}}func TestPropagateFolder_NonexistentFolder(t *testing.T) {_, err := CallsPropagateFolder(CallsPropagateFolderInput{Folder: "/nonexistent/path/xyz", FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err == nil {t.Fatal("expected error for nonexistent folder")}}func TestPropagateFolder_ConflictsTaggedWithFile(t *testing.T) {dir := t.TempDir()// Two sources with different calltypes both overlapping one target.writeFileAt(t, dir, "conflict.wav.data",seg(100, 125, lbl(fFrom, "Kiwi", "Male", 100)),seg(110, 130, lbl(fFrom, "Kiwi", "Female", 100)),seg(100, 130, lbl(fTo, "Kiwi", "", 70)),)out, err := CallsPropagateFolder(CallsPropagateFolderInput{Folder: dir, FromFilter: fFrom, ToFilter: fTo, Species: "Kiwi",})if err != nil {t.Fatalf("unexpected error: %v", err)}if out.SkippedConflict != 1 || len(out.Conflicts) != 1 {t.Fatalf("expected one conflict, got %+v", out)}if out.Conflicts[0].File == "" {t.Errorf("conflict should be tagged with file path, got %+v", out.Conflicts[0])}}
package callsimport ("fmt""os""skraak/utils")type CallsPropagateInput struct {File string `json:"file"`FromFilter string `json:"from_filter"`ToFilter string `json:"to_filter"`Species string `json:"species"`}type CallsPropagateOutput struct {File string `json:"file"`FromFilter string `json:"from_filter"`ToFilter string `json:"to_filter"`Species string `json:"species"`FiltersMissing bool `json:"filters_missing,omitempty"`TargetsExamined int `json:"targets_examined"`Propagated int `json:"propagated"`SkippedNoOverlap int `json:"skipped_no_overlap"`SkippedConflict int `json:"skipped_conflict"`Conflicts []PropagateConflict `json:"conflicts,omitempty"`Changes []PropagateChange `json:"changes,omitempty"`Error string `json:"error,omitempty"`}type CallsPropagateFolderInput struct {Folder string `json:"folder"`FromFilter string `json:"from_filter"`ToFilter string `json:"to_filter"`Species string `json:"species"`}type CallsPropagateFolderOutput struct {Folder string `json:"folder"`FromFilter string `json:"from_filter"`ToFilter string `json:"to_filter"`Species string `json:"species"`FilesTotal int `json:"files_total"`FilesWithBothFilters int `json:"files_with_both_filters"`FilesSkippedNoFilter int `json:"files_skipped_no_filter"`FilesChanged int `json:"files_changed"`FilesErrored int `json:"files_errored"`TargetsExamined int `json:"targets_examined"`Propagated int `json:"propagated"`SkippedNoOverlap int `json:"skipped_no_overlap"`SkippedConflict int `json:"skipped_conflict"`Conflicts []PropagateConflict `json:"conflicts,omitempty"`Errors []CallsPropagateOutput `json:"errors,omitempty"`Error string `json:"error,omitempty"`}type PropagateConflict struct {File string `json:"file,omitempty"`TargetStart float64 `json:"target_start"`TargetEnd float64 `json:"target_end"`TargetCallType string `json:"target_calltype,omitempty"`SourceChoices []PropagateSourceChoice `json:"source_choices"`}type PropagateSourceChoice struct {Start float64 `json:"start"`End float64 `json:"end"`Species string `json:"species"`CallType string `json:"calltype,omitempty"`}type PropagateChange struct {TargetStart float64 `json:"target_start"`TargetEnd float64 `json:"target_end"`PrevSpecies string `json:"prev_species"`PrevCallType string `json:"prev_calltype,omitempty"`PrevCertainty int `json:"prev_certainty"`NewSpecies string `json:"new_species"`NewCallType string `json:"new_calltype,omitempty"`NewCertainty int `json:"new_certainty"`}// CallsPropagate copies verified classifications (certainty==100) from one filter's// segments to overlapping target segments of another filter, within a single .data file.// Target labels with certainty==70 (ML-unverified) or certainty==0 (Don't Know / Noise)// are updated — targets at certainty==100 (human-verified) and certainty==90 (already// propagated) are left alone. Only source labels matching --species are considered.// Propagated target labels are set to certainty=90 and file reviewer is set to "Skraak".func CallsPropagate(input CallsPropagateInput) (CallsPropagateOutput, error) {output := CallsPropagateOutput{File: input.File,FromFilter: input.FromFilter,ToFilter: input.ToFilter,Species: input.Species,}if err := validatePropagateInput(&output, input); err != nil {return output, err}df, err := utils.ParseDataFile(input.File)if err != nil {output.Error = fmt.Sprintf("parse %s: %v", input.File, err)return output, fmt.Errorf("%s", output.Error)}// Fast path: skip files that don't contain both filters at all.if !hasBothFilters(df, input.FromFilter, input.ToFilter) {output.FiltersMissing = truereturn output, nil}sources := collectPropagateSources(df, input.FromFilter, input.Species)propagateTargets(df, sources, input, &output)if output.Propagated > 0 {df.Meta.Reviewer = "Skraak"if err := df.Write(input.File); err != nil {output.Error = fmt.Sprintf("write %s: %v", input.File, err)return output, fmt.Errorf("%s", output.Error)}}return output, nil}// validatePropagateInput checks required fields and file existencefunc validatePropagateInput(output *CallsPropagateOutput, input CallsPropagateInput) error {checks := []struct {val stringmsg string}{{input.File, "--file is required"},{input.FromFilter, "--from is required"},{input.ToFilter, "--to is required"},{input.Species, "--species is required"},}for _, c := range checks {if c.val == "" {output.Error = c.msgreturn fmt.Errorf("%s", c.msg)}}if input.FromFilter == input.ToFilter {output.Error = "--from and --to must differ"return fmt.Errorf("%s", output.Error)}if _, err := os.Stat(input.File); os.IsNotExist(err) {output.Error = fmt.Sprintf("file not found: %s", input.File)return fmt.Errorf("%s", output.Error)}return nil}// hasBothFilters checks whether the data file contains both from and to filtersfunc hasBothFilters(df *utils.DataFile, fromFilter, toFilter string) bool {hasFrom, hasTo := false, falsefor _, seg := range df.Segments {for _, lbl := range seg.Labels {if lbl.Filter == fromFilter {hasFrom = true}if lbl.Filter == toFilter {hasTo = true}if hasFrom && hasTo {return true}}}return false}// sourceRef pairs a segment with its matching source labeltype sourceRef struct {seg *utils.Segmentlabel *utils.Label}// collectPropagateSources gathers verified source labels (certainty==100) for the given filter/speciesfunc collectPropagateSources(df *utils.DataFile, fromFilter, species string) []sourceRef {var sources []sourceReffor _, seg := range df.Segments {for _, lbl := range seg.Labels {if lbl.Filter == fromFilter && lbl.Species == species && lbl.Certainty == 100 {sources = append(sources, sourceRef{seg: seg, label: lbl})break}}}return sources}// propagateTargets iterates target segments, finds overlapping sources, and applies agreed classificationsfunc propagateTargets(df *utils.DataFile, sources []sourceRef, input CallsPropagateInput, output *CallsPropagateOutput) {for _, tSeg := range df.Segments {toLabel := findUpdatableTargetLabel(tSeg.Labels, input.ToFilter)if toLabel == nil {continue}output.TargetsExamined++overlaps := findOverlappingSources(sources, tSeg)if len(overlaps) == 0 {output.SkippedNoOverlap++continue}agreedCallType, conflict := resolveCallType(overlaps)if conflict {output.SkippedConflict++output.Conflicts = append(output.Conflicts, buildConflictRecord(tSeg, toLabel, overlaps))continue}applyPropagation(toLabel, input.Species, agreedCallType, tSeg, output)}}// findUpdatableTargetLabel finds a target label with certainty 70 or 0 for the given filterfunc findUpdatableTargetLabel(labels []*utils.Label, toFilter string) *utils.Label {for _, lbl := range labels {if lbl.Filter == toFilter && (lbl.Certainty == 70 || lbl.Certainty == 0) {return lbl}}return nil}// findOverlappingSources returns sources whose segments overlap with the target segmentfunc findOverlappingSources(sources []sourceRef, tSeg *utils.Segment) []sourceRef {var overlaps []sourceReffor _, s := range sources {if s.seg.StartTime < tSeg.EndTime && tSeg.StartTime < s.seg.EndTime {overlaps = append(overlaps, s)}}return overlaps}// resolveCallType checks if all overlapping sources agree on a call type.// Returns the agreed call type and whether there is a conflict.func resolveCallType(overlaps []sourceRef) (string, bool) {agreedCallType := overlaps[0].label.CallTypefor _, s := range overlaps[1:] {if s.label.CallType != agreedCallType {return "", true}}return agreedCallType, false}// buildConflictRecord creates a PropagateConflict from overlapping disagreeing sourcesfunc buildConflictRecord(tSeg *utils.Segment, toLabel *utils.Label, overlaps []sourceRef) PropagateConflict {choices := make([]PropagateSourceChoice, 0, len(overlaps))for _, s := range overlaps {choices = append(choices, PropagateSourceChoice{Start: s.seg.StartTime,End: s.seg.EndTime,Species: s.label.Species,CallType: s.label.CallType,})}return PropagateConflict{TargetStart: tSeg.StartTime,TargetEnd: tSeg.EndTime,TargetCallType: toLabel.CallType,SourceChoices: choices,}}// applyPropagation updates the target label and records the changefunc applyPropagation(toLabel *utils.Label, species, callType string, tSeg *utils.Segment, output *CallsPropagateOutput) {change := PropagateChange{TargetStart: tSeg.StartTime,TargetEnd: tSeg.EndTime,PrevSpecies: toLabel.Species,PrevCallType: toLabel.CallType,PrevCertainty: toLabel.Certainty,NewSpecies: species,NewCallType: callType,NewCertainty: 90,}toLabel.Species = speciestoLabel.CallType = callTypetoLabel.Certainty = 90output.Propagated++output.Changes = append(output.Changes, change)}// CallsPropagateFolder runs CallsPropagate against every .data file in a folder,// aggregating counts. Files that do not contain both --from and --to filters are// skipped silently (counted as files_skipped_no_filter). Parse/write errors on// individual files are collected in Errors; they don't abort the run.func CallsPropagateFolder(input CallsPropagateFolderInput) (CallsPropagateFolderOutput, error) {output := CallsPropagateFolderOutput{Folder: input.Folder,FromFilter: input.FromFilter,ToFilter: input.ToFilter,Species: input.Species,}if input.Folder == "" {output.Error = "--folder is required"return output, fmt.Errorf("%s", output.Error)}if input.FromFilter == "" {output.Error = "--from is required"return output, fmt.Errorf("%s", output.Error)}if input.ToFilter == "" {output.Error = "--to is required"return output, fmt.Errorf("%s", output.Error)}if input.Species == "" {output.Error = "--species is required"return output, fmt.Errorf("%s", output.Error)}if input.FromFilter == input.ToFilter {output.Error = "--from and --to must differ"return output, fmt.Errorf("%s", output.Error)}info, err := os.Stat(input.Folder)if err != nil {output.Error = fmt.Sprintf("folder not found: %s", input.Folder)return output, fmt.Errorf("%s", output.Error)}if !info.IsDir() {output.Error = fmt.Sprintf("not a directory: %s", input.Folder)return output, fmt.Errorf("%s", output.Error)}files, err := utils.FindDataFiles(input.Folder)if err != nil {output.Error = fmt.Sprintf("list .data files: %v", err)return output, fmt.Errorf("%s", output.Error)}output.FilesTotal = len(files)for _, f := range files {fileOut, err := CallsPropagate(CallsPropagateInput{File: f,FromFilter: input.FromFilter,ToFilter: input.ToFilter,Species: input.Species,})if err != nil {output.FilesErrored++output.Errors = append(output.Errors, fileOut)continue}if fileOut.FiltersMissing {output.FilesSkippedNoFilter++continue}output.FilesWithBothFilters++output.TargetsExamined += fileOut.TargetsExaminedoutput.Propagated += fileOut.Propagatedoutput.SkippedNoOverlap += fileOut.SkippedNoOverlapoutput.SkippedConflict += fileOut.SkippedConflictif fileOut.Propagated > 0 {output.FilesChanged++}for _, c := range fileOut.Conflicts {c.File = foutput.Conflicts = append(output.Conflicts, c)}}return output, nil}
package callsimport ("path/filepath""testing""skraak/utils")func TestCallsModifyBookmark(t *testing.T) {// Create a temp .data file with a bookmarked segmenttmpDir := t.TempDir()tmpFile := filepath.Join(tmpDir, "test.data")df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "test", Duration: 60},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 15.0,FreqLow: 100,FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 80, Filter: "myfilter", CallType: "Duet", Bookmark: true},},},},}if err := df.Write(tmpFile); err != nil {t.Fatalf("failed to write test file: %v", err)}// Test 1: Adding bookmark when already true should do nothingbookmark := trueresult, err := CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "10-15",Certainty: 80,Bookmark: &bookmark,})// Should return error "no changes needed"if err == nil {t.Errorf("expected error 'no changes needed' when bookmark already true, got nil")}if result.Error != "No changes needed: all values already match" {t.Errorf("expected 'no changes needed' error, got: %s", result.Error)}// Verify bookmark is still true in the filedf2, err := utils.ParseDataFile(tmpFile)if err != nil {t.Fatalf("failed to parse file: %v", err)}if !df2.Segments[0].Labels[0].Bookmark {t.Errorf("bookmark should still be true, got false")}}func TestCallsModifyBookmarkFalse(t *testing.T) {// Create a temp .data file WITHOUT a bookmarktmpDir := t.TempDir()tmpFile := filepath.Join(tmpDir, "test.data")df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "test", Duration: 60},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 15.0,FreqLow: 100,FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 80, Filter: "myfilter", CallType: "Duet", Bookmark: false},},},},}if err := df.Write(tmpFile); err != nil {t.Fatalf("failed to write test file: %v", err)}// Test: Adding bookmark when false should set it to truebookmark := trueresult, err := CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "10-15",Certainty: 80,Bookmark: &bookmark,})if err != nil {t.Errorf("unexpected error: %v", err)}if result.Bookmark == nil || !*result.Bookmark {t.Errorf("expected bookmark=true in result, got %v", result.Bookmark)}// Verify bookmark is true in the filedf2, err := utils.ParseDataFile(tmpFile)if err != nil {t.Fatalf("failed to parse file: %v", err)}if !df2.Segments[0].Labels[0].Bookmark {t.Errorf("bookmark should be true, got false")}}func TestCallsModifyCommentAdditive(t *testing.T) {// Create a temp .data file with an existing commenttmpDir := t.TempDir()tmpFile := filepath.Join(tmpDir, "test.data")df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "test", Duration: 60},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 15.0,FreqLow: 100,FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 80, Filter: "myfilter", Comment: "First observation"},},},},}if err := df.Write(tmpFile); err != nil {t.Fatalf("failed to write test file: %v", err)}// Test: Adding comment should be additiveresult, err := CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "10-15",Certainty: 80,Comment: "Good example",})if err != nil {t.Errorf("unexpected error: %v", err)}expectedComment := "First observation | Good example"if result.Comment != expectedComment {t.Errorf("expected comment=%q, got %q", expectedComment, result.Comment)}// Verify comment in filedf2, err := utils.ParseDataFile(tmpFile)if err != nil {t.Fatalf("failed to parse file: %v", err)}if df2.Segments[0].Labels[0].Comment != expectedComment {t.Errorf("expected comment in file=%q, got %q", expectedComment, df2.Segments[0].Labels[0].Comment)}}func TestCallsModifyCommentAdditiveMultiple(t *testing.T) {// Create a temp .data file and add multiple commentstmpDir := t.TempDir()tmpFile := filepath.Join(tmpDir, "test.data")df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "test", Duration: 60},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 15.0,FreqLow: 100,FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 80, Filter: "myfilter"},},},},}if err := df.Write(tmpFile); err != nil {t.Fatalf("failed to write test file: %v", err)}// Add first comment_, err := CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "10-15",Certainty: 80,Comment: "First",})if err != nil {t.Fatalf("unexpected error on first comment: %v", err)}// Add second comment_, err = CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "10-15",Certainty: 80,Comment: "Second",})if err != nil {t.Fatalf("unexpected error on second comment: %v", err)}// Add third commentresult, err := CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "10-15",Certainty: 80,Comment: "Third",})if err != nil {t.Fatalf("unexpected error on third comment: %v", err)}expectedComment := "First | Second | Third"if result.Comment != expectedComment {t.Errorf("expected comment=%q, got %q", expectedComment, result.Comment)}}func TestCallsModifyCommentTooLong(t *testing.T) {// Create a temp .data file with an existing long commenttmpDir := t.TempDir()tmpFile := filepath.Join(tmpDir, "test.data")existingComment := "This is a fairly long existing comment that takes up space"df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "test", Duration: 60},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 15.0,FreqLow: 100,FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 80, Filter: "myfilter", Comment: existingComment},},},},}if err := df.Write(tmpFile); err != nil {t.Fatalf("failed to write test file: %v", err)}// Test: Adding a long comment that would exceed 140 chars should faillongNewComment := "This is another very long comment that when combined with the existing one will exceed the limit"result, err := CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "10-15",Certainty: 80,Comment: longNewComment,})if err == nil {t.Errorf("expected error for combined comment exceeding 140 chars, got nil")}if result.Error == "" {t.Errorf("expected error message, got empty")}// Verify original comment is preserveddf2, err := utils.ParseDataFile(tmpFile)if err != nil {t.Fatalf("failed to parse file: %v", err)}if df2.Segments[0].Labels[0].Comment != existingComment {t.Errorf("original comment should be preserved, got %q", df2.Segments[0].Labels[0].Comment)}}func TestCallsModifyPreservesBookmarkOnOtherChange(t *testing.T) {// Create a temp .data file with a bookmarktmpDir := t.TempDir()tmpFile := filepath.Join(tmpDir, "test.data")df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "test", Duration: 60},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 15.0,FreqLow: 100,FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 80, Filter: "myfilter", Bookmark: true},},},},}if err := df.Write(tmpFile); err != nil {t.Fatalf("failed to write test file: %v", err)}// Change certainty (without passing --bookmark) - bookmark should be preservedresult, err := CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "10-15",Certainty: 100,// No Bookmark set})if err != nil {t.Errorf("unexpected error: %v", err)}if result.Bookmark != nil {t.Errorf("bookmark should not be in output when not changed, got %v", result.Bookmark)}// Verify bookmark is still true in the filedf2, err := utils.ParseDataFile(tmpFile)if err != nil {t.Fatalf("failed to parse file: %v", err)}if !df2.Segments[0].Labels[0].Bookmark {t.Errorf("bookmark should still be true after changing certainty, got false")}}func TestCallsModifyInvalidSegment(t *testing.T) {tmpDir := t.TempDir()tmpFile := filepath.Join(tmpDir, "test.data")df := &utils.DataFile{Meta: &utils.DataMeta{Operator: "test", Duration: 60},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 15.0,FreqLow: 100,FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 80, Filter: "myfilter"},},},},}if err := df.Write(tmpFile); err != nil {t.Fatalf("failed to write test file: %v", err)}// Test: Non-existent segment should errorresult, err := CallsModify(CallsModifyInput{File: tmpFile,Reviewer: "tester",Filter: "myfilter",Segment: "99-100",Certainty: 80,})if err == nil {t.Errorf("expected error for non-existent segment, got nil")}if result.Error == "" {t.Errorf("expected error message, got empty")}}
package callsimport ("fmt""math""os""strings""skraak/utils")// CallsModifyInput defines the input for the modify tooltype CallsModifyInput struct {File string `json:"file"`Reviewer string `json:"reviewer"`Filter string `json:"filter"`Segment string `json:"segment"`Certainty int `json:"certainty"`Species string `json:"species"`Bookmark *bool `json:"bookmark"`Comment string `json:"comment"`}// CallsModifyOutput defines the output for the modify tooltype CallsModifyOutput struct {File string `json:"file"`SegmentStart int `json:"segment_start"`SegmentEnd int `json:"segment_end"`Species string `json:"species,omitempty"`CallType string `json:"calltype,omitempty"`Certainty int `json:"certainty,omitempty"`Bookmark *bool `json:"bookmark,omitempty"`Comment string `json:"comment,omitempty"`PreviousValue string `json:"previous_value,omitempty"`Error string `json:"error,omitempty"`}// validateModifyInput checks required fields and comment constraints.func validateModifyInput(input CallsModifyInput) error {if input.File == "" {return fmt.Errorf("--file is required")}if input.Reviewer == "" {return fmt.Errorf("--reviewer is required")}if input.Filter == "" {return fmt.Errorf("--filter is required")}if input.Segment == "" {return fmt.Errorf("--segment is required")}if len(input.Comment) > 140 {return fmt.Errorf("--comment must be 140 characters or less")}for i, r := range input.Comment {if r > 127 {return fmt.Errorf("--comment must be ASCII only (non-ASCII at position %d)", i)}}return nil}// resolveSpecies parses species+calltype from the input species string.// If input species is empty, keeps the existing label values.func resolveSpecies(inputSpecies string, label *utils.Label) (species, callType string) {if inputSpecies == "" {return label.Species, label.CallType}if before, after, ok := strings.Cut(inputSpecies, "+"); ok {return before, after}return inputSpecies, ""}// hasModifyChanges checks whether any field would actually change.func hasModifyChanges(newSpecies, newCallType string, input CallsModifyInput, label *utils.Label) bool {if newSpecies != label.Species || newCallType != label.CallType {return true}if input.Certainty != label.Certainty {return true}if input.Bookmark != nil && *input.Bookmark != label.Bookmark {return true}if input.Comment != "" {return true}return false}// applyLabelChanges updates the label and data file, populating the output.func applyLabelChanges(label *utils.Label, dataFile *utils.DataFile, input CallsModifyInput, newSpecies, newCallType string, output *CallsModifyOutput) error {dataFile.Meta.Reviewer = input.Reviewerlabel.Species = newSpecieslabel.CallType = newCallTypeoutput.Species = newSpeciesoutput.CallType = newCallTypelabel.Certainty = input.Certaintyoutput.Certainty = input.Certaintyif input.Bookmark != nil && *input.Bookmark != label.Bookmark {label.Bookmark = *input.Bookmarkoutput.Bookmark = input.Bookmark}if input.Comment != "" {var newComment stringif label.Comment != "" {newComment = label.Comment + " | " + input.Comment} else {newComment = input.Comment}if len(newComment) > 140 {return fmt.Errorf("combined comment exceeds 140 characters (%d)", len(newComment))}label.Comment = newCommentoutput.Comment = newComment}return nil}// CallsModify modifies a label in a .data filefunc CallsModify(input CallsModifyInput) (CallsModifyOutput, error) {var output CallsModifyOutputif err := validateModifyInput(input); err != nil {output.Error = err.Error()return output, err}startTime, endTime, err := parseSegmentRange(input.Segment)if err != nil {output.Error = err.Error()return output, err}output.File = input.Fileoutput.SegmentStart = startTimeoutput.SegmentEnd = endTimeif _, err := os.Stat(input.File); os.IsNotExist(err) {output.Error = fmt.Sprintf("File not found: %s", input.File)return output, fmt.Errorf("%s", output.Error)}dataFile, err := utils.ParseDataFile(input.File)if err != nil {output.Error = fmt.Sprintf("Failed to parse file: %v", err)return output, fmt.Errorf("%s", output.Error)}segment := findSegment(dataFile.Segments, startTime, endTime, input.Filter)if segment == nil {output.Error = fmt.Sprintf("No segment found matching time range %d-%d", startTime, endTime)return output, fmt.Errorf("%s", output.Error)}targetLabel := findLabelByFilter(segment, input.Filter)if targetLabel == nil {output.Error = fmt.Sprintf("No label found with filter '%s' in segment %d-%d", input.Filter, startTime, endTime)return output, fmt.Errorf("%s", output.Error)}output.PreviousValue = formatLabel(targetLabel)newSpecies, newCallType := resolveSpecies(input.Species, targetLabel)if !hasModifyChanges(newSpecies, newCallType, input, targetLabel) {output.Error = "No changes needed: all values already match"return output, fmt.Errorf("%s", output.Error)}if err := applyLabelChanges(targetLabel, dataFile, input, newSpecies, newCallType, &output); err != nil {output.Error = err.Error()return output, err}if err := dataFile.Write(input.File); err != nil {output.Error = fmt.Sprintf("Failed to save file: %v", err)return output, fmt.Errorf("%s", output.Error)}return output, nil}// findLabelByFilter finds the first label matching the given filter in a segment.func findLabelByFilter(segment *utils.Segment, filter string) *utils.Label {for _, label := range segment.Labels {if label.Filter == filter {return label}}return nil}// parseSegmentRange parses "12-15" format into start and end integersfunc parseSegmentRange(s string) (int, int, error) {parts := strings.Split(s, "-")if len(parts) != 2 {return 0, 0, fmt.Errorf("invalid segment format: %s (expected start-end, e.g., 12-15)", s)}var start, end intif _, err := fmt.Sscanf(parts[0], "%d", &start); err != nil {return 0, 0, fmt.Errorf("invalid start time: %s", parts[0])}if _, err := fmt.Sscanf(parts[1], "%d", &end); err != nil {return 0, 0, fmt.Errorf("invalid end time: %s", parts[1])}if start < 0 || end < 0 {return 0, 0, fmt.Errorf("times must be non-negative")}if start >= end {return 0, 0, fmt.Errorf("start time must be less than end time")}return start, end, nil}// findSegment finds a segment matching the time range using floor/ceil matching.// It also checks that the segment contains a label with the specified filter,// so that duplicate segments (same time range, different filters) are resolved correctly.func findSegment(segments []*utils.Segment, startTime, endTime int, filter string) *utils.Segment {for _, seg := range segments {segStart := int(math.Floor(seg.StartTime))segEnd := int(math.Ceil(seg.EndTime))if segEnd == segStart {segEnd = segStart + 1 // minimum 1 second}if segStart == startTime && segEnd == endTime {for _, label := range seg.Labels {if label.Filter == filter {return seg}}}}return nil}// formatLabel formats a label for displayfunc formatLabel(label *utils.Label) string {result := label.Speciesif label.CallType != "" {result += "+" + label.CallType}result += fmt.Sprintf(" (%d%%)", label.Certainty)return result}
package callsimport ("bufio""fmt""os""path/filepath""strconv""strings""skraak/utils")// CallsFromRavenInput defines the input for the calls-from-raven tooltype CallsFromRavenInput struct {Folder string `json:"folder"`File string `json:"file"`Delete bool `json:"delete"`ProgressHandler ProgressHandler `json:"-"` // Optional progress callback}// CallsFromRavenOutput defines the output for the calls-from-raven tooltype CallsFromRavenOutput struct {Calls []ClusteredCall `json:"calls"`TotalCalls int `json:"total_calls"`SpeciesCount map[string]int `json:"species_count"`DataFilesWritten int `json:"data_files_written"`DataFilesSkipped int `json:"data_files_skipped"`FilesProcessed int `json:"files_processed"`FilesDeleted int `json:"files_deleted"`Filter string `json:"filter"`Error *string `json:"error,omitempty"`}// ravenSource implements CallSource for Raven selection filestype ravenSource struct{}func (ravenSource) Name() string { return "Raven" }func (ravenSource) FindFiles(folder string) ([]string, error) {var files []stringentries, err := os.ReadDir(folder)if err != nil {return nil, err}for _, entry := range entries {name := entry.Name()if strings.HasSuffix(name, ".selections.txt") {files = append(files, filepath.Join(folder, name))}}return files, nil}func (ravenSource) ProcessFile(ravenFile string, cache *DirCache) ([]ClusteredCall, bool, bool, error) {return processRavenFileCached(ravenFile, cache)}// CallsFromRaven processes Raven selection files and writes .data filesfunc CallsFromRaven(input CallsFromRavenInput) (CallsFromRavenOutput, error) {src := ravenSource{}commonInput := CallsFromSourceInput(input)commonOutput, err := callsFromSource(src, commonInput)// Convert to Raven-specific output typevar output CallsFromRavenOutputoutput.Calls = commonOutput.Callsoutput.TotalCalls = commonOutput.TotalCallsoutput.SpeciesCount = commonOutput.SpeciesCountoutput.DataFilesWritten = commonOutput.DataFilesWrittenoutput.DataFilesSkipped = commonOutput.DataFilesSkippedoutput.FilesProcessed = commonOutput.FilesProcessedoutput.FilesDeleted = commonOutput.FilesDeletedoutput.Filter = commonOutput.Filteroutput.Error = commonOutput.Errorreturn output, err}// RavenSelection represents a single Raven selectiontype RavenSelection struct {StartTime float64EndTime float64FreqLow float64FreqHigh float64Species string}// ravenColumnIndices holds the column index positions for a Raven filetype ravenColumnIndices struct {beginTimeIdx intendTimeIdx intlowFreqIdx inthighFreqIdx intspeciesIdx int}// parseRavenHeader finds column indices from a tab-separated header linefunc parseRavenHeader(header []string) (ravenColumnIndices, error) {idx := ravenColumnIndices{beginTimeIdx: -1, endTimeIdx: -1, lowFreqIdx: -1, highFreqIdx: -1, speciesIdx: -1}for i, col := range header {switch col {case "Begin Time (s)":idx.beginTimeIdx = icase "End Time (s)":idx.endTimeIdx = icase "Low Freq (Hz)":idx.lowFreqIdx = icase "High Freq (Hz)":idx.highFreqIdx = icase "Species":idx.speciesIdx = i}}if idx.beginTimeIdx == -1 || idx.endTimeIdx == -1 || idx.speciesIdx == -1 {return idx, fmt.Errorf("missing required columns in Raven file")}return idx, nil}// parseRavenSelections reads all selection rows from a scanner and returns parsed selectionsfunc parseRavenSelections(scanner *bufio.Scanner, idx ravenColumnIndices) ([]RavenSelection, error) {var selections []RavenSelectionfor scanner.Scan() {line := scanner.Text()if line == "" {continue}fields := strings.Split(line, "\t")if len(fields) <= idx.speciesIdx {continue}sel, err := parseRavenRow(fields, idx)if err != nil {return nil, err}selections = append(selections, sel)}if err := scanner.Err(); err != nil {return nil, fmt.Errorf("error reading file: %w", err)}return selections, nil}// parseRavenRow parses a single tab-separated row into a RavenSelectionfunc parseRavenRow(fields []string, idx ravenColumnIndices) (RavenSelection, error) {var sel RavenSelectionstartTime, err := strconv.ParseFloat(fields[idx.beginTimeIdx], 64)if err != nil {return sel, fmt.Errorf("failed to parse begin time %q: %w", fields[idx.beginTimeIdx], err)}sel.StartTime = startTimeendTime, err := strconv.ParseFloat(fields[idx.endTimeIdx], 64)if err != nil {return sel, fmt.Errorf("failed to parse end time %q: %w", fields[idx.endTimeIdx], err)}sel.EndTime = endTimeif idx.lowFreqIdx >= 0 && idx.lowFreqIdx < len(fields) {freqLow, err := strconv.ParseFloat(fields[idx.lowFreqIdx], 64)if err != nil {return sel, fmt.Errorf("failed to parse low freq %q: %w", fields[idx.lowFreqIdx], err)}sel.FreqLow = freqLow}if idx.highFreqIdx >= 0 && idx.highFreqIdx < len(fields) {freqHigh, err := strconv.ParseFloat(fields[idx.highFreqIdx], 64)if err != nil {return sel, fmt.Errorf("failed to parse high freq %q: %w", fields[idx.highFreqIdx], err)}sel.FreqHigh = freqHigh}sel.Species = fields[idx.speciesIdx]return sel, nil}// deriveWAVBaseName extracts the base WAV filename from a Raven .selections.txt filenamefunc deriveWAVBaseName(ravenFile string) string {base := filepath.Base(ravenFile)nameWithoutSuffix := strings.TrimSuffix(base, ".selections.txt")idx := strings.Index(nameWithoutSuffix, ".Table.")if idx > 0 {nameWithoutSuffix = nameWithoutSuffix[:idx]}return nameWithoutSuffix}// processRavenFileCached processes a single Raven selection file using a DirCache for WAV lookupfunc processRavenFileCached(ravenFile string, cache *DirCache) ([]ClusteredCall, bool, bool, error) {file, err := os.Open(ravenFile)if err != nil {return nil, false, false, fmt.Errorf("failed to open file: %w", err)}defer func() { _ = file.Close() }()scanner := bufio.NewScanner(file)if !scanner.Scan() {return nil, false, false, fmt.Errorf("empty file")}header := strings.Split(scanner.Text(), "\t")idx, err := parseRavenHeader(header)if err != nil {return nil, false, false, err}selections, err := parseRavenSelections(scanner, idx)if err != nil {return nil, false, false, err}if len(selections) == 0 {return nil, false, true, nil}// Find WAV filewavPath := resolveWAVPath(ravenFile, cache)if wavPath == "" {return nil, false, true, nil}sampleRate, duration, err := utils.ParseWAVHeaderMinimal(wavPath)if err != nil {return nil, false, true, nil}dataPath := wavPath + ".data"segments := buildRavenSegments(selections, sampleRate)meta := AviaNZMeta{Operator: "Raven", Duration: duration}reviewer := "None"meta.Reviewer = &reviewerif err := writeDotDataFileSafe(dataPath, segments, "Raven", meta); err != nil {return nil, false, false, err}var calls []ClusteredCallfor _, sel := range selections {calls = append(calls, ClusteredCall{File: wavPath,StartTime: sel.StartTime,EndTime: sel.EndTime,EbirdCode: sel.Species,Segments: 1,})}return calls, true, false, nil}// resolveWAVPath finds the WAV file corresponding to a Raven filefunc resolveWAVPath(ravenFile string, cache *DirCache) string {baseName := deriveWAVBaseName(ravenFile)if cache != nil {return cache.FindWAV(baseName)}return findWAVFile(filepath.Dir(ravenFile), baseName)}// buildRavenSegments converts Raven selections to AviaNZ segmentsfunc buildRavenSegments(selections []RavenSelection, sampleRate int) []AviaNZSegment {var segments []AviaNZSegmentfor _, sel := range selections {labels := []AviaNZLabel{{Species: sel.Species,Certainty: 70, // Default certainty for Raven (no confidence metric)Filter: "Raven",},}// Use frequency range from Raven, or full band if not specifiedfreqLow := sel.FreqLowfreqHigh := sel.FreqHighif freqLow == 0 && freqHigh == 0 {freqHigh = float64(sampleRate)}segment := AviaNZSegment{sel.StartTime,sel.EndTime,freqLow,freqHigh,labels,}segments = append(segments, segment)}return segments}
package callsimport ("os""path/filepath""testing""skraak/utils")func TestCallsFromPreds_EmptyFilterError(t *testing.T) {// Create a temp CSV filetmpDir := t.TempDir()csvPath := filepath.Join(tmpDir, "preds.csv")csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {t.Fatal(err)}// Create a dummy WAV file (minimal valid WAV)wavPath := filepath.Join(tmpDir, "test.wav")createMinimalWAV(t, wavPath, 44100, 10.0)// Test with empty filter (should error)input := CallsFromPredsInput{CSVPath: csvPath,Filter: "",WriteDotData: true,ProgressHandler: nil,}output, err := CallsFromPreds(input)// Should return errorif err == nil {t.Error("expected error for empty filter, got nil")}if output.Error == nil || *output.Error == "" {t.Error("expected error message in output, got empty")}}func TestCallsFromPreds_NewDataFile(t *testing.T) {// Create a temp CSV filetmpDir := t.TempDir()csvPath := filepath.Join(tmpDir, "predsST_test-filter_2025-01-01.csv")csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {t.Fatal(err)}// Create a dummy WAV filewavPath := filepath.Join(tmpDir, "test.wav")createMinimalWAV(t, wavPath, 44100, 10.0)// Test with filter parsed from filenameinput := CallsFromPredsInput{CSVPath: csvPath,Filter: "", // Will parse from filenameWriteDotData: true,ProgressHandler: nil,}output, err := CallsFromPreds(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.DataFilesWritten != 1 {t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)}if output.Filter != "test-filter" {t.Errorf("expected filter 'test-filter', got '%s'", output.Filter)}// Verify .data file was createddataPath := wavPath + ".data"if _, err := os.Stat(dataPath); os.IsNotExist(err) {t.Error("expected .data file to be created")}// Verify contentdf, err := utils.ParseDataFile(dataPath)if err != nil {t.Fatalf("failed to parse .data file: %v", err)}if len(df.Segments) != 1 {t.Errorf("expected 1 segment, got %d", len(df.Segments))}if len(df.Segments[0].Labels) != 1 {t.Errorf("expected 1 label, got %d", len(df.Segments[0].Labels))}if df.Segments[0].Labels[0].Filter != "test-filter" {t.Errorf("expected filter 'test-filter', got '%s'", df.Segments[0].Labels[0].Filter)}}func TestCallsFromPreds_ExistingDataFileSameFilter(t *testing.T) {// Create a temp CSV filetmpDir := t.TempDir()csvPath := filepath.Join(tmpDir, "predsST_existing-filter_2025-01-01.csv")csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {t.Fatal(err)}// Create a dummy WAV filewavPath := filepath.Join(tmpDir, "test.wav")createMinimalWAV(t, wavPath, 44100, 10.0)// Create existing .data file with same filterdataPath := wavPath + ".data"existingData := `[{"Operator": "Manual", "Reviewer": "David", "Duration": 10.0},[5.0, 8.0, 0, 44100, [{"species": "morepork", "certainty": 90, "filter": "existing-filter"}]]]`if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {t.Fatal(err)}// Test with same filter (should error)input := CallsFromPredsInput{CSVPath: csvPath,Filter: "", // Will parse from filename -> "existing-filter"WriteDotData: true,ProgressHandler: nil,}output, err := CallsFromPreds(input)// Should return errorif err == nil {t.Error("expected error for same filter, got nil")}if output.Error == nil {t.Error("expected error message in output")}// Verify original .data file is unchangeddf, err := utils.ParseDataFile(dataPath)if err != nil {t.Fatalf("failed to parse .data file: %v", err)}if len(df.Segments) != 1 {t.Errorf("expected original 1 segment, got %d", len(df.Segments))}if df.Segments[0].Labels[0].Species != "morepork" {t.Errorf("expected original species 'morepork', got '%s'", df.Segments[0].Labels[0].Species)}}func TestCallsFromPreds_ExistingDataFileDifferentFilter(t *testing.T) {// Create a temp CSV filetmpDir := t.TempDir()csvPath := filepath.Join(tmpDir, "predsST_new-filter_2025-01-01.csv")csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {t.Fatal(err)}// Create a dummy WAV filewavPath := filepath.Join(tmpDir, "test.wav")createMinimalWAV(t, wavPath, 44100, 10.0)// Create existing .data file with different filterdataPath := wavPath + ".data"existingData := `[{"Operator": "Manual", "Reviewer": "David", "Duration": 10.0},[5.0, 8.0, 0, 44100, [{"species": "morepork", "certainty": 90, "filter": "old-filter"}]]]`if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {t.Fatal(err)}// Test with different filter (should merge)input := CallsFromPredsInput{CSVPath: csvPath,Filter: "", // Will parse from filename -> "new-filter"WriteDotData: true,ProgressHandler: nil,}output, err := CallsFromPreds(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.DataFilesWritten != 1 {t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)}// Verify .data file has merged contentdf, err := utils.ParseDataFile(dataPath)if err != nil {t.Fatalf("failed to parse .data file: %v", err)}if len(df.Segments) != 2 {t.Errorf("expected 2 segments after merge, got %d", len(df.Segments))}// Check segments are sorted by start timeif df.Segments[0].StartTime > df.Segments[1].StartTime {t.Error("expected segments to be sorted by start time")}// Check both filters are presentfilters := make(map[string]bool)for _, seg := range df.Segments {for _, label := range seg.Labels {filters[label.Filter] = true}}if !filters["old-filter"] {t.Error("expected 'old-filter' to be present")}if !filters["new-filter"] {t.Error("expected 'new-filter' to be present")}}func TestCallsFromPreds_ExistingDataFileParseError(t *testing.T) {// Create a temp CSV filetmpDir := t.TempDir()csvPath := filepath.Join(tmpDir, "predsST_test-filter_2025-01-01.csv")csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {t.Fatal(err)}// Create a dummy WAV filewavPath := filepath.Join(tmpDir, "test.wav")createMinimalWAV(t, wavPath, 44100, 10.0)// Create corrupted .data filedataPath := wavPath + ".data"corruptedData := `this is not valid json`if err := os.WriteFile(dataPath, []byte(corruptedData), 0644); err != nil {t.Fatal(err)}// Test (should error due to parse failure)input := CallsFromPredsInput{CSVPath: csvPath,Filter: "",WriteDotData: true,ProgressHandler: nil,}output, err := CallsFromPreds(input)// Should return errorif err == nil {t.Error("expected error for corrupted .data file, got nil")}if output.Error == nil {t.Error("expected error message in output")}// Verify original file is unchangedcontent, err := os.ReadFile(dataPath)if err != nil {t.Fatal(err)}if string(content) != corruptedData {t.Error("expected corrupted file to remain unchanged")}}func TestCallsFromPreds_ExplicitFilter(t *testing.T) {// Create a temp CSV file with non-standard nametmpDir := t.TempDir()csvPath := filepath.Join(tmpDir, "predictions.csv")csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {t.Fatal(err)}// Create a dummy WAV filewavPath := filepath.Join(tmpDir, "test.wav")createMinimalWAV(t, wavPath, 44100, 10.0)// Test with explicit filterinput := CallsFromPredsInput{CSVPath: csvPath,Filter: "my-custom-filter",WriteDotData: true,ProgressHandler: nil,}output, err := CallsFromPreds(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.Filter != "my-custom-filter" {t.Errorf("expected filter 'my-custom-filter', got '%s'", output.Filter)}// Verify .data file uses explicit filterdataPath := wavPath + ".data"df, err := utils.ParseDataFile(dataPath)if err != nil {t.Fatalf("failed to parse .data file: %v", err)}if df.Segments[0].Labels[0].Filter != "my-custom-filter" {t.Errorf("expected filter 'my-custom-filter' in .data file, got '%s'", df.Segments[0].Labels[0].Filter)}}func TestCallsFromPreds_NonParsableFilenameNoFilter(t *testing.T) {// Create a temp CSV file with non-standard name that can't be parsedtmpDir := t.TempDir()csvPath := filepath.Join(tmpDir, "random_name.csv")csvContent := "file,start_time,end_time,kiwi\n./test.wav,0.0,3.0,1\n"if err := os.WriteFile(csvPath, []byte(csvContent), 0644); err != nil {t.Fatal(err)}// Create a dummy WAV filewavPath := filepath.Join(tmpDir, "test.wav")createMinimalWAV(t, wavPath, 44100, 10.0)// Test with no filter and non-parsable filename (should error)input := CallsFromPredsInput{CSVPath: csvPath,Filter: "",WriteDotData: true,ProgressHandler: nil,}output, err := CallsFromPreds(input)// Should return errorif err == nil {t.Error("expected error for unparsable filename with no filter, got nil")}if output.Error == nil {t.Error("expected error message in output")}}// createMinimalWAV creates a minimal valid WAV file for testingfunc createMinimalWAV(t *testing.T, path string, sampleRate int, duration float64) {t.Helper()numSamples := int(float64(sampleRate) * duration)dataSize := numSamples * 2 // 16-bit mono// WAV header (44 bytes)header := make([]byte, 44)// RIFF headercopy(header[0:4], "RIFF")totalSize := uint32(36 + dataSize)header[4] = byte(totalSize)header[5] = byte(totalSize >> 8)header[6] = byte(totalSize >> 16)header[7] = byte(totalSize >> 24)copy(header[8:12], "WAVE")// fmt chunkcopy(header[12:16], "fmt ")chunkSize := uint32(16)header[16] = byte(chunkSize)header[17] = byte(chunkSize >> 8)header[18] = byte(chunkSize >> 16)header[19] = byte(chunkSize >> 24)audioFormat := uint16(1) // PCMheader[20] = byte(audioFormat)header[21] = byte(audioFormat >> 8)numChannels := uint16(1)header[22] = byte(numChannels)header[23] = byte(numChannels >> 8)header[24] = byte(sampleRate)header[25] = byte(sampleRate >> 8)header[26] = byte(sampleRate >> 16)header[27] = byte(sampleRate >> 24)byteRate := uint32(sampleRate * 2)header[28] = byte(byteRate)header[29] = byte(byteRate >> 8)header[30] = byte(byteRate >> 16)header[31] = byte(byteRate >> 24)blockAlign := uint16(2)header[32] = byte(blockAlign)header[33] = byte(blockAlign >> 8)bitsPerSample := uint16(16)header[34] = byte(bitsPerSample)header[35] = byte(bitsPerSample >> 8)// data chunkcopy(header[36:40], "data")header[40] = byte(dataSize)header[41] = byte(dataSize >> 8)header[42] = byte(dataSize >> 16)header[43] = byte(dataSize >> 24)// Create file with header and silencefile, err := os.Create(path)if err != nil {t.Fatal(err)}defer file.Close()if _, err := file.Write(header); err != nil {t.Fatal(err)}// Write silence (zeros)silence := make([]byte, dataSize)if _, err := file.Write(silence); err != nil {t.Fatal(err)}}
package callsimport ("encoding/csv""encoding/json""fmt""io""os""path/filepath""sort""strconv""strings""sync""sync/atomic""skraak/utils")// Constants for clustering algorithmconst (CLUSTER_GAP_MULTIPLIER = 2 // 3 Gap threshold = CLUSTER_GAP_MULTIPLIER * clip_duration. 3 for kiwiMIN_DETECTIONS_PER_CLUSTER = 0 // 1 = filter out single detections (used for kiwi, they have long calls 30s), 0 = let single detections pass throughDEFAULT_CERTAINTY = 70 // .data certainty:70DOT_DATA_WORKERS = 8 // Number of parallel workers for .data file writing)// ClusteredCall represents a clustered bird call detectiontype ClusteredCall struct {File string `json:"file"`StartTime float64 `json:"start_time"`EndTime float64 `json:"end_time"`EbirdCode string `json:"ebird_code"`Segments int `json:"segments"`}// CallsFromPredsInput defines the input for the calls-from-preds tooltype CallsFromPredsInput struct {CSVPath string `json:"csv_path"`Filter string `json:"filter"`WriteDotData bool `json:"write_dot_data"`GapMultiplier int `json:"gap_multiplier"`MinDetections int `json:"min_detections"`ProgressHandler ProgressHandler `json:"-"` // Optional progress callback (not serialized)}// ProgressHandler is a callback function for reporting progress during long operations// processed: number of items processed so far// total: total number of items to process// message: optional status messagetype ProgressHandler func(processed, total int, message string)// CallsFromPredsOutput defines the output for the calls-from-preds tooltype CallsFromPredsOutput struct {Calls []ClusteredCall `json:"calls"`TotalCalls int `json:"total_calls"`ClipDuration float64 `json:"clip_duration"`GapThreshold float64 `json:"gap_threshold"`SpeciesCount map[string]int `json:"species_count"`DataFilesWritten int `json:"data_files_written"`DataFilesSkipped int `json:"data_files_skipped"`Filter string `json:"filter"`Error *string `json:"error,omitempty"`}// AviaNZ .data file types// predFileSpeciesKey groups detections by file and ebird codetype predFileSpeciesKey struct {File stringEbirdCode string}// CallsFromPreds reads a predictions CSV and clusters detections into continuous bird callsfunc CallsFromPreds(input CallsFromPredsInput) (CallsFromPredsOutput, error) {var output CallsFromPredsOutput// Determine filter: use provided filter, or parse from CSV filenamefilter := input.Filterif filter == "" {filter = ParseFilterFromFilename(input.CSVPath)}if filter == "" {errMsg := "Filter must be specified via --filter flag or parsable from CSV filename"output.Error = &errMsgreturn output, fmt.Errorf("%s", errMsg)}output.Filter = filter_, detections, clipDuration, err := readPredCSV(input.CSVPath)if err != nil {errMsg := err.Error()output.Error = &errMsgreturn output, err}output.ClipDuration = clipDurationgapMultiplier := CLUSTER_GAP_MULTIPLIERif input.GapMultiplier > 0 {gapMultiplier = input.GapMultiplier}minDetections := MIN_DETECTIONS_PER_CLUSTERif input.MinDetections >= 0 {minDetections = input.MinDetections}gapThreshold := float64(gapMultiplier) * clipDurationoutput.GapThreshold = gapThresholdallCalls, speciesCount := clusterDetections(detections, clipDuration, gapThreshold, minDetections)output.Calls = allCallsoutput.TotalCalls = len(allCalls)output.SpeciesCount = speciesCountif input.WriteDotData {dataFilesWritten, dataFilesSkipped, err := writeDotFiles(input.CSVPath, filter, allCalls, input.ProgressHandler)if err != nil {errMsg := fmt.Sprintf("Error writing .data files: %v", err)output.Error = &errMsgreturn output, fmt.Errorf("%s", errMsg)}output.DataFilesWritten = dataFilesWrittenoutput.DataFilesSkipped = dataFilesSkipped}return output, nil}// readPredCSV opens and reads a predictions CSV, returning column mappings, detections, and clip durationfunc readPredCSV(csvPath string) (predCSVColumns, map[predFileSpeciesKey][]float64, float64, error) {file, err := os.Open(csvPath)if err != nil {return predCSVColumns{}, nil, 0, fmt.Errorf("failed to open CSV file: %w", err)}defer func() { _ = file.Close() }()reader := csv.NewReader(file)reader.ReuseRecord = trueheader, err := reader.Read()if err != nil {return predCSVColumns{}, nil, 0, fmt.Errorf("failed to read CSV header: %w", err)}cols, err := findPredCSVColumns(header)if err != nil {return predCSVColumns{}, nil, 0, err}detections, clipDuration, err := readPredCSVRows(reader, cols)if err != nil {return predCSVColumns{}, nil, 0, err}return cols, detections, clipDuration, nil}// predCSVColumns holds the column indices for a predictions CSVtype predCSVColumns struct {fileIdx intstartTimeIdx intendTimeIdx intebirdCodes []stringebirdIdx []int}// findPredCSVColumns parses the CSV header to find column indicesfunc findPredCSVColumns(header []string) (predCSVColumns, error) {cols := predCSVColumns{fileIdx: -1,startTimeIdx: -1,endTimeIdx: -1,}ignoredColumns := map[string]bool{"NotKiwi": true, "0.0": true}for i, col := range header {switch col {case "file":cols.fileIdx = icase "start_time":cols.startTimeIdx = icase "end_time":cols.endTimeIdx = idefault:if ignoredColumns[col] {continue}cols.ebirdCodes = append(cols.ebirdCodes, col)cols.ebirdIdx = append(cols.ebirdIdx, i)}}if cols.fileIdx == -1 || cols.startTimeIdx == -1 || cols.endTimeIdx == -1 {return cols, fmt.Errorf("CSV must have 'file', 'start_time', and 'end_time' columns")}if len(cols.ebirdCodes) == 0 {return cols, fmt.Errorf("CSV must have at least one ebird code column")}return cols, nil}// readPredCSVRows reads all CSV data rows and returns detections grouped by file+species, plus clip durationfunc readPredCSVRows(reader *csv.Reader, cols predCSVColumns) (map[predFileSpeciesKey][]float64, float64, error) {detections := make(map[predFileSpeciesKey][]float64)clipDuration := 0.0record, err := reader.Read()if err == io.EOF {return detections, 0, nil}if err != nil {return nil, 0, fmt.Errorf("failed to read first CSV row: %w", err)}startTime, _ := strconv.ParseFloat(record[cols.startTimeIdx], 64)endTime, _ := strconv.ParseFloat(record[cols.endTimeIdx], 64)clipDuration = endTime - startTimeaddDetectionsFromRow(record, cols, startTime, detections)for {record, err := reader.Read()if err == io.EOF {break}if err != nil {return nil, 0, fmt.Errorf("failed to read CSV row: %w", err)}startTime, _ = strconv.ParseFloat(record[cols.startTimeIdx], 64)addDetectionsFromRow(record, cols, startTime, detections)}return detections, clipDuration, nil}// addDetectionsFromRow adds positive detections from a single CSV rowfunc addDetectionsFromRow(record []string, cols predCSVColumns, startTime float64, detections map[predFileSpeciesKey][]float64) {fileName := record[cols.fileIdx]for i, idx := range cols.ebirdIdx {if record[idx] == "1" {key := predFileSpeciesKey{File: fileName, EbirdCode: cols.ebirdCodes[i]}detections[key] = append(detections[key], startTime)}}}// clusterDetections groups detections into clusters and produces sorted ClusteredCallsfunc clusterDetections(detections map[predFileSpeciesKey][]float64, clipDuration, gapThreshold float64, minDetections int) ([]ClusteredCall, map[string]int) {var allCalls []ClusteredCallspeciesCount := make(map[string]int)for key, startTimes := range detections {sort.Float64s(startTimes)clusters := clusterStartTimes(startTimes, gapThreshold)for _, cluster := range clusters {if len(cluster) <= minDetections {continue}call := ClusteredCall{File: key.File,StartTime: cluster[0],EndTime: cluster[len(cluster)-1] + clipDuration,EbirdCode: key.EbirdCode,Segments: len(cluster),}allCalls = append(allCalls, call)speciesCount[key.EbirdCode]++}}sort.Slice(allCalls, func(i, j int) bool {if allCalls[i].File != allCalls[j].File {return allCalls[i].File < allCalls[j].File}return allCalls[i].StartTime < allCalls[j].StartTime})return allCalls, speciesCount}// DirCache caches directory entries for fast WAV file lookup.// Scans the directory once and builds a map from lowercased basename to full filename.// Safe for concurrent read-only use after construction.type DirCache struct {dir stringwavMap map[string]string // lowercase basename -> filename with original case (e.g. "20230610_150000" -> "20230610_150000.WAV")dirMap map[string]string // lowercase basename -> filename for any file (used by from-raven for .selections.txt etc.)}// NewDirCache creates a DirCache by scanning the directory once.func NewDirCache(dir string) *DirCache {entries, err := os.ReadDir(dir)if err != nil {return &DirCache{dir: dir, wavMap: make(map[string]string), dirMap: make(map[string]string)}}wavMap := make(map[string]string, len(entries))dirMap := make(map[string]string, len(entries))for _, entry := range entries {if entry.IsDir() {continue}name := entry.Name()ext := filepath.Ext(name)base := strings.TrimSuffix(name, ext)dirMap[strings.ToLower(base)] = nameif strings.EqualFold(ext, ".wav") {wavMap[strings.ToLower(base)] = name}}return &DirCache{dir: dir, wavMap: wavMap, dirMap: dirMap}}// FindWAV looks up a WAV file by basename (case-insensitive).// Returns the full path with correct case, or empty string if not found.func (dc *DirCache) FindWAV(baseName string) string {if name, ok := dc.wavMap[strings.ToLower(baseName)]; ok {return filepath.Join(dc.dir, name)}return ""}// FindFile looks up any file by basename (case-insensitive).// Returns the full path with correct case, or empty string if not found.func (dc *DirCache) FindFile(baseName string) string {if name, ok := dc.dirMap[strings.ToLower(baseName)]; ok {return filepath.Join(dc.dir, name)}return ""}// findWAVFile finds a WAV file in the directory with case-insensitive matching.// baseName is the filename without extension (e.g., "20230610_150000").// Returns the full path with correct case, or empty string if not found.// Deprecated: Use DirCache.FindWAV for batch operations to avoid repeated directory scans.func findWAVFile(dir, baseName string) string {entries, err := os.ReadDir(dir)if err != nil {return ""}for _, entry := range entries {if entry.IsDir() {continue}name := entry.Name()ext := filepath.Ext(name)nameNoExt := strings.TrimSuffix(name, ext)if nameNoExt == baseName && strings.EqualFold(ext, ".wav") {return filepath.Join(dir, name)}}return ""}// writeDotFiles writes AviaNZ .data files for each audio file with calls// Uses parallel workers for improved performance on large batchesfunc writeDotFiles(csvPath, filter string, calls []ClusteredCall, progress ProgressHandler) (int, int, error) {// Base directory is the directory containing the CSV filecsvDir := filepath.Dir(csvPath)// Group calls by file (using extracted filename)callsByFile := make(map[string][]ClusteredCall)for _, call := range calls {filename := filepath.Base(call.File)callsByFile[filename] = append(callsByFile[filename], call)}// Report initial progressif progress != nil {progress(0, len(callsByFile), "Processing WAV files")}// If small batch, process sequentially (avoid goroutine overhead)if len(callsByFile) < 10 {return writeDotFilesSequential(csvDir, filter, callsByFile, progress)}// Parallel processing for larger batchesreturn writeDotFilesParallel(csvDir, filter, callsByFile, progress)}// dotDataJob represents a single file to processtype dotDataJob struct {filename stringfileCalls []ClusteredCall}// dotDataResult represents the result of processing a single filetype dotDataResult struct {filename stringwritten boolerr error}// writeDotFilesSequential processes files one at a time (for small batches)func writeDotFilesSequential(csvDir, filter string, callsByFile map[string][]ClusteredCall, progress ProgressHandler) (int, int, error) {dataFilesWritten := 0dataFilesSkipped := 0total := len(callsByFile)processed := 0for filename, fileCalls := range callsByFile {// Find WAV file with correct casebaseName := strings.TrimSuffix(filename, filepath.Ext(filename))wavPath := findWAVFile(csvDir, baseName)if wavPath == "" {dataFilesSkipped++processed++if progress != nil {progress(processed, total, "")}continue}dataPath := wavPath + ".data"sampleRate, duration, err := utils.ParseWAVHeaderMinimal(wavPath)if err != nil {dataFilesSkipped++processed++if progress != nil {progress(processed, total, "")}continue}// Build segments and metadatameta, segments := buildAviaNZMetaAndSegments(fileCalls, filter, duration, sampleRate)if err := writeDotDataFileSafe(dataPath, segments, filter, meta); err != nil {return dataFilesWritten, dataFilesSkipped, fmt.Errorf("failed to write %s: %w", dataPath, err)}dataFilesWritten++processed++if progress != nil {progress(processed, total, "")}}return dataFilesWritten, dataFilesSkipped, nil}// writeDotFilesParallel processes files concurrently using a worker poolfunc writeDotFilesParallel(csvDir, filter string, callsByFile map[string][]ClusteredCall, progress ProgressHandler) (int, int, error) {total := len(callsByFile)var processed atomic.Int32// Create job channeljobs := make(chan dotDataJob, len(callsByFile))results := make(chan dotDataResult, len(callsByFile))// Start workersvar wg sync.WaitGroupfor range DOT_DATA_WORKERS {wg.Add(1)go dotDataWorker(csvDir, filter, jobs, results, &wg)}// Send jobsfor filename, fileCalls := range callsByFile {jobs <- dotDataJob{filename: filename, fileCalls: fileCalls}}close(jobs)// Wait for workers to finishgo func() {wg.Wait()close(results)}()// Collect results with progress reportingdataFilesWritten := 0dataFilesSkipped := 0var firstErr errorfor result := range results {if result.err != nil && firstErr == nil {firstErr = result.err}if result.written {dataFilesWritten++} else {dataFilesSkipped++}// Report progressif progress != nil {current := int(processed.Add(1))progress(current, total, "")}}return dataFilesWritten, dataFilesSkipped, firstErr}// dotDataWorker processes files from the jobs channelfunc dotDataWorker(csvDir, filter string, jobs <-chan dotDataJob, results chan<- dotDataResult, wg *sync.WaitGroup) {defer wg.Done()for job := range jobs {// Find WAV file with correct casebaseName := strings.TrimSuffix(job.filename, filepath.Ext(job.filename))wavPath := findWAVFile(csvDir, baseName)if wavPath == "" {results <- dotDataResult{filename: job.filename, written: false, err: nil}continue}dataPath := wavPath + ".data"sampleRate, duration, err := utils.ParseWAVHeaderMinimal(wavPath)if err != nil {results <- dotDataResult{filename: job.filename, written: false, err: nil}continue}// Build segments and metadatameta, segments := buildAviaNZMetaAndSegments(job.fileCalls, filter, duration, sampleRate)if err := writeDotDataFileSafe(dataPath, segments, filter, meta); err != nil {results <- dotDataResult{filename: job.filename, written: false, err: fmt.Errorf("failed to write %s: %w", dataPath, err)}continue}results <- dotDataResult{filename: job.filename, written: true, err: nil}}}// buildAviaNZMetaAndSegments creates metadata and segments for a .data filefunc buildAviaNZMetaAndSegments(calls []ClusteredCall, filter string, duration float64, sampleRate int) (AviaNZMeta, []AviaNZSegment) {// Create metadatareviewer := "None"meta := AviaNZMeta{Operator: "Auto",Reviewer: &reviewer,Duration: duration,}// Build segments arrayvar segments []AviaNZSegmentfor _, call := range calls {// Create labels for this segmentlabels := []AviaNZLabel{{Species: call.EbirdCode,Certainty: DEFAULT_CERTAINTY,Filter: filter,},}// Create segment: [start, end, freq_low, freq_high, labels]// freq_low=0, freq_high=sampleRate for full-band segmentssegment := AviaNZSegment{call.StartTime,call.EndTime,0, // freq_lowsampleRate, // freq_high (full band)labels,}segments = append(segments, segment)}return meta, segments}// writeAviaNZDataFile writes a new .data file to disk (does not check for existing files)func writeAviaNZDataFile(path string, data []any) error {file, err := os.Create(path)if err != nil {return fmt.Errorf("failed to create file: %w", err)}defer func() { _ = file.Close() }()encoder := json.NewEncoder(file)encoder.SetIndent("", "") // No indentation for compact outputif err := encoder.Encode(data); err != nil {return fmt.Errorf("failed to encode JSON: %w", err)}return nil}// writeDotDataFileSafe safely writes or merges .data files// - If file doesn't exist: write new file// - If file exists with same filter: return error (refuse to clobber)// - If file exists with different filter: merge segments and write// - If file exists but can't be parsed: return error (refuse to clobber)func writeDotDataFileSafe(path string, newSegments []AviaNZSegment, filter string, meta AviaNZMeta) error {// Check if file existsif _, err := os.Stat(path); err == nil {// File exists - parse and checkexisting, err := utils.ParseDataFile(path)if err != nil {return fmt.Errorf("cannot parse existing %s: %w (refusing to clobber)", path, err)}// Check for duplicate filterfor _, seg := range existing.Segments {if seg.HasFilterLabel(filter) {return fmt.Errorf("%s already contains filter '%s' (refusing to clobber)", path, filter)}}// Append new segments (different filter - safe to merge)for _, newSeg := range newSegments {seg := convertAviaNZSegment(newSeg, filter)existing.Segments = append(existing.Segments, seg)}// Sort by start timesort.Slice(existing.Segments, func(i, j int) bool {return existing.Segments[i].StartTime < existing.Segments[j].StartTime})return existing.Write(path)}// File doesn't exist - write newdata := buildDataFileFromSegments(meta, newSegments)return writeAviaNZDataFile(path, data)}// convertAviaNZSegment converts an AviaNZSegment to utils.Segmentfunc convertAviaNZSegment(seg AviaNZSegment, filter string) *utils.Segment {labels := seg[4].([]AviaNZLabel)utilsLabels := make([]*utils.Label, len(labels))for i, l := range labels {utilsLabels[i] = &utils.Label{Species: l.Species,Certainty: l.Certainty,Filter: filter,}}// Handle freq values (could be int or float64 depending on how they were created)var freqLow, freqHigh float64switch v := seg[2].(type) {case int:freqLow = float64(v)case float64:freqLow = v}switch v := seg[3].(type) {case int:freqHigh = float64(v)case float64:freqHigh = v}return &utils.Segment{StartTime: seg[0].(float64),EndTime: seg[1].(float64),FreqLow: freqLow,FreqHigh: freqHigh,Labels: utilsLabels,}}// buildDataFileFromSegments builds the data file structure from meta and segmentsfunc buildDataFileFromSegments(meta AviaNZMeta, segments []AviaNZSegment) []any {result := make([]any, 0, 1+len(segments))result = append(result, meta)for _, seg := range segments {result = append(result, seg)}return result}// ParseFilterFromFilename extracts filter name from preds CSV filename// "predsST_opensoundscape-kiwi-1.2_2025-11-12.csv" -> "opensoundscape-kiwi-1.2"// Returns empty string if parsing failsfunc ParseFilterFromFilename(csvPath string) string {filename := filepath.Base(csvPath)// Remove .csv extensionname := strings.TrimSuffix(filename, ".csv")// Split on underscoreparts := strings.Split(name, "_")if len(parts) == 3 {return parts[1]}return ""}// clusterStartTimes groups consecutive start times into clusters// where the gap between consecutive times is <= gapThresholdfunc clusterStartTimes(startTimes []float64, gapThreshold float64) [][]float64 {if len(startTimes) == 0 {return nil}var clusters [][]float64currentCluster := []float64{startTimes[0]}for i := 1; i < len(startTimes); i++ {gap := startTimes[i] - startTimes[i-1]if gap <= gapThreshold {// Same clustercurrentCluster = append(currentCluster, startTimes[i])} else {// New clusterclusters = append(clusters, currentCluster)currentCluster = []float64{startTimes[i]}}}// Don't forget the last clusterclusters = append(clusters, currentCluster)return clusters}
package callsimport ("fmt""os""path/filepath""sort""sync""sync/atomic")// CallsFromSourceInput defines the common input for calls-from-source toolstype CallsFromSourceInput struct {Folder string `json:"folder"`File string `json:"file"`Delete bool `json:"delete"`ProgressHandler ProgressHandler `json:"-"` // Optional progress callback}// CallsFromSourceOutput defines the common output for calls-from-source toolstype CallsFromSourceOutput struct {Calls []ClusteredCall `json:"calls"`TotalCalls int `json:"total_calls"`SpeciesCount map[string]int `json:"species_count"`DataFilesWritten int `json:"data_files_written"`DataFilesSkipped int `json:"data_files_skipped"`FilesProcessed int `json:"files_processed"`FilesDeleted int `json:"files_deleted"`Filter string `json:"filter"`Error *string `json:"error,omitempty"`}// CallSource abstracts a source of bird call data (Raven, BirdNET, etc.)type CallSource interface {// Name returns the display name (e.g. "Raven", "BirdNET")Name() string// FindFiles discovers source files in the given folderFindFiles(folder string) ([]string, error)// ProcessFile processes a single source file and returns calls, write/skip statusProcessFile(path string, cache *DirCache) (calls []ClusteredCall, written, skipped bool, err error)}// callsFromSource is the shared entry point for all call source tools.func callsFromSource(src CallSource, input CallsFromSourceInput) (CallsFromSourceOutput, error) {var output CallsFromSourceOutputoutput.Filter = src.Name()// Collect source files to processvar files []stringif input.File != "" {files = []string{input.File}} else if input.Folder != "" {var err errorfiles, err = src.FindFiles(input.Folder)if err != nil {errMsg := fmt.Sprintf("Failed to find %s files: %v", src.Name(), err)output.Error = &errMsgreturn output, fmt.Errorf("%s", errMsg)}} else {errMsg := "Either --folder or --file must be specified"output.Error = &errMsgreturn output, fmt.Errorf("%s", errMsg)}if len(files) == 0 {errMsg := fmt.Sprintf("No %s files found", src.Name())output.Error = &errMsgreturn output, fmt.Errorf("%s", errMsg)}// Single file or small batch: process sequentially (avoid goroutine overhead)if len(files) < 10 {return callsFromSourceSequential(src, input, files)}// Large batch: parallel processing with DirCachereturn callsFromSourceParallel(src, input, files)}// callsFromSourceSequential processes source files one at a time (for small batches)func callsFromSourceSequential(src CallSource, input CallsFromSourceInput, files []string) (CallsFromSourceOutput, error) {var output CallsFromSourceOutputoutput.Filter = src.Name()// Build DirCache once for the folderdirCaches := make(map[string]*DirCache)if input.Folder != "" {dirCaches[input.Folder] = NewDirCache(input.Folder)}speciesCount := make(map[string]int)var allCalls []ClusteredCalldataFilesWritten := 0dataFilesSkipped := 0filesProcessed := 0filesDeleted := 0for _, file := range files {dir := filepath.Dir(file)cache := dirCaches[dir]if cache == nil {cache = NewDirCache(dir)dirCaches[dir] = cache}calls, written, skipped, err := src.ProcessFile(file, cache)if err != nil {errMsg := fmt.Sprintf("Error processing %s: %v", file, err)output.Error = &errMsgreturn output, fmt.Errorf("%s", errMsg)}if written {dataFilesWritten++}if skipped {dataFilesSkipped++}for _, call := range calls {allCalls = append(allCalls, call)speciesCount[call.EbirdCode]++}filesProcessed++// Delete if requested and successfully processedif input.Delete && written {if err := os.Remove(file); err != nil {errMsg := fmt.Sprintf("Failed to delete %s: %v", file, err)output.Error = &errMsgreturn output, fmt.Errorf("%s", errMsg)}filesDeleted++}if input.ProgressHandler != nil {input.ProgressHandler(filesProcessed, len(files), filepath.Base(file))}}// Sort all calls by file, then start timesort.Slice(allCalls, func(i, j int) bool {if allCalls[i].File != allCalls[j].File {return allCalls[i].File < allCalls[j].File}return allCalls[i].StartTime < allCalls[j].StartTime})output.Calls = allCallsoutput.TotalCalls = len(allCalls)output.SpeciesCount = speciesCountoutput.DataFilesWritten = dataFilesWrittenoutput.DataFilesSkipped = dataFilesSkippedoutput.FilesProcessed = filesProcessedoutput.FilesDeleted = filesDeletedreturn output, nil}// sourceJob represents a single file to process (generic over CallSource)type sourceJob struct {filePath string}// sourceResult represents the result of processing a single source filetype sourceResult struct {path stringcalls []ClusteredCallwritten boolskipped boolerr error}func (r sourceResult) filePath() string { return r.path }func (r sourceResult) getCalls() []ClusteredCall { return r.calls }func (r sourceResult) wasWritten() bool { return r.written }func (r sourceResult) wasSkipped() bool { return r.skipped }func (r sourceResult) getError() error { return r.err }// callsFromSourceParallel processes source files concurrently using a worker pool and DirCachefunc callsFromSourceParallel(src CallSource, input CallsFromSourceInput, files []string) (CallsFromSourceOutput, error) {var output CallsFromSourceOutputoutput.Filter = src.Name()total := len(files)var processed atomic.Int32// Build DirCache for the folderdirCaches := &sync.Map{}if input.Folder != "" {cache := NewDirCache(input.Folder)dirCaches.Store(input.Folder, cache)}// Create job and result channelsjobs := make(chan sourceJob, total)results := make(chan parallelResult, total)// Start workersvar wg sync.WaitGroupfor range DOT_DATA_WORKERS {wg.Add(1)go sourceWorker(src, dirCaches, jobs, results, &wg)}// Send jobsfor _, file := range files {jobs <- sourceJob{filePath: file}}close(jobs)// Wait for workers to finish, then close resultsgo func() {wg.Wait()close(results)}()// Collect results with progress reportingstats := aggregateResults(results, total, &processed, input.Delete, input.ProgressHandler)if stats.firstErr != nil {errMsg := stats.firstErr.Error()output.Error = &errMsgreturn output, stats.firstErr}sortCallsByFileAndTime(stats.calls)output.Calls = stats.callsoutput.TotalCalls = len(stats.calls)output.SpeciesCount = stats.speciesCountoutput.DataFilesWritten = stats.dataFilesWrittenoutput.DataFilesSkipped = stats.dataFilesSkippedoutput.FilesProcessed = stats.filesProcessedoutput.FilesDeleted = stats.filesDeletedreturn output, nil}// sourceWorker processes source files from the jobs channelfunc sourceWorker(src CallSource, dirCaches *sync.Map, jobs <-chan sourceJob, results chan<- parallelResult, wg *sync.WaitGroup) {defer wg.Done()for job := range jobs {dir := filepath.Dir(job.filePath)// Get or create DirCache for this directoryvar cache *DirCacheif cached, ok := dirCaches.Load(dir); ok {cache = cached.(*DirCache)} else {cache = NewDirCache(dir)dirCaches.Store(dir, cache)}calls, written, skipped, err := src.ProcessFile(job.filePath, cache)results <- sourceResult{path: job.filePath,calls: calls,written: written,skipped: skipped,err: err,}}}
package callsimport ("os""path/filepath""testing""skraak/utils")// ============================================// BirdNET Tests// ============================================func TestCallsFromBirda_NewDataFile(t *testing.T) {tmpDir := t.TempDir()// Create a minimal WAV filewavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)// Create BirdNET results filebirdaPath := filepath.Join(tmpDir, "test.BirdNET.results.csv")birdaContent := "\ufeffStart (s),End (s),Scientific name,Common name,Confidence,File\n0.0,3.0,Turdus migratorius,American Robin,0.85,/some/path/test.WAV\n"if err := os.WriteFile(birdaPath, []byte(birdaContent), 0644); err != nil {t.Fatal(err)}input := CallsFromBirdaInput{File: birdaPath,}output, err := CallsFromBirda(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.DataFilesWritten != 1 {t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)}if output.Filter != "BirdNET" {t.Errorf("expected filter 'BirdNET', got '%s'", output.Filter)}if output.TotalCalls != 1 {t.Errorf("expected 1 call, got %d", output.TotalCalls)}// Verify .data file was createddataPath := wavPath + ".data"df, err := utils.ParseDataFile(dataPath)if err != nil {t.Fatalf("failed to parse .data file: %v", err)}if len(df.Segments) != 1 {t.Errorf("expected 1 segment, got %d", len(df.Segments))}if df.Segments[0].Labels[0].Filter != "BirdNET" {t.Errorf("expected filter 'BirdNET', got '%s'", df.Segments[0].Labels[0].Filter)}if df.Segments[0].Labels[0].Certainty != 85 {t.Errorf("expected certainty 85, got %d", df.Segments[0].Labels[0].Certainty)}}func TestCallsFromBirda_ExistingSameFilter(t *testing.T) {tmpDir := t.TempDir()wavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)dataPath := wavPath + ".data"existingData := `[{"Operator": "Test", "Duration": 60.0}, [5.0, 10.0, 0, 16000, [{"species": "Existing Bird", "certainty": 90, "filter": "BirdNET"}]]]`if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {t.Fatal(err)}birdaPath := filepath.Join(tmpDir, "test.BirdNET.results.csv")birdaContent := "\ufeffStart (s),End (s),Scientific name,Common name,Confidence,File\n0.0,3.0,New Bird,New Bird,0.85,test.WAV\n"if err := os.WriteFile(birdaPath, []byte(birdaContent), 0644); err != nil {t.Fatal(err)}input := CallsFromBirdaInput{File: birdaPath}output, err := CallsFromBirda(input)if err == nil {t.Error("expected error for same filter, got nil")}if output.Error == nil {t.Error("expected error message in output")}}func TestCallsFromBirda_ExistingDifferentFilter(t *testing.T) {tmpDir := t.TempDir()wavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)dataPath := wavPath + ".data"existingData := `[{"Operator": "Test", "Duration": 60.0}, [5.0, 10.0, 0, 16000, [{"species": "Kiwi", "certainty": 90, "filter": "Manual"}]]]`if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {t.Fatal(err)}birdaPath := filepath.Join(tmpDir, "test.BirdNET.results.csv")birdaContent := "\ufeffStart (s),End (s),Scientific name,Common name,Confidence,File\n0.0,3.0,Robin,Robin,0.85,test.WAV\n"if err := os.WriteFile(birdaPath, []byte(birdaContent), 0644); err != nil {t.Fatal(err)}input := CallsFromBirdaInput{File: birdaPath}output, err := CallsFromBirda(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.DataFilesWritten != 1 {t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)}df, err := utils.ParseDataFile(dataPath)if err != nil {t.Fatalf("failed to parse .data file: %v", err)}if len(df.Segments) != 2 {t.Errorf("expected 2 segments after merge, got %d", len(df.Segments))}}func TestCallsFromBirda_DeleteOption(t *testing.T) {tmpDir := t.TempDir()wavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)birdaPath := filepath.Join(tmpDir, "test.BirdNET.results.csv")birdaContent := "\ufeffStart (s),End (s),Scientific name,Common name,Confidence,File\n0.0,3.0,Robin,Robin,0.85,test.WAV\n"if err := os.WriteFile(birdaPath, []byte(birdaContent), 0644); err != nil {t.Fatal(err)}input := CallsFromBirdaInput{File: birdaPath, Delete: true}output, err := CallsFromBirda(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.FilesDeleted != 1 {t.Errorf("expected 1 file deleted, got %d", output.FilesDeleted)}if _, err := os.Stat(birdaPath); !os.IsNotExist(err) {t.Error("expected BirdNET file to be deleted")}}func TestCallsFromBirda_FolderMode(t *testing.T) {tmpDir := t.TempDir()for i := range 2 {wavPath := filepath.Join(tmpDir, "test"+string(rune('0'+i))+".WAV")createMinimalWAV(t, wavPath, 16000, 60.0)birdaPath := filepath.Join(tmpDir, "test"+string(rune('0'+i))+".BirdNET.results.csv")birdaContent := "\ufeffStart (s),End (s),Scientific name,Common name,Confidence,File\n0.0,3.0,Bird,Bird,0.85,test.WAV\n"if err := os.WriteFile(birdaPath, []byte(birdaContent), 0644); err != nil {t.Fatal(err)}}input := CallsFromBirdaInput{Folder: tmpDir}output, err := CallsFromBirda(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.FilesProcessed != 2 {t.Errorf("expected 2 files processed, got %d", output.FilesProcessed)}if output.DataFilesWritten != 2 {t.Errorf("expected 2 data files written, got %d", output.DataFilesWritten)}}// ============================================// Raven Tests// ============================================func TestCallsFromRaven_NewDataFile(t *testing.T) {tmpDir := t.TempDir()wavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)ravenPath := filepath.Join(tmpDir, "test.Table.1.selections.txt")ravenContent := "Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tSpecies\n1\tSpectrogram 1\t1\t0.0\t5.0\t1000\t5000\tKiwi\n"if err := os.WriteFile(ravenPath, []byte(ravenContent), 0644); err != nil {t.Fatal(err)}input := CallsFromRavenInput{File: ravenPath}output, err := CallsFromRaven(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.DataFilesWritten != 1 {t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)}if output.Filter != "Raven" {t.Errorf("expected filter 'Raven', got '%s'", output.Filter)}dataPath := wavPath + ".data"df, err := utils.ParseDataFile(dataPath)if err != nil {t.Fatalf("failed to parse .data file: %v", err)}if df.Segments[0].FreqLow != 1000 {t.Errorf("expected freq_low 1000, got %f", df.Segments[0].FreqLow)}if df.Segments[0].FreqHigh != 5000 {t.Errorf("expected freq_high 5000, got %f", df.Segments[0].FreqHigh)}}func TestCallsFromRaven_ExistingSameFilter(t *testing.T) {tmpDir := t.TempDir()wavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)dataPath := wavPath + ".data"existingData := `[{"Operator": "Test", "Duration": 60.0}, [5.0, 10.0, 0, 16000, [{"species": "Existing", "certainty": 90, "filter": "Raven"}]]]`if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {t.Fatal(err)}ravenPath := filepath.Join(tmpDir, "test.Table.1.selections.txt")ravenContent := "Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tSpecies\n1\tSpectrogram 1\t1\t0.0\t5.0\t1000\t5000\tNew\n"if err := os.WriteFile(ravenPath, []byte(ravenContent), 0644); err != nil {t.Fatal(err)}input := CallsFromRavenInput{File: ravenPath}output, err := CallsFromRaven(input)if err == nil {t.Error("expected error for same filter, got nil")}if output.Error == nil {t.Error("expected error message in output")}}func TestCallsFromRaven_ExistingDifferentFilter(t *testing.T) {tmpDir := t.TempDir()wavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)dataPath := wavPath + ".data"existingData := `[{"Operator": "Test", "Duration": 60.0}, [5.0, 10.0, 0, 16000, [{"species": "Kiwi", "certainty": 90, "filter": "BirdNET"}]]]`if err := os.WriteFile(dataPath, []byte(existingData), 0644); err != nil {t.Fatal(err)}ravenPath := filepath.Join(tmpDir, "test.Table.1.selections.txt")ravenContent := "Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tSpecies\n1\tSpectrogram 1\t1\t0.0\t5.0\t1000\t5000\tMorepork\n"if err := os.WriteFile(ravenPath, []byte(ravenContent), 0644); err != nil {t.Fatal(err)}input := CallsFromRavenInput{File: ravenPath}output, err := CallsFromRaven(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.DataFilesWritten != 1 {t.Errorf("expected 1 data file written, got %d", output.DataFilesWritten)}df, err := utils.ParseDataFile(dataPath)if err != nil {t.Fatalf("failed to parse .data file: %v", err)}if len(df.Segments) != 2 {t.Errorf("expected 2 segments after merge, got %d", len(df.Segments))}}func TestCallsFromRaven_DeleteOption(t *testing.T) {tmpDir := t.TempDir()wavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)ravenPath := filepath.Join(tmpDir, "test.Table.1.selections.txt")ravenContent := "Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tSpecies\n1\tSpectrogram 1\t1\t0.0\t5.0\t1000\t5000\tKiwi\n"if err := os.WriteFile(ravenPath, []byte(ravenContent), 0644); err != nil {t.Fatal(err)}input := CallsFromRavenInput{File: ravenPath, Delete: true}output, err := CallsFromRaven(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.FilesDeleted != 1 {t.Errorf("expected 1 file deleted, got %d", output.FilesDeleted)}if _, err := os.Stat(ravenPath); !os.IsNotExist(err) {t.Error("expected Raven file to be deleted")}}func TestCallsFromRaven_MultipleSelections(t *testing.T) {tmpDir := t.TempDir()wavPath := filepath.Join(tmpDir, "test.WAV")createMinimalWAV(t, wavPath, 16000, 60.0)ravenPath := filepath.Join(tmpDir, "test.Table.1.selections.txt")ravenContent := "Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tSpecies\n1\tSpectrogram 1\t1\t0.0\t5.0\t1000\t5000\tKiwi\n2\tSpectrogram 1\t1\t10.0\t15.0\t2000\t6000\tMorepork\n3\tSpectrogram 1\t1\t20.0\t25.0\t1500\t4500\tTui\n"if err := os.WriteFile(ravenPath, []byte(ravenContent), 0644); err != nil {t.Fatal(err)}input := CallsFromRavenInput{File: ravenPath}output, err := CallsFromRaven(input)if err != nil {t.Fatalf("unexpected error: %v", err)}if output.TotalCalls != 3 {t.Errorf("expected 3 calls, got %d", output.TotalCalls)}if output.SpeciesCount["Kiwi"] != 1 || output.SpeciesCount["Morepork"] != 1 || output.SpeciesCount["Tui"] != 1 {t.Errorf("unexpected species count: %v", output.SpeciesCount)}}
package callsimport ("encoding/csv""fmt""io""os""path/filepath""strconv""strings""skraak/utils")// CallsFromBirdaInput defines the input for the calls-from-birda tooltype CallsFromBirdaInput struct {Folder string `json:"folder"`File string `json:"file"`Delete bool `json:"delete"`ProgressHandler ProgressHandler `json:"-"` // Optional progress callback}// CallsFromBirdaOutput defines the output for the calls-from-birda tooltype CallsFromBirdaOutput struct {Calls []ClusteredCall `json:"calls"`TotalCalls int `json:"total_calls"`SpeciesCount map[string]int `json:"species_count"`DataFilesWritten int `json:"data_files_written"`DataFilesSkipped int `json:"data_files_skipped"`FilesProcessed int `json:"files_processed"`FilesDeleted int `json:"files_deleted"`Filter string `json:"filter"`Error *string `json:"error,omitempty"`}// birdaSource implements CallSource for BirdNET results filestype birdaSource struct{}func (birdaSource) Name() string { return "BirdNET" }func (birdaSource) FindFiles(folder string) ([]string, error) {var files []stringentries, err := os.ReadDir(folder)if err != nil {return nil, err}for _, entry := range entries {name := entry.Name()if strings.HasSuffix(name, ".BirdNET.results.csv") {files = append(files, filepath.Join(folder, name))}}return files, nil}func (birdaSource) ProcessFile(birdaFile string, cache *DirCache) ([]ClusteredCall, bool, bool, error) {return processBirdaFileCached(birdaFile, cache)}// CallsFromBirda processes BirdNET results files and writes .data filesfunc CallsFromBirda(input CallsFromBirdaInput) (CallsFromBirdaOutput, error) {src := birdaSource{}commonInput := CallsFromSourceInput(input)commonOutput, err := callsFromSource(src, commonInput)// Convert to Birda-specific output typevar output CallsFromBirdaOutputoutput.Calls = commonOutput.Callsoutput.TotalCalls = commonOutput.TotalCallsoutput.SpeciesCount = commonOutput.SpeciesCountoutput.DataFilesWritten = commonOutput.DataFilesWrittenoutput.DataFilesSkipped = commonOutput.DataFilesSkippedoutput.FilesProcessed = commonOutput.FilesProcessedoutput.FilesDeleted = commonOutput.FilesDeletedoutput.Filter = commonOutput.Filteroutput.Error = commonOutput.Errorreturn output, err}// BirdNETDetection represents a single BirdNET detectiontype BirdNETDetection struct {StartTime float64EndTime float64ScientificName stringCommonName stringConfidence float64WAVPath string}// birdaColumnIndices holds the parsed column positions from a BirdNET CSV header.type birdaColumnIndices struct {startIdx intendIdx intcommonNameIdx intconfidenceIdx intfileIdx int}// parseBirdaCSVHeader reads the CSV header row and returns column indices.func parseBirdaCSVHeader(reader *csv.Reader) (birdaColumnIndices, error) {header, err := reader.Read()if err != nil {return birdaColumnIndices{}, fmt.Errorf("failed to read header: %w", err)}idx := birdaColumnIndices{startIdx: -1, endIdx: -1, commonNameIdx: -1, confidenceIdx: -1, fileIdx: -1}for i, col := range header {col = strings.TrimPrefix(col, "\ufeff")switch col {case "Start (s)":idx.startIdx = icase "End (s)":idx.endIdx = icase "Common name":idx.commonNameIdx = icase "Confidence":idx.confidenceIdx = icase "File":idx.fileIdx = i}}if idx.startIdx == -1 || idx.endIdx == -1 || idx.commonNameIdx == -1 || idx.confidenceIdx == -1 {return birdaColumnIndices{}, fmt.Errorf("missing required columns in BirdNET file")}return idx, nil}// readBirdaDetections reads all detection records from a BirdNET CSV.func readBirdaDetections(reader *csv.Reader, idx birdaColumnIndices) ([]BirdNETDetection, error) {var detections []BirdNETDetectionfor {record, err := reader.Read()if err == io.EOF {break}if err != nil {return nil, fmt.Errorf("failed to read record: %w", err)}var det BirdNETDetectionstartTime, perr := strconv.ParseFloat(record[idx.startIdx], 64)if perr != nil {return nil, fmt.Errorf("failed to parse start time %q: %w", record[idx.startIdx], perr)}det.StartTime = startTimeendTime, perr := strconv.ParseFloat(record[idx.endIdx], 64)if perr != nil {return nil, fmt.Errorf("failed to parse end time %q: %w", record[idx.endIdx], perr)}det.EndTime = endTimedet.CommonName = record[idx.commonNameIdx]confidence, perr := strconv.ParseFloat(record[idx.confidenceIdx], 64)if perr != nil {return nil, fmt.Errorf("failed to parse confidence %q: %w", record[idx.confidenceIdx], perr)}det.Confidence = confidenceif idx.fileIdx >= 0 && idx.fileIdx < len(record) {det.WAVPath = record[idx.fileIdx]}detections = append(detections, det)}return detections, nil}// resolveBirdaWAVPath finds the WAV file associated with a BirdNET results file.func resolveBirdaWAVPath(birdaFile string, firstWAVPath string, cache *DirCache) string {if firstWAVPath != "" {if _, err := os.Stat(firstWAVPath); err == nil {return firstWAVPath}}dir := filepath.Dir(birdaFile)base := filepath.Base(birdaFile)baseName := strings.TrimSuffix(base, ".BirdNET.results.csv")if cache != nil {return cache.FindWAV(baseName)}return findWAVFile(dir, baseName)}// processBirdaFileCached processes a single BirdNET results file using a DirCache for WAV lookupfunc processBirdaFileCached(birdaFile string, cache *DirCache) ([]ClusteredCall, bool, bool, error) {file, err := os.Open(birdaFile)if err != nil {return nil, false, false, fmt.Errorf("failed to open file: %w", err)}defer func() { _ = file.Close() }()reader := csv.NewReader(file)idx, err := parseBirdaCSVHeader(reader)if err != nil {return nil, false, false, err}detections, err := readBirdaDetections(reader, idx)if err != nil {return nil, false, false, err}if len(detections) == 0 {return nil, false, true, nil}wavPath := resolveBirdaWAVPath(birdaFile, detections[0].WAVPath, cache)if wavPath == "" {return nil, false, true, nil}sampleRate, duration, err := utils.ParseWAVHeaderMinimal(wavPath)if err != nil {return nil, false, true, nil}dataPath := wavPath + ".data"segments := buildBirdNETSegments(detections, sampleRate)meta := AviaNZMeta{Operator: "BirdNET", Duration: duration}reviewer := "None"meta.Reviewer = &reviewerif err := writeDotDataFileSafe(dataPath, segments, "BirdNET", meta); err != nil {return nil, false, false, err}var calls []ClusteredCallfor _, det := range detections {calls = append(calls, ClusteredCall{File: wavPath,StartTime: det.StartTime,EndTime: det.EndTime,EbirdCode: det.CommonName,Segments: 1,})}return calls, true, false, nil}// buildBirdNETSegments converts BirdNET detections to AviaNZ segmentsfunc buildBirdNETSegments(detections []BirdNETDetection, sampleRate int) []AviaNZSegment {var segments []AviaNZSegmentfor _, det := range detections {// Convert confidence (0.0-1.0) to certainty (0-100)certainty := min(max(int(det.Confidence*100), 0), 100)labels := []AviaNZLabel{{Species: det.CommonName,Certainty: certainty,Filter: "BirdNET",},}segment := AviaNZSegment{det.StartTime,det.EndTime,0, // freq_lowsampleRate, // freq_high (full band)labels,}segments = append(segments, segment)}return segments}
package callsimport ("os""path/filepath""testing")func TestDetectAnomalies_LabelMismatch(t *testing.T) {dir := t.TempDir()// Same time range, different calltypes across two modelsdata := `[{"Operator":"test"},` +`[0,10,100,1000,[{"species":"Kiwi","calltype":"Duet","certainty":100,"filter":"model-a"},` +`{"species":"Kiwi","calltype":"Male","certainty":100,"filter":"model-b"}]]]`if err := os.WriteFile(filepath.Join(dir, "f1.data"), []byte(data), 0644); err != nil {t.Fatal(err)}out, err := DetectAnomalies(DetectAnomaliesInput{Folder: dir, Models: []string{"model-a", "model-b"}})if err != nil {t.Fatal(err)}if out.LabelMismatches != 1 {t.Errorf("expected 1 label mismatch, got %d", out.LabelMismatches)}if out.CertaintyMismatches != 0 {t.Errorf("expected 0 certainty mismatches, got %d", out.CertaintyMismatches)}if out.Anomalies[0].Type != "label_mismatch" {t.Errorf("expected label_mismatch, got %s", out.Anomalies[0].Type)}}func TestDetectAnomalies_CertaintyMismatch(t *testing.T) {dir := t.TempDir()// Same time range, same labels, different certaintydata := `[{"Operator":"test"},` +`[0,10,100,1000,[{"species":"Kiwi","calltype":"Duet","certainty":90,"filter":"model-a"},` +`{"species":"Kiwi","calltype":"Duet","certainty":100,"filter":"model-b"}]]]`if err := os.WriteFile(filepath.Join(dir, "f1.data"), []byte(data), 0644); err != nil {t.Fatal(err)}out, err := DetectAnomalies(DetectAnomaliesInput{Folder: dir, Models: []string{"model-a", "model-b"}})if err != nil {t.Fatal(err)}if out.CertaintyMismatches != 1 {t.Errorf("expected 1 certainty mismatch, got %d", out.CertaintyMismatches)}if out.LabelMismatches != 0 {t.Errorf("expected 0 label mismatches, got %d", out.LabelMismatches)}}func TestDetectAnomalies_NoAnomalyWhenAgreement(t *testing.T) {dir := t.TempDir()data := `[{"Operator":"test"},` +`[0,10,100,1000,[{"species":"Kiwi","calltype":"Duet","certainty":100,"filter":"model-a"},` +`{"species":"Kiwi","calltype":"Duet","certainty":100,"filter":"model-b"}]]]`if err := os.WriteFile(filepath.Join(dir, "f1.data"), []byte(data), 0644); err != nil {t.Fatal(err)}out, err := DetectAnomalies(DetectAnomaliesInput{Folder: dir, Models: []string{"model-a", "model-b"}})if err != nil {t.Fatal(err)}if out.AnomaliesTotal != 0 {t.Errorf("expected 0 anomalies, got %d", out.AnomaliesTotal)}}func TestDetectAnomalies_LonelySegmentSkipped(t *testing.T) {dir := t.TempDir()// model-a has a segment, model-b has no segment in this filedata := `[{"Operator":"test"},` +`[0,10,100,1000,[{"species":"Kiwi","certainty":100,"filter":"model-a"}]]]`if err := os.WriteFile(filepath.Join(dir, "f1.data"), []byte(data), 0644); err != nil {t.Fatal(err)}out, err := DetectAnomalies(DetectAnomaliesInput{Folder: dir, Models: []string{"model-a", "model-b"}})if err != nil {t.Fatal(err)}if out.AnomaliesTotal != 0 {t.Errorf("lonely segment should be skipped, got %d anomalies", out.AnomaliesTotal)}if out.FilesWithAllModels != 0 {t.Errorf("file missing a model should not count as FilesWithAllModels")}}func TestDetectAnomalies_FailsWithOneModel(t *testing.T) {dir := t.TempDir()_, err := DetectAnomalies(DetectAnomaliesInput{Folder: dir, Models: []string{"model-a"}})if err == nil {t.Error("expected error with only 1 model")}}
package callsimport ("fmt""os""path/filepath""skraak/utils")type DetectAnomaliesInput struct {Folder stringModels []string // at least 2 filter namesSpecies []string // optional scope; empty = all species}type DetectAnomaliesOutput struct {Folder string `json:"folder"`Models []string `json:"models"`FilesExamined int `json:"files_examined"`FilesWithAllModels int `json:"files_with_all_models"`AnomaliesTotal int `json:"anomalies_total"`LabelMismatches int `json:"label_mismatches"`CertaintyMismatches int `json:"certainty_mismatches"`Anomalies []Anomaly `json:"anomalies,omitempty"`Error string `json:"error,omitempty"`}type Anomaly struct {File string `json:"file"`Type string `json:"type"` // "label_mismatch" | "certainty_mismatch"Segments []AnomalySegment `json:"segments"`}type AnomalySegment struct {Model string `json:"model"`Start float64 `json:"start"`End float64 `json:"end"`Species string `json:"species"`CallType string `json:"calltype,omitempty"`Certainty int `json:"certainty"`}// DetectAnomalies compares corresponding segments across multiple ML model filters// within each .data file. Segments are matched by time overlap (same logic as propagate).// Lonely segments (no overlap in one or more models) are silently skipped.// Anomalies are flagged when overlapping segments disagree on species+calltype,// or when labels match but certainty values differ.// validateAnomalyInput validates the input parameters for DetectAnomalies.func validateAnomalyInput(input DetectAnomaliesInput) error {if len(input.Models) < 2 {return fmt.Errorf("at least 2 --model values required")}for i, a := range input.Models {for j, b := range input.Models {if i != j && a == b {return fmt.Errorf("duplicate --model values are not allowed")}}}info, err := os.Stat(input.Folder)if err != nil {return fmt.Errorf("folder not found: %s", input.Folder)}if !info.IsDir() {return fmt.Errorf("not a directory: %s", input.Folder)}return nil}func DetectAnomalies(input DetectAnomaliesInput) (DetectAnomaliesOutput, error) {folder := filepath.Clean(input.Folder)output := DetectAnomaliesOutput{Folder: folder,Models: input.Models,}if err := validateAnomalyInput(input); err != nil {output.Error = err.Error()return output, err}files, err := utils.FindDataFiles(folder)if err != nil {output.Error = fmt.Sprintf("list .data files: %v", err)return output, fmt.Errorf("%s", output.Error)}scopeSet := make(map[string]bool, len(input.Species))for _, s := range input.Species {scopeSet[s] = true}for _, path := range files {df, err := utils.ParseDataFile(path)if err != nil {continue}output.FilesExamined++anomalies := detectAnomaliesInFile(df, path, input.Models, scopeSet)if anomalies == nil {// file didn't have all models presentcontinue}output.FilesWithAllModels++for _, a := range anomalies {if a.Type == "label_mismatch" {output.LabelMismatches++} else {output.CertaintyMismatches++}}output.Anomalies = append(output.Anomalies, anomalies...)}output.AnomaliesTotal = len(output.Anomalies)return output, nil}// labeledSeg pairs a segment with the specific label matching the model filter.type labeledSeg struct {seg *utils.Segmentlabel *utils.Label}// detectAnomaliesInFile returns nil if the file doesn't contain all required models.func detectAnomaliesInFile(df *utils.DataFile, path string, models []string, scope map[string]bool) []Anomaly {modelSegs := collectModelSegments(df, models)// Skip file if any model is entirely absent.for _, model := range models {if len(modelSegs[model]) == 0 {return nil}}var anomalies []Anomalyfor _, anchor := range modelSegs[models[0]] {if !inScope(anchor, scope) {continue}if matches := findOverlappingMatches(anchor, models, modelSegs); matches == nil {continue} else {group := buildComparisonGroup(anchor, models, matches)if a := checkGroupAnomaly(group, path, models); a != nil {anomalies = append(anomalies, *a)}}}return anomalies}// collectModelSegments groups labeled segments by model filter name.func collectModelSegments(df *utils.DataFile, models []string) map[string][]labeledSeg {modelSegs := make(map[string][]labeledSeg, len(models))for _, seg := range df.Segments {for _, lbl := range seg.Labels {for _, model := range models {if lbl.Filter == model {modelSegs[model] = append(modelSegs[model], labeledSeg{seg: seg, label: lbl})break}}}}return modelSegs}// inScope returns true if the anchor's label is within the species scope filter.func inScope(anchor labeledSeg, scope map[string]bool) bool {if len(scope) == 0 {return true}key := anchor.label.Speciesif anchor.label.CallType != "" {key += "+" + anchor.label.CallType}return scope[key] || scope[anchor.label.Species]}// findOverlappingMatches returns matches[model] = overlapping segments from that model,// or nil if any model has no overlap (lonely anchor).func findOverlappingMatches(anchor labeledSeg, models []string, modelSegs map[string][]labeledSeg) map[string][]labeledSeg {matches := make(map[string][]labeledSeg, len(models)-1)for _, model := range models[1:] {for _, candidate := range modelSegs[model] {if overlaps(anchor.seg, candidate.seg) {matches[model] = append(matches[model], candidate)}}if len(matches[model]) == 0 {return nil}}return matches}// buildComparisonGroup assembles anchor + first match per other model.func buildComparisonGroup(anchor labeledSeg, models []string, matches map[string][]labeledSeg) []labeledSeg {group := []labeledSeg{anchor}for _, model := range models[1:] {group = append(group, matches[model][0])}return group}// checkGroupAnomaly checks a comparison group for label or certainty mismatches.func checkGroupAnomaly(group []labeledSeg, path string, models []string) *Anomaly {refSpecies := group[0].label.SpeciesrefCallType := group[0].label.CallTypefor _, ls := range group[1:] {if ls.label.Species != refSpecies || ls.label.CallType != refCallType {a := Anomaly{File: path, Type: "label_mismatch", Segments: buildAnomalySegs(group, models)}return &a}}refCertainty := group[0].label.Certaintyfor _, ls := range group[1:] {if ls.label.Certainty != refCertainty {a := Anomaly{File: path, Type: "certainty_mismatch", Segments: buildAnomalySegs(group, models)}return &a}}return nil}func buildAnomalySegs(group []labeledSeg, models []string) []AnomalySegment {segs := make([]AnomalySegment, len(group))for i, ls := range group {segs[i] = AnomalySegment{Model: models[i],Start: ls.seg.StartTime,End: ls.seg.EndTime,Species: ls.label.Species,CallType: ls.label.CallType,Certainty: ls.label.Certainty,}}return segs}// overlaps returns true if two segments share any time overlap.func overlaps(a, b *utils.Segment) bool {return a.StartTime < b.EndTime && b.StartTime < a.EndTime}
package callsimport ("encoding/csv""os""path/filepath""strings""testing""skraak/utils")// --- test helpers (test file only) ---func writeDataFile(t *testing.T, dir, name string, df *utils.DataFile) {t.Helper()if err := df.Write(filepath.Join(dir, name)); err != nil {t.Fatalf("write .data file %s: %v", name, err)}}func writeMapping(t *testing.T, dir, json string) {t.Helper()if err := os.WriteFile(filepath.Join(dir, "mapping.json"), []byte(json), 0644); err != nil {t.Fatalf("write mapping.json: %v", err)}}// parseCSV reads the output CSV, returning header and rows.func parseCSV(t *testing.T, path string) ([]string, [][]string) {t.Helper()f, err := os.Open(path)if err != nil {t.Fatalf("open CSV %s: %v", path, err)}defer f.Close()r := csv.NewReader(f)header, err := r.Read()if err != nil {t.Fatalf("read header: %v", err)}rows, err := r.ReadAll()if err != nil {t.Fatalf("read rows: %v", err)}return header, rows}// clipLabels calls CallsClipLabels with standard test parameters.func clipLabels(t *testing.T, dir string, extra ...func(*CallsClipLabelsInput)) CallsClipLabelsOutput {t.Helper()input := CallsClipLabelsInput{Folder: dir,MappingPath: filepath.Join(dir, "mapping.json"),OutputPath: filepath.Join(dir, "clip_labels.csv"),ClipDuration: 5,ClipOverlap: 0,MinLabelOverlap: 0.25,FinalClip: "full",}for _, fn := range extra {fn(&input)}out, err := CallsClipLabels(input)if err != nil {t.Fatalf("CallsClipLabels: %v", err)}return out}// --- tests ---func TestClipLabels_RealClassTrue(t *testing.T) {dir := t.TempDir()writeDataFile(t, dir, "rec.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 20},Segments: []*utils.Segment{{StartTime: 3, EndTime: 8, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},},},})writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"}}`)out := clipLabels(t, dir)header, rows := parseCSV(t, filepath.Join(dir, "clip_labels.csv"))// Header: file, start_time, end_time, Kiwiif len(header) != 4 || header[3] != "Kiwi" {t.Fatalf("header = %v, want [..., Kiwi]", header)}// Clip 0-5 overlaps segment 3-8 by 2s ≥ 0.25 → Kiwi=True// Clip 5-10 overlaps segment 3-8 by 3s ≥ 0.25 → Kiwi=True// Clip 10-15, 15-20 → Kiwi=FalsekiwiCol := 3for i, row := range rows {switch row[1] {case "0.0", "5.0":if row[kiwiCol] != "True" {t.Errorf("row %d (start=%s): Kiwi=%s, want True", i, row[1], row[kiwiCol])}case "10.0", "15.0":if row[kiwiCol] != "False" {t.Errorf("row %d (start=%s): Kiwi=%s, want False", i, row[1], row[kiwiCol])}}}if out.PerClassTrueCount["Kiwi"] != 2 {t.Errorf("PerClassTrueCount[Kiwi] = %d, want 2", out.PerClassTrueCount["Kiwi"])}}func TestClipLabels_GapClipsAllFalse(t *testing.T) {dir := t.TempDir()// 15s file, Kiwi segment 0-5 only → clips 5-10 and 10-15 are gapswriteDataFile(t, dir, "rec.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 15},Segments: []*utils.Segment{{StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},},},})writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"}}`)out := clipLabels(t, dir)if out.ClipsAllFalseGap != 2 {t.Errorf("ClipsAllFalseGap = %d, want 2", out.ClipsAllFalseGap)}if out.PerClassTrueCount["Kiwi"] != 1 {t.Errorf("PerClassTrueCount[Kiwi] = %d, want 1", out.PerClassTrueCount["Kiwi"])}if out.RowsWritten != 3 {t.Errorf("RowsWritten = %d, want 3", out.RowsWritten)}}func TestClipLabels_NegativeOverridesPositive(t *testing.T) {dir := t.TempDir()// Kiwi segment 0-8, Not segment 0-4 → clip 0-5 overlaps both → __NEGATIVE__ wins// Clip 5-10 overlaps only Kiwi (3s) → TruewriteDataFile(t, dir, "rec.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 10},Segments: []*utils.Segment{{StartTime: 0, EndTime: 8, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},},{StartTime: 0, EndTime: 4, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Not", Certainty: 100, Filter: "f1"}},},},})writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"},"Not":{"species":"__NEGATIVE__"}}`)out := clipLabels(t, dir)if out.ClipsNegative != 1 {t.Errorf("ClipsNegative = %d, want 1", out.ClipsNegative)}_, rows := parseCSV(t, filepath.Join(dir, "clip_labels.csv"))// Clip 0-5: negative hit → all-False (Not overlaps 0-4 by 4s)if rows[0][3] != "False" {t.Errorf("clip 0-5 Kiwi = %s, want False (overridden by __NEGATIVE__)", rows[0][3])}// Clip 5-10: only Kiwi overlaps (3s) → Trueif rows[1][3] != "True" {t.Errorf("clip 5-10 Kiwi = %s, want True", rows[1][3])}}func TestClipLabels_IgnoreExcludesClip(t *testing.T) {dir := t.TempDir()// Don't Know segment 0-5, Kiwi segment 6-10// Clip 0-5 overlaps __IGNORE__ → excluded// Clip 5-10 overlaps Kiwi → emitted with TruewriteDataFile(t, dir, "rec.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 15},Segments: []*utils.Segment{{StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Don't Know", Certainty: 0, Filter: "f1"}},},{StartTime: 6, EndTime: 10, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},},},})writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"},"Don't Know":{"species":"__IGNORE__"}}`)out := clipLabels(t, dir)if out.ClipsIgnored != 1 {t.Errorf("ClipsIgnored = %d, want 1", out.ClipsIgnored)}if out.SegmentsIgnored != 1 {t.Errorf("SegmentsIgnored = %d, want 1", out.SegmentsIgnored)}// Only 2 rows: clip 5-10 (Kiwi=True) and clip 10-15 (gap)if out.RowsWritten != 2 {t.Errorf("RowsWritten = %d, want 2", out.RowsWritten)}}func TestClipLabels_FilterRestrictsLabels(t *testing.T) {dir := t.TempDir()// Same time range, two filters. Only "wanted" should contribute.writeDataFile(t, dir, "rec.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 10},Segments: []*utils.Segment{{StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "wanted"},{Species: "Not", Certainty: 100, Filter: "unwanted"},},},},})writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"},"Not":{"species":"__NEGATIVE__"}}`)out := clipLabels(t, dir, func(in *CallsClipLabelsInput) { in.Filter = "wanted" })// Only Kiwi from "wanted" filter → clip 0-5 should be Kiwi=True// Not from "unwanted" filter should be ignored → no __NEGATIVE__ overrideif out.ClipsNegative != 0 {t.Errorf("ClipsNegative = %d, want 0 (Not filter excluded)", out.ClipsNegative)}if out.PerClassTrueCount["Kiwi"] != 1 {t.Errorf("PerClassTrueCount[Kiwi] = %d, want 1", out.PerClassTrueCount["Kiwi"])}}func TestClipLabels_MappingCoverageError(t *testing.T) {dir := t.TempDir()writeDataFile(t, dir, "rec.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 10},Segments: []*utils.Segment{{StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Mystery", Certainty: 100, Filter: "f1"}},},},})writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"}}`)input := CallsClipLabelsInput{Folder: dir,MappingPath: filepath.Join(dir, "mapping.json"),OutputPath: filepath.Join(dir, "clip_labels.csv"),ClipDuration: 5,ClipOverlap: 0,MinLabelOverlap: 0.25,FinalClip: "full",}_, err := CallsClipLabels(input)if err == nil {t.Fatal("expected error for missing species in mapping")}if !strings.Contains(err.Error(), "Mystery") {t.Errorf("error should mention missing species, got: %v", err)}}func TestClipLabels_AppendMode(t *testing.T) {dir := t.TempDir()writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"}}`)// First filewriteDataFile(t, dir, "a.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 5},Segments: []*utils.Segment{{StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},},},})out1 := clipLabels(t, dir)if out1.RowsWritten != 1 {t.Fatalf("first run: RowsWritten = %d, want 1", out1.RowsWritten)}// Second run on same output file but with a different input folder// Simulate append by running again — should fail on duplicate_, err := CallsClipLabels(CallsClipLabelsInput{Folder: dir,MappingPath: filepath.Join(dir, "mapping.json"),OutputPath: filepath.Join(dir, "clip_labels.csv"),ClipDuration: 5,ClipOverlap: 0,MinLabelOverlap: 0.25,FinalClip: "full",})if err == nil {t.Fatal("expected duplicate error on second run with same folder")}if !strings.Contains(err.Error(), "duplicate") {t.Errorf("error should mention duplicate, got: %v", err)}}func TestClipLabels_MultipleFiles(t *testing.T) {dir := t.TempDir()writeMapping(t, dir, `{"Kiwi":{"species":"Kiwi"}}`)writeDataFile(t, dir, "a.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 10},Segments: []*utils.Segment{{StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},},},})writeDataFile(t, dir, "b.wav.data", &utils.DataFile{Meta: &utils.DataMeta{Duration: 5},Segments: []*utils.Segment{{StartTime: 0, EndTime: 5, FreqLow: 100, FreqHigh: 5000,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "f1"}},},},})out := clipLabels(t, dir)if out.DataFilesParsed != 2 {t.Errorf("DataFilesParsed = %d, want 2", out.DataFilesParsed)}// a: 2 clips (0-5, 5-10), b: 1 clip (0-5) = 3 totalif out.RowsWritten != 3 {t.Errorf("RowsWritten = %d, want 3", out.RowsWritten)}_, rows := parseCSV(t, filepath.Join(dir, "clip_labels.csv"))files := map[string]int{}for _, r := range rows {files[r[0]]++}if len(files) != 2 {t.Errorf("expected 2 distinct files in CSV, got %d", len(files))}}
package callsimport ("encoding/csv""fmt""io""os""path/filepath""slices""sort""strconv""strings""skraak/utils")// CallsClipLabelsInput configures the clip-labels exporter.type CallsClipLabelsInput struct {Folder string `json:"folder"`MappingPath string `json:"mapping"`Filter string `json:"filter,omitempty"`OutputPath string `json:"output"`ClipDuration float64 `json:"clip_duration"`ClipOverlap float64 `json:"clip_overlap"`MinLabelOverlap float64 `json:"min_label_overlap"`FinalClip string `json:"final_clip"`}// CallsClipLabelsOutput summarises a run.type CallsClipLabelsOutput struct {Folder string `json:"folder"`OutputPath string `json:"output"`Filter string `json:"filter,omitempty"`Classes []string `json:"classes"`DataFilesParsed int `json:"data_files_parsed"`ClipsNegative int `json:"clips_negative"` // emitted, all-False because of __NEGATIVE__ClipsIgnored int `json:"clips_ignored"` // excluded from output because of __IGNORE__ overlapSegmentsIgnored int `json:"segments_ignored"` // segments whose species maps to __IGNORE__ClipsAllFalseGap int `json:"clips_all_false_gap"` // emitted, all-False because no overlapPerClassTrueCount map[string]int `json:"per_class_true_count"`AppendedToFile bool `json:"appended_to_file"`ExistingRowsFound int `json:"existing_rows_found"`RowsWritten int `json:"rows_written"`}// resolvedSeg is a segment that has been classified by the mapping and is// ready for overlap-checking against clip windows.type resolvedSeg struct {start, end float64kind utils.MappingKindclassIdx int // valid only when kind == utils.MappingReal}// clipDisposition describes the outcome for a single clip window.type clipDisposition intconst (dispoLabelled clipDisposition = iota // at least one class column is TruedispoNegative // __NEGATIVE__ hit, all class columns FalsedispoGap // no segment overlaps, all class columns FalsedispoIgnored // __IGNORE__ hit, clip excluded from output)// clipLabelsRow is one row of the output CSV.type clipLabelsRow struct {file stringstart float64end float64flags []bool}// rowKey is used for duplicate detection.type rowKey struct {file stringstart stringend string}// CallsClipLabels reads .data files from a single folder and writes a CSV in// OpenSoundScape's clip_labels format: one row per clip per file, with one// True/False column per class in the mapping.//// Mirrors BoxedAnnotations.clip_labels(): every clip window is emitted; a// column is True when any annotation of that class overlaps the window by// ≥ min_label_overlap seconds. Sentinel mappings (__NEGATIVE__, __IGNORE__)// get no column and contribute no labels.// parsedClipFile holds a parsed .data file for clip-labels processing.type parsedClipFile struct {path stringdf *utils.DataFile}// validateClipLabelsInput validates the input parameters and returns the parsed finalClipMode.func validateClipLabelsInput(input CallsClipLabelsInput) (utils.FinalClipMode, error) {finalClipMode, err := utils.ParseFinalClipMode(input.FinalClip)if err != nil {return 0, err}if input.ClipDuration <= 0 {return 0, fmt.Errorf("--clip-duration must be > 0, got %v", input.ClipDuration)}if input.ClipOverlap < 0 || input.ClipOverlap >= input.ClipDuration {return 0, fmt.Errorf("--clip-overlap must be in [0, clip-duration), got %v", input.ClipOverlap)}if input.MinLabelOverlap <= 0 {return 0, fmt.Errorf("--min-label-overlap must be > 0, got %v", input.MinLabelOverlap)}return finalClipMode, nil}// parseClipLabelsDataFiles finds and parses .data files, collecting species seen.func parseClipLabelsDataFiles(folder, filter string, mapping utils.MappingFile) ([]parsedClipFile, error) {dataPaths, err := utils.FindDataFiles(folder)if err != nil {return nil, fmt.Errorf("scan folder %s: %w", folder, err)}if len(dataPaths) == 0 {return nil, fmt.Errorf("no .data files found in %s", folder)}speciesSeen := map[string]bool{}parsed := make([]parsedClipFile, 0, len(dataPaths))for _, p := range dataPaths {df, err := utils.ParseDataFile(p)if err != nil {return nil, fmt.Errorf("parse %s: %w", p, err)}if df.Meta == nil || df.Meta.Duration <= 0 {return nil, fmt.Errorf("missing or non-positive Duration in %s (cannot generate clips)", p)}for _, seg := range df.Segments {for _, lbl := range seg.Labels {if filter != "" && lbl.Filter != filter {continue}speciesSeen[lbl.Species] = true}}parsed = append(parsed, parsedClipFile{path: p, df: df})}if missing := mapping.ValidateCoversSpecies(speciesSeen); len(missing) > 0 {return nil, fmt.Errorf("mapping.json is missing entries for species: %s\n(run /data-mapping to regenerate)", strings.Join(missing, ", "))}return parsed, nil}// dedupClipLabelsRows checks for duplicate rows within new rows and against existing CSV rows.func dedupClipLabelsRows(rows []clipLabelsRow, existing map[rowKey]bool) error {dedup := make(map[rowKey]bool, len(existing)+len(rows))for k := range existing {dedup[k] = true}for _, r := range rows {k := rowKey{file: r.file, start: formatTime(r.start), end: formatTime(r.end)}if dedup[k] {return fmt.Errorf("duplicate clip detected: file=%s start=%s end=%s", k.file, k.start, k.end)}dedup[k] = true}return nil}func CallsClipLabels(input CallsClipLabelsInput) (CallsClipLabelsOutput, error) {out := CallsClipLabelsOutput{Folder: input.Folder,OutputPath: input.OutputPath,PerClassTrueCount: map[string]int{},}finalClipMode, err := validateClipLabelsInput(input)if err != nil {return out, err}mapping, err := utils.LoadMappingFile(input.MappingPath)if err != nil {return out, fmt.Errorf("load mapping %s: %w", input.MappingPath, err)}classes := mapping.Classes()if len(classes) == 0 {return out, fmt.Errorf("mapping.json has no real (non-sentinel) classes")}out.Classes = classesout.Filter = input.FilterclassIdx := map[string]int{}for i, c := range classes {classIdx[c] = i}parsed, err := parseClipLabelsDataFiles(input.Folder, input.Filter, mapping)if err != nil {return out, err}out.DataFilesParsed = len(parsed)expectedHeader := append([]string{"file", "start_time", "end_time"}, classes...)existing, appendMode, err := loadExistingRows(input.OutputPath, expectedHeader)if err != nil {return out, err}out.AppendedToFile = appendModeout.ExistingRowsFound = len(existing)cwd, err := os.Getwd()if err != nil {return out, fmt.Errorf("getwd: %w", err)}folderAbs, err := filepath.Abs(input.Folder)if err != nil {return out, fmt.Errorf("abs %s: %w", input.Folder, err)}rows := make([]clipLabelsRow, 0, 1024)for _, pf := range parsed {fileRows, err := processClipLabelsFile(pf.path, pf.df, mapping, classIdx, classes, input, finalClipMode, cwd, folderAbs, &out)if err != nil {return out, err}rows = append(rows, fileRows...)}if err := dedupClipLabelsRows(rows, existing); err != nil {return out, err}if err := writeRows(input.OutputPath, expectedHeader, rows, appendMode); err != nil {return out, err}out.RowsWritten = len(rows)sort.Strings(out.Classes)return out, nil}// processClipLabelsFile generates clip-labels rows for a single .data file.func processClipLabelsFile(path string,df *utils.DataFile,mapping utils.MappingFile,classIdx map[string]int,classes []string,input CallsClipLabelsInput,finalClipMode utils.FinalClipMode,cwd, folderAbs string,out *CallsClipLabelsOutput,) ([]clipLabelsRow, error) {windows, err := utils.GenerateClipTimes(df.Meta.Duration,input.ClipDuration,input.ClipOverlap,finalClipMode,10,)if err != nil {return nil, fmt.Errorf("generate clip windows for %s: %w", path, err)}if len(windows) == 0 {return nil, nil}segs := resolveSegments(df.Segments, input.Filter, input.MinLabelOverlap, mapping, classIdx, out)rel, err := computeWavRelPath(path, cwd, folderAbs)if err != nil {return nil, err}return labelClipWindows(windows, segs, rel, classes, input.MinLabelOverlap, out), nil}// resolveSegments maps segments to their classification and filters out mismatches.func resolveSegments(segments []*utils.Segment,filter string,minLabelOverlap float64,mapping utils.MappingFile,classIdx map[string]int,out *CallsClipLabelsOutput,) []resolvedSeg {segs := make([]resolvedSeg, 0, len(segments))for _, seg := range segments {if seg.EndTime-seg.StartTime < minLabelOverlap {continue}for _, lbl := range seg.Labels {if filter != "" && lbl.Filter != filter {continue}canon, kind, ok := mapping.Classify(lbl.Species)if !ok {continue}switch kind {case utils.MappingIgn:out.SegmentsIgnored++segs = append(segs, resolvedSeg{start: seg.StartTime, end: seg.EndTime, kind: kind})case utils.MappingNeg:segs = append(segs, resolvedSeg{start: seg.StartTime, end: seg.EndTime, kind: kind})case utils.MappingReal:idx, present := classIdx[canon]if !present {continue}segs = append(segs, resolvedSeg{start: seg.StartTime, end: seg.EndTime, kind: kind, classIdx: idx})}}}return segs}// computeWavRelPath computes the relative path from cwd to the WAV file corresponding to a .data file.func computeWavRelPath(dataPath, cwd, folderAbs string) (string, error) {wavName := strings.TrimSuffix(filepath.Base(dataPath), ".data")wavAbs := filepath.Join(folderAbs, wavName)rel, err := filepath.Rel(cwd, wavAbs)if err != nil {rel = wavAbs}// Ensure relative paths start with ./ to match OPSO / pandas convention.if rel != "" && !filepath.IsAbs(rel) && !strings.HasPrefix(rel, "."+string(filepath.Separator)) {rel = "." + string(filepath.Separator) + rel}return rel, nil}// labelClipWindows classifies each clip window and builds the output rows.func labelClipWindows(windows []utils.ClipWindow, segs []resolvedSeg, rel string, classes []string, minLabelOverlap float64, out *CallsClipLabelsOutput) []clipLabelsRow {var rows []clipLabelsRowfor _, w := range windows {dispo, classHits := classifyClip(w, segs, minLabelOverlap, len(classes))if dispo == dispoIgnored {out.ClipsIgnored++continue}row := clipLabelsRow{file: rel,start: w.Start,end: w.End,flags: make([]bool, len(classes)),}switch dispo {case dispoNegative:out.ClipsNegative++case dispoGap:out.ClipsAllFalseGap++case dispoLabelled:for i, hit := range classHits {if hit {row.flags[i] = trueout.PerClassTrueCount[classes[i]]++}}}rows = append(rows, row)}return rows}// classifyClip determines the disposition of a single clip window against// the resolved segments. Priority: __IGNORE__ > __NEGATIVE__ > class labels.func classifyClip(w utils.ClipWindow, segs []resolvedSeg, minLabelOverlap float64, nClasses int) (clipDisposition, []bool) {ignoreHit := falsenegativeHit := falseclassHits := make([]bool, nClasses)for _, s := range segs {if overlapSeconds(s.start, s.end, w.Start, w.End) < minLabelOverlap {continue}switch s.kind {case utils.MappingIgn:ignoreHit = truecase utils.MappingNeg:negativeHit = truecase utils.MappingReal:classHits[s.classIdx] = true}}if ignoreHit {return dispoIgnored, nil}if negativeHit {return dispoNegative, classHits}for _, hit := range classHits {if hit {return dispoLabelled, classHits}}return dispoGap, classHits}// loadExistingRows reads an existing output CSV and returns its row keys// (for deduplication) and whether we're in append mode.func loadExistingRows(outputPath string, expectedHeader []string) (map[rowKey]bool, bool, error) {fi, err := os.Stat(outputPath)if err != nil {if os.IsNotExist(err) {return nil, false, nil}return nil, false, fmt.Errorf("stat %s: %w", outputPath, err)}if fi.Size() == 0 {return nil, false, nil}f, err := os.Open(outputPath)if err != nil {return nil, false, fmt.Errorf("open existing %s: %w", outputPath, err)}defer func() { _ = f.Close() }()r := csv.NewReader(f)r.FieldsPerRecord = -1header, err := r.Read()if err != nil {return nil, false, fmt.Errorf("read header of existing %s: %w", outputPath, err)}if !slices.Equal(header, expectedHeader) {return nil, false, fmt.Errorf("column-set mismatch in existing %s\n existing: %s\n new: %s",outputPath, strings.Join(header, ","), strings.Join(expectedHeader, ","))}existing := map[rowKey]bool{}for {rec, err := r.Read()if err == io.EOF {break}if err != nil {return nil, false, fmt.Errorf("read row of existing %s: %w", outputPath, err)}if len(rec) < 3 {return nil, false, fmt.Errorf("malformed row in existing %s: %v", outputPath, rec)}existing[rowKey{file: rec[0], start: rec[1], end: rec[2]}] = true}return existing, true, nil}// overlapSeconds returns the duration of overlap between two half-open intervals.func overlapSeconds(aStart, aEnd, bStart, bEnd float64) float64 {lo := max(aStart, bStart)hi := min(aEnd, bEnd)if hi <= lo {return 0}return hi - lo}// formatTime renders a float to match pandas' default float repr in to_csv:// always at least one decimal place, no trailing zeros beyond what's needed.// e.g. 5 → "5.0", 5.5 → "5.5", 3.5001250000 → "3.500125".func formatTime(v float64) string {s := strconv.FormatFloat(v, 'f', -1, 64)if !strings.ContainsRune(s, '.') {s += ".0"}return s}// writeRows writes the clip-labels rows to a CSV file.func writeRows(path string, header []string, rows []clipLabelsRow, appendMode bool) error {var f *os.Filevar err errorif appendMode {f, err = os.OpenFile(path, os.O_APPEND|os.O_WRONLY, 0644)} else {f, err = os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)}if err != nil {return fmt.Errorf("open %s for write: %w", path, err)}defer func() { _ = f.Close() }()w := csv.NewWriter(f)if !appendMode {if err := w.Write(header); err != nil {return fmt.Errorf("write header: %w", err)}}if len(rows) == 0 {w.Flush()return w.Error()}rec := make([]string, 3+len(rows[0].flags))for _, r := range rows {rec[0] = r.filerec[1] = formatTime(r.start)rec[2] = formatTime(r.end)for i, b := range r.flags {if b {rec[3+i] = "True"} else {rec[3+i] = "False"}}if err := w.Write(rec); err != nil {return fmt.Errorf("write row: %w", err)}}w.Flush()return w.Error()}
package callsimport ("encoding/binary""math""os""testing""skraak/utils")const benchWAV = "../../audio/20211028_211500.WAV"// ==================== WAV I/O ====================func BenchmarkReadWAV(b *testing.B) {b.ReportAllocs()for i := 0; i < b.N; i++ {_, _, err := utils.ReadWAVSamples(benchWAV)if err != nil {b.Fatal(err)}}}func BenchmarkConvertToFloat64_16bit(b *testing.B) {// Simulate 16-bit mono WAV data (same size as test file: 14.32M samples)numSamples := 14320000data := make([]byte, numSamples*2)for i := range numSamples {binary.LittleEndian.PutUint16(data[i*2:], uint16(i%65536))}b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {_ = convertToFloat64Bench(data, 16, 1)}}// Duplicate of convertToFloat64 for benchmarking (unexported in utils)func convertToFloat64Bench(data []byte, bitsPerSample, channels int) []float64 {bytesPerSample := bitsPerSample / 8blockAlign := bytesPerSample * channelsnumSamples := len(data) / blockAlignsamples := make([]float64, numSamples)for i := range numSamples {offset := i * blockAlignsample := int16(binary.LittleEndian.Uint16(data[offset : offset+2]))samples[i] = float64(sample) / 32768.0}return samples}func BenchmarkWriteWAV(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)b.Logf("segment samples=%d", len(segSamples))b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {f, _ := os.CreateTemp("", "bench_*.wav")utils.WriteWAVFile(f.Name(), segSamples, sr)f.Close()os.Remove(f.Name())}}// ==================== Resample ====================func BenchmarkResampleRate_48k(b *testing.B) {samples, _, _ := utils.ReadWAVSamples(benchWAV)b.Logf("resampling %d samples 48000->16000", len(samples))b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {utils.ResampleRate(samples, 48000, 16000)}}func BenchmarkResampleRate_250k(b *testing.B) {samples, _, _ := utils.ReadWAVSamples(benchWAV)b.Logf("resampling %d samples 250000->16000", len(samples))b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {utils.ResampleRate(samples, 250000, 16000)}}// ==================== Spectrogram pipeline ====================func BenchmarkExtractSegment(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)b.Logf("full file: %d samples, sr=%d", len(samples), sr)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {seg := utils.ExtractSegmentSamples(samples, sr, 872, 895)if len(seg) == 0 {b.Fatal("empty segment")}}}func BenchmarkPowerSpectrumFFT_512(b *testing.B) {n := 512samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)frameData := make([]float64, n)power := make([]float64, n/2+1)scratch := make([]complex128, n)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {// Simulate the windowing step (Hann) + FFTfor j := range n {frameData[j] = segSamples[j] * 0.5 * (1.0 - math.Cos(2.0*math.Pi*float64(j)/float64(n-1)))}utils.PowerSpectrumFFT(frameData, power, scratch)}}func BenchmarkSpectrogram_23s(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)cfg := utils.DefaultSpectrogramConfig(16000)b.Logf("segment samples=%d, windowSize=%d, hopSize=%d", len(segSamples), cfg.WindowSize, cfg.HopSize)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {spect := utils.GenerateSpectrogram(segSamples, cfg)if spect == nil {b.Fatal("nil spectrogram")}}}func BenchmarkSpectrogram_60s(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 0, 60)cfg := utils.DefaultSpectrogramConfig(16000)b.Logf("60s segment samples=%d", len(segSamples))b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {spect := utils.GenerateSpectrogram(segSamples, cfg)if spect == nil {b.Fatal("nil spectrogram")}}}// ==================== Image creation & resize ====================func BenchmarkCreateGrayscaleImage(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)cfg := utils.DefaultSpectrogramConfig(16000)spect := utils.GenerateSpectrogram(segSamples, cfg)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {img := utils.CreateGrayscaleImage(spect)if img == nil {b.Fatal("nil image")}}}func BenchmarkCreateRGBImage(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)cfg := utils.DefaultSpectrogramConfig(16000)spect := utils.GenerateSpectrogram(segSamples, cfg)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {colorData := utils.ApplyL4Colormap(spect)img := utils.CreateRGBImage(colorData)if img == nil {b.Fatal("nil image")}}}func BenchmarkApplyL4Colormap(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)cfg := utils.DefaultSpectrogramConfig(16000)spect := utils.GenerateSpectrogram(segSamples, cfg)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {colorData := utils.ApplyL4Colormap(spect)if colorData == nil {b.Fatal("nil colormap")}}}func BenchmarkResizeGray224(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)cfg := utils.DefaultSpectrogramConfig(16000)spect := utils.GenerateSpectrogram(segSamples, cfg)img := utils.CreateGrayscaleImage(spect)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {resized := utils.ResizeImage(img, 224, 224)if resized == nil {b.Fatal("nil resize")}}}func BenchmarkResizeGray448(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)cfg := utils.DefaultSpectrogramConfig(16000)spect := utils.GenerateSpectrogram(segSamples, cfg)img := utils.CreateGrayscaleImage(spect)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {resized := utils.ResizeImage(img, 448, 448)if resized == nil {b.Fatal("nil resize")}}}// ==================== PNG write ====================func BenchmarkWritePNG_224(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)cfg := utils.DefaultSpectrogramConfig(16000)spect := utils.GenerateSpectrogram(segSamples, cfg)img := utils.CreateGrayscaleImage(spect)resized := utils.ResizeImage(img, 224, 224)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {f, _ := os.CreateTemp("", "bench_*.png")utils.WritePNG(resized, f)f.Close()os.Remove(f.Name())}}// ==================== Full pipeline ====================func BenchmarkFullPipelineGray224(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)outputSR := srif sr > 16000 {segSamples = utils.ResampleRate(segSamples, sr, 16000)outputSR = 16000}cfg := utils.DefaultSpectrogramConfig(outputSR)spect := utils.GenerateSpectrogram(segSamples, cfg)img := utils.CreateGrayscaleImage(spect)resized := utils.ResizeImage(img, 224, 224)f, _ := os.CreateTemp("", "bench_*.png")utils.WritePNG(resized, f)f.Close()os.Remove(f.Name())utils.WriteWAVFile(f.Name(), segSamples, outputSR)os.Remove(f.Name())_ = resized}}func BenchmarkFullPipelineColor448(b *testing.B) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)b.ResetTimer()b.ReportAllocs()for i := 0; i < b.N; i++ {segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)outputSR := srif sr > 16000 {segSamples = utils.ResampleRate(segSamples, sr, 16000)outputSR = 16000}cfg := utils.DefaultSpectrogramConfig(outputSR)spect := utils.GenerateSpectrogram(segSamples, cfg)colorData := utils.ApplyL4Colormap(spect)img := utils.CreateRGBImage(colorData)resized := utils.ResizeImage(img, 448, 448)f, _ := os.CreateTemp("", "bench_*.png")utils.WritePNG(resized, f)f.Close()os.Remove(f.Name())utils.WriteWAVFile(f.Name(), segSamples, outputSR)os.Remove(f.Name())_ = resized}}// ==================== Data dimension report ====================func TestPipelineDimensions(t *testing.T) {samples, sr, _ := utils.ReadWAVSamples(benchWAV)segSamples := utils.ExtractSegmentSamples(samples, sr, 872, 895)t.Logf("Input: %d samples, sr=%d, segment=%d samples (%.1fs)",len(samples), sr, len(segSamples), float64(len(segSamples))/float64(sr))cfg := utils.DefaultSpectrogramConfig(16000)numFrames := (len(segSamples)-cfg.WindowSize)/cfg.HopSize + 1numBins := cfg.WindowSize/2 + 1t.Logf("Spectrogram: %d freq bins x %d time frames = %d values",numBins, numFrames, numBins*numFrames)spect := utils.GenerateSpectrogram(segSamples, cfg)t.Logf("Output: %d x %d (freq x time)", len(spect), len(spect[0]))img := utils.CreateGrayscaleImage(spect)t.Logf("Grayscale image: %dx%d pixels, %d bytes",img.Bounds().Dx(), img.Bounds().Dy(), img.Bounds().Dx()*img.Bounds().Dy())resized := utils.ResizeImage(img, 224, 224)t.Logf("Resized 224: %dx%d", resized.Bounds().Dx(), resized.Bounds().Dy())resized448 := utils.ResizeImage(img, 448, 448)t.Logf("Resized 448: %dx%d", resized448.Bounds().Dx(), resized448.Bounds().Dy())}
package callsimport ("fmt""image""math""os""path/filepath""runtime""strings""sync""skraak/utils")// CallsClipInput defines the input for the clip tooltype CallsClipInput struct {File string `json:"file"`Folder string `json:"folder"`Output string `json:"output"`Prefix string `json:"prefix"`Filter string `json:"filter"`Species string `json:"species"`Certainty int `json:"certainty"`Size int `json:"size"`Color bool `json:"color"`Night bool `json:"night"`Day bool `json:"day"`Location string `json:"location,omitempty"`}// CallsClipOutput defines the output for the clip tooltype CallsClipOutput struct {FilesProcessed int `json:"files_processed"`SegmentsClipped int `json:"segments_clipped"`NightSkipped int `json:"night_skipped,omitempty"`DaySkipped int `json:"day_skipped,omitempty"`OutputFiles []string `json:"output_files"`Errors []string `json:"errors,omitempty"`}// CallsClip processes .data files and generates audio/image clips for matching segmentsfunc CallsClip(input CallsClipInput) (CallsClipOutput, error) {var output CallsClipOutput// Validate required flagsif err := validateClipInput(&output, input); err != nil {return output, err}// Parse species+calltypespeciesName, callType := utils.ParseSpeciesCallType(input.Species)// Get list of .data filesfilePaths, err := resolveClipFiles(&output, input)if err != nil {return output, err}// Create output folder if it doesn't existif err := os.MkdirAll(input.Output, 0755); err != nil {output.Errors = append(output.Errors, fmt.Sprintf("failed to create output folder: %v", err))return output, err}// Clamp image size to valid rangeimgSize := utils.ClampImageSize(input.Size)// Parse location into lat/lng/timezonevar lat, lng float64var timezone stringif input.Location != "" {var err errorlat, lng, timezone, err = utils.ParseLocation(input.Location)if err != nil {output.Errors = append(output.Errors, err.Error())return output, err}}// Process .data files (parallel for larger batches)if len(filePaths) <= 2 {processFilesSequential(&output, filePaths, input, speciesName, callType, imgSize, lat, lng, timezone)} else {processFilesParallel(&output, filePaths, input, speciesName, callType, imgSize, lat, lng, timezone)}return output, nil}// validateClipInput validates required flags for clip generation.func validateClipInput(output *CallsClipOutput, input CallsClipInput) error {if input.File == "" && input.Folder == "" {output.Errors = append(output.Errors, "either --file or --folder is required")return fmt.Errorf("missing required flag: --file or --folder")}if input.Output == "" {output.Errors = append(output.Errors, "--output is required")return fmt.Errorf("missing required flag: --output")}if input.Prefix == "" {output.Errors = append(output.Errors, "--prefix is required")return fmt.Errorf("missing required flag: --prefix")}return nil}// resolveClipFiles returns the list of .data file paths from input.func resolveClipFiles(output *CallsClipOutput, input CallsClipInput) ([]string, error) {if input.File != "" {return []string{input.File}, nil}filePaths, err := utils.FindDataFiles(input.Folder)if err != nil {output.Errors = append(output.Errors, fmt.Sprintf("failed to find .data files: %v", err))return nil, err}if len(filePaths) == 0 {output.Errors = append(output.Errors, "no .data files found")return nil, fmt.Errorf("no .data files found")}return filePaths, nil}// processFilesSequential processes .data files one at a time.func processFilesSequential(output *CallsClipOutput, filePaths []string, input CallsClipInput, speciesName, callType string, imgSize int, lat, lng float64, timezone string) {for _, dataPath := range filePaths {clips, skipped, errs := processFile(dataPath, input.Output, input.Prefix, input.Filter, speciesName, callType, input.Certainty, imgSize, input.Color, input.Night, input.Day, lat, lng, timezone)accumulateFileResult(output, clips, skipped, errs, input.Night)}}// processFilesParallel processes .data files using worker goroutines.func processFilesParallel(output *CallsClipOutput, filePaths []string, input CallsClipInput, speciesName, callType string, imgSize int, lat, lng float64, timezone string) {type fileResult struct {clips []stringskipped interrs []string}workers := min(runtime.NumCPU(), 8, len(filePaths))jobs := make(chan string, len(filePaths))results := make(chan fileResult, len(filePaths))var wg sync.WaitGroupfor range workers {wg.Go(func() {for dataPath := range jobs {clips, skipped, errs := processFile(dataPath, input.Output, input.Prefix, input.Filter, speciesName, callType, input.Certainty, imgSize, input.Color, input.Night, input.Day, lat, lng, timezone)results <- fileResult{clips: clips, skipped: skipped, errs: errs}}})}for _, dataPath := range filePaths {jobs <- dataPath}close(jobs)go func() {wg.Wait()close(results)}()for r := range results {accumulateFileResult(output, r.clips, r.skipped, r.errs, input.Night)}}// accumulateFileResult merges a single file's results into the output.func accumulateFileResult(output *CallsClipOutput, clips []string, skipped int, errs []string, night bool) {output.SegmentsClipped += len(clips)if night {output.NightSkipped += skipped} else {output.DaySkipped += skipped}output.OutputFiles = append(output.OutputFiles, clips...)output.Errors = append(output.Errors, errs...)if len(clips) > 0 || len(errs) == 0 {output.FilesProcessed++}}// processFile processes a single .data file and returns generated clips, time-filter-skipped count, and errorsfunc processFile(dataPath, outputDir, prefix, filter, speciesName, callType string, certainty, imgSize int, color, night, day bool, lat, lng float64, timezone string) ([]string, int, []string) {var clips []stringvar errors []string// Parse .data filedataFile, err := utils.ParseDataFile(dataPath)if err != nil {errors = append(errors, fmt.Sprintf("%s: failed to parse: %v", dataPath, err))return nil, 0, errors}// Get WAV basename (without path and extensions)wavPath := filepath.Clean(strings.TrimSuffix(dataPath, ".data"))basename := filepath.Base(wavPath)basename = strings.TrimSuffix(basename, filepath.Ext(basename))// Filter segmentsmatchingSegments := filterSegments(dataFile.Segments, filter, speciesName, callType, certainty)if len(matchingSegments) == 0 {return nil, 0, nil}// Day/night filter: check WAV header only (cheaper than reading full audio).if night || day {skipped, err := checkDayNightFilter(wavPath, night, day, lat, lng, timezone)if err != nil || skipped {if skipped {return nil, 1, nil}return nil, 0, nil}}// Read WAV samples oncesamples, sampleRate, err := utils.ReadWAVSamples(wavPath)if err != nil {errors = append(errors, fmt.Sprintf("%s: failed to read WAV: %v", dataPath, err))return nil, 0, errors}// Process matching segmentsclips, errors = processSegments(matchingSegments, dataPath, samples, sampleRate, outputDir, prefix, basename, imgSize, color)return clips, 0, errors}// filterSegments returns segments matching the given filter criteria.func filterSegments(segments []*utils.Segment, filter, speciesName, callType string, certainty int) []*utils.Segment {var matching []*utils.Segmentfor _, seg := range segments {if seg.SegmentMatchesFilters(filter, speciesName, callType, certainty) {matching = append(matching, seg)}}return matching}// checkDayNightFilter applies day/night filtering. Returns (skipped=true, nil) if the// recording should be skipped, (false, nil) if it passes, or (false, err) on failure.func checkDayNightFilter(wavPath string, night, day bool, lat, lng float64, timezone string) (bool, error) {result, err := IsNight(IsNightInput{FilePath: wavPath,Lat: lat,Lng: lng,Timezone: timezone,})if err != nil {fmt.Fprintf(os.Stderr, "warning: skipping %s (isnight error: %v)\n", wavPath, err)return false, err}if night && !result.SolarNight {fmt.Fprintf(os.Stderr, "skipped (daytime): %s\n", wavPath)return true, nil}if day && !result.DiurnalActive {fmt.Fprintf(os.Stderr, "skipped (nighttime): %s\n", wavPath)return true, nil}return false, nil}// processSegments generates clips for matching segments, using parallel processing for larger batches.func processSegments(segments []*utils.Segment, dataPath string, samples []float64, sampleRate int, outputDir, prefix, basename string, imgSize int, color bool) ([]string, []string) {var clips []stringvar errors []stringif len(segments) <= 2 {for _, seg := range segments {clipFiles, err := generateClip(samples, sampleRate, outputDir, prefix, basename, seg.StartTime, seg.EndTime, imgSize, color)if err != nil {errors = append(errors, fmt.Sprintf("%s: segment %.0f-%.0f: %v", dataPath, seg.StartTime, seg.EndTime, err))continue}clips = append(clips, clipFiles...)}} else {clips, errors = processSegmentsParallel(segments, dataPath, samples, sampleRate, outputDir, prefix, basename, imgSize, color)}return clips, errors}// processSegmentsParallel generates clips for segments using worker goroutines.func processSegmentsParallel(segments []*utils.Segment, dataPath string, samples []float64, sampleRate int, outputDir, prefix, basename string, imgSize int, color bool) ([]string, []string) {type segResult struct {clips []stringerr string}workers := min(runtime.NumCPU(), len(segments))jobs := make(chan *utils.Segment, len(segments))results := make(chan segResult, len(segments))var wg sync.WaitGroupfor range workers {wg.Go(func() {for seg := range jobs {clipFiles, err := generateClip(samples, sampleRate, outputDir, prefix, basename, seg.StartTime, seg.EndTime, imgSize, color)if err != nil {results <- segResult{err: fmt.Sprintf("%s: segment %.0f-%.0f: %v", dataPath, seg.StartTime, seg.EndTime, err)}} else {results <- segResult{clips: clipFiles}}}})}for _, seg := range segments {jobs <- seg}close(jobs)go func() {wg.Wait()close(results)}()var clips []stringvar errors []stringfor r := range results {if r.err != "" {errors = append(errors, r.err)} else {clips = append(clips, r.clips...)}}return clips, errors}// generateClip generates PNG and WAV files for a segmentfunc generateClip(samples []float64, sampleRate int, outputDir, prefix, basename string, startTime, endTime float64, imgSize int, color bool) ([]string, error) {var files []string// Calculate integer times for filenamestartInt := int(math.Floor(startTime))endInt := int(math.Ceil(endTime))// Build base filenamebaseName := fmt.Sprintf("%s_%s_%d_%d", prefix, basename, startInt, endInt)wavPath := filepath.Join(outputDir, baseName+".wav")// Extract segment samplessegSamples := utils.ExtractSegmentSamples(samples, sampleRate, startTime, endTime)if len(segSamples) == 0 {return nil, fmt.Errorf("no samples in segment")}// Determine output sample rate (downsample if > 16kHz)outputSampleRate := sampleRateif sampleRate > utils.DefaultMaxSampleRate {segSamples = utils.ResampleRate(segSamples, sampleRate, utils.DefaultMaxSampleRate)outputSampleRate = utils.DefaultMaxSampleRate}pngPath := filepath.Join(outputDir, baseName+".png")spectSampleRate := outputSampleRateconfig := utils.DefaultSpectrogramConfig(spectSampleRate)spectrogram := utils.GenerateSpectrogram(segSamples, config)if spectrogram == nil {return nil, fmt.Errorf("failed to generate spectrogram")}// Create image (grayscale or color)var img image.Imageif color {colorData := utils.ApplyL4Colormap(spectrogram)img = utils.CreateRGBImage(colorData)} else {img = utils.CreateGrayscaleImage(spectrogram)}if img == nil {return nil, fmt.Errorf("failed to create image")}resized := utils.ResizeImage(img, imgSize, imgSize)// Write PNG (O_EXCL fails atomically if file exists)pngFile, err := os.OpenFile(pngPath, os.O_WRONLY|os.O_CREATE|os.O_EXCL, 0644)if err != nil {if os.IsExist(err) {return nil, fmt.Errorf("file already exists: %s", pngPath)}return nil, fmt.Errorf("failed to create PNG: %w", err)}if err := utils.WritePNG(resized, pngFile); err != nil {_ = pngFile.Close()return nil, fmt.Errorf("failed to write PNG: %w", err)}if err := pngFile.Close(); err != nil {return nil, fmt.Errorf("failed to close PNG: %w", err)}files = append(files, pngPath)// Write WAVif err := utils.WriteWAVFile(wavPath, segSamples, outputSampleRate); err != nil {return nil, fmt.Errorf("failed to write WAV: %w", err)}files = append(files, wavPath)return files, nil}
package callsimport ("testing""skraak/utils")func NewClassifyState(config ClassifyConfig, dataFiles []*utils.DataFile) *ClassifyState {hasFilter := config.Filter != "" || config.Species != "" || config.Certainty >= 0cached := make([][]*utils.Segment, len(dataFiles))for i, df := range dataFiles {if !hasFilter {cached[i] = df.Segments} else {for _, seg := range df.Segments {if seg.SegmentMatchesFilters(config.Filter, config.Species, config.CallType, config.Certainty) {cached[i] = append(cached[i], seg)}}}}total := 0for _, segs := range cached {total += len(segs)}return &ClassifyState{Config: config,DataFiles: dataFiles,filteredSegs: cached,totalSegs: total,}}func TestParseKeyBuffer(t *testing.T) {bindings := []KeyBinding{{Key: "k", Species: "Kiwi"},{Key: "d", Species: "Kiwi", CallType: "Duet"},{Key: "n", Species: "Don't Know"},{Key: "p", Species: "Morepork"},}state := NewClassifyState(ClassifyConfig{Bindings: bindings, Certainty: -1}, nil)tests := []struct {key stringwant *BindingResultwantNil bool}{{"k", &BindingResult{Species: "Kiwi"}, false},{"d", &BindingResult{Species: "Kiwi", CallType: "Duet"}, false},{"n", &BindingResult{Species: "Don't Know"}, false},{"p", &BindingResult{Species: "Morepork"}, false},{"x", nil, true}, // unknown key}for _, tt := range tests {got := state.ParseKeyBuffer(tt.key)if tt.wantNil {if got != nil {t.Errorf("ParseKeyBuffer(%q) = %v, want nil", tt.key, got)}} else {if got == nil {t.Errorf("ParseKeyBuffer(%q) = nil, want %+v", tt.key, tt.want)continue}if got.Species != tt.want.Species {t.Errorf("ParseKeyBuffer(%q).Species = %q, want %q", tt.key, got.Species, tt.want.Species)}if got.CallType != tt.want.CallType {t.Errorf("ParseKeyBuffer(%q).CallType = %q, want %q", tt.key, got.CallType, tt.want.CallType)}}}}func TestApplyBinding(t *testing.T) {bindings := []KeyBinding{{Key: "k", Species: "Kiwi"},{Key: "n", Species: "Don't Know"},{Key: "d", Species: "Kiwi", CallType: "Duet"},}df := &utils.DataFile{Meta: &utils.DataMeta{},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 20.0,Labels: []*utils.Label{{Species: "Unknown", Certainty: 50, Filter: "test-filter", CallType: "OldType"},},},},}state := NewClassifyState(ClassifyConfig{Filter: "test-filter",Reviewer: "David",Bindings: bindings,Certainty: -1,}, []*utils.DataFile{df})// Apply "k" = Kiwi (no calltype, should remove existing calltype)result := &BindingResult{Species: "Kiwi"}state.ApplyBinding(result)// Check label was updatedif len(df.Segments[0].Labels) != 1 {t.Errorf("expected 1 label, got %d", len(df.Segments[0].Labels))}if df.Segments[0].Labels[0].Species != "Kiwi" {t.Errorf("expected Species=Kiwi, got %s", df.Segments[0].Labels[0].Species)}if df.Segments[0].Labels[0].Certainty != 100 {t.Errorf("expected Certainty=100, got %d", df.Segments[0].Labels[0].Certainty)}if df.Segments[0].Labels[0].CallType != "" {t.Errorf("expected CallType='', got %s (should be removed)", df.Segments[0].Labels[0].CallType)}if df.Meta.Reviewer != "David" {t.Errorf("expected Reviewer=David, got %s", df.Meta.Reviewer)}// Apply "d" = Kiwi/Duet (should set calltype)result = &BindingResult{Species: "Kiwi", CallType: "Duet"}state.ApplyBinding(result)if df.Segments[0].Labels[0].CallType != "Duet" {t.Errorf("expected CallType=Duet, got %s", df.Segments[0].Labels[0].CallType)}// Apply "n" = Don't Know (certainty should be 0)result = &BindingResult{Species: "Don't Know"}state.ApplyBinding(result)if df.Segments[0].Labels[0].Species != "Don't Know" {t.Errorf("expected Species=Don't Know, got %s", df.Segments[0].Labels[0].Species)}if df.Segments[0].Labels[0].Certainty != 0 {t.Errorf("expected Certainty=0 for Don't Know, got %d", df.Segments[0].Labels[0].Certainty)}}func TestApplyBindingCallTypeRemoval(t *testing.T) {bindings := []KeyBinding{{Key: "k", Species: "Kiwi"}, // no calltype}df := &utils.DataFile{Meta: &utils.DataMeta{},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 20.0,Labels: []*utils.Label{{Species: "Kiwi", Certainty: 100, Filter: "test-filter", CallType: "Male"},},},},}state := NewClassifyState(ClassifyConfig{Filter: "test-filter",Reviewer: "David",Bindings: bindings,Certainty: -1,}, []*utils.DataFile{df})// Apply "k" = Kiwi (should remove Male calltype)result := &BindingResult{Species: "Kiwi"}state.ApplyBinding(result)if df.Segments[0].Labels[0].CallType != "" {t.Errorf("expected CallType='', got %s (should be removed)", df.Segments[0].Labels[0].CallType)}}func TestConfirmLabelDontKnow(t *testing.T) {df := &utils.DataFile{Meta: &utils.DataMeta{},Segments: []*utils.Segment{{StartTime: 10.0,EndTime: 20.0,Labels: []*utils.Label{{Species: "Don't Know", Certainty: 0, Filter: "test-filter"},},},},}state := NewClassifyState(ClassifyConfig{Filter: "test-filter",Reviewer: "David",Certainty: -1,}, []*utils.DataFile{df})// ConfirmLabel on Don't Know should be a no-opif state.ConfirmLabel() {t.Error("ConfirmLabel() should return false for Don't Know (certainty=0)")}label := df.Segments[0].Labels[0]if label.Species != "Don't Know" {t.Errorf("Species should remain Don't Know, got %s", label.Species)}if label.Certainty != 0 {t.Errorf("Certainty should remain 0, got %d", label.Certainty)}if state.Dirty {t.Error("State should not be dirty after confirming Don't Know")}}
package callsimport ("os""path/filepath""testing")// writeDataFileContent creates a .data file in dir with the given raw content.func writeDataFileContent(t *testing.T, dir, name, content string) {t.Helper()if err := os.WriteFile(filepath.Join(dir, name), []byte(content), 0644); err != nil {t.Fatal(err)}}// mustLoadDataFiles is a test helper that calls LoadDataFiles and fatals on error.func mustLoadDataFiles(t *testing.T, config ClassifyConfig) *ClassifyState {t.Helper()state, err := LoadDataFiles(config)if err != nil {t.Fatal(err)}return state}// assertFileSegCounts checks file count and total segment count match expected values.func assertFileSegCounts(t *testing.T, state *ClassifyState, wantFiles, wantSegs int, label string) {t.Helper()if len(state.DataFiles) != wantFiles {t.Errorf("%s: expected %d files, got %d", label, wantFiles, len(state.DataFiles))}if state.TotalSegments() != wantSegs {t.Errorf("%s: expected %d segments total, got %d", label, wantSegs, state.TotalSegments())}}const (kiwiSeg = `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 90}]]]`tomtitSeg = `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Tomtit", "certainty": 90}]]]`)func TestLoadDataFilesFiltersFilesWithNoMatchingSegments(t *testing.T) {tempDir := t.TempDir()writeDataFileContent(t, tempDir, "file1.data", kiwiSeg)writeDataFileContent(t, tempDir, "file2.data", tomtitSeg)writeDataFileContent(t, tempDir, "file3.data", kiwiSeg)t.Run("no_filter", func(t *testing.T) {state := mustLoadDataFiles(t, ClassifyConfig{Folder: tempDir, Certainty: -1})assertFileSegCounts(t, state, 3, 3, "No filter")})t.Run("species_kiwi", func(t *testing.T) {state := mustLoadDataFiles(t, ClassifyConfig{Folder: tempDir, Species: "Kiwi", Certainty: -1})assertFileSegCounts(t, state, 2, 2, "Species=Kiwi")})t.Run("species_tomtit", func(t *testing.T) {state := mustLoadDataFiles(t, ClassifyConfig{Folder: tempDir, Species: "Tomtit", Certainty: -1})assertFileSegCounts(t, state, 1, 1, "Species=Tomtit")})t.Run("species_nonexistent", func(t *testing.T) {state := mustLoadDataFiles(t, ClassifyConfig{Folder: tempDir, Species: "NonExistent", Certainty: -1})assertFileSegCounts(t, state, 0, 0, "Species=NonExistent")})}func TestLoadDataFilesWithMixedSegments(t *testing.T) {tempDir := t.TempDir()file := `[{"Operator": "test"},[0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 90}]],[10, 20, 100, 1000, [{"species": "Tomtit", "certainty": 80}]],[20, 30, 100, 1000, [{"species": "Kiwi", "certainty": 95}]]]`writeDataFileContent(t, tempDir, "mixed.data", file)state := mustLoadDataFiles(t, ClassifyConfig{Folder: tempDir, Species: "Kiwi", Certainty: -1})if len(state.DataFiles) != 1 {t.Errorf("Expected 1 file, got %d", len(state.DataFiles))}if state.TotalSegments() != 2 {t.Errorf("Species=Kiwi: expected 2 segments, got %d", state.TotalSegments())}// The DataFile should still have all 3 segments internally// but cached filtered segments should return only the Kiwi onesif len(state.DataFiles[0].Segments) != 3 {t.Errorf("DataFile should have 3 segments internally, got %d", len(state.DataFiles[0].Segments))}// TotalSegments uses cached filtered segmentsif state.TotalSegments() != 2 {t.Errorf("TotalSegments should return 2 Kiwi segments, got %d", state.TotalSegments())}}// Test that the original DataFile segments are not modified (immutable filtering)func TestFilteringDoesNotModifyOriginalSegments(t *testing.T) {tempDir := t.TempDir()file := `[{"Operator": "test"},[0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 90}]],[10, 20, 100, 1000, [{"species": "Tomtit", "certainty": 80}]]]`writeDataFileContent(t, tempDir, "test.data", file)state := mustLoadDataFiles(t, ClassifyConfig{Folder: tempDir, Species: "Kiwi", Certainty: -1})// Original segments should be untouchedoriginalSegments := state.DataFiles[0].Segmentsif len(originalSegments) != 2 {t.Errorf("Original should have 2 segments, got %d", len(originalSegments))}// Verify all original segments are preservedspecies := []string{}for _, seg := range originalSegments {if len(seg.Labels) > 0 {species = append(species, seg.Labels[0].Species)}}if len(species) != 2 || species[0] != "Kiwi" || species[1] != "Tomtit" {t.Errorf("Original segments should have both species, got %v", species)}}func TestLoadDataFilesCertaintyPruning(t *testing.T) {tempDir := t.TempDir()writeDataFileContent(t, tempDir, "file1.data", `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 70}]]]`)writeDataFileContent(t, tempDir, "file2.data", `[{"Operator": "test"}, [0, 10, 100, 1000, [{"species": "Kiwi", "certainty": 100}]]]`)state := mustLoadDataFiles(t, ClassifyConfig{Folder: tempDir, Certainty: 100})assertFileSegCounts(t, state, 1, 1, "Certainty=100")// CurrentSegment should work (not nil) because file1 was prunedseg := state.CurrentSegment()if seg == nil {t.Error("CurrentSegment should not be nil after pruning")}}
package callsimport ("math/rand""testing""skraak/utils")func TestTotalSegmentsRespectsFilters(t *testing.T) {// Create test data files with different species and filtersdf1 := &utils.DataFile{FilePath: "/test/file1.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0"},},},{StartTime: 10,EndTime: 20,Labels: []*utils.Label{{Species: "Tomtit", Filter: "model-1.0"},},},},}df2 := &utils.DataFile{FilePath: "/test/file2.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0"},},},},}// Test 1: No filters - should count all segments (3)state1 := NewClassifyState(ClassifyConfig{Certainty: -1}, []*utils.DataFile{df1, df2})if got := state1.TotalSegments(); got != 3 {t.Errorf("No filters: expected 3 segments, got %d", got)}// Test 2: Filter by species "Kiwi" - should count only Kiwi segments (2)state2 := NewClassifyState(ClassifyConfig{Species: "Kiwi", Certainty: -1}, []*utils.DataFile{df1, df2})if got := state2.TotalSegments(); got != 2 {t.Errorf("Species=Kiwi: expected 2 segments, got %d", got)}// Test 3: Filter by species "Tomtit" - should count only Tomtit segments (1)state3 := NewClassifyState(ClassifyConfig{Species: "Tomtit", Certainty: -1}, []*utils.DataFile{df1, df2})if got := state3.TotalSegments(); got != 1 {t.Errorf("Species=Tomtit: expected 1 segment, got %d", got)}// Test 4: Filter by filter name "model-1.0" - should count all segments (3)state4 := NewClassifyState(ClassifyConfig{Filter: "model-1.0", Certainty: -1}, []*utils.DataFile{df1, df2})if got := state4.TotalSegments(); got != 3 {t.Errorf("Filter=model-1.0: expected 3 segments, got %d", got)}// Test 5: Filter by non-existent species - should count 0state5 := NewClassifyState(ClassifyConfig{Species: "NonExistent", Certainty: -1}, []*utils.DataFile{df1, df2})if got := state5.TotalSegments(); got != 0 {t.Errorf("Species=NonExistent: expected 0 segments, got %d", got)}// Test 6: Combined filter + speciesdf3 := &utils.DataFile{FilePath: "/test/file3.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0", CallType: "Duet"},},},{StartTime: 10,EndTime: 20,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-2.0", CallType: "Male"},},},},}state6 := NewClassifyState(ClassifyConfig{Filter: "model-1.0", Species: "Kiwi", Certainty: -1}, []*utils.DataFile{df3})if got := state6.TotalSegments(); got != 1 {t.Errorf("Filter=model-1.0 + Species=Kiwi: expected 1 segment, got %d", got)}}func TestCurrentSegmentNumberWithFilters(t *testing.T) {// Create test data filesdf1 := &utils.DataFile{FilePath: "/test/file1.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0"},},},{StartTime: 10,EndTime: 20,Labels: []*utils.Label{{Species: "Tomtit", Filter: "model-1.0"},},},},}df2 := &utils.DataFile{FilePath: "/test/file2.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0"},},},},}// Test: Filter by species "Kiwi", at file 2, segment 0// Should report current segment as 2 (first Kiwi in df1 + first Kiwi in df2)state := NewClassifyState(ClassifyConfig{Species: "Kiwi", Certainty: -1}, []*utils.DataFile{df1, df2})state.FileIdx = 1 // at df2state.SegmentIdx = 0if got := state.CurrentSegmentNumber(); got != 2 {t.Errorf("Species=Kiwi, at file 2, seg 0: expected current segment 2, got %d", got)}}func TestCertaintyFiltering(t *testing.T) {// Create test data files with different certainty levelsdf := &utils.DataFile{FilePath: "/test/file1.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0", Certainty: 70},},},{StartTime: 10,EndTime: 20,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0", Certainty: 100},},},{StartTime: 20,EndTime: 30,Labels: []*utils.Label{{Species: "Tomtit", Filter: "model-1.0", Certainty: 70},},},},}// Test 1: Filter by certainty 70 - should get 2 segmentsstate1 := NewClassifyState(ClassifyConfig{Certainty: 70}, []*utils.DataFile{df})if got := state1.TotalSegments(); got != 2 {t.Errorf("Certainty=70: expected 2 segments, got %d", got)}// Test 2: Filter by certainty 100 - should get 1 segmentstate2 := NewClassifyState(ClassifyConfig{Certainty: 100}, []*utils.DataFile{df})if got := state2.TotalSegments(); got != 1 {t.Errorf("Certainty=100: expected 1 segment, got %d", got)}// Test 3: Filter by certainty 0 - should get 0 segmentsstate3 := NewClassifyState(ClassifyConfig{Certainty: 0}, []*utils.DataFile{df})if got := state3.TotalSegments(); got != 0 {t.Errorf("Certainty=0: expected 0 segments, got %d", got)}// Test 4: Combined species + certaintystate4 := NewClassifyState(ClassifyConfig{Species: "Kiwi", Certainty: 70}, []*utils.DataFile{df})if got := state4.TotalSegments(); got != 1 {t.Errorf("Species=Kiwi + Certainty=70: expected 1 segment, got %d", got)}}func TestSampling(t *testing.T) {makeSegs := func(n int) []*utils.Segment {s := make([]*utils.Segment, n)for i := range s {s[i] = &utils.Segment{StartTime: float64(i), EndTime: float64(i + 1)}}return s}df1 := &utils.DataFile{FilePath: "/test/f1.data", Segments: makeSegs(6)}df2 := &utils.DataFile{FilePath: "/test/f2.data", Segments: makeSegs(4)}kept := []*utils.DataFile{df1, df2}cached := [][]*utils.Segment{df1.Segments, df2.Segments}countTotal := func(c [][]*utils.Segment) int {n := 0for _, s := range c {n += len(s)}return n}// 50% of 10 → 5k, c := applySampling(kept, cached, 50, rand.New(rand.NewSource(42)))if got := countTotal(c); got != 5 {t.Errorf("sample 50%%: expected 5, got %d", got)}// Files must be in original chronological orderfor i := 1; i < len(k); i++ {if k[i].FilePath < k[i-1].FilePath {t.Errorf("sample 50%%: files out of order at index %d", i)}}// 10% of 10 → 1_, c2 := applySampling(kept, cached, 10, rand.New(rand.NewSource(42)))if got := countTotal(c2); got != 1 {t.Errorf("sample 10%%: expected 1, got %d", got)}// 1% of 10 → clamp to 1_, c3 := applySampling(kept, cached, 1, rand.New(rand.NewSource(42)))if got := countTotal(c3); got != 1 {t.Errorf("sample 1%%: expected 1 (clamped), got %d", got)}// 99% of 10 → 9_, c4 := applySampling(kept, cached, 99, rand.New(rand.NewSource(42)))if got := countTotal(c4); got != 9 {t.Errorf("sample 99%%: expected 9, got %d", got)}}func TestCertaintyPruning(t *testing.T) {// Simulate the bug: first file has no matching certainty segmentsdf1 := &utils.DataFile{FilePath: "/test/file1.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0", Certainty: 70},},},},}df2 := &utils.DataFile{FilePath: "/test/file2.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0", Certainty: 100},},},},}// Without pruning (old bug): file1 is first, has no certainty=100 segments// CurrentSegment() would return nil even though TotalSegments() > 0state := NewClassifyState(ClassifyConfig{Certainty: 100}, []*utils.DataFile{df1, df2})// TotalSegments should be 1 (only file2 has certainty 100)if got := state.TotalSegments(); got != 1 {t.Errorf("Certainty=100: expected 1 segment, got %d", got)}// CurrentSegment should work if files are properly pruned// Note: this test assumes LoadDataFiles does the pruning// Here we test the state after manual construction}func TestCallTypeNoneFiltering(t *testing.T) {// Create test data: Kiwi with calltype, Kiwi without, Tomtit withoutdf := &utils.DataFile{FilePath: "/test/file1.data",Segments: []*utils.Segment{{StartTime: 0,EndTime: 10,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0", CallType: "Male"},},},{StartTime: 10,EndTime: 20,Labels: []*utils.Label{{Species: "Kiwi", Filter: "model-1.0"}, // no calltype},},{StartTime: 20,EndTime: 30,Labels: []*utils.Label{{Species: "Tomtit", Filter: "model-1.0"}, // no calltype, wrong species},},},}// Test 1: --species Kiwi+_ should match only Kiwi with no calltype (1 segment)state1 := NewClassifyState(ClassifyConfig{Species: "Kiwi", CallType: utils.CallTypeNone, Certainty: -1}, []*utils.DataFile{df})if got := state1.TotalSegments(); got != 1 {t.Errorf("Species=Kiwi+_: expected 1 segment, got %d", got)}// Test 2: --species Kiwi should still match all Kiwi (2 segments)state2 := NewClassifyState(ClassifyConfig{Species: "Kiwi", Certainty: -1}, []*utils.DataFile{df})if got := state2.TotalSegments(); got != 2 {t.Errorf("Species=Kiwi: expected 2 segments, got %d", got)}// Test 3: --species Kiwi+Male should still work as before (1 segment)state3 := NewClassifyState(ClassifyConfig{Species: "Kiwi", CallType: "Male", Certainty: -1}, []*utils.DataFile{df})if got := state3.TotalSegments(); got != 1 {t.Errorf("Species=Kiwi+Male: expected 1 segment, got %d", got)}}
package callsimport ("fmt""math/rand""os""path/filepath""slices""sort""strings""time""skraak/utils")// KeyBinding maps a key to a species/calltypetype KeyBinding struct {Key string // single char: "k", "n", "p"Species string // "Kiwi", "Don't Know", "Morepork"CallType string // "Duet", "Female", "Male" (optional)}// ClassifyConfig holds the configuration for classificationtype ClassifyConfig struct {Folder stringFile stringFilter stringSpecies string // scope to this species (optional)CallType string // scope to this calltype within species (optional)Certainty int // scope to this certainty value, -1 = no filter (optional)Sample int // random sample percentage 1-99, -1 = no sampling, 100 = no-opGoto string // goto this file on startup (optional, basename match)Reviewer stringColor boolImageSize int // spectrogram display size in pixels (0 = default)Sixel boolITerm boolBindings []KeyBinding// SecondaryBindings maps a primary binding key to per-species calltype// keys. Invoked via Shift+primary-key: the species is labeled without// advancing, and the next key is interpreted as a calltype.SecondaryBindings map[string]map[string]stringNight boolDay boolLat float64Lng float64Timezone string}// ClassifyState holds the current state for TUItype ClassifyState struct {Config ClassifyConfigDataFiles []*utils.DataFilefilteredSegs [][]*utils.Segment // cached at load time, parallel to DataFilestotalSegs int // pre-computed total segment countFileIdx intSegmentIdx intDirty boolPlayer *utils.AudioPlayerPlaybackSpeed float64 // Current playback speed (1.0 = normal, 0.5 = half speed)TimeFilteredCount int // files skipped by --night or --day filter}// BindingResult represents parsed key resulttype BindingResult struct {Species stringCallType string // empty string = remove calltype}// LoadDataFiles loads all .data files for classification// findDataFilePaths resolves the list of .data file paths from config.func findDataFilePaths(config ClassifyConfig) ([]string, error) {if config.File != "" {return []string{config.File}, nil}paths, err := utils.FindDataFiles(config.Folder)if err != nil {return nil, fmt.Errorf("find data files: %w", err)}return paths, nil}// filterDataFileSegments applies segment and day/night filters to a single data file.// Returns the filtered segments and whether the file should be kept.// If the file is filtered out (no matching segments, or time-of-day), returns nil, false.func filterDataFileSegments(df *utils.DataFile, config ClassifyConfig) ([]*utils.Segment, bool, int) {segs := filterSegmentsByLabel(df.Segments, config)if segs == nil {return nil, false, 0}timeFiltered := 0if config.Night || config.Day {keep, tf := filterByTimeOfDay(df.FilePath, config)if !keep {return nil, false, tf}}return segs, true, timeFiltered}// filterSegmentsByLabel applies label/species/certainty filters, returning matching segments.// Returns nil if no segments match (caller should skip the file).func filterSegmentsByLabel(segments []*utils.Segment, config ClassifyConfig) []*utils.Segment {hasFilter := config.Filter != "" || config.Species != "" || config.Certainty >= 0if !hasFilter {return segments}var segs []*utils.Segmentfor _, seg := range segments {if seg.SegmentMatchesFilters(config.Filter, config.Species, config.CallType, config.Certainty) {segs = append(segs, seg)}}return segs // nil if empty, caller treats as "skip"}// filterByTimeOfDay checks --night/--day time-of-day filter for a .data file.// Returns (keep, timeFilteredCount).func filterByTimeOfDay(dataFilePath string, config ClassifyConfig) (bool, int) {wavPath := filepath.Clean(strings.TrimSuffix(dataFilePath, ".data"))result, err := IsNight(IsNightInput{FilePath: wavPath,Lat: config.Lat,Lng: config.Lng,Timezone: config.Timezone,})if err != nil {fmt.Fprintf(os.Stderr, "warning: skipping %s (isnight error: %v)\n", wavPath, err)return false, 1}if config.Night && !result.SolarNight {return false, 1}if config.Day && !result.DiurnalActive {return false, 1}return true, 0}func LoadDataFiles(config ClassifyConfig) (*ClassifyState, error) {dataFiles, err := parseAndSortDataFiles(config)if err != nil {return nil, err}kept, cachedSegs, timeFiltered := filterDataFiles(dataFiles, config)if config.Sample > 0 && config.Sample < 100 {rng := rand.New(rand.NewSource(time.Now().UnixNano()))kept, cachedSegs = applySampling(kept, cachedSegs, config.Sample, rng)}return buildClassifyState(config, kept, cachedSegs, timeFiltered)}// parseAndSortDataFiles finds, parses, and sorts .data files from the config.func parseAndSortDataFiles(config ClassifyConfig) ([]*utils.DataFile, error) {filePaths, err := findDataFilePaths(config)if err != nil {return nil, err}if len(filePaths) == 0 {return nil, fmt.Errorf("no .data files found")}var dataFiles []*utils.DataFilefor _, path := range filePaths {df, err := utils.ParseDataFile(path)if err != nil {continue}dataFiles = append(dataFiles, df)}if len(dataFiles) == 0 {return nil, fmt.Errorf("no valid .data files")}sort.Slice(dataFiles, func(i, j int) bool {return dataFiles[i].FilePath < dataFiles[j].FilePath})return dataFiles, nil}// filterDataFiles applies segment filters to each data file, returning kept files and their segments.func filterDataFiles(dataFiles []*utils.DataFile, config ClassifyConfig) ([]*utils.DataFile, [][]*utils.Segment, int) {var kept []*utils.DataFilevar cachedSegs [][]*utils.Segmentvar timeFiltered intfor _, df := range dataFiles {segs, keep, tf := filterDataFileSegments(df, config)timeFiltered += tfif !keep {continue}kept = append(kept, df)cachedSegs = append(cachedSegs, segs)}return kept, cachedSegs, timeFiltered}// buildClassifyState constructs the ClassifyState, handling --goto file positioning.func buildClassifyState(config ClassifyConfig, dataFiles []*utils.DataFile, filteredSegs [][]*utils.Segment, timeFiltered int) (*ClassifyState, error) {total := 0for _, segs := range filteredSegs {total += len(segs)}state := &ClassifyState{Config: config,DataFiles: dataFiles,filteredSegs: filteredSegs,totalSegs: total,TimeFilteredCount: timeFiltered,}if config.Goto == "" {return state, nil}for i, df := range state.DataFiles {base := df.FilePath[strings.LastIndex(df.FilePath, "/")+1:]if base == config.Goto {state.FileIdx = ireturn state, nil}}return nil, fmt.Errorf("goto file not found (or has no matching segments): %s", config.Goto)}// applySampling randomly selects sample% of segments from the filtered set.// The returned files and segments preserve the original chronological order.func applySampling(kept []*utils.DataFile, cachedSegs [][]*utils.Segment, sample int, rng *rand.Rand) ([]*utils.DataFile, [][]*utils.Segment) {flat := make([]struct{ fileIdx, segIdx int }, 0)for fi, segs := range cachedSegs {for si := range segs {flat = append(flat, struct{ fileIdx, segIdx int }{fi, si})}}targetCount := max(len(flat)*sample/100, 1)rng.Shuffle(len(flat), func(i, j int) { flat[i], flat[j] = flat[j], flat[i] })selected := flat[:targetCount]// Restore chronological order before rebuildingsort.Slice(selected, func(i, j int) bool {if selected[i].fileIdx != selected[j].fileIdx {return selected[i].fileIdx < selected[j].fileIdx}return selected[i].segIdx < selected[j].segIdx})newCached := make([][]*utils.Segment, len(cachedSegs))for _, ref := range selected {newCached[ref.fileIdx] = append(newCached[ref.fileIdx], cachedSegs[ref.fileIdx][ref.segIdx])}var newKept []*utils.DataFilevar finalCached [][]*utils.Segmentfor i, segs := range newCached {if len(segs) > 0 {newKept = append(newKept, kept[i])finalCached = append(finalCached, segs)}}return newKept, finalCached}// FilteredSegs returns the cached filtered segments parallel to DataFiles.func (s *ClassifyState) FilteredSegs() [][]*utils.Segment {return s.filteredSegs}// CurrentFile returns the current data filefunc (s *ClassifyState) CurrentFile() *utils.DataFile {if s.FileIdx >= len(s.DataFiles) {return nil}return s.DataFiles[s.FileIdx]}// CurrentSegment returns the current segmentfunc (s *ClassifyState) CurrentSegment() *utils.Segment {if s.FileIdx >= len(s.filteredSegs) {return nil}segs := s.filteredSegs[s.FileIdx]if s.SegmentIdx >= len(segs) {return nil}return segs[s.SegmentIdx]}// TotalSegments returns total segments to reviewfunc (s *ClassifyState) TotalSegments() int {return s.totalSegs}// CurrentSegmentNumber returns 1-based segment numberfunc (s *ClassifyState) CurrentSegmentNumber() int {count := 0for i := 0; i < s.FileIdx; i++ {count += len(s.filteredSegs[i])}return count + s.SegmentIdx + 1}// NextSegment moves to the next segment, returns false if at endfunc (s *ClassifyState) NextSegment() bool {if s.FileIdx >= len(s.filteredSegs) {return false}segs := s.filteredSegs[s.FileIdx]if s.SegmentIdx+1 < len(segs) {s.SegmentIdx++return true}// Move to next fileif s.FileIdx+1 < len(s.DataFiles) {s.FileIdx++s.SegmentIdx = 0return true}return false}// PrevSegment moves to the previous segment, returns false if at startfunc (s *ClassifyState) PrevSegment() bool {if s.SegmentIdx > 0 {s.SegmentIdx--return true}// Move to previous fileif s.FileIdx > 0 {s.FileIdx--segs := s.filteredSegs[s.FileIdx]s.SegmentIdx = max(len(segs)-1, 0)return true}return false}// ParseKeyBuffer parses a single key into binding resultfunc (s *ClassifyState) ParseKeyBuffer(key string) *BindingResult {for _, b := range s.Config.Bindings {if b.Key == key {return &BindingResult{Species: b.Species,CallType: b.CallType,}}}return nil}// SetComment sets the comment on the current segment's filter label.// Returns the previous comment (for undo) or empty string if none.func (s *ClassifyState) SetComment(comment string) string {seg := s.CurrentSegment()if seg == nil {return ""}df := s.CurrentFile()if df == nil {return ""}// Set reviewerdf.Meta.Reviewer = s.Config.Reviewer// Get labels matching filterfilterLabels := seg.GetFilterLabels(s.Config.Filter)var oldComment stringif len(filterLabels) == 0 {// No matching labels, add new one with commentlabel := &utils.Label{Species: "Don't Know",Certainty: 0,Filter: s.Config.Filter,Comment: comment,}seg.Labels = append(seg.Labels, label)} else {// Set comment on first matching labeloldComment = filterLabels[0].CommentfilterLabels[0].Comment = comment}s.Dirty = truereturn oldComment}// GetCurrentComment returns the comment on the current segment's filter label.func (s *ClassifyState) GetCurrentComment() string {seg := s.CurrentSegment()if seg == nil {return ""}filterLabels := seg.GetFilterLabels(s.Config.Filter)if len(filterLabels) == 0 {return ""}return filterLabels[0].Comment}// ApplyBinding applies a binding result to the current segmentfunc (s *ClassifyState) ApplyBinding(result *BindingResult) {seg := s.CurrentSegment()if seg == nil {return}df := s.CurrentFile()if df == nil {return}// Set reviewerdf.Meta.Reviewer = s.Config.Reviewer// Get labels matching filterfilterLabels := seg.GetFilterLabels(s.Config.Filter)// Determine certainty: 0 for Don't Know, 100 for otherscertainty := 100if result.Species == "Don't Know" {certainty = 0}if len(filterLabels) == 0 {// No matching labels, add new oneseg.Labels = append(seg.Labels, &utils.Label{Species: result.Species,Certainty: certainty,Filter: s.Config.Filter,CallType: result.CallType,})} else {// Edit first matching label, remove restfilterLabels[0].Species = result.SpeciesfilterLabels[0].Certainty = certaintyfilterLabels[0].CallType = result.CallType // always set (empty = remove)// Remove extra matching labelsif len(filterLabels) > 1 {var newLabels []*utils.Labelfor _, l := range seg.Labels {keep := !slices.Contains(filterLabels[1:], l)if keep {newLabels = append(newLabels, l)}}seg.Labels = newLabels}}// Re-sort labelssort.Slice(seg.Labels, func(i, j int) bool {return seg.Labels[i].Species < seg.Labels[j].Species})s.Dirty = true}// ApplyCallTypeOnly sets the CallType on the current segment's first// filter-matching label. Used after a Shift+primary keypress labeled the// species and we now receive the secondary key for the calltype.// No-op if there is no matching label to update.func (s *ClassifyState) ApplyCallTypeOnly(callType string) {seg := s.CurrentSegment()if seg == nil {return}df := s.CurrentFile()if df == nil {return}filterLabels := seg.GetFilterLabels(s.Config.Filter)if len(filterLabels) == 0 {return}df.Meta.Reviewer = s.Config.ReviewerfilterLabels[0].CallType = callTypes.Dirty = true}// HasSecondary reports whether the given primary key has any secondary// (calltype) bindings configured.func (s *ClassifyState) HasSecondary(primaryKey string) bool {return len(s.Config.SecondaryBindings[primaryKey]) > 0}// ConfirmLabel upgrades the current segment's existing filter label certainty// to 100. Returns true if a write is needed (label existed and was below 100).// Returns false for Don't Know (certainty=0) — confirming a Don't Know is a no-op;// the caller should just advance to the next segment.func (s *ClassifyState) ConfirmLabel() bool {seg := s.CurrentSegment()if seg == nil {return false}filterLabels := seg.GetFilterLabels(s.Config.Filter)if len(filterLabels) == 0 {return false}if filterLabels[0].Certainty == 0 {return false}if filterLabels[0].Certainty == 100 {return false}df := s.CurrentFile()if df == nil {return false}df.Meta.Reviewer = s.Config.ReviewerfilterLabels[0].Certainty = 100s.Dirty = truereturn true}// Save saves the current filefunc (s *ClassifyState) Save() error {df := s.CurrentFile()if df == nil {return nil}if !s.Dirty {return nil}err := df.Write(df.FilePath)if err != nil {return err}s.Dirty = falsereturn nil}// getFilterLabel returns the label matching the current filter, or first label if no filter.func (s *ClassifyState) getFilterLabel(seg *utils.Segment) *utils.Label {if s.Config.Filter == "" {if len(seg.Labels) > 0 {return seg.Labels[0]}return nil}for _, label := range seg.Labels {if label.Filter == s.Config.Filter {return label}}return nil}// getOrCreateFilterLabel gets existing label or creates new one for the current filter.func (s *ClassifyState) getOrCreateFilterLabel(seg *utils.Segment) *utils.Label {label := s.getFilterLabel(seg)if label != nil {return label}// Create new labellabel = &utils.Label{Species: "Don't Know",Certainty: 0,Filter: s.Config.Filter,}seg.Labels = append(seg.Labels, label)s.Dirty = truereturn label}// HasBookmark returns true if current segment has a bookmark on the filter label.func (s *ClassifyState) HasBookmark() bool {seg := s.CurrentSegment()if seg == nil {return false}label := s.getFilterLabel(seg)return label != nil && label.Bookmark}// ToggleBookmark toggles the bookmark on the current segment's filter label.func (s *ClassifyState) ToggleBookmark() {seg := s.CurrentSegment()if seg == nil {return}df := s.CurrentFile()if df == nil {return}// Set reviewerdf.Meta.Reviewer = s.Config.Reviewerlabel := s.getOrCreateFilterLabel(seg)label.Bookmark = !label.Bookmarks.Dirty = true}// NextBookmark navigates to the next bookmark, wrapping around if needed.// Returns false if no bookmarks found (back at start position).func (s *ClassifyState) NextBookmark() bool {startFile := s.FileIdxstartSeg := s.SegmentIdxfirst := truefor {// Advance to next segmentif !s.NextSegment() {// Wrap to start of folders.FileIdx = 0s.SegmentIdx = 0}// Check if we've looped back to startif !first && s.FileIdx == startFile && s.SegmentIdx == startSeg {return false // full circle, no bookmark found}first = false// Check if current segment has bookmarkif s.hasFilterBookmark() {return true}}}// PrevBookmark navigates to the previous bookmark, wrapping around if needed.// Returns false if no bookmarks found (back at start position).func (s *ClassifyState) PrevBookmark() bool {startFile := s.FileIdxstartSeg := s.SegmentIdxfirst := truefor {// Move to previous segmentif !s.PrevSegment() {// Wrap to end of folders.FileIdx = len(s.DataFiles) - 1segs := s.filteredSegs[s.FileIdx]s.SegmentIdx = max(len(segs)-1, 0)}// Check if we've looped back to startif !first && s.FileIdx == startFile && s.SegmentIdx == startSeg {return false // full circle, no bookmark found}first = false// Check if current segment has bookmarkif s.hasFilterBookmark() {return true}}}// hasFilterBookmark checks if current segment has bookmark on filter-matching label.func (s *ClassifyState) hasFilterBookmark() bool {seg := s.CurrentSegment()if seg == nil {return false}label := s.getFilterLabel(seg)return label != nil && label.Bookmark}// FormatLabels formats labels for displayfunc FormatLabels(labels []*utils.Label, filter string) string {var parts []stringfor _, l := range labels {if filter != "" && l.Filter != filter {continue}part := l.Speciesif l.CallType != "" {part += "/" + l.CallType}part += fmt.Sprintf(" (%d%%)", l.Certainty)if l.Filter != "" {part += " [" + l.Filter + "]"}if l.Comment != "" {part += fmt.Sprintf(" \"%s\"", l.Comment)}parts = append(parts, part)}return strings.Join(parts, ", ")}
package calls// AviaNZMeta is the metadata element in a .data filetype AviaNZMeta struct {Operator string `json:"Operator"`Reviewer *string `json:"Reviewer,omitempty"`Duration float64 `json:"Duration"`}// AviaNZLabel represents a species label in a segmenttype AviaNZLabel struct {Species string `json:"species"`Certainty int `json:"certainty"`Filter string `json:"filter"`}// AviaNZSegment represents a detection segment [start, end, freq_low, freq_high, labels]type AviaNZSegment [5]any
package db// ResolveDBPath returns the inputPath if non-empty, otherwise returns the// fallback path. This is used by tools that accept an explicit DBPath in// their Input struct but need a default when not provided.func ResolveDBPath(inputPath, fallback string) string {if inputPath != "" {return inputPath}return fallback}
tools/*.go → CLI tools (one file per tool, defines input/output types)
tools/*.go → CLI tools: sql, export, cluster, dataset, location, pattern, time, prependtools/calls/ → Call processing (filesystem .data/WAV, NO database access)tools/import/ → Import operations (bulk, file, files, segments, unstructured)
## [2026-05-12] Stream 7: tools/ package split + SetDBPath removalSplit tools/ into three packages to improve navigation and reduce coupling:### tools/calls/ (13 source + 11 test + 3 utility files, 4563 lines)- All calls_* processing — purely filesystem-based, NO database access- avianz_types.go, parallel_aggregate.go, isnight.go- Package name: `calls` (import: `skraak/tools/calls`)### tools/import/ (5 source + 1 test files, 2078 lines)- import_file, import_files, import_segments, import_unstructured, bulk_file_import- Package name: `imp` (import: `imp "skraak/tools/import"`)(`import` is a Go keyword, so `imp` is used as the package identifier)### tools/ (8 source + 4 test files, remaining ~1700 lines)- sql, export, cluster, dataset, location, pattern, time, prepend
### SetDBPath removal- Removed global `var dbPath string` and `SetDBPath()` from tools/sql.go- All callers already pass `Input.DBPath` — the global was redundant- Test files updated: `SetDBPath(testDB)` → `DBPath: testDB` in Input structs- Added `db.ResolveDBPath()` helper for the resolveDBPath pattern### depguard updates- New rules for tools/calls/ and tools/import/ packages- tui/ may import tools/calls but not tools- tools/ may not import sub-packages- tools/calls/ and tools/import/ may not import parent tools/ package### Cross-boundary dependency resolution- `resolveDBPath()` → each package calls `db.ResolveDBPath()` directly- `calls_clip_bench_test.go` path fix: `../audio/` → `../../audio/`- No unexported symbols cross package boundaries (verified by analysis)
- pkg: "skraak/tools$"desc: "tui must import from tools/calls, not tools"calls:files:- "**/tools/calls/*.go"deny:- pkg: "skraak/cmd"desc: "tools/calls must not import cmd"- pkg: "skraak/tools"desc: "tools/calls must not import parent package"- pkg: "skraak/tui"desc: "tools/calls must not import tui"import:files:- "**/tools/import/*.go"deny:- pkg: "skraak/cmd"desc: "tools/import must not import cmd"- pkg: "skraak/tools"desc: "tools/import must not import parent package"- pkg: "skraak/tui"desc: "tools/import must not import tui"