package tools
import (
"context"
"database/sql"
"fmt"
"io/fs"
"os"
"path/filepath"
"strings"
"time"
"skraak/db"
"skraak/utils"
)
// ImportUnstructuredInput defines the input parameters for importing files into an unstructured dataset
type ImportUnstructuredInput struct {
DatasetID string `json:"dataset_id" jsonschema:"required,Dataset ID (12 characters) - must be 'unstructured' type"`
FolderPath string `json:"folder_path" jsonschema:"required,Absolute path to folder containing WAV files"`
Recursive *bool `json:"recursive,omitempty" jsonschema:"Scan subfolders recursively (default: true)"`
}
// ImportUnstructuredOutput defines the output structure
type ImportUnstructuredOutput struct {
TotalFiles int `json:"total_files"`
ImportedFiles int `json:"imported_files"`
SkippedFiles int `json:"skipped_files"` // Duplicates
FailedFiles int `json:"failed_files"`
TotalDuration float64 `json:"total_duration_seconds"`
ProcessingTime string `json:"processing_time"`
Errors []utils.FileImportError `json:"errors,omitempty"`
}
// ImportUnstructured imports WAV files into an unstructured dataset
// Files are stored with minimal metadata: hash, duration, sample_rate, file_mod_time as timestamp
// No location/cluster hierarchy, no astronomical data, no AudioMoth parsing
func ImportUnstructured(
ctx context.Context,
input ImportUnstructuredInput,
) (ImportUnstructuredOutput, error) {
startTime := time.Now()
var output ImportUnstructuredOutput
// Default recursive to true
recursive := true
if input.Recursive != nil {
recursive = *input.Recursive
}
// Validate input
if err := validateUnstructuredInput(input); err != nil {
return output, fmt.Errorf("validation failed: %w", err)
}
// Open database
database, err := db.OpenWriteableDB(dbPath)
if err != nil {
return output, fmt.Errorf("failed to open database: %w", err)
}
defer database.Close()
// Scan for WAV files
files, scanErrors := scanWavFiles(input.FolderPath, recursive)
output.Errors = append(output.Errors, scanErrors...)
output.TotalFiles = len(files)
if len(files) == 0 {
output.ProcessingTime = time.Since(startTime).String()
return output, nil
}
// Begin transaction
tx, err := database.BeginTx(ctx, nil)
if err != nil {
return output, fmt.Errorf("failed to begin transaction: %w", err)
}
defer func() {
if err != nil {
tx.Rollback()
}
}()
// Process each file
for _, filePath := range files {
fileResult, procErr := processUnstructuredFile(tx, filePath, input.DatasetID)
if procErr != nil {
output.FailedFiles++
output.Errors = append(output.Errors, utils.FileImportError{
FileName: filepath.Base(filePath),
Error: procErr.Error(),
Stage: "process",
})
continue
}
if fileResult.Skipped {
output.SkippedFiles++
} else {
output.ImportedFiles++
output.TotalDuration += fileResult.Duration
}
}
// Commit transaction
if err = tx.Commit(); err != nil {
return output, fmt.Errorf("failed to commit transaction: %w", err)
}
output.ProcessingTime = time.Since(startTime).String()
return output, nil
}
// unstructuredFileResult holds the result of processing a single file
type unstructuredFileResult struct {
Skipped bool // True if duplicate
Duration float64 // Duration in seconds
}
// processUnstructuredFile processes a single WAV file for unstructured import
func processUnstructuredFile(tx *sql.Tx, filePath, datasetID string) (*unstructuredFileResult, error) {
result := &unstructuredFileResult{}
// Step 1: Parse WAV header
metadata, err := utils.ParseWAVHeader(filePath)
if err != nil {
return nil, fmt.Errorf("WAV header parsing failed: %w", err)
}
// Step 2: Calculate hash
hash, err := utils.ComputeXXH64(filePath)
if err != nil {
return nil, fmt.Errorf("hash calculation failed: %w", err)
}
// Step 3: Check for duplicate - if exists, skip entirely (do not link to dataset)
_, isDuplicate, err := utils.CheckDuplicateHash(tx, hash)
if err != nil {
return nil, fmt.Errorf("duplicate check failed: %w", err)
}
if isDuplicate {
// File already exists in database - skip completely, do not link to dataset
result.Skipped = true
result.Duration = metadata.Duration
return result, nil
}
// Step 4: Generate file ID
fileID, err := utils.GenerateLongID()
if err != nil {
return nil, fmt.Errorf("ID generation failed: %w", err)
}
// Step 5: Use file modification time as timestamp (no timezone conversion)
timestamp := metadata.FileModTime
// Step 6: Insert into file table
_, err = tx.Exec(`
INSERT INTO file (
id, file_name, xxh64_hash, location_id, cluster_id,
timestamp_local, duration, sample_rate,
maybe_solar_night, maybe_civil_night, moon_phase,
active
) VALUES (?, ?, ?, NULL, NULL, ?, ?, ?, NULL, NULL, NULL, TRUE)
`,
fileID,
filepath.Base(filePath),
hash,
timestamp,
metadata.Duration,
metadata.SampleRate,
)
if err != nil {
return nil, fmt.Errorf("file insert failed: %w", err)
}
// Step 7: Insert into file_dataset table
_, err = tx.Exec(
"INSERT INTO file_dataset (file_id, dataset_id) VALUES (?, ?)",
fileID, datasetID,
)
if err != nil {
return nil, fmt.Errorf("file_dataset insert failed: %w", err)
}
result.Duration = metadata.Duration
return result, nil
}
// validateUnstructuredInput validates the input parameters
func validateUnstructuredInput(input ImportUnstructuredInput) error {
// Validate dataset ID format
if err := utils.ValidateShortID(input.DatasetID, "dataset_id"); err != nil {
return err
}
// Verify folder exists
info, err := os.Stat(input.FolderPath)
if err != nil {
return fmt.Errorf("folder not accessible: %w", err)
}
if !info.IsDir() {
return fmt.Errorf("path is not a directory: %s", input.FolderPath)
}
// Open database for validation
database, err := db.OpenReadOnlyDB(dbPath)
if err != nil {
return fmt.Errorf("failed to open database: %w", err)
}
defer database.Close()
// Verify dataset exists and is active
var datasetExists bool
err = database.QueryRow(
"SELECT EXISTS(SELECT 1 FROM dataset WHERE id = ? AND active = true)",
input.DatasetID,
).Scan(&datasetExists)
if err != nil {
return fmt.Errorf("failed to query dataset: %w", err)
}
if !datasetExists {
return fmt.Errorf("dataset not found or inactive: %s", input.DatasetID)
}
// Verify dataset is 'unstructured' type
if err := utils.ValidateDatasetTypeUnstructured(database, input.DatasetID); err != nil {
return err
}
return nil
}
// scanWavFiles scans a folder for WAV files
func scanWavFiles(folderPath string, recursive bool) ([]string, []utils.FileImportError) {
var files []string
var errors []utils.FileImportError
walkFunc := func(path string, d fs.DirEntry, err error) error {
if err != nil {
errors = append(errors, utils.FileImportError{
FileName: path,
Error: err.Error(),
Stage: "scan",
})
return nil
}
// Skip directories if not recursive
if d.IsDir() {
if !recursive && path != folderPath {
return fs.SkipDir
}
return nil
}
// Check for .wav extension (case-insensitive)
if strings.HasSuffix(strings.ToLower(d.Name()), ".wav") {
files = append(files, path)
}
return nil
}
if recursive {
filepath.WalkDir(folderPath, walkFunc)
} else {
// Non-recursive: only scan top-level
entries, err := os.ReadDir(folderPath)
if err != nil {
errors = append(errors, utils.FileImportError{
FileName: folderPath,
Error: err.Error(),
Stage: "scan",
})
return nil, errors
}
for _, entry := range entries {
if !entry.IsDir() && strings.HasSuffix(strings.ToLower(entry.Name()), ".wav") {
files = append(files, filepath.Join(folderPath, entry.Name()))
}
}
}
return files, errors
}