import_unstructured.go
package imp
import (
"context"
"database/sql"
"fmt"
"io/fs"
"os"
"path/filepath"
"strings"
"time"
"skraak/db"
"skraak/utils"
"skraak/wav"
)
// ImportUnstructuredInput defines the input parameters for importing files into an unstructured dataset
type ImportUnstructuredInput struct {
DBPath string `json:"db_path"`
DatasetID string `json:"dataset_id"`
FolderPath string `json:"folder_path"`
Recursive *bool `json:"recursive,omitempty"`
}
// ImportUnstructuredOutput defines the output structure
type ImportUnstructuredOutput struct {
TotalFiles int `json:"total_files"`
ImportedFiles int `json:"imported_files"`
SkippedFiles int `json:"skipped_files"` // Duplicates
FailedFiles int `json:"failed_files"`
TotalDuration float64 `json:"total_duration_seconds"`
ProcessingTime string `json:"processing_time"`
Errors []FileImportError `json:"errors,omitempty"`
}
// ImportUnstructured imports WAV files into an unstructured dataset
// Files are stored with minimal metadata: hash, duration, sample_rate, file_mod_time as timestamp
// No location/cluster hierarchy, no astronomical data, no AudioMoth parsing
func ImportUnstructured(
ctx context.Context,
input ImportUnstructuredInput,
) (ImportUnstructuredOutput, error) {
startTime := time.Now()
var output ImportUnstructuredOutput
// Default recursive to true
recursive := true
if input.Recursive != nil {
recursive = *input.Recursive
}
// Validate input
if err := validateUnstructuredInput(input); err != nil {
return output, fmt.Errorf("validation failed: %w", err)
}
// Scan for WAV files (no DB needed)
files, scanErrors := scanWavFiles(input.FolderPath, recursive)
output.Errors = append(output.Errors, scanErrors...)
output.TotalFiles = len(files)
if len(files) == 0 {
output.ProcessingTime = time.Since(startTime).String()
return output, nil
}
err := db.WithWriteTx(ctx, db.ResolveDBPath(input.DBPath, ""), "import_unstructured", func(database *sql.DB, tx *db.LoggedTx) error {
// Process each file
for _, filePath := range files {
fileResult, procErr := processUnstructuredFile(tx, filePath, input.DatasetID)
if procErr != nil {
output.FailedFiles++
output.Errors = append(output.Errors, FileImportError{
FileName: filepath.Base(filePath),
Error: procErr.Error(),
Stage: StageProcess,
})
continue
}
if fileResult.Skipped {
output.SkippedFiles++
} else {
output.ImportedFiles++
output.TotalDuration += fileResult.Duration
}
}
return nil
})
if err != nil {
return output, err
}
output.ProcessingTime = time.Since(startTime).String()
return output, nil
}
// unstructuredFileResult holds the result of processing a single file
type unstructuredFileResult struct {
Skipped bool // True if duplicate
Duration float64 // Duration in seconds
}
// processUnstructuredFile processes a single WAV file for unstructured import
func processUnstructuredFile(tx *db.LoggedTx, filePath, datasetID string) (*unstructuredFileResult, error) {
result := &unstructuredFileResult{}
// Step 1: Parse WAV header
metadata, err := wav.ParseWAVHeader(filePath)
if err != nil {
return nil, fmt.Errorf("WAV header parsing failed: %w", err)
}
// Step 2: Calculate hash
hash, err := utils.ComputeXXH64(filePath)
if err != nil {
return nil, fmt.Errorf("hash calculation failed: %w", err)
}
// Step 3: Check for duplicate - if exists, skip entirely (do not link to dataset)
_, isDuplicate, err := CheckDuplicateHash(tx, hash)
if err != nil {
return nil, fmt.Errorf("duplicate check failed: %w", err)
}
if isDuplicate {
// File already exists in database - skip completely, do not link to dataset
result.Skipped = true
result.Duration = metadata.Duration
return result, nil
}
// Step 4: Generate file ID
fileID, err := utils.GenerateLongID()
if err != nil {
return nil, fmt.Errorf("ID generation failed: %w", err)
}
// Step 5: Use file modification time as timestamp (no timezone conversion)
timestamp := metadata.FileModTime
// Step 6: Insert into file table
_, err = tx.ExecContext(context.Background(), `
INSERT INTO file (
id, file_name, xxh64_hash, location_id, cluster_id,
timestamp_local, duration, sample_rate,
maybe_solar_night, maybe_civil_night, moon_phase,
active
) VALUES (?, ?, ?, NULL, NULL, ?, ?, ?, NULL, NULL, NULL, TRUE)
`,
fileID,
filepath.Base(filePath),
hash,
timestamp,
metadata.Duration,
metadata.SampleRate,
)
if err != nil {
return nil, fmt.Errorf("file insert failed: %w", err)
}
// Step 7: Insert into file_dataset table
_, err = tx.ExecContext(context.Background(),
"INSERT INTO file_dataset (file_id, dataset_id) VALUES (?, ?)",
fileID, datasetID,
)
if err != nil {
return nil, fmt.Errorf("file_dataset insert failed: %w", err)
}
result.Duration = metadata.Duration
return result, nil
}
// validateUnstructuredInput validates the input parameters
func validateUnstructuredInput(input ImportUnstructuredInput) error {
// Validate dataset ID format
if err := utils.ValidateShortID(input.DatasetID, "dataset_id"); err != nil {
return err
}
// Verify folder exists
info, err := os.Stat(input.FolderPath)
if err != nil {
return fmt.Errorf("folder not accessible: %w", err)
}
if !info.IsDir() {
return fmt.Errorf("path is not a directory: %s", input.FolderPath)
}
return db.WithReadDB(db.ResolveDBPath(input.DBPath, ""), func(database *sql.DB) error {
// Verify dataset exists and is active
if _, err := db.DatasetExistsAndActive(database, input.DatasetID); err != nil {
return err
}
// Verify dataset is 'unstructured' type
if err := db.ValidateDatasetTypeUnstructured(database, input.DatasetID); err != nil {
return err
}
return nil
})
}
// scanWavFiles scans a folder for WAV files
func scanWavFiles(folderPath string, recursive bool) ([]string, []FileImportError) {
if recursive {
return scanWavFilesRecursive(folderPath)
}
return scanWavFilesFlat(folderPath)
}
// scanWavFilesRecursive walks the directory tree recursively.
func scanWavFilesRecursive(folderPath string) ([]string, []FileImportError) {
var files []string
var errors []FileImportError
err := filepath.WalkDir(folderPath, func(path string, d fs.DirEntry, err error) error {
if err != nil {
errors = append(errors, FileImportError{
FileName: path,
Error: err.Error(),
Stage: StageScan,
})
return nil
}
if d.IsDir() {
return nil
}
if isWavFile(d.Name()) {
files = append(files, path)
}
return nil
})
if err != nil {
errors = append(errors, FileImportError{
FileName: folderPath,
Error: err.Error(),
Stage: StageScan,
})
}
return files, errors
}
// scanWavFilesFlat scans only the top-level directory.
func scanWavFilesFlat(folderPath string) ([]string, []FileImportError) {
var files []string
var errors []FileImportError
entries, err := os.ReadDir(folderPath)
if err != nil {
errors = append(errors, FileImportError{
FileName: folderPath,
Error: err.Error(),
Stage: StageScan,
})
return nil, errors
}
for _, entry := range entries {
if !entry.IsDir() && isWavFile(entry.Name()) {
files = append(files, filepath.Join(folderPath, entry.Name()))
}
}
return files, errors
}
// isWavFile returns true if the filename has a .wav extension (case-insensitive).
func isWavFile(name string) bool {
return strings.HasSuffix(strings.ToLower(name), ".wav")
}