import_unstructured_test.go
package imp
import (
"context"
"database/sql"
"os"
"path/filepath"
"testing"
)
func TestImportUnstructured_HappyPath(t *testing.T) {
ctx := context.Background()
dbPath := setupFileBasedTestDB(t)
// Create temp folder with a WAV file
tmpDir := t.TempDir()
wavPath := filepath.Join(tmpDir, "test_recording.wav")
hash := createTestWAV(t, wavPath)
// Import to unstructured dataset
output, err := ImportUnstructured(ctx, ImportUnstructuredInput{
DBPath: dbPath,
DatasetID: "dstest000002", // unstructured dataset
FolderPath: tmpDir,
Recursive: new(true),
})
if err != nil {
t.Fatalf("ImportUnstructured failed: %v", err)
}
// Verify output
if output.TotalFiles != 1 {
t.Errorf("expected 1 total file, got %d", output.TotalFiles)
}
if output.ImportedFiles != 1 {
t.Errorf("expected 1 imported file, got %d", output.ImportedFiles)
}
if output.SkippedFiles != 0 {
t.Errorf("expected 0 skipped files, got %d", output.SkippedFiles)
}
if len(output.Errors) != 0 {
t.Errorf("unexpected errors: %v", output.Errors)
}
// Verify file was inserted into database
database, err := sql.Open("duckdb", dbPath)
if err != nil {
t.Fatalf("failed to open database for verification: %v", err)
}
defer database.Close()
var fileCount int
err = database.QueryRow("SELECT COUNT(*) FROM file WHERE xxh64_hash = ? AND active = true", hash).Scan(&fileCount)
if err != nil {
t.Fatalf("failed to query file: %v", err)
}
if fileCount != 1 {
t.Errorf("expected 1 file in database, got %d", fileCount)
}
// Verify file_dataset link
var linkCount int
err = database.QueryRow(`
SELECT COUNT(*) FROM file_dataset fd
JOIN file f ON fd.file_id = f.id
WHERE f.xxh64_hash = ? AND fd.dataset_id = 'dstest000002'
`, hash).Scan(&linkCount)
if err != nil {
t.Fatalf("failed to query file_dataset: %v", err)
}
if linkCount != 1 {
t.Errorf("expected 1 file_dataset link, got %d", linkCount)
}
// Verify location_id and cluster_id are NULL for unstructured
var locID, clID sql.NullString
err = database.QueryRow("SELECT location_id, cluster_id FROM file WHERE xxh64_hash = ?", hash).Scan(&locID, &clID)
if err != nil {
t.Fatalf("failed to query file: %v", err)
}
if locID.Valid {
t.Errorf("expected NULL location_id for unstructured file, got %s", locID.String)
}
if clID.Valid {
t.Errorf("expected NULL cluster_id for unstructured file, got %s", clID.String)
}
}
func TestImportUnstructured_DuplicateHandling(t *testing.T) {
ctx := context.Background()
dbPath := setupFileBasedTestDB(t)
// Create temp folder with a WAV file
tmpDir := t.TempDir()
wavPath := filepath.Join(tmpDir, "test_recording.wav")
hash := createTestWAV(t, wavPath)
// First import
_, err := ImportUnstructured(ctx, ImportUnstructuredInput{
DBPath: dbPath,
DatasetID: "dstest000002",
FolderPath: tmpDir,
Recursive: new(true),
})
if err != nil {
t.Fatalf("first import failed: %v", err)
}
// Second import of same file (should be skipped as duplicate)
output, err := ImportUnstructured(ctx, ImportUnstructuredInput{
DBPath: dbPath,
DatasetID: "dstest000002",
FolderPath: tmpDir,
Recursive: new(true),
})
if err != nil {
t.Fatalf("second import failed: %v", err)
}
// Verify output
if output.TotalFiles != 1 {
t.Errorf("expected 1 total file, got %d", output.TotalFiles)
}
if output.ImportedFiles != 0 {
t.Errorf("expected 0 imported files (duplicate), got %d", output.ImportedFiles)
}
if output.SkippedFiles != 1 {
t.Errorf("expected 1 skipped file (duplicate), got %d", output.SkippedFiles)
}
// Verify only one file in database (not duplicated)
database, err := sql.Open("duckdb", dbPath)
if err != nil {
t.Fatalf("failed to open database for verification: %v", err)
}
defer database.Close()
var fileCount int
err = database.QueryRow("SELECT COUNT(*) FROM file WHERE xxh64_hash = ? AND active = true", hash).Scan(&fileCount)
if err != nil {
t.Fatalf("failed to query file: %v", err)
}
if fileCount != 1 {
t.Errorf("expected 1 file in database (not duplicated), got %d", fileCount)
}
}
func TestImportUnstructured_EdgeCases(t *testing.T) {
ctx := context.Background()
t.Run("empty folder returns empty output", func(t *testing.T) {
dbPath := setupFileBasedTestDB(t)
tmpDir := t.TempDir()
output, err := ImportUnstructured(ctx, ImportUnstructuredInput{
DBPath: dbPath,
DatasetID: "dstest000002",
FolderPath: tmpDir,
Recursive: new(true),
})
if err != nil {
t.Fatalf("ImportUnstructured failed: %v", err)
}
if output.TotalFiles != 0 {
t.Errorf("expected 0 total files, got %d", output.TotalFiles)
}
})
t.Run("structured dataset rejected", func(t *testing.T) {
dbPath := setupFileBasedTestDB(t)
tmpDir := t.TempDir()
wavPath := filepath.Join(tmpDir, "test.wav")
createTestWAV(t, wavPath)
_, err := ImportUnstructured(ctx, ImportUnstructuredInput{
DBPath: dbPath,
DatasetID: "dstest000001", // structured dataset
FolderPath: tmpDir,
})
if err == nil {
t.Error("expected error for structured dataset")
}
})
t.Run("invalid dataset ID", func(t *testing.T) {
dbPath := setupFileBasedTestDB(t)
tmpDir := t.TempDir()
_, err := ImportUnstructured(ctx, ImportUnstructuredInput{
DBPath: dbPath,
DatasetID: "invalid_id",
FolderPath: tmpDir,
})
if err == nil {
t.Error("expected error for invalid dataset ID")
}
})
t.Run("folder does not exist", func(t *testing.T) {
dbPath := setupFileBasedTestDB(t)
_, err := ImportUnstructured(ctx, ImportUnstructuredInput{
DBPath: dbPath,
DatasetID: "dstest000002",
FolderPath: "/nonexistent/path",
})
if err == nil {
t.Error("expected error for nonexistent folder")
}
})
}
func TestScanWavFiles(t *testing.T) {
t.Run("finds WAV files", func(t *testing.T) {
tmpDir := t.TempDir()
wavPath := filepath.Join(tmpDir, "test.wav")
createTestWAV(t, wavPath)
files, errors := scanWavFiles(tmpDir, false)
if len(errors) > 0 {
t.Errorf("unexpected errors: %v", errors)
}
if len(files) != 1 {
t.Errorf("expected 1 file, got %d", len(files))
}
})
t.Run("case insensitive extension", func(t *testing.T) {
tmpDir := t.TempDir()
wavPath := filepath.Join(tmpDir, "test.WAV")
createTestWAV(t, wavPath)
files, errors := scanWavFiles(tmpDir, false)
if len(errors) > 0 {
t.Errorf("unexpected errors: %v", errors)
}
if len(files) != 1 {
t.Errorf("expected 1 file, got %d", len(files))
}
})
t.Run("non-recursive ignores subdirectories", func(t *testing.T) {
tmpDir := t.TempDir()
subDir := filepath.Join(tmpDir, "subdir")
if err := os.Mkdir(subDir, 0755); err != nil {
t.Fatalf("failed to create subdir: %v", err)
}
// Create WAV in both directories
createTestWAV(t, filepath.Join(tmpDir, "root.wav"))
createTestWAV(t, filepath.Join(subDir, "sub.wav"))
files, errors := scanWavFiles(tmpDir, false)
if len(errors) > 0 {
t.Errorf("unexpected errors: %v", errors)
}
if len(files) != 1 {
t.Errorf("expected 1 file (non-recursive), got %d", len(files))
}
})
t.Run("recursive finds all files", func(t *testing.T) {
tmpDir := t.TempDir()
subDir := filepath.Join(tmpDir, "subdir")
if err := os.Mkdir(subDir, 0755); err != nil {
t.Fatalf("failed to create subdir: %v", err)
}
// Create WAV in both directories
createTestWAV(t, filepath.Join(tmpDir, "root.wav"))
createTestWAV(t, filepath.Join(subDir, "sub.wav"))
files, errors := scanWavFiles(tmpDir, true)
if len(errors) > 0 {
t.Errorf("unexpected errors: %v", errors)
}
if len(files) != 2 {
t.Errorf("expected 2 files (recursive), got %d", len(files))
}
})
}