J62FGJ3BGFTUWEOUON4ATYNDFBEUIR6FCIOZSHOVHAA7KGFYHW6AC WNTA72IKZG7LBQJ3CXHBNNBSOJVFWF5S2GOMBYGQR7TD7GVEYGUAC IFVRAERTCCDICNTYTG3TX2WASB6RXQQEJWWXQMQZJSQDQ3HLE5OQC 65G4H2V6262GLHTPQQ5H4NIPDJB7HRPBRVNAD2EL26N75YUM5PWQC NKQAT3RE4IBIWXVMI5LJUINDPHTANNMORZ5N2JFA4AN6UUB72KGAC RMWLXG5HGB44LH3CEA7FWTAFPSZQZQGH52OHVQJUDP6ASPVDVQJAC 2Y2ZW565SRONQ2UXPLX5SRP2HDFWMRF5KDXKSKVRCHBBGEGMTVIQC 7NS27QXZMVTZBK4VPMYL5IKGSTTAWR6NDG5SOVITNX44VNIRZPMAC 47GPFVLW7RWBBHHUZYMEEYWG3KBJBWELR7RDKMJRWMNRWYJUBR7QC 5LMYPB2QHNVDLYCRWLOMCPY35ZKHHPYVW5XHASE66L6PJZSOCXYQC OGLLBQQYE5KICDMI6EX7ZI4TZT5RB7UFHH7O2DUOZ44QQXVL5YAAC QQOATNCITSSIPKVUFNZEPN73TGU244GIAW6K37SILAGQWVQ4TCQQC W3A2EECCD23SVHJZN6MXPH2PAVFHH5CNFD2XHPQRRW6M4GUTG3FAC package toolsimport ("context""database/sql""fmt""os""path/filepath""sort""strings""skraak/db")// ExportDatasetInput defines the input parameters for the export dataset tooltype ExportDatasetInput struct {DatasetID string `json:"dataset_id" jsonschema:"required,Dataset ID to export"`Output string `json:"output" jsonschema:"required,Output database path"`DryRun bool `json:"dry_run" jsonschema:"Show what would be exported without creating file"`Force bool `json:"force" jsonschema:"Overwrite existing output file"`}// ExportDatasetOutput defines the output structuretype ExportDatasetOutput struct {DatasetID string `json:"dataset_id"`DatasetName string `json:"dataset_name"`OutputPath string `json:"output_path"`RowCounts map[string]int64 `json:"row_counts"`FileSizeMB float64 `json:"file_size_mb,omitempty"`DryRun bool `json:"dry_run"`Message string `json:"message"`}// TableRelationship defines how a table relates to a datasettype TableRelationship struct {Table string // table nameRelation string // "owned" | "owned-via" | "referenced" | "derived"FilterCol string // column to filter onViaTable string // for referenced: junction tableViaColumn string // for referenced: column in junction tableSourceTable string // for derived: source table}// Dataset tables manifest - defines how each table relates to a datasetvar datasetTables = []TableRelationship{// Owned directly{Table: "dataset", Relation: "owned", FilterCol: "id"},{Table: "location", Relation: "owned", FilterCol: "dataset_id"},{Table: "cluster", Relation: "owned", FilterCol: "dataset_id"},{Table: "selection", Relation: "owned", FilterCol: "dataset_id"},{Table: "file_dataset", Relation: "owned", FilterCol: "dataset_id"},{Table: "species_dataset", Relation: "owned", FilterCol: "dataset_id"},// Owned via FK chain{Table: "file", Relation: "owned-via", FilterCol: "cluster_id", ViaTable: "cluster"},{Table: "moth_metadata", Relation: "owned-via", FilterCol: "file_id", ViaTable: "file"},{Table: "file_metadata", Relation: "owned-via", FilterCol: "file_id", ViaTable: "file"},{Table: "selection_metadata", Relation: "owned-via", FilterCol: "selection_id", ViaTable: "selection"},{Table: "label", Relation: "owned-via", FilterCol: "selection_id", ViaTable: "selection"},{Table: "label_subtype", Relation: "owned-via", FilterCol: "label_id", ViaTable: "label"},// Referenced (subset extraction){Table: "cyclic_recording_pattern", Relation: "referenced", FilterCol: "id", ViaTable: "cluster", ViaColumn: "cyclic_recording_pattern_id"},{Table: "filter", Relation: "referenced", FilterCol: "id", ViaTable: "label", ViaColumn: "filter_id"},{Table: "species", Relation: "referenced", FilterCol: "id", ViaTable: "species_dataset", ViaColumn: "species_id"},{Table: "call_type", Relation: "referenced", FilterCol: "species_id", ViaTable: "species", ViaColumn: "id"},{Table: "ebird_taxonomy", Relation: "referenced", FilterCol: "species_code", ViaTable: "species", ViaColumn: "ebird_code"},// Copied as-is (no filtering){Table: "ebird_taxonomy_v2024", Relation: "copy"},}// ExportDataset exports a single dataset with all related data to a new databasefunc ExportDataset(ctx context.Context,input ExportDatasetInput,) (ExportDatasetOutput, error) {var output ExportDatasetOutputoutput.DatasetID = input.DatasetIDoutput.OutputPath = input.Outputoutput.DryRun = input.DryRunoutput.RowCounts = make(map[string]int64)// Open source database (read-only for safety)sourceDB, err := db.OpenReadOnlyDB(dbPath)if err != nil {return output, fmt.Errorf("failed to open source database: %w", err)}// Verify dataset exists and get namevar datasetName stringerr = sourceDB.QueryRowContext(ctx,"SELECT name FROM dataset WHERE id = ? AND active = true",input.DatasetID,).Scan(&datasetName)if err != nil {sourceDB.Close()return output, fmt.Errorf("dataset not found: %s", input.DatasetID)}output.DatasetName = datasetName// Check if output file existsif !input.DryRun {if _, err := os.Stat(input.Output); err == nil && !input.Force {sourceDB.Close()return output, fmt.Errorf("output file exists: %s (use --force to overwrite)", input.Output)}}// Get FK order for tablesfkOrder, err := db.GetFKOrder(sourceDB)if err != nil {sourceDB.Close()return output, fmt.Errorf("failed to compute table order: %w", err)}// Sort our manifest by FK orderorderedTables := orderByFKDependency(datasetTables, fkOrder)// Calculate row counts for each tablefor _, tr := range orderedTables {count, err := countTableRows(ctx, sourceDB, tr, input.DatasetID)if err != nil {sourceDB.Close()return output, fmt.Errorf("failed to count rows in %s: %w", tr.Table, err)}if count > 0 {output.RowCounts[tr.Table] = count}}// If dry-run, return nowif input.DryRun {sourceDB.Close()output.Message = fmt.Sprintf("Would export dataset '%s' (%s)", datasetName, input.DatasetID)return output, nil}// Close source DB before creating output (DuckDB can't attach same file twice)sourceDB.Close()// Create output directory if neededoutputDir := filepath.Dir(input.Output)if outputDir != "" && outputDir != "." {if err := os.MkdirAll(outputDir, 0755); err != nil {return output, fmt.Errorf("failed to create output directory: %w", err)}}// Create output databaseoutputDB, err := createOutputDatabase(input.Output)if err != nil {return output, fmt.Errorf("failed to create output database: %w", err)}defer outputDB.Close()// Attach source database_, err = outputDB.ExecContext(ctx, fmt.Sprintf("ATTACH '%s' AS source", dbPath))if err != nil {return output, fmt.Errorf("failed to attach source database: %w", err)}// Copy data in FK orderfor _, tr := range orderedTables {switch tr.Relation {case "copy":// Copy entire table as-iserr = copyTableAsIs(ctx, outputDB, tr.Table)case "derived":// Derived tables are populated from already-copied dataerr = populateDerivedTable(ctx, outputDB, tr)default:err = copyTableData(ctx, outputDB, tr, input.DatasetID)}if err != nil {return output, fmt.Errorf("failed to copy %s: %w", tr.Table, err)}}// Detach source_, err = outputDB.ExecContext(ctx, "DETACH source")if err != nil {return output, fmt.Errorf("failed to detach source database: %w", err)}// Close output DB before getting file sizeoutputDB.Close()outputDB = nil// Get file sizeif info, err := os.Stat(input.Output); err == nil {output.FileSizeMB = float64(info.Size()) / 1024 / 1024}// Create empty event log fileeventLogPath := input.Output + ".events.jsonl"eventFile, err := os.Create(eventLogPath)if err != nil {return output, fmt.Errorf("failed to create event log file: %w", err)}eventFile.Close()output.Message = fmt.Sprintf("Successfully exported dataset '%s' (%s) to %s",datasetName, input.DatasetID, input.Output)return output, nil}// createOutputDatabase creates a new database with the schemafunc createOutputDatabase(outputPath string) (*sql.DB, error) {// Remove existing file if anyos.Remove(outputPath)// Open new database connectionconnStr := outputPath + "?access_mode=read_write"database, err := sql.Open("duckdb", connStr)if err != nil {return nil, fmt.Errorf("failed to create output database: %w", err)}// Read and execute schemaschemaSQL, err := db.ReadSchemaSQL()if err != nil {database.Close()return nil, fmt.Errorf("failed to read schema: %w", err)}statements := db.ExtractDDLStatements(schemaSQL)for _, stmt := range statements {// For CREATE TABLE ... AS SELECT, create table structure manuallyif stmt.Type == "CREATE_TABLE_AS" {createStmt := buildDerivedTableCreate(stmt.TableName)if createStmt != "" {if _, err := database.Exec(createStmt); err != nil {database.Close()return nil, fmt.Errorf("failed to create table %s: %w", stmt.TableName, err)}}} else {if _, err := database.Exec(stmt.SQL); err != nil {// Ignore "already exists" errors for typesif !strings.Contains(err.Error(), "already exists") {database.Close()return nil, fmt.Errorf("failed to execute DDL for %s: %w", stmt.TableName, err)}}}}return database, nil}// buildDerivedTableCreate builds a CREATE TABLE statement for tables defined as CREATE TABLE AS SELECTfunc buildDerivedTableCreate(tableName string) string {switch tableName {case "ebird_taxonomy_v2024":return `CREATE TABLE ebird_taxonomy_v2024 (id VARCHAR(12),species_code VARCHAR(15),primary_com_name VARCHAR(100),sci_name VARCHAR(100),bird_order VARCHAR(30),family VARCHAR(100))`default:return ""}}// copyTableAsIs copies an entire table without filteringfunc copyTableAsIs(ctx context.Context, outputDB *sql.DB, table string) error {query := fmt.Sprintf("INSERT INTO %s SELECT * FROM source.%s", table, table)_, err := outputDB.ExecContext(ctx, query)return err}// copyTableData copies data from source to output databasefunc copyTableData(ctx context.Context, outputDB *sql.DB, tr TableRelationship, datasetID string) error {var query stringswitch tr.Relation {case "owned":// Direct filter on dataset_id (or id for dataset table)if tr.Table == "dataset" {query = fmt.Sprintf("INSERT INTO %s SELECT * FROM source.%s WHERE id = ?", tr.Table, tr.Table)} else {query = fmt.Sprintf("INSERT INTO %s SELECT * FROM source.%s WHERE dataset_id = ?", tr.Table, tr.Table)}case "owned-via":// Filter via FK chainquery = buildOwnedViaQuery(tr, datasetID)case "referenced":// Subset via junction tablequery = buildReferencedQuery(tr, datasetID)default:return fmt.Errorf("unknown relation type: %s", tr.Relation)}_, err := outputDB.ExecContext(ctx, query, datasetID)return err}// buildOwnedViaQuery builds a query for owned-via tablesfunc buildOwnedViaQuery(tr TableRelationship, datasetID string) string {switch tr.ViaTable {case "cluster":return fmt.Sprintf(`INSERT INTO %s SELECT * FROM source.%sWHERE %s IN (SELECT id FROM source.cluster WHERE dataset_id = ?)`,tr.Table, tr.Table, tr.FilterCol)case "file":return fmt.Sprintf(`INSERT INTO %s SELECT * FROM source.%sWHERE %s IN (SELECT id FROM source.file WHERE cluster_id IN(SELECT id FROM source.cluster WHERE dataset_id = ?))`,tr.Table, tr.Table, tr.FilterCol)case "selection":return fmt.Sprintf(`INSERT INTO %s SELECT * FROM source.%sWHERE %s IN (SELECT id FROM source.selection WHERE dataset_id = ?)`,tr.Table, tr.Table, tr.FilterCol)case "label":return fmt.Sprintf(`INSERT INTO %s SELECT * FROM source.%sWHERE %s IN (SELECT id FROM source.label WHERE selection_id IN(SELECT id FROM source.selection WHERE dataset_id = ?))`,tr.Table, tr.Table, tr.FilterCol)default:// Generic fallbackreturn fmt.Sprintf(`INSERT INTO %s SELECT * FROM source.%s WHERE %s IN(SELECT id FROM source.%s WHERE dataset_id = ?)`,tr.Table, tr.Table, tr.FilterCol, tr.ViaTable)}}// buildReferencedQuery builds a query for referenced tablesfunc buildReferencedQuery(tr TableRelationship, datasetID string) string {switch tr.Table {case "ebird_taxonomy":// Join via species.ebird_code and filter by species_datasetreturn `INSERT INTO ebird_taxonomy SELECT DISTINCT t.*FROM source.ebird_taxonomy tINNER JOIN source.species s ON t.species_code = s.ebird_code AND t.taxonomy_version = s.taxonomy_versionWHERE s.id IN (SELECT species_id FROM source.species_dataset WHERE dataset_id = ?)`case "filter":// Filter via label -> selection -> dataset_idreturn `INSERT INTO filter SELECT DISTINCT f.*FROM source.filter fWHERE f.id IN (SELECT l.filter_id FROM source.label lINNER JOIN source.selection s ON l.selection_id = s.idWHERE s.dataset_id = ?)`case "call_type":// Call types for species in this datasetreturn fmt.Sprintf(`INSERT INTO %s SELECT * FROM source.%sWHERE %s IN (SELECT id FROM source.species WHERE id IN(SELECT species_id FROM source.species_dataset WHERE dataset_id = ?))`,tr.Table, tr.Table, tr.FilterCol)case "cyclic_recording_pattern":// Patterns via cluster.dataset_idreturn `INSERT INTO cyclic_recording_pattern SELECT DISTINCT p.*FROM source.cyclic_recording_pattern pWHERE p.id IN (SELECT c.cyclic_recording_pattern_id FROM source.cluster cWHERE c.dataset_id = ? AND c.cyclic_recording_pattern_id IS NOT NULL)`default:// Generic: filter via junction tablereturn fmt.Sprintf(`INSERT INTO %s SELECT * FROM source.%sWHERE %s IN (SELECT %s FROM source.%s WHERE dataset_id = ?)`,tr.Table, tr.Table, tr.FilterCol, tr.ViaColumn, tr.ViaTable)}}// populateDerivedTable populates a derived table from already-copied datafunc populateDerivedTable(ctx context.Context, outputDB *sql.DB, tr TableRelationship) error {switch tr.Table {case "ebird_taxonomy_v2024":// Insert from already-copied ebird_taxonomy_, err := outputDB.ExecContext(ctx, `INSERT INTO ebird_taxonomy_v2024SELECT id, species_code, primary_com_name, sci_name, bird_order, familyFROM ebird_taxonomyWHERE taxonomy_version = '2024'`)return errdefault:return fmt.Errorf("unknown derived table: %s", tr.Table)}}// countTableRows counts rows for a table relationshipfunc countTableRows(ctx context.Context, db *sql.DB, tr TableRelationship, datasetID string) (int64, error) {var query stringswitch tr.Relation {case "copy":// Count all rows in tablequery = "SELECT COUNT(*) FROM " + tr.Tablecase "owned":if tr.Table == "dataset" {query = "SELECT COUNT(*) FROM " + tr.Table + " WHERE id = ?"} else {query = "SELECT COUNT(*) FROM " + tr.Table + " WHERE dataset_id = ?"}case "owned-via":query = buildCountOwnedViaQuery(tr)case "referenced":query = buildCountReferencedQuery(tr)case "derived":// Count what would be derived based on already-referenced dataif tr.Table == "ebird_taxonomy_v2024" {query = `SELECT COUNT(*) FROM ebird_taxonomy WHERE taxonomy_version = '2024'AND species_code IN (SELECT ebird_code FROM species WHERE id IN(SELECT species_id FROM species_dataset WHERE dataset_id = ?))`} else {return 0, nil}default:return 0, nil}var count int64err := db.QueryRowContext(ctx, query, datasetID).Scan(&count)return count, err}// buildCountOwnedViaQuery builds a count query for owned-via tablesfunc buildCountOwnedViaQuery(tr TableRelationship) string {switch tr.ViaTable {case "cluster":return fmt.Sprintf(`SELECT COUNT(*) FROM %s WHERE %s IN(SELECT id FROM cluster WHERE dataset_id = ?)`, tr.Table, tr.FilterCol)case "file":return fmt.Sprintf(`SELECT COUNT(*) FROM %s WHERE %s IN(SELECT id FROM file WHERE cluster_id IN(SELECT id FROM cluster WHERE dataset_id = ?))`, tr.Table, tr.FilterCol)case "selection":return fmt.Sprintf(`SELECT COUNT(*) FROM %s WHERE %s IN(SELECT id FROM selection WHERE dataset_id = ?)`, tr.Table, tr.FilterCol)case "label":return fmt.Sprintf(`SELECT COUNT(*) FROM %s WHERE %s IN(SELECT id FROM label WHERE selection_id IN(SELECT id FROM selection WHERE dataset_id = ?))`, tr.Table, tr.FilterCol)default:return fmt.Sprintf(`SELECT COUNT(*) FROM %s WHERE %s IN(SELECT id FROM %s WHERE dataset_id = ?)`, tr.Table, tr.FilterCol, tr.ViaTable)}}// buildCountReferencedQuery builds a count query for referenced tablesfunc buildCountReferencedQuery(tr TableRelationship) string {switch tr.Table {case "ebird_taxonomy":return `SELECT COUNT(DISTINCT ebird_taxonomy.id) FROM ebird_taxonomyINNER JOIN species ON ebird_taxonomy.species_code = species.ebird_codeAND ebird_taxonomy.taxonomy_version = species.taxonomy_versionWHERE species.id IN (SELECT species_id FROM species_dataset WHERE dataset_id = ?)`case "filter":// Filter via label -> selection -> dataset_idreturn `SELECT COUNT(DISTINCT filter.id) FROM filterINNER JOIN label ON filter.id = label.filter_idINNER JOIN selection ON label.selection_id = selection.idWHERE selection.dataset_id = ?`case "call_type":return fmt.Sprintf(`SELECT COUNT(*) FROM %s WHERE %s IN(SELECT id FROM species WHERE id IN(SELECT species_id FROM species_dataset WHERE dataset_id = ?))`, tr.Table, tr.FilterCol)case "cyclic_recording_pattern":return `SELECT COUNT(DISTINCT cyclic_recording_pattern.id) FROM cyclic_recording_patternINNER JOIN cluster ON cyclic_recording_pattern.id = cluster.cyclic_recording_pattern_idWHERE cluster.dataset_id = ?`default:return fmt.Sprintf(`SELECT COUNT(*) FROM %s WHERE %s IN(SELECT %s FROM %s WHERE dataset_id = ?)`, tr.Table, tr.FilterCol, tr.ViaColumn, tr.ViaTable)}}// orderByFKDependency sorts tables by FK dependency orderfunc orderByFKDependency(tables []TableRelationship, fkOrder []string) []TableRelationship {// Create a map for quick order lookuporderMap := make(map[string]int)for i, table := range fkOrder {orderMap[table] = i}// Sort by FK order, with copy tables lastsorted := make([]TableRelationship, len(tables))copy(sorted, tables)sort.Slice(sorted, func(i, j int) bool {ti, tj := sorted[i], sorted[j]// Copy tables always come lastif ti.Relation == "copy" && tj.Relation != "copy" {return false}if tj.Relation == "copy" && ti.Relation != "copy" {return true}if ti.Relation == "copy" && tj.Relation == "copy" {return ti.Table < tj.Table}// Derived tables come after their source tablesif ti.Relation == "derived" && tj.Relation != "derived" {return false}if tj.Relation == "derived" && ti.Relation != "derived" {return true}if ti.Relation == "derived" && tj.Relation == "derived" {// Order derived tables by their source tablesi, _ := orderMap[ti.SourceTable]sj, _ := orderMap[tj.SourceTable]return si < sj}// Use FK order for non-derived tablesoi, _ := orderMap[ti.Table]oj, _ := orderMap[tj.Table]return oi < oj})return sorted}
#!/bin/bash# Test export dataset functionality# Usage: ./test_export.sh [db_path]set -eSCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"PROJECT_DIR="$(dirname "$SCRIPT_DIR")"SKRAAK="$PROJECT_DIR/skraak"DB_PATH="${1:-$PROJECT_DIR/db/test.duckdb}"EXPORT_DB="/tmp/skraak_export_test_$$.duckdb"echo "=== Testing Export Dataset ==="echo "Database: $DB_PATH"echo ""# Clean up any existing exportrm -f "$EXPORT_DB" "$EXPORT_DB.events.jsonl"# Get a dataset ID to exportecho "Test 1: Get dataset ID..."DATASET_ID=$("$SKRAAK" sql --db "$DB_PATH" "SELECT id FROM dataset WHERE active = true LIMIT 1" | jq -r '.rows[0].id')if [ -z "$DATASET_ID" ] || [ "$DATASET_ID" = "null" ]; thenecho "ERROR: No active dataset found"exit 1fiecho " Dataset ID: $DATASET_ID"# Test dry-runecho ""echo "Test 2: Dry-run export..."OUTPUT=$("$SKRAAK" export dataset --db "$DB_PATH" --id "$DATASET_ID" --output "$EXPORT_DB" --dry-run)echo "$OUTPUT" | jq -r '.message'DRY_RUN=$(echo "$OUTPUT" | jq -r '.dry_run')if [ "$DRY_RUN" != "true" ]; thenecho "ERROR: dry_run should be true"exit 1fiecho " ✓ Dry-run works"# Verify no file createdif [ -f "$EXPORT_DB" ]; thenecho "ERROR: Export file should not exist after dry-run"exit 1fiecho " ✓ No file created in dry-run mode"# Test actual exportecho ""echo "Test 3: Export dataset..."OUTPUT=$("$SKRAAK" export dataset --db "$DB_PATH" --id "$DATASET_ID" --output "$EXPORT_DB" --force)echo "$OUTPUT" | jq -r '.message'# Verify export file existsif [ ! -f "$EXPORT_DB" ]; thenecho "ERROR: Export file not created"exit 1fiecho " ✓ Export file created"# Verify event log file existsif [ ! -f "$EXPORT_DB.events.jsonl" ]; thenecho "ERROR: Event log file not created"exit 1fiecho " ✓ Event log file created"# Verify row countsecho ""echo "Test 4: Verify row counts..."FILE_COUNT=$(echo "$OUTPUT" | jq -r '.row_counts.file')EXPORTED_COUNT=$("$SKRAAK" sql --db "$EXPORT_DB" "SELECT COUNT(*) as count FROM file" | jq -r '.rows[0].count')if [ "$FILE_COUNT" != "$EXPORTED_COUNT" ]; thenecho "ERROR: File count mismatch: expected $FILE_COUNT, got $EXPORTED_COUNT"exit 1fiecho " ✓ Row counts match ($FILE_COUNT files)"# Verify datasetecho ""echo "Test 5: Verify dataset..."DATASET_COUNT=$("$SKRAAK" sql --db "$EXPORT_DB" "SELECT COUNT(*) as count FROM dataset WHERE id = '$DATASET_ID'" | jq -r '.rows[0].count')if [ "$DATASET_COUNT" != "1" ]; thenecho "ERROR: Dataset not found in export"exit 1fiecho " ✓ Dataset found in export"# Test error handling - dataset not foundecho ""echo "Test 6: Test error handling..."ERROR=$("$SKRAAK" export dataset --db "$DB_PATH" --id "NOTAREALID" --output "$EXPORT_DB" 2>&1 || true)if [[ ! "$ERROR" =~ "dataset not found" ]]; thenecho "ERROR: Should report dataset not found"echo "$ERROR"exit 1fiecho " ✓ Error handling works for missing dataset"# Test --force overwriteecho ""echo "Test 7: Test --force overwrite..."OUTPUT=$("$SKRAAK" export dataset --db "$DB_PATH" --id "$DATASET_ID" --output "$EXPORT_DB" --force 2>&1)if [[ "$OUTPUT" =~ "error" ]]; thenecho "ERROR: Should not error with --force"echo "$OUTPUT"exit 1fiecho " ✓ --force overwrite works"# Test error without --forceecho ""echo "Test 8: Test error without --force..."ERROR=$("$SKRAAK" export dataset --db "$DB_PATH" --id "$DATASET_ID" --output "$EXPORT_DB" 2>&1 || true)if [[ ! "$ERROR" =~ "file exists" ]]; thenecho "ERROR: Should report file exists"echo "$ERROR"exit 1fiecho " ✓ Error handling works for existing file"# Clean uprm -f "$EXPORT_DB" "$EXPORT_DB.events.jsonl"echo ""echo "=== All tests passed ==="
package dbimport ("database/sql""strings""testing"_ "github.com/duckdb/duckdb-go/v2")func TestReadSchemaSQL(t *testing.T) {schema, err := ReadSchemaSQL()if err != nil {t.Fatalf("ReadSchemaSQL() error = %v", err)}// Verify schema contains expected elementsif !strings.Contains(schema, "CREATE TABLE dataset") {t.Error("schema missing CREATE TABLE dataset")}if !strings.Contains(schema, "CREATE TYPE dataset_type") {t.Error("schema missing CREATE TYPE dataset_type")}if !strings.Contains(schema, "CREATE INDEX") {t.Error("schema missing CREATE INDEX")}if !strings.Contains(schema, "ebird_taxonomy_v2024") {t.Error("schema missing ebird_taxonomy_v2024")}}func TestExtractDDLStatements(t *testing.T) {schema, err := ReadSchemaSQL()if err != nil {t.Fatalf("ReadSchemaSQL() error = %v", err)}statements := ExtractDDLStatements(schema)if len(statements) == 0 {t.Fatal("ExtractDDLStatements returned no statements")}// Count statement typestypeCounts := make(map[string]int)tableNames := make(map[string]bool)for _, stmt := range statements {typeCounts[stmt.Type]++if stmt.TableName != "" {tableNames[stmt.TableName] = true}t.Logf("Statement type=%s table=%s sql=%s", stmt.Type, stmt.TableName, stmt.SQL[:min(50, len(stmt.SQL))])}// Verify we have all expected typesif typeCounts["CREATE_TYPE"] < 2 {t.Errorf("expected at least 2 CREATE_TYPE statements, got %d", typeCounts["CREATE_TYPE"])}if typeCounts["CREATE_TABLE"] < 10 {t.Errorf("expected at least 10 CREATE_TABLE statements, got %d", typeCounts["CREATE_TABLE"])}if typeCounts["CREATE_INDEX"] < 5 {t.Errorf("expected at least 5 CREATE_INDEX statements, got %d", typeCounts["CREATE_INDEX"])}// CREATE_TABLE_AS might be 0 if the extraction logic changes - that's OK// as long as we handle it correctly in the export code// Verify key tables are foundexpectedTables := []string{"dataset", "location", "cluster", "file", "selection", "label"}for _, expected := range expectedTables {if !tableNames[expected] {t.Errorf("missing table %s in extracted statements", expected)}}}func min(a, b int) int {if a < b {return a}return b}func TestExtractDDLStatement_Types(t *testing.T) {tests := []struct {name stringsql stringwantType stringwantTable string}{{name: "CREATE TYPE",sql: "CREATE TYPE dataset_type AS ENUM ('structured', 'unstructured');",wantType: "CREATE_TYPE",wantTable: "",},{name: "CREATE TABLE simple",sql: "CREATE TABLE dataset (id VARCHAR(12) PRIMARY KEY);",wantType: "CREATE_TABLE",wantTable: "dataset",},{name: "CREATE TABLE with newlines",sql: "CREATE TABLE location\n(\n id VARCHAR(12) PRIMARY KEY\n);",wantType: "CREATE_TABLE",wantTable: "location",},{name: "CREATE TABLE AS SELECT",sql: "CREATE TABLE ebird_taxonomy_v2024 AS SELECT id, species_code FROM ebird_taxonomy WHERE taxonomy_version = '2024';",wantType: "CREATE_TABLE_AS",wantTable: "ebird_taxonomy_v2024",},{name: "CREATE INDEX",sql: "CREATE INDEX idx_file_location ON file(location_id);",wantType: "CREATE_INDEX",wantTable: "idx_file_location",},{name: "CREATE UNIQUE INDEX",sql: "CREATE UNIQUE INDEX idx_species_label ON species(label);",wantType: "CREATE_INDEX",wantTable: "idx_species_label",},}for _, tt := range tests {t.Run(tt.name, func(t *testing.T) {stmt := parseDDLStatement(tt.sql)if stmt.Type != tt.wantType {t.Errorf("parseDDLStatement().Type = %v, want %v", stmt.Type, tt.wantType)}if stmt.TableName != tt.wantTable {t.Errorf("parseDDLStatement().TableName = %v, want %v", stmt.TableName, tt.wantTable)}})}}func TestExtractTableName(t *testing.T) {tests := []struct {name stringsql stringwant string}{{name: "simple table",sql: "CREATE TABLE dataset (id VARCHAR(12) PRIMARY KEY",want: "dataset",},{name: "table with space before paren",sql: "CREATE TABLE location (id VARCHAR(12)",want: "location",},{name: "table with newline",sql: "CREATE TABLE cluster\n(\n id VARCHAR(12)",want: "cluster",},{name: "table with no space",sql: "CREATE TABLE file(id VARCHAR(21)",want: "file",},}for _, tt := range tests {t.Run(tt.name, func(t *testing.T) {got := extractTableName(tt.sql)if got != tt.want {t.Errorf("extractTableName() = %v, want %v", got, tt.want)}})}}func TestExtractTableNameAsSelect(t *testing.T) {tests := []struct {name stringsql stringwant string}{{name: "simple AS SELECT",sql: "CREATE TABLE ebird_taxonomy_v2024 AS SELECT id FROM ebird_taxonomy",want: "ebird_taxonomy_v2024",},{name: "AS SELECT with newlines",sql: "CREATE TABLE my_view AS\nSELECT id\nFROM source",want: "my_view",},}for _, tt := range tests {t.Run(tt.name, func(t *testing.T) {got := extractTableNameAsSelect(tt.sql)if got != tt.want {t.Errorf("extractTableNameAsSelect() = %v, want %v", got, tt.want)}})}}func TestExtractIndexName(t *testing.T) {tests := []struct {name stringsql stringwant string}{{name: "CREATE INDEX",sql: "CREATE INDEX idx_file_location ON file(location_id)",want: "idx_file_location",},{name: "CREATE UNIQUE INDEX",sql: "CREATE UNIQUE INDEX idx_species_label ON species(label)",want: "idx_species_label",},{name: "index with spaces",sql: "CREATE INDEX idx_test ON table_name (column)",want: "idx_test",},}for _, tt := range tests {t.Run(tt.name, func(t *testing.T) {got := extractIndexName(tt.sql)if got != tt.want {t.Errorf("extractIndexName() = %v, want %v", got, tt.want)}})}}func TestExtractDDLStatements_SkipsComments(t *testing.T) {schema := `-- This is a commentCREATE TABLE test (id INT);-- Another commentCREATE INDEX idx_test ON test(id);`statements := ExtractDDLStatements(schema)// Should have 2 statements, not 4if len(statements) != 2 {t.Errorf("expected 2 statements, got %d", len(statements))}for _, stmt := range statements {if strings.Contains(stmt.SQL, "--") {t.Errorf("statement should not contain comments: %s", stmt.SQL)}}}func TestGetFKOrder(t *testing.T) {// Use in-memory databasedb, err := sql.Open("duckdb", ":memory:")if err != nil {t.Fatalf("failed to open database: %v", err)}defer db.Close()// Create tables with FK relationshipsschema := `CREATE TABLE parent (id VARCHAR(12) PRIMARY KEY);CREATE TABLE child (id VARCHAR(12) PRIMARY KEY, parent_id VARCHAR(12), FOREIGN KEY (parent_id) REFERENCES parent(id));CREATE TABLE grandchild (id VARCHAR(12) PRIMARY KEY, child_id VARCHAR(12), FOREIGN KEY (child_id) REFERENCES child(id));CREATE TABLE independent (id VARCHAR(12) PRIMARY KEY);`_, err = db.Exec(schema)if err != nil {t.Fatalf("failed to create schema: %v", err)}order, err := GetFKOrder(db)if err != nil {t.Fatalf("GetFKOrder() error = %v", err)}// Build a map for quick lookuporderMap := make(map[string]int)for i, table := range order {orderMap[table] = i}// Verify order: parent must come before child, child before grandchildif orderMap["parent"] >= orderMap["child"] {t.Error("parent should come before child")}if orderMap["child"] >= orderMap["grandchild"] {t.Error("child should come before grandchild")}// Independent table can be anywhereif _, ok := orderMap["independent"]; !ok {t.Error("independent table missing from order")}}func TestGetTableRowCount(t *testing.T) {// Use in-memory databasedb, err := sql.Open("duckdb", ":memory:")if err != nil {t.Fatalf("failed to open database: %v", err)}defer db.Close()// Create and populate table_, err = db.Exec("CREATE TABLE test (id INT)")if err != nil {t.Fatalf("failed to create table: %v", err)}_, err = db.Exec("INSERT INTO test VALUES (1), (2), (3)")if err != nil {t.Fatalf("failed to insert: %v", err)}count, err := GetTableRowCount(db, "test")if err != nil {t.Fatalf("GetTableRowCount() error = %v", err)}if count != 3 {t.Errorf("GetTableRowCount() = %d, want 3", count)}}
package dbimport ("database/sql""embed""fmt""strings")//go:embed schema.sqlvar schemaFS embed.FS// ReadSchemaSQL reads the schema.sql file// Uses embedded file for distributed binariesfunc ReadSchemaSQL() (string, error) {data, err := schemaFS.ReadFile("schema.sql")if err != nil {return "", fmt.Errorf("failed to read schema.sql: %w", err)}return string(data), nil}// DDLStatement represents a parsed DDL statement with metadatatype DDLStatement struct {SQL stringType string // "CREATE_TYPE", "CREATE_TABLE", "CREATE_INDEX", "CREATE_TABLE_AS"TableName string // for CREATE TABLE and CREATE INDEX}// ExtractDDLStatements splits schema SQL into executable DDL statements// Returns statements in order: types, tables, indexes// Handles CREATE TABLE ... AS SELECT specially (marked but included)func ExtractDDLStatements(schemaSQL string) []DDLStatement {var statements []DDLStatement// Split by semicolon, but handle the CREATE TABLE AS SELECT caselines := strings.Split(schemaSQL, "\n")var currentStmt strings.Builderfor _, line := range lines {trimmed := strings.TrimSpace(line)// Skip empty lines and commentsif trimmed == "" || strings.HasPrefix(trimmed, "--") {continue}currentStmt.WriteString(line)currentStmt.WriteString("\n")// Statement ends at semicolonif strings.HasSuffix(trimmed, ";") {sql := strings.TrimSpace(currentStmt.String())if sql != "" {stmt := parseDDLStatement(sql)statements = append(statements, stmt)}currentStmt.Reset()}}// Handle any remaining statement without trailing semicolonif currentStmt.Len() > 0 {sql := strings.TrimSpace(currentStmt.String())if sql != "" && strings.HasSuffix(sql, ";") {stmt := parseDDLStatement(sql)statements = append(statements, stmt)}}return statements}// parseDDLStatement determines the type and table name of a DDL statementfunc parseDDLStatement(sql string) DDLStatement {upper := strings.ToUpper(sql)switch {case strings.HasPrefix(upper, "CREATE TYPE"):return DDLStatement{SQL: sql, Type: "CREATE_TYPE", TableName: ""}case isCreateTableAsSelect(upper):// CREATE TABLE name AS SELECT ...tableName := extractTableNameAsSelect(sql)return DDLStatement{SQL: sql, Type: "CREATE_TABLE_AS", TableName: tableName}case strings.HasPrefix(upper, "CREATE TABLE"):tableName := extractTableName(sql)return DDLStatement{SQL: sql, Type: "CREATE_TABLE", TableName: tableName}case strings.HasPrefix(upper, "CREATE INDEX") || strings.HasPrefix(upper, "CREATE UNIQUE INDEX"):indexName := extractIndexName(sql)return DDLStatement{SQL: sql, Type: "CREATE_INDEX", TableName: indexName}default:return DDLStatement{SQL: sql, Type: "UNKNOWN", TableName: ""}}}// isCreateTableAsSelect checks if SQL is CREATE TABLE ... AS SELECTfunc isCreateTableAsSelect(upper string) bool {if !strings.HasPrefix(upper, "CREATE TABLE") {return false}// Look for AS followed by whitespace and SELECTrest := upper[12:] // len("CREATE TABLE")// Find AS (followed by whitespace)asIdx := strings.Index(rest, " AS")if asIdx == -1 {return false}// Check if what follows AS is whitespace and then SELECTafterAs := rest[asIdx+3:] // skip " AS"afterAs = strings.TrimSpace(afterAs)return strings.HasPrefix(afterAs, "SELECT")}// extractTableName extracts table name from CREATE TABLE statementfunc extractTableName(sql string) string {// CREATE TABLE name (// or CREATE TABLE name(upper := strings.ToUpper(sql)// Find "CREATE TABLE"idx := strings.Index(upper, "CREATE TABLE")if idx == -1 {return ""}// Move past "CREATE TABLE"rest := sql[idx+12:]rest = strings.TrimSpace(rest)// Find opening parenthesis or endendIdx := strings.Index(rest, "(")if endIdx == -1 {endIdx = len(rest)}name := strings.TrimSpace(rest[:endIdx])return name}// extractTableNameAsSelect extracts table name from CREATE TABLE ... AS SELECTfunc extractTableNameAsSelect(sql string) string {// CREATE TABLE name AS SELECT// or CREATE TABLE name AS\nSELECT (multiline)upper := strings.ToUpper(sql)idx := strings.Index(upper, "CREATE TABLE")if idx == -1 {return ""}rest := sql[idx+12:]rest = strings.TrimSpace(rest)// Find " AS" (followed by whitespace that leads to SELECT)asIdx := strings.Index(strings.ToUpper(rest), " AS")if asIdx == -1 {return ""}name := strings.TrimSpace(rest[:asIdx])return name}// extractIndexName extracts index name from CREATE INDEX statementfunc extractIndexName(sql string) string {upper := strings.ToUpper(sql)// Handle "CREATE UNIQUE INDEX" or "CREATE INDEX"var rest stringif strings.HasPrefix(upper, "CREATE UNIQUE INDEX") {rest = sql[19:]} else if strings.HasPrefix(upper, "CREATE INDEX") {rest = sql[12:]} else {return ""}rest = strings.TrimSpace(rest)// Find " ON "onIdx := strings.Index(strings.ToUpper(rest), " ON ")if onIdx == -1 {return ""}name := strings.TrimSpace(rest[:onIdx])return name}// FKRelation represents a foreign key relationship between tablestype FKRelation struct {Table string // table that has the FKColumn string // FK columnForeignTable string // referenced table}// GetFKOrder computes the order tables should be copied based on FK dependencies// Tables with no FKs come first, then dependent tables in topological orderfunc GetFKOrder(db *sql.DB) ([]string, error) {// Use DuckDB's duckdb_constraints() function for accurate FK infoquery := `SELECT table_name, referenced_tableFROM duckdb_constraints()WHERE constraint_type = 'FOREIGN KEY'AND referenced_table IS NOT NULL`rows, err := db.Query(query)if err != nil {return nil, fmt.Errorf("failed to query FK relationships: %w", err)}defer rows.Close()// Build reverse dependency graph: table -> tables that depend on it// dependsOnMe[A] = [B, C] means B and C have FKs to AdependsOnMe := make(map[string][]string)tables := make(map[string]bool)for rows.Next() {var table, foreignTable stringif err := rows.Scan(&table, &foreignTable); err != nil {return nil, fmt.Errorf("failed to scan FK row: %w", err)}tables[table] = truetables[foreignTable] = true// foreignTable is referenced by tabledependsOnMe[foreignTable] = append(dependsOnMe[foreignTable], table)}if err := rows.Err(); err != nil {return nil, fmt.Errorf("error iterating FK rows: %w", err)}// Get all tables from the databasetableRows, err := db.Query(`SELECT table_nameFROM information_schema.tablesWHERE table_schema = 'main'AND table_type = 'BASE TABLE'`)if err != nil {return nil, fmt.Errorf("failed to query tables: %w", err)}defer tableRows.Close()for tableRows.Next() {var name stringif err := tableRows.Scan(&name); err != nil {return nil, fmt.Errorf("failed to scan table name: %w", err)}tables[name] = true}// Count how many FKs each table has (tables it depends on)fkCount := make(map[string]int)for table := range tables {fkCount[table] = 0}for _, dependents := range dependsOnMe {for _, dependent := range dependents {fkCount[dependent]++}}// Topological sort (Kahn's algorithm)// 1. Start with tables that have no FKs (fkCount = 0)var queue []stringfor table := range tables {if fkCount[table] == 0 {queue = append(queue, table)}}// 2. Process queuevar result []stringfor len(queue) > 0 {// Pop first elementcurrent := queue[0]queue = queue[1:]result = append(result, current)// For each table that depends on current, decrease its FK countfor _, dependent := range dependsOnMe[current] {fkCount[dependent]--if fkCount[dependent] == 0 {queue = append(queue, dependent)}}}// If result doesn't contain all tables, there's a cycleif len(result) != len(tables) {// Add remaining tables (cycle handling)for table := range tables {found := falsefor _, r := range result {if r == table {found = truebreak}}if !found {result = append(result, table)}}}return result, nil}// GetTableRowCount returns the number of rows in a tablefunc GetTableRowCount(db *sql.DB, table string) (int64, error) {var count int64err := db.QueryRow(fmt.Sprintf("SELECT COUNT(*) FROM %s", table)).Scan(&count)if err != nil {return 0, fmt.Errorf("failed to count rows in %s: %w", table, err)}return count, nil}
package cmdimport ("context""encoding/json""flag""fmt""os""skraak/tools")// RunExport handles the "export" subcommandfunc RunExport(args []string) {if len(args) < 1 {printExportUsage()os.Exit(1)}switch args[0] {case "dataset":runExportDataset(args[1:])default:fmt.Fprintf(os.Stderr, "Unknown export subcommand: %s\n\n", args[0])printExportUsage()os.Exit(1)}}func printExportUsage() {fmt.Fprintf(os.Stderr, "Usage: skraak export <subcommand> [options]\n\n")fmt.Fprintf(os.Stderr, "Subcommands:\n")fmt.Fprintf(os.Stderr, " dataset Export a dataset with all related data\n")fmt.Fprintf(os.Stderr, "\nExamples:\n")fmt.Fprintf(os.Stderr, " skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb\n")fmt.Fprintf(os.Stderr, " skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb --dry-run\n")}func runExportDataset(args []string) {fs := flag.NewFlagSet("export dataset", flag.ExitOnError)dbPath := fs.String("db", "", "Path to source DuckDB database (required)")datasetID := fs.String("id", "", "Dataset ID to export (required)")output := fs.String("output", "", "Output database path (required)")dryRun := fs.Bool("dry-run", false, "Show what would be exported without creating file")force := fs.Bool("force", false, "Overwrite existing output file")fs.Usage = func() {fmt.Fprintf(os.Stderr, "Usage: skraak export dataset --db <path> --id <dataset_id> --output <path> [options]\n\n")fmt.Fprintf(os.Stderr, "Export a dataset with all related data to a new DuckDB database.\n\n")fmt.Fprintf(os.Stderr, "Options:\n")fs.PrintDefaults()fmt.Fprintf(os.Stderr, "\nExamples:\n")fmt.Fprintf(os.Stderr, " skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb\n")fmt.Fprintf(os.Stderr, " skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb --dry-run\n")fmt.Fprintf(os.Stderr, " skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb --force\n")}if err := fs.Parse(args); err != nil {os.Exit(1)}// Validate required flagsmissing := []string{}if *dbPath == "" {missing = append(missing, "--db")}if *datasetID == "" {missing = append(missing, "--id")}if *output == "" {missing = append(missing, "--output")}if len(missing) > 0 {fmt.Fprintf(os.Stderr, "Error: missing required flags: %v\n\n", missing)fs.Usage()os.Exit(1)}tools.SetDBPath(*dbPath)input := tools.ExportDatasetInput{DatasetID: *datasetID,Output: *output,DryRun: *dryRun,Force: *force,}outputResult, err := tools.ExportDataset(context.Background(), input)if err != nil {fmt.Fprintf(os.Stderr, "Error: %v\n", err)os.Exit(1)}enc := json.NewEncoder(os.Stdout)enc.SetIndent("", " ")enc.Encode(outputResult)}
## Dataset ExportExport a dataset with all related data to a new DuckDB database for collaboration, testing, or archival.**Use cases:**- **Collaboration:** Export, send to collaborator, they return event log for replay- **Testing:** Create focused test database from production (100 MB vs 1.5 GB)- **Archival:** Snapshot a dataset at a point in time
**Export:**```bash# Export dataset to new database./skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb# Preview without creating file./skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb --dry-run# Overwrite existing export./skraak export dataset --db ./db/skraak.duckdb --id abc123 --output export.duckdb --force```**What's exported:**- All rows owned by dataset (via dataset_id foreign key traversal)- Subset of reference data (species, patterns, filters used)- Creates empty event log file for changes**Re-import changes:**```bash# After collaborator returns event log, replay on backup./skraak replay events --db ./backup.duckdb --log export.duckdb.events.jsonl```
---## Dataset ExportExport a dataset with all related data to a new DuckDB database.```bashskraak export dataset --db skraak.duckdb --id abc123 --output export.duckdbskraak export dataset --db skraak.duckdb --id abc123 --output export.duckdb --dry-run```**Use cases:** Collaboration, testing (small DB vs 1.5 GB), archival**Key files:**- `tools/export.go` - Export logic with table manifest- `db/schema.go` - Schema utilities (DDL extraction, FK ordering)
**CLI Commands:** `mcp`, `sql`, `dataset`, `location`, `cluster`, `pattern`, `import`, `replay`
**CLI Commands:** `mcp`, `sql`, `dataset`, `location`, `cluster`, `pattern`, `import`, `export`, `replay`
## [2026-02-19] Dataset Export for Collaboration and Testing**New feature: Export a dataset with all related data to a new database****Purpose:** Enable dataset-level exports for collaboration (export, modify, replay changes), testing (small focused test DBs), and archival.**Architecture:**- Schema read from embedded `db/schema.sql` (DDL statements extracted dynamically)- Table copy order computed from FK relationships using `duckdb_constraints()`- ATTACH mechanism for efficient cross-database copying- Declarative manifest defines table relationships**Added:**- `tools/export.go` — `ExportDataset()` with table manifest and copy logic- `cmd/export.go` — `skraak export dataset` CLI command- `db/schema.go` — Schema utilities: `ReadSchemaSQL()`, `ExtractDDLStatements()`, `GetFKOrder()`- `shell_scripts/test_export.sh` — Integration test script**Command:**```bashskraak export dataset --db skraak.duckdb --id abc123 --output export.duckdbskraak export dataset --db skraak.duckdb --id abc123 --output export.duckdb --dry-runskraak export dataset --db skraak.duckdb --id abc123 --output export.duckdb --force```**What's exported:**- Dataset row and all owned data (locations, clusters, files, selections, labels)- Subset of reference data (species, patterns, filters used by dataset)- Reference tables copied as-is (`ebird_taxonomy_v2024`)- Empty event log created for capturing changes**Design decisions:**- Schema from `schema.sql` ensures schema-resilience (new columns auto-included)- FK order computed dynamically via `duckdb_constraints()` function- Close source DB before output DB (DuckDB single-connection limit)- `SELECT *` copies all columns without hard-coding