76377HW6NPWGB77BON4ZDDVMWD4QHQARAMUTJGZCIW7PE2MVTUAAC JHLW3BKOFX3R2YUVEHWL5KZHEGPI3I73FEOD65AZJQNDLHIB2RAAC POT4DNZ3HEX4TZWQLLI6WUZPRFLOFWRBJAFBM6MKEJG2GLVR3JZAC OGLLBQQYE5KICDMI6EX7ZI4TZT5RB7UFHH7O2DUOZ44QQXVL5YAAC J62FGJ3BGFTUWEOUON4ATYNDFBEUIR6FCIOZSHOVHAA7KGFYHW6AC IFVRAERTCCDICNTYTG3TX2WASB6RXQQEJWWXQMQZJSQDQ3HLE5OQC 5LMYPB2QHNVDLYCRWLOMCPY35ZKHHPYVW5XHASE66L6PJZSOCXYQC W3A2EECCD23SVHJZN6MXPH2PAVFHH5CNFD2XHPQRRW6M4GUTG3FAC RMWLXG5HGB44LH3CEA7FWTAFPSZQZQGH52OHVQJUDP6ASPVDVQJAC 5IIWZLQR5ZKOW4OUJCQMJDPWXZKDLGRTK4DQIELXHX7VUBTXUJ4AC Table string // table nameRelation string // "owned" | "owned-via" | "referenced" | "derived"FilterCol string // column to filter onViaTable string // for referenced: junction tableViaColumn string // for referenced: column in junction tableSourceTable string // for derived: source table
Table string // table nameRelation string // "owned" | "owned-via" | "copy"FilterCol string // column to filter onViaTable string // for owned-via: table to join through
// Referenced (subset extraction){Table: "species", Relation: "referenced", FilterCol: "id", ViaTable: "species_dataset", ViaColumn: "species_id"},{Table: "call_type", Relation: "referenced", FilterCol: "species_id", ViaTable: "species", ViaColumn: "id"},
// Referenced (subset extraction) - none remaining
createStmt := buildDerivedTableCreate(stmt.TableName)if createStmt != "" {if _, err := database.Exec(createStmt); err != nil {database.Close()return nil, fmt.Errorf("failed to create table %s: %w", stmt.TableName, err)}}} else {if _, err := database.Exec(stmt.SQL); err != nil {// Ignore "already exists" errors for typesif !strings.Contains(err.Error(), "already exists") {database.Close()return nil, fmt.Errorf("failed to execute DDL for %s: %w", stmt.TableName, err)}
continue}if _, err := database.Exec(stmt.SQL); err != nil {// Ignore "already exists" errors for typesif !strings.Contains(err.Error(), "already exists") {database.Close()return nil, fmt.Errorf("failed to execute DDL for %s: %w", stmt.TableName, err)
// buildDerivedTableCreate builds a CREATE TABLE statement for tables defined as CREATE TABLE AS SELECTfunc buildDerivedTableCreate(tableName string) string {switch tableName {case "ebird_taxonomy_v2024":return `CREATE TABLE ebird_taxonomy_v2024 (id VARCHAR(12),species_code VARCHAR(15),primary_com_name VARCHAR(100),sci_name VARCHAR(100),bird_order VARCHAR(30),family VARCHAR(100))`default:return ""}}
}}// buildReferencedQuery builds a query for referenced tablesfunc buildReferencedQuery(tr TableRelationship, datasetID string) string {switch tr.Table {case "ebird_taxonomy":// Join via species.ebird_code and filter by species_datasetreturn `INSERT INTO ebird_taxonomy SELECT DISTINCT t.*FROM source.ebird_taxonomy tINNER JOIN source.species s ON t.species_code = s.ebird_code AND t.taxonomy_version = s.taxonomy_versionWHERE s.id IN (SELECT species_id FROM source.species_dataset WHERE dataset_id = ?)`case "filter":// Filter via label -> selection -> dataset_idreturn `INSERT INTO filter SELECT DISTINCT f.*FROM source.filter fWHERE f.id IN (SELECT l.filter_id FROM source.label lINNER JOIN source.selection s ON l.selection_id = s.idWHERE s.dataset_id = ?)`case "call_type":// Call types for species in this datasetreturn fmt.Sprintf(`INSERT INTO %s SELECT * FROM source.%sWHERE %s IN (SELECT id FROM source.species WHERE id IN(SELECT species_id FROM source.species_dataset WHERE dataset_id = ?))`,tr.Table, tr.Table, tr.FilterCol)case "cyclic_recording_pattern":// Patterns via cluster.dataset_idreturn `INSERT INTO cyclic_recording_pattern SELECT DISTINCT p.*FROM source.cyclic_recording_pattern pWHERE p.id IN (SELECT c.cyclic_recording_pattern_id FROM source.cluster cWHERE c.dataset_id = ? AND c.cyclic_recording_pattern_id IS NOT NULL)`default:// Generic: filter via junction tablereturn fmt.Sprintf(`INSERT INTO %s SELECT * FROM source.%sWHERE %s IN (SELECT %s FROM source.%s WHERE dataset_id = ?)`,tr.Table, tr.Table, tr.FilterCol, tr.ViaColumn, tr.ViaTable)}}// populateDerivedTable populates a derived table from already-copied datafunc populateDerivedTable(ctx context.Context, outputDB *sql.DB, tr TableRelationship) error {switch tr.Table {case "ebird_taxonomy_v2024":// Insert from already-copied ebird_taxonomy_, err := outputDB.ExecContext(ctx, `INSERT INTO ebird_taxonomy_v2024SELECT id, species_code, primary_com_name, sci_name, bird_order, familyFROM ebird_taxonomyWHERE taxonomy_version = '2024'`)return errdefault:return fmt.Errorf("unknown derived table: %s", tr.Table)
case "referenced":query = buildCountReferencedQuery(tr)case "derived":// Count what would be derived based on already-referenced dataif tr.Table == "ebird_taxonomy_v2024" {query = `SELECT COUNT(*) FROM ebird_taxonomy WHERE taxonomy_version = '2024'AND species_code IN (SELECT ebird_code FROM species WHERE id IN(SELECT species_id FROM species_dataset WHERE dataset_id = ?))`} else {return 0, nil}
switch tr.Table {case "ebird_taxonomy":return `SELECT COUNT(DISTINCT ebird_taxonomy.id) FROM ebird_taxonomyINNER JOIN species ON ebird_taxonomy.species_code = species.ebird_codeAND ebird_taxonomy.taxonomy_version = species.taxonomy_versionWHERE species.id IN (SELECT species_id FROM species_dataset WHERE dataset_id = ?)`case "filter":// Filter via label -> selection -> dataset_idreturn `SELECT COUNT(DISTINCT filter.id) FROM filterINNER JOIN label ON filter.id = label.filter_idINNER JOIN selection ON label.selection_id = selection.idWHERE selection.dataset_id = ?`case "call_type":return fmt.Sprintf(`SELECT COUNT(*) FROM %s WHERE %s IN(SELECT id FROM species WHERE id IN(SELECT species_id FROM species_dataset WHERE dataset_id = ?))`, tr.Table, tr.FilterCol)case "cyclic_recording_pattern":return `SELECT COUNT(DISTINCT cyclic_recording_pattern.id) FROM cyclic_recording_patternINNER JOIN cluster ON cyclic_recording_pattern.id = cluster.cyclic_recording_pattern_idWHERE cluster.dataset_id = ?`default:return fmt.Sprintf(`SELECT COUNT(*) FROM %s WHERE %s IN(SELECT %s FROM %s WHERE dataset_id = ?)`, tr.Table, tr.FilterCol, tr.ViaColumn, tr.ViaTable)}
// Only filter table is referenced via label -> selection -> dataset_idreturn `SELECT COUNT(DISTINCT filter.id) FROM filterINNER JOIN label ON filter.id = label.filter_idINNER JOIN selection ON label.selection_id = selection.idWHERE selection.dataset_id = ?`
// Derived tables come after their source tablesif ti.Relation == "derived" && tj.Relation != "derived" {return false}if tj.Relation == "derived" && ti.Relation != "derived" {return true}if ti.Relation == "derived" && tj.Relation == "derived" {// Order derived tables by their source tablesi, _ := orderMap[ti.SourceTable]sj, _ := orderMap[tj.SourceTable]return si < sj}// Use FK order for non-derived tables
Description: "SQL schema for a specific table. Available tables: dataset, location, cyclic_recording_pattern, cluster, file, moth_metadata, file_metadata, file_dataset, selection, selection_metadata, ebird_taxonomy, species, call_type, filter, label, label_subtype, ebird_taxonomy_v2024, species_dataset",
Description: "SQL schema for a specific table. Available tables: dataset, location, cyclic_recording_pattern, cluster, file, moth_metadata, file_metadata, file_dataset, selection, selection_metadata, ebird_taxonomy, species, call_type, filter, label, label_subtype",
CREATE TABLE species_dataset (species_id VARCHAR(12) NOT NULL,dataset_id VARCHAR(12) NOT NULL,created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,PRIMARY KEY (species_id, dataset_id),FOREIGN KEY (species_id) REFERENCES species(id),FOREIGN KEY (dataset_id) REFERENCES dataset(id));CREATE TABLE ebird_taxonomy_v2024 ASSELECTid,species_code,primary_com_name,sci_name,bird_order, -- AS "order", order is reserved word in pgsqlfamilyFROM ebird_taxonomyWHERE taxonomy_version = '2024';
## [2026-02-19] Schema Simplification - Remove species_dataset and ebird_taxonomy_v2024**Database schema changes:**- Dropped `species_dataset` table — all species now available across all datasets- Dropped `ebird_taxonomy_v2024` table — use `WHERE taxonomy_version = '2024'` on `ebird_taxonomy` instead**Rationale:**- Simplifies species management (no duplicate species names across datasets)- Reduces schema complexity (one fewer join for species lookups)- `ebird_taxonomy_v2024` was redundant; filtering `ebird_taxonomy` directly is sufficient**Code changes:**- `tools/export.go` — Simplified manifest: `species` and `call_type` now "copy" (full table)- `tools/export.go` — Removed `buildDerivedTableCreate()`, `populateDerivedTable()`, simplified `buildReferencedQuery()`- `tools/import_ml_selections.go` — Species lookup no longer joins `species_dataset`- `resources/schema.go` — Removed tables from list- `db/schema_test.go` — Removed obsolete test cases- `prompts/examples.go` — Updated taxonomy schema description**Export manifest changes:**- `species_dataset` → removed (no longer exists)- `ebird_taxonomy_v2024` → removed (no longer exists)- `species` → changed from "referenced" to "copy"- `call_type` → changed from "referenced" to "copy"- `filter` → changed from "referenced" to "copy"- All "referenced" and "derived" handling code removed---
- Subset of reference data (species, call_types used by dataset via species_dataset)- Reference tables copied in full (`ebird_taxonomy`, `ebird_taxonomy_v2024`, `cyclic_recording_pattern`, `filter`)
- Reference tables copied in full (`ebird_taxonomy`, `species`, `call_type`, `cyclic_recording_pattern`, `filter`)
*.parquet