quietlight/skraak_mcp: db/schema.sql

-- NOTE: DBML does not like functions and materialised views
-- sql2dbml schema.sql --postgres -o schema.dbml
-- dbml-renderer -i schema.dbml -o schema.svg 

-- junction table for files-dataset relationship
-- (file can be a member of many datasets)
-- (selections apply only to 1 dataset)

-- use suncalc.js
-- could use a function for night, on client not db, but want to filter on night
-- in file table use mid point of file as time

-- all times must be zoned to utc 

-- dataset type enum
CREATE TYPE dataset_type AS ENUM ('organise', 'test', 'train');

-- Dataset Table
-- Add type column to the dataset table, so that I do not ever mix testing data with training data.
CREATE TABLE dataset (
    id VARCHAR(12) PRIMARY KEY, -- nanoid(12)
    name VARCHAR(255) NOT NULL,
    description VARCHAR(255),
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    active BOOLEAN DEFAULT TRUE,
    type dataset_type NOT NULL DEFAULT 'organise'
);

CREATE INDEX idx_dataset_name ON dataset(name);
CREATE INDEX idx_dataset_active ON dataset(active);
CREATE INDEX idx_dataset_public ON dataset(public);

-- Locations Table
CREATE TABLE location (
    id VARCHAR(12) PRIMARY KEY, -- nanoid(12)
    dataset_id VARCHAR(12) NOT NULL, -- nanoid, link to dataset
    name VARCHAR(140) NOT NULL,
    latitude DECIMAL(10, 7) NOT NULL CHECK (latitude BETWEEN -90.0 AND 90.0), -- -45.5027
    longitude DECIMAL(10, 7) NOT NULL CHECK (longitude BETWEEN -180.0 AND 180.0), -- 167.48406
    description VARCHAR(255), -- Limited to 255 characters for efficiency
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    active BOOLEAN DEFAULT TRUE,
    timezone_id VARCHAR(40) NOT NULL, -- XXnot required as may need to auto generate cluster, can't always ask or check
    FOREIGN KEY (dataset_id) REFERENCES dataset(id)
);

CREATE INDEX idx_location_name ON location(name);
CREATE INDEX idx_location_dataset ON location(dataset_id); -- ??
CREATE INDEX idx_location_active ON location(active); -- ??
CREATE INDEX idx_location_dataset_active ON location(dataset_id, active);

-- Add recording pattern, mainly so it can be searchable,
-- this is an optional field, audio moth needs this, to help
-- with searching and filtering
-- i have 24/7: 1 in 30 and 1 in 40
CREATE TABLE cyclic_recording_pattern (
    id VARCHAR(12) PRIMARY KEY, -- nanoid(12)
    record_s INTEGER NOT NULL,
    sleep_s INTEGER NOT NULL,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    active BOOLEAN DEFAULT TRUE
);

CREATE INDEX idx_cyclic_recording_pattern_active ON cyclic_recording_pattern(active);

-- Cluster Table (think of a cluster of files as all the files on 1 SD Card)
-- a statistical unit with no major time gaps, to enable call rate stats
-- See changes below, added foreign key on location, added recording pattern
-- added timezone_id for iana timezone id
CREATE TABLE cluster (
    id VARCHAR(12) PRIMARY KEY, -- nanoid(12)
    dataset_id VARCHAR(12) NOT NULL, -- nanoid, link to dataset
    location_id VARCHAR(12) NOT NULL, -- A cluster must have a location, as well as a dataset
    name VARCHAR(140) NOT NULL,
    description VARCHAR(255), -- Limited to 255 characters for efficiency
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    active BOOLEAN DEFAULT TRUE,
    cyclic_recording_pattern_id VARCHAR(12),
    sample_rate INTEGER NOT NULL,
    FOREIGN KEY (dataset_id) REFERENCES dataset(id),
    FOREIGN KEY (location_id) REFERENCES location(id),
    FOREIGN KEY (cyclic_recording_pattern_id) REFERENCES cyclic_recording_pattern(id)
);

CREATE INDEX idx_cluster_dataset ON cluster(dataset_id);
CREATE INDEX idx_cluster_active ON cluster(active);
CREATE INDEX idx_cluster_dataset_active ON cluster(dataset_id, active);
CREATE INDEX idx_cluster_recording_pattern ON cluster(cyclic_recording_pattern_id);
CREATE INDEX idx_cluster_location_id ON cluster(location_id);

-- values in my data is medium and medium-high
CREATE TYPE gain_level AS ENUM ('low', 'low-medium', 'medium', 'medium-high', 'high');

-- Files Table (removed dataset_id as now use junction table)
-- Note: timestamp_local should reflect local time, not timestamp in filename, this is important
-- duration must not have more than 3 decimal places, check in ui
CREATE TABLE file (
    id VARCHAR(21) PRIMARY KEY, -- nanoid
    file_name VARCHAR(255) NOT NULL,
    path VARCHAR(255) NULL, -- optional hint for local access
    xxh64_hash VARCHAR(16) NOT NULL, -- hash of original file cbe675a69a5fef1c
    location_id VARCHAR(12) NOT NULL, -- nanoid, from locations table
    timestamp_local TIMESTAMP WITH TIME ZONE NOT NULL, -- parsed from filename, adjust for daylight saving
    cluster_id VARCHAR(12), -- nanoid(12), optional if imported one by one
    duration DECIMAL(7, 3) NOT NULL CHECK (duration > 0), -- in seconds, allowing for millisecond precision (9999.999s)
    sample_rate INTEGER NOT NULL,
    description VARCHAR(255), -- Limited to 255 characters for efficiency
    maybe_solar_night BOOLEAN, --calculate with function on client. this is a more accurate value to file table
    maybe_civil_night BOOLEAN, --calculate with function on client. this is a more accurate value to file table
    moon_phase DECIMAL(3,2) CHECK (moon_phase BETWEEN 0.00 AND 1.00), -- 0.00 to 1.00 (new moon to full moon)
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    active BOOLEAN DEFAULT TRUE,
    FOREIGN KEY (location_id) REFERENCES location(id),
    FOREIGN KEY (cluster_id) REFERENCES cluster(id)
);

CREATE INDEX idx_file_location ON file(location_id);
CREATE INDEX idx_file_active ON file(active);
CREATE INDEX idx_file_timestamp_local ON file(timestamp_local);
CREATE INDEX idx_file_cluster ON file(cluster_id);
CREATE INDEX idx_file_maybe_solar_night ON file(maybe_solar_night);
CREATE INDEX idx_file_maybe_civil_night ON file(maybe_civil_night);

-- UNIMPLEMENTED
-- Unique constraint on xxh64_hash to prevent duplicate file hashes
-- ALTER TABLE file ADD CONSTRAINT unique_xxh64_hash UNIQUE (xxh64_hash);

CREATE TABLE moth_metadata (
    file_id VARCHAR(21) PRIMARY KEY,
    timestamp TIMESTAMP WITH TIME ZONE NOT NULL,
    recorder_id VARCHAR(16), -- 24F31901603710CD (16)
    gain gain_level NULL, -- low, medium, high or null
    battery_v DECIMAL(2, 1) CHECK (battery_v >= 0), -- for values from 0 to 9.9
    temp_c DECIMAL(3, 1), -- e.g., 24.2
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    active BOOLEAN DEFAULT TRUE,
    FOREIGN KEY (file_id) REFERENCES file(id)
);

CREATE INDEX idx_moth_metadata_active ON moth_metadata(active);

CREATE TABLE file_metadata (
    file_id VARCHAR(21) PRIMARY KEY,
    json JSON, -- For noise levels and other file-level metadata
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    active BOOLEAN DEFAULT TRUE,
    FOREIGN KEY (file_id) REFERENCES file(id)
);

-- CREATE INDEX idx_file_metadata_json ON file_metadata USING gin(json); -- unimplemented
CREATE INDEX idx_file_metadata_active ON file_metadata(active);

-- Junction Table for Files to Dataset (many-to-many)
CREATE TABLE file_dataset (
    file_id VARCHAR(21) NOT NULL,
    dataset_id VARCHAR(12) NOT NULL,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY (file_id, dataset_id),
    FOREIGN KEY (file_id) REFERENCES file(id),
    FOREIGN KEY (dataset_id) REFERENCES dataset(id)
);

-- indexes for the junction table
CREATE INDEX idx_file_dataset_file ON file_dataset(file_id);
CREATE INDEX idx_file_dataset_dataset ON file_dataset(dataset_id);

-- Selection Table
CREATE TABLE selection(
    id VARCHAR(21) PRIMARY KEY, -- nanoid
    file_id VARCHAR(21) NOT NULL, -- nanoid
    dataset_id VARCHAR(12) NOT NULL, -- nanoid, link to dataset
    start_time DECIMAL(7,3) NOT NULL, --up to 9999.999 seconds
    end_time DECIMAL(7,3) NOT NULL, -- up to 9999.999 seconds
    freq_low DECIMAL(9,3) CHECK (freq_low < 300000), -- LOOK AT CHECK
    freq_high DECIMAL(9,3) CHECK (freq_high < 300000), -- LOOK AT CHECK
    description VARCHAR(255), -- Limited to 255 characters for efficiency
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    active BOOLEAN DEFAULT TRUE,
    FOREIGN KEY (file_id) REFERENCES file(id),
    FOREIGN KEY (dataset_id) REFERENCES dataset(id)
);

CREATE INDEX idx_selection_file ON selection(file_id);
CREATE INDEX idx_selection_dataset ON selection(dataset_id);
CREATE INDEX idx_selection_active ON selection(active);

CREATE TABLE selection_metadata (
    selection_id VARCHAR(21) PRIMARY KEY,
    json JSON, -- for loudness, noise, and other selection-level metadaata
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    active BOOLEAN DEFAULT TRUE,
    FOREIGN KEY (selection_id) REFERENCES selection(id)
);

-- CREATE INDEX idx_selection_metadata_json ON selection_metadata USING gin(json); -- unimplemensed
CREATE INDEX idx_selection_metadata_active ON selection_metadata(active);

-- eBird Taxonomy Table
-- will need to update INDEX too when introducing a new version
-- see working with ebird taxonomies, aichat, deepseek, macbook
-- see materialised view and index on it
-- see alter table stuff below, modifications
CREATE TABLE ebird_taxonomy (
    id VARCHAR(12) PRIMARY KEY,  
    taxonomy_version VARCHAR(4) NOT NULL,  
    taxon_order INTEGER NOT NULL,
    category VARCHAR(15) NOT NULL,
    species_code VARCHAR(15) NOT NULL,
    taxon_concept_id VARCHAR(15),
    primary_com_name VARCHAR(100) NOT NULL,
    sci_name VARCHAR(100) NOT NULL,
    bird_order VARCHAR(30),  
    family VARCHAR(100),
    species_group VARCHAR(100),
    report_as VARCHAR(15),
    valid_from DATE NOT NULL,
    valid_to DATE,
    UNIQUE (species_code, taxonomy_version)
);

-- Species Table (mutable)
CREATE TABLE species (
    id VARCHAR(12) PRIMARY KEY, -- nanoid(12)
    label VARCHAR(100) NOT NULL, -- display label for the species
    ebird_code VARCHAR(12), -- link to ebird taxonomy
    taxonomy_version VARCHAR(4),    
    description VARCHAR(255),
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    active BOOLEAN DEFAULT TRUE,
    FOREIGN KEY (ebird_code, taxonomy_version) REFERENCES ebird_taxonomy(species_code, taxonomy_version)
);

CREATE INDEX idx_species_label ON species(label);
CREATE INDEX idx_species_ebird ON species(ebird_code);

-- Call Types Table (mutable)
CREATE TABLE call_type (
    id VARCHAR(12) PRIMARY KEY, -- nanoid(12)
    species_id VARCHAR(12) NOT NULL, -- link to parent species
    label VARCHAR(100) NOT NULL, -- display name like "male", "female", "duet"
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    active BOOLEAN DEFAULT TRUE,
    FOREIGN KEY (species_id) REFERENCES species(id)
);

CREATE INDEX idx_call_type_species ON call_type(species_id);
CREATE INDEX idx_call_type_label ON call_type(label);

CREATE TABLE filter (
    id VARCHAR(12) PRIMARY KEY, -- nanoid
    name VARCHAR(140) NOT NULL,
    description VARCHAR(255),
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    active BOOLEAN NOT NULL DEFAULT true
);

-- Label Table, many to 1 relationship with 
CREATE TABLE label (
    id VARCHAR(21) PRIMARY KEY, -- nanoid
    selection_id VARCHAR(21) NOT NULL, -- link to selection table
    species_id VARCHAR(12) NOT NULL, -- link to species table
    filter_id VARCHAR(12),
    certainty DECIMAL(5,2) CHECK (certainty <= 100 AND certainty >= 0),
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    active BOOLEAN NOT NULL DEFAULT true,
    FOREIGN KEY (selection_id) REFERENCES selection(id),
    FOREIGN KEY (species_id) REFERENCES species(id),
    FOREIGN KEY (filter_id) REFERENCES filter(id)
);

CREATE INDEX idx_label_selection_id ON label(selection_id);
CREATE INDEX idx_label_species_id ON label(species_id);

-- Label Sub-type Table (optional 1:1 relationship with label)
CREATE TABLE label_subtype (
    id VARCHAR(21) PRIMARY KEY, -- nanoid
    label_id VARCHAR(21) NOT NULL, -- link to parent label
    calltype_id VARCHAR(12) NOT NULL, -- link to call_type table
    filter_id VARCHAR(12),
    certainty DECIMAL(5,2) CHECK (certainty <= 100 AND certainty >= 0),
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    active BOOLEAN NOT NULL DEFAULT true,
    FOREIGN KEY (label_id) REFERENCES label(id),
    FOREIGN KEY (calltype_id) REFERENCES call_type(id),
    FOREIGN KEY (filter_id) REFERENCES filter(id)
    -- UNIQUE (label_id) -- ensures 1:1 relationship with label. how do i handle multiple call type filters? needs 1:many relation
);


-- UNIMPLEMENTED
-- file_dataset must exist first
-- Referential Integrity for Selections
-- To ensure `selections.dataset_id` is valid for the associated file:
-- Add composite foreign key (requires file_dataset to exist first)
-- ALTER TABLE selection ADD CONSTRAINT fk_selection_file_dataset
-- FOREIGN KEY (file_id, dataset_id) REFERENCES file_dataset(file_id, dataset_id);

-- 2024 Taxonomy View, after populating underlying table
-- see working with ebird taxonomies, aichat, deepseek, macbook
-- I think I still need this to display the options when someone creates a species
CREATE TABLE ebird_taxonomy_v2024 AS
SELECT
    id,
    species_code,
    primary_com_name,
    sci_name,
    bird_order, -- AS "order",  order is reserved word in pgsql
    family
FROM ebird_taxonomy
WHERE taxonomy_version = '2024';

-- to help with plain text search on common name and scientific name
CREATE INDEX idx_ebird_name_search ON ebird_taxonomy_v2024 USING gin(to_tsvector('english', primary_com_name || ' ' || sci_name));
CREATE INDEX ebird_taxonomy_species_code ON ebird_taxonomy_v2024(species_code);


-- Junction Table for Species to Dataset (many-to-many)
CREATE TABLE species_dataset (
    species_id VARCHAR(12) NOT NULL,  
    dataset_id VARCHAR(12) NOT NULL,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_modified TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY (species_id, dataset_id),
    FOREIGN KEY (species_id) REFERENCES species(id),
    FOREIGN KEY (dataset_id) REFERENCES dataset(id)
);

-- indexes for the junction table
CREATE INDEX idx_species_dataset_species ON species_dataset(species_id);
CREATE INDEX idx_species_dataset_dataset ON species_dataset(dataset_id);