CiteGeist/db/migrations/0001_multisource.sql

186 lines
7.4 KiB
SQL

-- Migration: Multi-source bibliographic schema
-- Description: Add multi-source support with works, identifiers, source records, citations, and embeddings
-- ============================================================================
-- WORKS TABLE - Canonical metadata for works
-- ============================================================================
CREATE TABLE IF NOT EXISTS works (
id INTEGER PRIMARY KEY AUTOINCREMENT,
work_id TEXT NOT NULL UNIQUE,
title TEXT,
abstract TEXT,
publication_year INTEGER,
publication_date TEXT,
journal_name TEXT,
publisher TEXT,
volume TEXT,
issue TEXT,
pages TEXT,
doi TEXT,
pmid TEXT,
pmcid TEXT,
arxiv_id TEXT,
dblp_key TEXT,
openalex_id TEXT,
isbn TEXT,
issn TEXT,
entry_type TEXT NOT NULL DEFAULT 'article',
citation_count INTEGER DEFAULT 0,
cited_by_count INTEGER DEFAULT 0,
influential_citations INTEGER DEFAULT 0,
is_open_access BOOLEAN DEFAULT 0,
best_oa_url TEXT,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
-- ============================================================================
-- WORK IDENTIFIERS TABLE - Mapping scheme + value to works
-- ============================================================================
CREATE TABLE IF NOT EXISTS work_identifiers (
id INTEGER PRIMARY KEY AUTOINCREMENT,
work_id TEXT NOT NULL,
scheme TEXT NOT NULL,
value TEXT NOT NULL,
is_primary BOOLEAN DEFAULT 0,
normalized_value TEXT,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE(work_id, scheme, value),
FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
);
-- ============================================================================
-- SOURCE RECORDS TABLE - Raw API responses with provenance
-- ============================================================================
CREATE TABLE IF NOT EXISTS source_records (
id INTEGER PRIMARY KEY AUTOINCREMENT,
work_id TEXT NOT NULL,
source_type TEXT NOT NULL,
source_label TEXT NOT NULL,
raw_data_json TEXT NOT NULL,
raw_record_id TEXT,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE(work_id, source_type, source_label),
FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
);
-- ============================================================================
-- CITATIONS TABLE - Citation graph with provenance
-- ============================================================================
CREATE TABLE IF NOT EXISTS citations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_work_id TEXT NOT NULL,
target_work_id TEXT NOT NULL,
relation_type TEXT NOT NULL,
source_type TEXT NOT NULL,
source_label TEXT NOT NULL,
confidence REAL DEFAULT 1.0,
is_verified BOOLEAN DEFAULT 0,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE(source_work_id, target_work_id, relation_type),
FOREIGN KEY (source_work_id) REFERENCES works(id) ON DELETE CASCADE,
FOREIGN KEY (target_work_id) REFERENCES works(id) ON DELETE CASCADE
);
-- ============================================================================
-- WORK EMBEDDINGS TABLE - Vector storage for semantic search
-- ============================================================================
CREATE TABLE IF NOT EXISTS work_embeddings (
id INTEGER PRIMARY KEY AUTOINCREMENT,
work_id TEXT NOT NULL,
embedding TEXT NOT NULL,
model_name TEXT NOT NULL,
model_version TEXT,
dimension INTEGER NOT NULL,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE(work_id, model_name),
FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
);
-- ============================================================================
-- INDEXES - For performance optimization
-- ============================================================================
-- Work identifiers indexes
CREATE INDEX IF NOT EXISTS idx_work_identifiers_scheme ON work_identifiers(scheme);
CREATE INDEX IF NOT EXISTS idx_work_identifiers_value ON work_identifiers(value);
CREATE INDEX IF NOT EXISTS idx_work_identifiers_work_id ON work_identifiers(work_id);
CREATE INDEX IF NOT EXISTS idx_work_identifiers_normalized ON work_identifiers(normalized_value);
-- Source records indexes
CREATE INDEX IF NOT EXISTS idx_source_records_work_id ON source_records(work_id);
CREATE INDEX IF NOT EXISTS idx_source_records_source_type ON source_records(source_type);
-- Citations indexes
CREATE INDEX IF NOT EXISTS idx_citations_source_work ON citations(source_work_id);
CREATE INDEX IF NOT EXISTS idx_citations_target_work ON citations(target_work_id);
CREATE INDEX IF NOT EXISTS idx_citations_relation_type ON citations(relation_type);
CREATE INDEX IF NOT EXISTS idx_citations_source_type ON citations(source_type);
-- Works indexes
CREATE INDEX IF NOT EXISTS idx_works_doi ON works(doi);
CREATE INDEX IF NOT EXISTS idx_works_pmid ON works(pmid);
CREATE INDEX IF NOT EXISTS idx_works_pmcid ON works(pmcid);
CREATE INDEX IF NOT EXISTS idx_works_arxiv ON works(arxiv_id);
CREATE INDEX IF NOT EXISTS idx_works_openalex ON works(openalex_id);
CREATE INDEX IF NOT EXISTS idx_works_is_open_access ON works(is_open_access);
CREATE INDEX IF NOT EXISTS idx_works_created_at ON works(created_at);
-- Embeddings indexes
CREATE INDEX IF NOT EXISTS idx_embeddings_work_id ON work_embeddings(work_id);
CREATE INDEX IF NOT EXISTS idx_embeddings_model_name ON work_embeddings(model_name);
-- ============================================================================
-- PostgreSQL-specific extensions and vector indexing
-- ============================================================================
-- Note: The following are PostgreSQL-specific and should be run when using pgvector
-- Uncomment these when using PostgreSQL with pgvector extension:
-- CREATE EXTENSION IF NOT EXISTS vector;
-- CREATE INDEX IF NOT EXISTS idx_embeddings_vector ON work_embeddings
-- USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
-- ============================================================================
-- TRIGGERS - For automatic timestamp updates
-- ============================================================================
-- Works table update trigger
CREATE TRIGGER IF NOT EXISTS works_updated_at
AFTER UPDATE ON works
FOR EACH ROW
WHEN (new.updated_at IS NULL)
BEGIN
UPDATE works SET updated_at = CURRENT_TIMESTAMP WHERE id = old.id;
END;
-- Work identifiers update trigger
CREATE TRIGGER IF NOT EXISTS work_identifiers_updated_at
AFTER UPDATE ON work_identifiers
FOR EACH ROW
WHEN (new.created_at IS NULL)
BEGIN
UPDATE work_identifiers SET created_at = CURRENT_TIMESTAMP WHERE id = old.id;
END;
-- ============================================================================
-- VIEWS - For simplified queries
-- ============================================================================
-- View to join works with their identifiers
CREATE VIEW IF NOT EXISTS works_with_identifiers AS
SELECT
w.id,
w.work_id,
w.title,
w.abstract,
w.publication_year,
w.journal_name,
w.publisher,
w.doi,
w.pmid,
w.pmcid,
w.arxiv_id,
w.dblp_key,
w.openalex_id,
GROUP_CONCAT(DISTINCT wi.scheme || ':' || wi.value, ', ') AS identifiers
FROM works w
LEFT JOIN work_identifiers wi ON w.id = wi.work_id
GROUP BY w.id, w.work_id, w.title, w.abstract, w.publication_year, w.journal_name, w.publisher, w.doi, w.pmid, w.pmcid, w.arxiv_id, w.dblp_key, w.openalex_id;