186 lines
7.4 KiB
SQL
186 lines
7.4 KiB
SQL
-- Migration: Multi-source bibliographic schema
|
|
-- Description: Add multi-source support with works, identifiers, source records, citations, and embeddings
|
|
|
|
-- ============================================================================
|
|
-- WORKS TABLE - Canonical metadata for works
|
|
-- ============================================================================
|
|
CREATE TABLE IF NOT EXISTS works (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
work_id TEXT NOT NULL UNIQUE,
|
|
title TEXT,
|
|
abstract TEXT,
|
|
publication_year INTEGER,
|
|
publication_date TEXT,
|
|
journal_name TEXT,
|
|
publisher TEXT,
|
|
volume TEXT,
|
|
issue TEXT,
|
|
pages TEXT,
|
|
doi TEXT,
|
|
pmid TEXT,
|
|
pmcid TEXT,
|
|
arxiv_id TEXT,
|
|
dblp_key TEXT,
|
|
openalex_id TEXT,
|
|
isbn TEXT,
|
|
issn TEXT,
|
|
entry_type TEXT NOT NULL DEFAULT 'article',
|
|
citation_count INTEGER DEFAULT 0,
|
|
cited_by_count INTEGER DEFAULT 0,
|
|
influential_citations INTEGER DEFAULT 0,
|
|
is_open_access BOOLEAN DEFAULT 0,
|
|
best_oa_url TEXT,
|
|
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
|
|
-- ============================================================================
|
|
-- WORK IDENTIFIERS TABLE - Mapping scheme + value to works
|
|
-- ============================================================================
|
|
CREATE TABLE IF NOT EXISTS work_identifiers (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
work_id TEXT NOT NULL,
|
|
scheme TEXT NOT NULL,
|
|
value TEXT NOT NULL,
|
|
is_primary BOOLEAN DEFAULT 0,
|
|
normalized_value TEXT,
|
|
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
UNIQUE(work_id, scheme, value),
|
|
FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
|
|
);
|
|
|
|
-- ============================================================================
|
|
-- SOURCE RECORDS TABLE - Raw API responses with provenance
|
|
-- ============================================================================
|
|
CREATE TABLE IF NOT EXISTS source_records (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
work_id TEXT NOT NULL,
|
|
source_type TEXT NOT NULL,
|
|
source_label TEXT NOT NULL,
|
|
raw_data_json TEXT NOT NULL,
|
|
raw_record_id TEXT,
|
|
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
UNIQUE(work_id, source_type, source_label),
|
|
FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
|
|
);
|
|
|
|
-- ============================================================================
|
|
-- CITATIONS TABLE - Citation graph with provenance
|
|
-- ============================================================================
|
|
CREATE TABLE IF NOT EXISTS citations (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
source_work_id TEXT NOT NULL,
|
|
target_work_id TEXT NOT NULL,
|
|
relation_type TEXT NOT NULL,
|
|
source_type TEXT NOT NULL,
|
|
source_label TEXT NOT NULL,
|
|
confidence REAL DEFAULT 1.0,
|
|
is_verified BOOLEAN DEFAULT 0,
|
|
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
UNIQUE(source_work_id, target_work_id, relation_type),
|
|
FOREIGN KEY (source_work_id) REFERENCES works(id) ON DELETE CASCADE,
|
|
FOREIGN KEY (target_work_id) REFERENCES works(id) ON DELETE CASCADE
|
|
);
|
|
|
|
-- ============================================================================
|
|
-- WORK EMBEDDINGS TABLE - Vector storage for semantic search
|
|
-- ============================================================================
|
|
CREATE TABLE IF NOT EXISTS work_embeddings (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
work_id TEXT NOT NULL,
|
|
embedding TEXT NOT NULL,
|
|
model_name TEXT NOT NULL,
|
|
model_version TEXT,
|
|
dimension INTEGER NOT NULL,
|
|
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
UNIQUE(work_id, model_name),
|
|
FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
|
|
);
|
|
|
|
-- ============================================================================
|
|
-- INDEXES - For performance optimization
|
|
-- ============================================================================
|
|
-- Work identifiers indexes
|
|
CREATE INDEX IF NOT EXISTS idx_work_identifiers_scheme ON work_identifiers(scheme);
|
|
CREATE INDEX IF NOT EXISTS idx_work_identifiers_value ON work_identifiers(value);
|
|
CREATE INDEX IF NOT EXISTS idx_work_identifiers_work_id ON work_identifiers(work_id);
|
|
CREATE INDEX IF NOT EXISTS idx_work_identifiers_normalized ON work_identifiers(normalized_value);
|
|
|
|
-- Source records indexes
|
|
CREATE INDEX IF NOT EXISTS idx_source_records_work_id ON source_records(work_id);
|
|
CREATE INDEX IF NOT EXISTS idx_source_records_source_type ON source_records(source_type);
|
|
|
|
-- Citations indexes
|
|
CREATE INDEX IF NOT EXISTS idx_citations_source_work ON citations(source_work_id);
|
|
CREATE INDEX IF NOT EXISTS idx_citations_target_work ON citations(target_work_id);
|
|
CREATE INDEX IF NOT EXISTS idx_citations_relation_type ON citations(relation_type);
|
|
CREATE INDEX IF NOT EXISTS idx_citations_source_type ON citations(source_type);
|
|
|
|
-- Works indexes
|
|
CREATE INDEX IF NOT EXISTS idx_works_doi ON works(doi);
|
|
CREATE INDEX IF NOT EXISTS idx_works_pmid ON works(pmid);
|
|
CREATE INDEX IF NOT EXISTS idx_works_pmcid ON works(pmcid);
|
|
CREATE INDEX IF NOT EXISTS idx_works_arxiv ON works(arxiv_id);
|
|
CREATE INDEX IF NOT EXISTS idx_works_openalex ON works(openalex_id);
|
|
CREATE INDEX IF NOT EXISTS idx_works_is_open_access ON works(is_open_access);
|
|
CREATE INDEX IF NOT EXISTS idx_works_created_at ON works(created_at);
|
|
|
|
-- Embeddings indexes
|
|
CREATE INDEX IF NOT EXISTS idx_embeddings_work_id ON work_embeddings(work_id);
|
|
CREATE INDEX IF NOT EXISTS idx_embeddings_model_name ON work_embeddings(model_name);
|
|
|
|
-- ============================================================================
|
|
-- PostgreSQL-specific extensions and vector indexing
|
|
-- ============================================================================
|
|
-- Note: The following are PostgreSQL-specific and should be run when using pgvector
|
|
|
|
-- Uncomment these when using PostgreSQL with pgvector extension:
|
|
-- CREATE EXTENSION IF NOT EXISTS vector;
|
|
-- CREATE INDEX IF NOT EXISTS idx_embeddings_vector ON work_embeddings
|
|
-- USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
|
|
|
|
-- ============================================================================
|
|
-- TRIGGERS - For automatic timestamp updates
|
|
-- ============================================================================
|
|
-- Works table update trigger
|
|
CREATE TRIGGER IF NOT EXISTS works_updated_at
|
|
AFTER UPDATE ON works
|
|
FOR EACH ROW
|
|
WHEN (new.updated_at IS NULL)
|
|
BEGIN
|
|
UPDATE works SET updated_at = CURRENT_TIMESTAMP WHERE id = old.id;
|
|
END;
|
|
|
|
-- Work identifiers update trigger
|
|
CREATE TRIGGER IF NOT EXISTS work_identifiers_updated_at
|
|
AFTER UPDATE ON work_identifiers
|
|
FOR EACH ROW
|
|
WHEN (new.created_at IS NULL)
|
|
BEGIN
|
|
UPDATE work_identifiers SET created_at = CURRENT_TIMESTAMP WHERE id = old.id;
|
|
END;
|
|
|
|
-- ============================================================================
|
|
-- VIEWS - For simplified queries
|
|
-- ============================================================================
|
|
-- View to join works with their identifiers
|
|
CREATE VIEW IF NOT EXISTS works_with_identifiers AS
|
|
SELECT
|
|
w.id,
|
|
w.work_id,
|
|
w.title,
|
|
w.abstract,
|
|
w.publication_year,
|
|
w.journal_name,
|
|
w.publisher,
|
|
w.doi,
|
|
w.pmid,
|
|
w.pmcid,
|
|
w.arxiv_id,
|
|
w.dblp_key,
|
|
w.openalex_id,
|
|
GROUP_CONCAT(DISTINCT wi.scheme || ':' || wi.value, ', ') AS identifiers
|
|
FROM works w
|
|
LEFT JOIN work_identifiers wi ON w.id = wi.work_id
|
|
GROUP BY w.id, w.work_id, w.title, w.abstract, w.publication_year, w.journal_name, w.publisher, w.doi, w.pmid, w.pmcid, w.arxiv_id, w.dblp_key, w.openalex_id;
|