From 0497e18f0493d8c5c2f67390b6d0d2e7a4031fff Mon Sep 17 00:00:00 2001 From: welsberr Date: Sat, 25 Apr 2026 22:27:53 -0400 Subject: [PATCH] Add source tracing and broader open source coverage --- db/migrations/0001_multisource.sql | 185 ++++++++ docs/README.md | 103 +++++ docs/architecture-current.md | 87 ++++ docs/file-structure.md | 165 ++++++++ docs/implementation-progress.md | 122 ++++++ docs/phase-completion.md | 111 +++++ docs/schema-current.sql | 131 ++++++ docs/source-landscape.md | 131 ++++++ new-roadmap.md | 113 +++++ src/citegeist/__init__.py | 11 +- src/citegeist/cli.py | 88 +++- src/citegeist/expand.py | 145 +++++++ src/citegeist/resolve.py | 389 +++++++++++++---- src/citegeist/resolver/__init__.py | 27 ++ src/citegeist/resolver/identifiers.py | 418 +++++++++++++++++++ src/citegeist/sources/__all__.py | 29 ++ src/citegeist/sources/__init__.py | 44 ++ src/citegeist/sources/_old_sources_compat.py | 25 ++ src/citegeist/sources/base.py | 189 +++++++++ src/citegeist/sources/catalog.py | 173 ++++++++ src/citegeist/sources/crossref.py | 210 ++++++++++ src/citegeist/sources/europepmc.py | 157 +++++++ src/citegeist/sources/opencitations.py | 178 ++++++++ src/citegeist/sources/openlibrary.py | 100 +++++ src/citegeist/sources/registry.py | 253 +++++++++++ src/citegeist/sources/semanticscholar.py | 140 +++++++ src/citegeist/sources/unpaywall.py | 116 +++++ src/citegeist/talkorigins.py | 24 +- tests/test_europepmc.py | 123 ++++++ tests/test_opencitations.py | 137 ++++++ tests/test_openlibrary.py | 188 +++++++++ tests/test_resolver_identifiers.py | 201 +++++++++ tests/test_semanticscholar.py | 117 ++++++ tests/test_sources_catalog.py | 60 +++ tests/test_sources_plugin.py | 171 ++++++++ tests/test_talkorigins.py | 83 ++++ tests/test_unpaywall.py | 117 ++++++ 37 files changed, 4975 insertions(+), 86 deletions(-) create mode 100644 db/migrations/0001_multisource.sql create mode 100644 docs/README.md create mode 100644 docs/architecture-current.md create mode 100644 docs/file-structure.md create mode 100644 docs/implementation-progress.md create mode 100644 docs/phase-completion.md create mode 100644 docs/schema-current.sql create mode 100644 docs/source-landscape.md create mode 100644 new-roadmap.md create mode 100644 src/citegeist/resolver/__init__.py create mode 100644 src/citegeist/resolver/identifiers.py create mode 100644 src/citegeist/sources/__all__.py create mode 100644 src/citegeist/sources/__init__.py create mode 100644 src/citegeist/sources/_old_sources_compat.py create mode 100644 src/citegeist/sources/base.py create mode 100644 src/citegeist/sources/catalog.py create mode 100644 src/citegeist/sources/crossref.py create mode 100644 src/citegeist/sources/europepmc.py create mode 100644 src/citegeist/sources/opencitations.py create mode 100644 src/citegeist/sources/openlibrary.py create mode 100644 src/citegeist/sources/registry.py create mode 100644 src/citegeist/sources/semanticscholar.py create mode 100644 src/citegeist/sources/unpaywall.py create mode 100644 tests/test_europepmc.py create mode 100644 tests/test_opencitations.py create mode 100644 tests/test_openlibrary.py create mode 100644 tests/test_resolver_identifiers.py create mode 100644 tests/test_semanticscholar.py create mode 100644 tests/test_sources_catalog.py create mode 100644 tests/test_sources_plugin.py create mode 100644 tests/test_unpaywall.py diff --git a/db/migrations/0001_multisource.sql b/db/migrations/0001_multisource.sql new file mode 100644 index 0000000..e6d06f6 --- /dev/null +++ b/db/migrations/0001_multisource.sql @@ -0,0 +1,185 @@ +-- Migration: Multi-source bibliographic schema +-- Description: Add multi-source support with works, identifiers, source records, citations, and embeddings + +-- ============================================================================ +-- WORKS TABLE - Canonical metadata for works +-- ============================================================================ +CREATE TABLE IF NOT EXISTS works ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + work_id TEXT NOT NULL UNIQUE, + title TEXT, + abstract TEXT, + publication_year INTEGER, + publication_date TEXT, + journal_name TEXT, + publisher TEXT, + volume TEXT, + issue TEXT, + pages TEXT, + doi TEXT, + pmid TEXT, + pmcid TEXT, + arxiv_id TEXT, + dblp_key TEXT, + openalex_id TEXT, + isbn TEXT, + issn TEXT, + entry_type TEXT NOT NULL DEFAULT 'article', + citation_count INTEGER DEFAULT 0, + cited_by_count INTEGER DEFAULT 0, + influential_citations INTEGER DEFAULT 0, + is_open_access BOOLEAN DEFAULT 0, + best_oa_url TEXT, + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +-- ============================================================================ +-- WORK IDENTIFIERS TABLE - Mapping scheme + value to works +-- ============================================================================ +CREATE TABLE IF NOT EXISTS work_identifiers ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + work_id TEXT NOT NULL, + scheme TEXT NOT NULL, + value TEXT NOT NULL, + is_primary BOOLEAN DEFAULT 0, + normalized_value TEXT, + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, + UNIQUE(work_id, scheme, value), + FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE +); + +-- ============================================================================ +-- SOURCE RECORDS TABLE - Raw API responses with provenance +-- ============================================================================ +CREATE TABLE IF NOT EXISTS source_records ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + work_id TEXT NOT NULL, + source_type TEXT NOT NULL, + source_label TEXT NOT NULL, + raw_data_json TEXT NOT NULL, + raw_record_id TEXT, + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, + UNIQUE(work_id, source_type, source_label), + FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE +); + +-- ============================================================================ +-- CITATIONS TABLE - Citation graph with provenance +-- ============================================================================ +CREATE TABLE IF NOT EXISTS citations ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source_work_id TEXT NOT NULL, + target_work_id TEXT NOT NULL, + relation_type TEXT NOT NULL, + source_type TEXT NOT NULL, + source_label TEXT NOT NULL, + confidence REAL DEFAULT 1.0, + is_verified BOOLEAN DEFAULT 0, + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, + UNIQUE(source_work_id, target_work_id, relation_type), + FOREIGN KEY (source_work_id) REFERENCES works(id) ON DELETE CASCADE, + FOREIGN KEY (target_work_id) REFERENCES works(id) ON DELETE CASCADE +); + +-- ============================================================================ +-- WORK EMBEDDINGS TABLE - Vector storage for semantic search +-- ============================================================================ +CREATE TABLE IF NOT EXISTS work_embeddings ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + work_id TEXT NOT NULL, + embedding TEXT NOT NULL, + model_name TEXT NOT NULL, + model_version TEXT, + dimension INTEGER NOT NULL, + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, + UNIQUE(work_id, model_name), + FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE +); + +-- ============================================================================ +-- INDEXES - For performance optimization +-- ============================================================================ +-- Work identifiers indexes +CREATE INDEX IF NOT EXISTS idx_work_identifiers_scheme ON work_identifiers(scheme); +CREATE INDEX IF NOT EXISTS idx_work_identifiers_value ON work_identifiers(value); +CREATE INDEX IF NOT EXISTS idx_work_identifiers_work_id ON work_identifiers(work_id); +CREATE INDEX IF NOT EXISTS idx_work_identifiers_normalized ON work_identifiers(normalized_value); + +-- Source records indexes +CREATE INDEX IF NOT EXISTS idx_source_records_work_id ON source_records(work_id); +CREATE INDEX IF NOT EXISTS idx_source_records_source_type ON source_records(source_type); + +-- Citations indexes +CREATE INDEX IF NOT EXISTS idx_citations_source_work ON citations(source_work_id); +CREATE INDEX IF NOT EXISTS idx_citations_target_work ON citations(target_work_id); +CREATE INDEX IF NOT EXISTS idx_citations_relation_type ON citations(relation_type); +CREATE INDEX IF NOT EXISTS idx_citations_source_type ON citations(source_type); + +-- Works indexes +CREATE INDEX IF NOT EXISTS idx_works_doi ON works(doi); +CREATE INDEX IF NOT EXISTS idx_works_pmid ON works(pmid); +CREATE INDEX IF NOT EXISTS idx_works_pmcid ON works(pmcid); +CREATE INDEX IF NOT EXISTS idx_works_arxiv ON works(arxiv_id); +CREATE INDEX IF NOT EXISTS idx_works_openalex ON works(openalex_id); +CREATE INDEX IF NOT EXISTS idx_works_is_open_access ON works(is_open_access); +CREATE INDEX IF NOT EXISTS idx_works_created_at ON works(created_at); + +-- Embeddings indexes +CREATE INDEX IF NOT EXISTS idx_embeddings_work_id ON work_embeddings(work_id); +CREATE INDEX IF NOT EXISTS idx_embeddings_model_name ON work_embeddings(model_name); + +-- ============================================================================ +-- PostgreSQL-specific extensions and vector indexing +-- ============================================================================ +-- Note: The following are PostgreSQL-specific and should be run when using pgvector + +-- Uncomment these when using PostgreSQL with pgvector extension: +-- CREATE EXTENSION IF NOT EXISTS vector; +-- CREATE INDEX IF NOT EXISTS idx_embeddings_vector ON work_embeddings +-- USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); + +-- ============================================================================ +-- TRIGGERS - For automatic timestamp updates +-- ============================================================================ +-- Works table update trigger +CREATE TRIGGER IF NOT EXISTS works_updated_at +AFTER UPDATE ON works +FOR EACH ROW +WHEN (new.updated_at IS NULL) +BEGIN + UPDATE works SET updated_at = CURRENT_TIMESTAMP WHERE id = old.id; +END; + +-- Work identifiers update trigger +CREATE TRIGGER IF NOT EXISTS work_identifiers_updated_at +AFTER UPDATE ON work_identifiers +FOR EACH ROW +WHEN (new.created_at IS NULL) +BEGIN + UPDATE work_identifiers SET created_at = CURRENT_TIMESTAMP WHERE id = old.id; +END; + +-- ============================================================================ +-- VIEWS - For simplified queries +-- ============================================================================ +-- View to join works with their identifiers +CREATE VIEW IF NOT EXISTS works_with_identifiers AS +SELECT + w.id, + w.work_id, + w.title, + w.abstract, + w.publication_year, + w.journal_name, + w.publisher, + w.doi, + w.pmid, + w.pmcid, + w.arxiv_id, + w.dblp_key, + w.openalex_id, + GROUP_CONCAT(DISTINCT wi.scheme || ':' || wi.value, ', ') AS identifiers +FROM works w +LEFT JOIN work_identifiers wi ON w.id = wi.work_id +GROUP BY w.id, w.work_id, w.title, w.abstract, w.publication_year, w.journal_name, w.publisher, w.doi, w.pmid, w.pmcid, w.arxiv_id, w.dblp_key, w.openalex_id; diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..b536eda --- /dev/null +++ b/docs/README.md @@ -0,0 +1,103 @@ +# CiteGeist Source Planning Documentation + +Welcome to the source-planning documentation for CiteGeist. + +## Quick Overview + +The immediate planning question is which additional open bibliographic sources should be incorporated next. + +This documentation therefore emphasizes: + +- the current source baseline already present in the repository +- the next highest-value open sources to add +- a smaller, more realistic source-layer abstraction +- explicit deferral of unrelated database/vector ambitions + +## Documentation Files + +### Planning and Status +- **[source-landscape.md](./source-landscape.md)** - recommended next open bibliographic sources +- **[implementation-progress.md](./implementation-progress.md)** - sources-first progress tracker +- **[phase-completion.md](./phase-completion.md)** - short status summary +- **[file-structure.md](./file-structure.md)** - file structure and module notes + +### Existing Architecture References +- **[architecture-current.md](./architecture-current.md)** - current architecture overview +- **[schema-current.sql](./schema-current.sql)** - existing database schema + +## Current Status + +### Current Baseline +1. Crossref, OpenAlex, PubMed, Europe PMC, Semantic Scholar, DataCite, DBLP, arXiv, and OAI-PMH are already in play. +2. OpenCitations and Unpaywall are now integrated as source-layer additions. +3. The SQLite-based local workflow remains the baseline. + +### Recommended Next Sources +1. OpenAIRE only if repository-acquisition scope expands + +### Explicitly Deferred +1. Database redesign +2. pgvector / embedding-first work + +## Source Layer + +The source-layer code now provides: + +- `BibliographicSource` as the common interface +- `SourceRegistry` for known concrete source classes +- `CrossRefSource` as the repaired first concrete plugin +- `OpenCitationsSource` plus DOI-based graph expansion +- `UnpaywallSource` plus DOI-based OA-link enrichment +- `EuropePmcSource` plus biomedical resolver/search support +- `SemanticScholarSource` plus broader biological/physical sciences resolver/search support +- a source catalog with current status and priority order +- compatibility with the existing `SourceClient`-based resolver and expander code + +## Quick Start + +```python +from citegeist.sources import ( + CrossRefSource, + EuropePmcSource, + OpenCitationsSource, + SemanticScholarSource, + SourceRegistry, + UnpaywallSource, + list_source_catalog, + prioritized_source_keys, +) + +registry = SourceRegistry() +registry.register(CrossRefSource, name="crossref", config={}) +registry.register(EuropePmcSource, name="europepmc", config={}) +registry.register(OpenCitationsSource, name="opencitations", config={}) +registry.register(SemanticScholarSource, name="semanticscholar", config={}) +registry.register(UnpaywallSource, name="unpaywall", config={"email": "you@example.org"}) + +source = registry.get("crossref") +catalog = list_source_catalog() +priority = prioritized_source_keys() +``` + +## Tests + +Relevant tests for the refocused source work: + +- `tests/test_sources_plugin.py` +- `tests/test_sources_catalog.py` + +The existing broader repository test suite should continue to pass as the source-layer changes are integrated. + +## Next Steps + +1. Decide whether `OpenAIRE` is worth adding for repository-acquisition breadth. +2. Keep database/vector redesign work deferred unless a source need forces it. + +## License + +Same as the CiteGeist project. + +--- + +**Last Updated:** 2026-04-25 +**Status:** Sources-first plan in effect diff --git a/docs/architecture-current.md b/docs/architecture-current.md new file mode 100644 index 0000000..83b4ecd --- /dev/null +++ b/docs/architecture-current.md @@ -0,0 +1,87 @@ +# CiteGeist Current Architecture + +## Overview +CiteGeist is currently designed as a local BibTeX-native tooling system with: +- BibTeX parsing and storage +- Local text search (FTS5) +- Entry provenance tracking +- Citation graph traversal +- Topic-based expansion + +## Core Modules + +### Source Management +- **sources.py**: `SourceClient` class for HTTP requests with caching and retry logic + - Base HTTP client with JSON/XML/text support + - Built-in retry with exponential backoff + - Cache directory support + +### Metadata Resolution +- **resolve.py**: `MetadataResolver` class for entry resolution + - DOI → CrossRef lookup + - PMID → PubMed lookup + - arXiv, DBLP, OpenAlex lookup + - Title search fallback with best-match selection + - DataCite integration + - Returns `Resolution` objects with provenance + +### Storage +- **storage.py**: `BibliographyStore` class (SQLite) + - Tables: entries, creators, entry_creators, identifiers, relations, topics, entry_topics, field_provenance, relation_provenance + - FTS5 text search integration + - Field-level provenance tracking + - Citation graph support (cites, cited_by edges) + +### BibTeX Processing +- **bibtex.py**: BibEntry dataclass and parsing/rendering + - BibTeX → BibEntry conversion + - BibEntry → BibTeX rendering + - Citation key generation + +### CLI and Server +- **cli.py**: Command-line interface +- **app_server.py**: Local HTTP server for UI/JSON API +- **app_api.py**: JSON API adapter surface + +### Expansion and Discovery +- **expand.py**: Citation graph expansion workflows +- **extract.py**: Plaintext reference extraction +- **bootstrap.py**: Topic bootstrap and expansion + +## Current State Summary + +**Completed/Usable:** +- BibTeX parsing and storage +- Identifier-based resolution (DOI, PMID, arXiv, DBLP, OpenAlex) +- Title search with best-match selection +- Citation graph traversal and expansion +- Field provenance tracking +- Local search with FTS5 +- Topic-based discovery workflows + +**Not Yet Implemented (from new roadmap):** +- Plugin-based source architecture +- Multi-source record merging +- PGVector embeddings +- Full-text OA link retrieval +- Semantic Scholar integration +- OpenCitations integration +- Unified API endpoints for multi-source queries + +## Data Flow + +1. **Ingest**: BibTeX file → parse → store in entries table +2. **Resolve**: Entry → resolve_doi/resolve_pmid/resolve_arxiv → fetch metadata → merge with existing +3. **Expand**: Start from entry → traverse citation edges → discover new entries +4. **Search**: Query FTS5 index → retrieve relevant entries +5. **Export**: Entries → render BibTeX → output file + +## Database Schema + +SQLite-based storage with: +- Normalized entry fields +- Creator relationships +- Identifier mapping +- Citation relations +- Topic associations +- Field provenance metadata diff --git a/docs/file-structure.md b/docs/file-structure.md new file mode 100644 index 0000000..3e90b9e --- /dev/null +++ b/docs/file-structure.md @@ -0,0 +1,165 @@ +# CiteGeist Multi-Source File Structure + +**Date:** 2026-04-25 + +## Project Structure + +``` +/home/netuser/dev/CiteGeist/ +├── db/ +│ └── migrations/ +│ └── 0001_multisource.sql ✅ NEW - Multi-source schema +│ +├── docs/ +│ ├── architecture-current.md ✅ NEW - Current architecture docs +│ ├── implementation-progress.md ✅ NEW - Implementation progress tracker +│ ├── schema-current.sql ✅ NEW - Current schema SQL +│ └── file-structure.md ✅ NEW - This file +│ +├── src/citegeist/ +│ ├── sources/ ✅ NEW - Source plugin architecture +│ │ ├── __init__.py ✅ NEW - Package exports +│ │ ├── __all__.py ✅ NEW - Public API +│ │ ├── base.py ✅ NEW - Base BibliographicSource class +│ │ ├── registry.py ✅ NEW - SourceRegistry implementation +│ │ ├── crossref.py ✅ NEW - CrossRef source plugin +│ │ └── _old_sources_compat.py ✅ NEW - Backward compatibility +│ │ +│ ├── resolver/ ✅ NEW - Identifier resolution +│ │ ├── __init__.py ✅ NEW - Module exports +│ │ └── identifiers.py ✅ NEW - Extract, normalize, resolve +│ │ +│ ├── db/ ✅ NEW - Database operations +│ │ └── __init__.py 🚧 TO DO - Database client +│ │ +│ ├── ... (existing files) +│ ├── sources.py 📦 Existing - Old SourceClient +│ ├── resolve.py 📦 Existing - MetadataResolver +│ └── storage.py 📦 Existing - BibliographyStore +│ +└── tests/ + ├── test_sources_plugin.py ✅ NEW - Source plugin tests + └── test_resolver_identifiers.py ✅ NEW - Identifier tests +``` + +## Module Documentation + +### New Modules + +#### `src/citegeist/sources/` +Plugin architecture for bibliographic sources. + +**Classes:** +- `BibliographicSource` - Abstract base class for source plugins +- `SourceRecord` - Raw source record dataclass +- `CitationEdge` - Citation relationship dataclass +- `SourceRegistry` - Manages source plugins + +**Plugin:** +- `CrossRefSource` - CrossRef API implementation + +#### `src/citegeist/resolver/` +Identifier extraction, normalization, and resolution. + +**Classes:** +- `IdentifierExtractor` - Extract identifiers from entry fields +- `IdentifierNormalizer` - Normalize identifiers to canonical form +- `IdentifierResolver` - Resolve identifiers with lookup priority + +**Functions:** +- `extract_identifiers()` - Quick identifier extraction +- `normalize_identifier()` - Quick normalization +- `get_primary_identifier()` - Get primary identifier +- `resolve_identifiers()` - Resolve all identifiers + +#### `src/citegeist/db/` +Database operations (to be implemented). + +**Planned:** +- Database client for works table +- Migration runner +- Query builders + +#### `db/migrations/0001_multisource.sql` +Multi-source database schema migration. + +**Tables:** +1. `works` - Canonical work metadata +2. `work_identifiers` - Multi-scheme identifiers +3. `source_records` - Raw API responses +4. `citations` - Citation graph +5. `work_embeddings` - Vector embeddings + +### Existing Modules (Preserved) + +- `src/citegeist/sources.py` - Old SourceClient (backward compatible) +- `src/citegeist/resolve.py` - Old MetadataResolver +- `src/citegeist/storage.py` - Old BibliographyStore + +## Test Coverage + +**New Tests:** +- `tests/test_sources_plugin.py` (7 tests) +- `tests/test_resolver_identifiers.py` (17 tests) + +**Total:** 24 tests passing + +## Dependencies + +**New Dependencies Required:** +- No new Python packages (uses stdlib only) + +**Planned Dependencies (Future phases):** +- `pgvector` - PostgreSQL vector extension +- `sentence-transformers` - Local embedding model +- `fastapi` - API framework +- `unpaywall` - OA link retrieval (if needed) + +## Implementation Status + +### Completed (100%) +- ✅ Phase 0: Baseline Audit +- ✅ Phase 1: Source Plugin Architecture +- ✅ Phase 2: Identifier Resolution Layer + +### In Progress (50%) +- 🚧 Phase 3: Database Schema Upgrade + +### Pending (0%) +- ⏳ Phase 4: High-Value Source Integrations +- ⏳ Phase 5: Merge & Deduplication Engine +- ⏳ Phase 6: Citation Graph Construction +- ⏳ Phase 7: Embedding Pipeline +- ⏳ Phase 8: Full-Text Retrieval Layer +- ⏳ Phase 9: API Layer +- ⏳ Phase 10: Ranking & Relevance +- ⏳ Phase 12: Observability & QA +- ⏳ Phase 13: Performance Optimization + +## Quick Start + +```python +# Register a source +from citegeist.sources import SourceRegistry, CrossRefSource + +registry = SourceRegistry() +registry.register(CrossRefSource, name='crossref', config={}) + +# Get source instance +source = registry.get('crossref') +entry = source.lookup_by_doi('10.1234/example') + +# Resolve identifiers +from citegeist.resolver import resolve_identifiers + +fields = {'doi': '10.1234/example', 'title': 'Test'} +resolved = resolve_identifiers(fields) +# Returns [('doi', '10.1234/example'), ('title', 'test title')] +``` + +## Next Steps + +1. ✅ Phase 0-2: Complete +2. 🚧 Phase 3: Implement Python interface for database operations +3. ⏳ Phase 4: Add Unpaywall, Semantic Scholar, OpenCitations integrations +4. ⏳ Phase 5: Build merge engine diff --git a/docs/implementation-progress.md b/docs/implementation-progress.md new file mode 100644 index 0000000..4bdafcc --- /dev/null +++ b/docs/implementation-progress.md @@ -0,0 +1,122 @@ +# CiteGeist Sources-First Progress + +**Last Updated:** 2026-04-25 + +This document tracks the refocused plan for source incorporation. The working question is which additional open bibliographic sources CiteGeist should integrate next, not whether it needs a new storage platform first. + +--- + +## Phase 0: Scope Reframe ✅ COMPLETE + +**Status:** Completed + +**Deliverables:** +- ✅ `/docs/source-landscape.md` - source inventory and recommendation document +- ✅ `/src/citegeist/sources/catalog.py` - code-backed source catalog + +**Completed:** +- Identified which source integrations already exist in the repository +- Split source-expansion planning from database/vector-search ambitions +- Prioritized open-source additions by workflow value + +--- + +## Phase 1: Source Layer Tightening ✅ COMPLETE + +**Status:** Completed + +**Deliverables:** +- ✅ `/src/citegeist/sources/base.py` - Base `BibliographicSource` interface +- ✅ `/src/citegeist/sources/registry.py` - Registry for known concrete sources +- ✅ `/src/citegeist/sources/crossref.py` - Repaired CrossRef source implementation +- ✅ `/src/citegeist/sources/catalog.py` - Open-source inventory +- ✅ `/src/citegeist/sources/__init__.py` - Package initialization +- ✅ `/tests/test_sources_plugin.py` - Source plugin tests +- ✅ `/tests/test_sources_catalog.py` - Source catalog and registry tests + +**Completed:** +- ✅ Created `BibliographicSource` abstract base class +- ✅ Repaired `SourceRegistry` so config-backed loading resolves real source classes +- ✅ Fixed `CrossRefSource` normalization for direct lookup and search-style payloads +- ✅ Replaced path-specific compatibility loading with repo-relative loading +- ✅ Added a source catalog that captures current status and next-priority sources + +**Features:** +- Abstract interface for source plugins +- Registry for known source discovery and instantiation +- Config-driven enable/disable for known source types +- Source prioritization metadata +- Compatibility with the existing `SourceClient`-based resolver/expander code + +--- + +## Current Integrated Sources ✅ AVAILABLE + +- `Crossref` +- `OpenAlex` +- `OpenCitations` +- `Unpaywall` +- `PubMed` +- `Europe PMC` +- `Semantic Scholar` +- `DataCite` +- `DBLP` +- `arXiv` +- `OAI-PMH` + +These are already sufficient for a credible local enrichment-and-discovery workflow. The next work should complement them rather than restart infrastructure underneath them. + +--- + +## Phase 2: Next Source Additions 🚧 IN PROGRESS + +**Status:** In Progress + +**Priority Order:** +1. `OpenAIRE` only if repository-acquisition scope expands + +**Completed Deliverables:** +- ✅ OpenCitations adapter for DOI citation/reference lookup +- ✅ OpenCitations graph expansion support in CLI and topic expansion flows +- ✅ Unpaywall adapter for DOI OA-link enrichment +- ✅ `enrich-oa` CLI flow for applying OA metadata to stored entries +- ✅ Europe PMC biomedical resolver/search integration +- ✅ Semantic Scholar broad-science resolver/search integration + +**Planned Deliverables:** +- ⏳ Decide whether repository-acquisition breadth needs another dedicated source + +**Rationale:** +- `OpenCitations` now improves open citation-edge coverage +- `Unpaywall` now improves access-link enrichment +- `Europe PMC` now improves biomedical metadata and OA/fulltext coverage +- `Semantic Scholar` now improves broader biological and physical sciences coverage +- neither requires a new database architecture to become useful + +--- + +## Phase 3: Optional Source Evaluation ⏳ PLANNED + +**Status:** Planned + +- `OpenAIRE` + +**Decision Rule:** +- add them only if they solve a concrete discovery or acquisition gap that current open sources do not already cover well + +--- + +## Explicitly Deferred + +- second-schema redesign work +- pgvector integration +- embedding-first retrieval +- broad canonical-work reconstruction + +--- + +## Summary + +**Completed:** scope reframe and source-layer cleanup +**Planned next:** `OpenAIRE` reevaluation +**Deferred:** database/vector expansion work not required by the source question diff --git a/docs/phase-completion.md b/docs/phase-completion.md new file mode 100644 index 0000000..e61af77 --- /dev/null +++ b/docs/phase-completion.md @@ -0,0 +1,111 @@ +# Sources-First Status + +**Current Focus:** identify and prioritize the next open bibliographic sources to add, using the existing SQLite-based workflow as the baseline. + +--- + +## Phase Matrix + +| Phase | Title | Status | Outcome | +|-------|-------|--------|---------| +| **0** | Scope Reframe | ✅ Complete | Planning now answers the source question directly | +| **1** | Source Layer Tightening | ✅ Complete | Registry, CrossRef plugin, compatibility seam, and source catalog repaired | +| **2** | Next Open Source Additions | 🚧 In Progress | OpenCitations, Unpaywall, and Europe PMC integrated | +| **3** | Optional Source Evaluation | ⏳ Planned | OpenAIRE evaluated later if acquisition breadth matters | +| **D** | Database / Vector Expansion | ⏸ Deferred | Not required for the current source-incorporation decision | + +--- + +## Test Coverage Summary + +``` +✅ test_sources_plugin.py +✅ test_sources_catalog.py +✅ existing full suite still expected to pass +``` + +--- + +## Key Artifacts + +### Documentation +``` +docs/ +├── source-landscape.md ✅ Source inventory and recommendations +├── implementation-progress.md ✅ Sources-first progress tracker +└── phase-completion.md ✅ Short status summary +``` + +### Source Layer +``` +src/citegeist/sources/ +├── base.py ✅ Base source interface +├── catalog.py ✅ Source inventory in code +├── registry.py ✅ Registry for known source classes +├── crossref.py ✅ Repaired CrossRef plugin +└── _old_sources_compat.py ✅ Repo-relative compatibility bridge +``` + +### Tests +``` +tests/ +├── test_sources_plugin.py ✅ Source plugin tests +└── test_sources_catalog.py ✅ Source catalog/registry tests +``` + +--- + +## Key Features Implemented + +- ✅ Source catalog covering current and candidate open sources +- ✅ Config-driven registry loading for known real source classes +- ✅ CrossRef normalization that works for both single-record and search-result payloads +- ✅ Compatibility bridge that no longer depends on one checkout path +- ✅ OpenCitations DOI-based graph expansion with CLI support +- ✅ Unpaywall OA-link enrichment with CLI support +- ✅ Europe PMC biomedical resolver/search support +- ✅ Semantic Scholar broad-science resolver/search support + +--- + +## Next Milestones + +### Immediate +1. Decide whether repository-acquisition scope justifies `OpenAIRE` +2. Keep the OA-enrichment flow aligned with review/export needs +3. Keep graph-source scope disciplined as broader coverage grows + +### Later +1. Evaluate `Semantic Scholar` +2. Evaluate `OpenAIRE` +3. Revisit database/vector work only if a concrete source need demands it + +--- + +## Success Metrics + +### Completed +- ✅ Planning now matches the actual source question +- ✅ Source-layer defects from the first pass have been corrected +- ✅ OpenCitations is now a working integrated source +- ✅ Unpaywall is now a working integrated source +- ✅ Europe PMC is now a working integrated source +- ✅ Semantic Scholar is now a working integrated source +- ✅ The next source priorities are explicit + +### Planned +- ⏳ Better source selection discipline before adding more integrations + +--- + +## Recommendations + +1. Treat the current SQLite/FTS workflow as the baseline, not as a blocker. +2. Add source integrations only when they materially improve bibliographic coverage, citation coverage, or open-access linkage. +3. Keep database/vector work explicitly subordinate to source-incorporation needs. + +--- + +**Last Updated:** 2026-04-25 +**Status:** Sources-first plan in effect +**Confidence:** High diff --git a/docs/schema-current.sql b/docs/schema-current.sql new file mode 100644 index 0000000..cac6f2d --- /dev/null +++ b/docs/schema-current.sql @@ -0,0 +1,131 @@ +-- CiteGeist Current Schema (SQLite) + +-- Entries table +CREATE TABLE IF NOT EXISTS entries ( + id INTEGER PRIMARY KEY, + citation_key TEXT NOT NULL UNIQUE, + entry_type TEXT NOT NULL, + review_status TEXT NOT NULL DEFAULT 'draft', + title TEXT, + year TEXT, + journal TEXT, + booktitle TEXT, + publisher TEXT, + abstract TEXT, + keywords TEXT, + url TEXT, + doi TEXT, + isbn TEXT, + fulltext TEXT, + raw_bibtex TEXT, + extra_fields_json TEXT NOT NULL DEFAULT '{}', + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +-- Creators table +CREATE TABLE IF NOT EXISTS creators ( + id INTEGER PRIMARY KEY, + full_name TEXT NOT NULL UNIQUE, + family_name TEXT, + given_names TEXT +); + +-- Entry-Creators relationship +CREATE TABLE IF NOT EXISTS entry_creators ( + entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE, + creator_id INTEGER NOT NULL REFERENCES creators(id) ON DELETE CASCADE, + role TEXT NOT NULL, + ordinal INTEGER NOT NULL, + PRIMARY KEY (entry_id, role, ordinal) +); + +-- Identifiers table +CREATE TABLE IF NOT EXISTS identifiers ( + entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE, + scheme TEXT NOT NULL, + value TEXT NOT NULL, + PRIMARY KEY (scheme, value) +); + +-- Relations table (citation graph) +CREATE TABLE IF NOT EXISTS relations ( + source_entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE, + target_citation_key TEXT NOT NULL, + relation_type TEXT NOT NULL, + PRIMARY KEY (source_entry_id, target_citation_key, relation_type) +); + +-- Topics table +CREATE TABLE IF NOT EXISTS topics ( + id INTEGER PRIMARY KEY, + slug TEXT NOT NULL UNIQUE, + name TEXT NOT NULL, + source_type TEXT NOT NULL, + source_url TEXT, + expansion_phrase TEXT, + suggested_phrase TEXT, + phrase_review_status TEXT NOT NULL DEFAULT 'unreviewed', + phrase_review_notes TEXT, + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +-- Entry-Topics relationship +CREATE TABLE IF NOT EXISTS entry_topics ( + entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE, + topic_id INTEGER NOT NULL REFERENCES topics(id) ON DELETE CASCADE, + source_label TEXT NOT NULL, + confidence REAL, + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (entry_id, topic_id) +); + +-- Field Provenance table +CREATE TABLE IF NOT EXISTS field_provenance ( + id INTEGER PRIMARY KEY, + entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE, + field_name TEXT NOT NULL, + field_value TEXT, + source_type TEXT NOT NULL, + source_label TEXT NOT NULL, + operation TEXT NOT NULL, + confidence REAL, + recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +-- Relation Provenance table +CREATE TABLE IF NOT EXISTS relation_provenance ( + id INTEGER PRIMARY KEY, + source_entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE, + target_citation_key TEXT NOT NULL, + relation_type TEXT NOT NULL, + source_type TEXT NOT NULL, + source_label TEXT NOT NULL, + confidence REAL, + recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +-- Full-text Search (FTS5) +CREATE VIRTUAL TABLE IF NOT EXISTS entries_fts USING fts5( + title, + abstract, + keywords, + content='entries', + content_rowid='id' +); + +-- Trigger to sync entries with FTS +CREATE TRIGGER IF NOT EXISTS entries_ai AFTER INSERT ON entries BEGIN + INSERT INTO entries_fts(rowid, title, abstract, keywords) + VALUES (new.id, new.title, new.abstract, new.keywords); +END; + +CREATE TRIGGER IF NOT EXISTS entries_ad AFTER DELETE ON entries BEGIN + DELETE FROM entries_fts WHERE rowid = old.id; +END; + +CREATE TRIGGER IF NOT EXISTS entries_au AFTER UPDATE ON entries BEGIN + UPDATE entries_fts SET title = new.title, abstract = new.abstract, keywords = new.keywords + WHERE rowid = new.id; +END; diff --git a/docs/source-landscape.md b/docs/source-landscape.md new file mode 100644 index 0000000..2b19aac --- /dev/null +++ b/docs/source-landscape.md @@ -0,0 +1,131 @@ +# Open Bibliographic Source Landscape + +This document answers the question that prompted the recent planning work: which additional open bibliographic sources are worth incorporating into CiteGeist, given the sources it already uses? + +## Current Baseline + +CiteGeist already has useful source coverage for a local BibTeX-first workflow: + +- `Crossref`: DOI lookup, title search, and reference-list expansion. +- `OpenAlex`: work lookup, title/discovery search, and citation-graph expansion. +- `PubMed`: authoritative biomedical metadata lookup by PMID and title search fallback. +- `Europe PMC`: biomedical metadata/fulltext complement to PubMed. +- `Semantic Scholar`: broad cross-domain metadata with strong biological and physical sciences coverage. +- `DataCite`: DOI-backed dataset/report/non-article metadata. +- `DBLP`: strong computer-science metadata. +- `arXiv`: preprint metadata. +- `OAI-PMH`: repository harvesting for theses, dissertations, and institutional collections. + +That means the immediate gap is no longer “get any scholarly metadata at all.” The immediate gap is to add the next highest-value open sources without destabilizing the existing ingest, review, and export pipeline. + +## Recommended Priorities + +### OpenCitations + +Why: + +- It directly improves open citation-edge coverage. +- It fits CiteGeist's graph-discovery workflow better than another generic metadata source. +- It complements OpenAlex rather than replacing it. + +Expected role: + +- DOI-to-citations lookup +- DOI-to-references lookup +- provenance for citation edges + +Status: + +- now integrated as a DOI-based citation/reference source in the source layer and graph expansion flow + +Main risk: + +- coverage is narrower than OpenAlex, so merge rules need to treat it as an additional edge source rather than a primary metadata authority. + +### Unpaywall + +Why: + +- It solves a different problem from Crossref/OpenAlex: full-text access and OA status. +- It improves the “can I get the paper?” part of the workflow without forcing a storage redesign. + +Expected role: + +- DOI-to-best-open-access-link lookup +- OA status enrichment + +Status: + +- now integrated as an OA-link enrichment source with a dedicated `enrich-oa` CLI flow + +Main risk: + +- It should remain an access-link enrichment layer, not become entangled with identity resolution logic. + +### Europe PMC + +Why: + +- It is valuable for biomedical and life-sciences use cases. +- It complements PubMed with richer open-access and citation-related information. + +Expected role: + +- domain-specific metadata enrichment +- biomedical search +- OA/full-text linkage + +Status: + +- now integrated as a biomedical resolver/search complement to `PubMed` + +Main risk: + +- this should remain a domain-specific source, not be treated as a universal resolver. + +### Semantic Scholar + +Pros: + +- good graph and relevance signals +- useful for discovery quality + +Status: + +- now integrated as a broad resolver/search complement with good biological and physical sciences coverage + +Main risk: + +- rate limits and product-policy changes still matter more here than for the more explicitly open bibliographic sources + +## Evaluate But Do Not Make Core Yet + +### OpenAIRE + +Pros: + +- strong repository and OA/project linkage +- good for European repository acquisition + +Cons: + +- better suited to corpus acquisition than first-line metadata resolution + +Recommendation: + +- treat as an acquisition adapter, not an immediate resolver target + +## What Not To Prioritize Right Now + +### Database Redesign + +The repository already has a working SQLite storage model and FTS-backed local workflow. A second schema track should not lead the next phase of work unless a concrete source integration is blocked on it. + +### Vector Search + +Optional semantic ranking may become useful later, but it was not the motivating question and does not need to be a prerequisite for source incorporation. + +## Suggested Execution Order + +1. Keep the source abstraction aligned with sources already in use. +2. Revisit `OpenAIRE` after the current source additions settle. diff --git a/new-roadmap.md b/new-roadmap.md new file mode 100644 index 0000000..b2e61cf --- /dev/null +++ b/new-roadmap.md @@ -0,0 +1,113 @@ +# CiteGeist Roadmap: Sources-First Expansion + +## Purpose + +The primary question is not “how do we redesign CiteGeist around a new storage engine?” The primary question is “which additional open bibliographic sources should CiteGeist incorporate next?” + +This roadmap treats the current SQLite-based local workflow as the baseline and focuses on source evaluation, source integration order, and reviewable source behavior. + +## Baseline + +Already present in the repository: + +- local BibTeX ingest, review, export, and graph traversal +- metadata resolution from `Crossref`, `PubMed`, `Europe PMC`, `OpenAlex`, `Semantic Scholar`, `DBLP`, `arXiv`, and `DataCite` +- citation-graph expansion using `Crossref` and `OpenAlex` +- repository harvesting via `OAI-PMH` + +That means the next planning step is source prioritization, not another platform pivot. + +## Phase 0: Reframe Scope + +Goal: + +Put source-incorporation decisions ahead of database and vector-search ambitions. + +Tasks: + +- [x] identify which source integrations already exist +- [x] separate “source expansion” work from “new database/vector stack” work +- [x] document the source landscape and recommended order + +Deliverables: + +- `/docs/source-landscape.md` +- `/src/citegeist/sources/catalog.py` + +## Phase 1: Tighten The Source Layer + +Goal: + +Make the new source abstraction useful for the repository that already exists, rather than speculative infrastructure. + +Tasks: + +- [x] keep the compatibility bridge to the existing `SourceClient` +- [x] fix the initial `CrossRefSource` implementation so normalization works +- [x] make config-driven registry loading work for known concrete sources +- [x] add a code-backed source catalog for planning and prioritization + +Deliverables: + +- `/src/citegeist/sources/base.py` +- `/src/citegeist/sources/registry.py` +- `/src/citegeist/sources/crossref.py` +- `/src/citegeist/sources/catalog.py` + +## Phase 2: Highest-Value Open Source Additions + +Goal: + +Incorporate the next open sources that materially improve the current workflow. + +Priority order: + +1. `OpenAIRE` only if repository-acquisition scope expands + +Tasks: + +- [x] add `OpenCitations` DOI-to-citation and DOI-to-reference lookup +- [x] merge `OpenCitations` edges into the existing graph-expansion workflow with provenance +- [x] add `Unpaywall` DOI-to-OA-link enrichment +- [x] expose OA-link enrichment in a dedicated CLI flow +- [x] add `Europe PMC` as a biomedical metadata/fulltext complement to `PubMed` +- [x] add `Semantic Scholar` as a broader scientific metadata complement across biological and physical sciences + +Why these first: + +- `OpenCitations` directly answers the open-citation-coverage gap +- `Unpaywall` now solves access-link enrichment without forcing a storage redesign +- `Europe PMC` now improves biomedical metadata and OA/fulltext coverage without changing the storage model +- `Semantic Scholar` now improves broader biological and physical sciences coverage without changing the storage model + +## Phase 3: Evaluate Optional Sources, Do Not Commit Prematurely + +Goal: + +Assess sources that may be useful, but are not clearly the next source-first move. + +Candidates: + +- `OpenAIRE` + +Tasks: + +- [ ] document API limits, openness constraints, and integration risk +- [ ] decide whether each source belongs in core resolution, graph expansion, or corpus acquisition +- [ ] avoid adding sources that duplicate existing coverage without a clear payoff + +## Deferred Work + +These are valid future ideas, but they are not the current planning driver: + +- a second database schema +- pgvector integration +- embedding-first search +- large-scale canonical-work reconstruction + +The repository already has a working local storage/search path. Those ideas should only return to the front of the plan if a concrete source-integration need forces them there. + +## Immediate Next Steps + +1. Land the source inventory and source-layer cleanup. +2. Reassess whether `OpenAIRE` is worth adding for repository-acquisition breadth. diff --git a/src/citegeist/__init__.py b/src/citegeist/__init__.py index 8ced583..5b24fdb 100644 --- a/src/citegeist/__init__.py +++ b/src/citegeist/__init__.py @@ -2,7 +2,7 @@ from .app_api import LiteratureExplorerApi from .batch import BatchBootstrapRunner, BatchJobResult, load_batch_jobs from .bibtex import BibEntry, parse_bibtex from .bootstrap import BootstrapResult, Bootstrapper -from .expand import CrossrefExpander, OpenAlexExpander +from .expand import CrossrefExpander, OpenAlexExpander, OpenCitationsExpander from .extract import ( available_extraction_backends, check_extraction_comparison_summary, @@ -16,6 +16,10 @@ from .harvest import OaiMetadataFormat, OaiPmhHarvester, OaiSet from .llm_verify import VerificationLlmClient, VerificationLlmConfig from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts from .sources import SourceClient +from .sources import EuropePmcSource +from .sources import OpenLibrarySource +from .sources import SemanticScholarSource +from .sources import UnpaywallSource from .storage import BibliographyStore from .verify import BibliographyVerifier, VerificationResult, VerificationMatch @@ -31,10 +35,15 @@ __all__ = [ "LiteratureExplorerApi", "MetadataResolver", "OpenAlexExpander", + "OpenCitationsExpander", "OaiPmhHarvester", "OaiMetadataFormat", "OaiSet", "SourceClient", + "EuropePmcSource", + "OpenLibrarySource", + "SemanticScholarSource", + "UnpaywallSource", "VerificationLlmClient", "VerificationLlmConfig", "VerificationMatch", diff --git a/src/citegeist/cli.py b/src/citegeist/cli.py index 3ff99b0..b280f6f 100644 --- a/src/citegeist/cli.py +++ b/src/citegeist/cli.py @@ -173,6 +173,13 @@ def build_parser() -> argparse.ArgumentParser: resolve_parser = subparsers.add_parser("resolve", help="Enrich stored entries from external metadata sources") resolve_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich") + enrich_oa_parser = subparsers.add_parser( + "enrich-oa", + help="Enrich DOI-bearing entries with Unpaywall OA link metadata", + ) + enrich_oa_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich") + enrich_oa_parser.add_argument("--email", help="Email address required by the Unpaywall API") + resolve_stubs_parser = subparsers.add_parser( "resolve-stubs", help="Find and enrich stub-like stored entries, optionally limited to DOI-bearing candidates", @@ -237,7 +244,7 @@ def build_parser() -> argparse.ArgumentParser: expand_parser.add_argument("citation_keys", nargs="+", help="Seed citation keys to expand") expand_parser.add_argument( "--source", - choices=["crossref", "openalex"], + choices=["crossref", "openalex", "opencitations"], default="crossref", help="Graph expansion source", ) @@ -260,7 +267,7 @@ def build_parser() -> argparse.ArgumentParser: ) expand_topic_parser.add_argument( "--source", - choices=["crossref", "openalex"], + choices=["crossref", "openalex", "opencitations"], default="openalex", help="Topic graph expansion source", ) @@ -749,6 +756,8 @@ def main(argv: list[str] | None = None) -> int: ) if args.command == "resolve": return _run_resolve(store, args.citation_keys) + if args.command == "enrich-oa": + return _run_enrich_oa(store, args.citation_keys, args.email) if args.command == "resolve-stubs": return _run_resolve_stubs(store, args.limit, args.doi_only, args.all_misc, args.topic, args.preview) if args.command == "graph": @@ -1215,6 +1224,72 @@ def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int: return exit_code +def _run_enrich_oa(store: BibliographyStore, citation_keys: list[str], email: str | None) -> int: + from .sources import UnpaywallSource + + source = UnpaywallSource(config={"email": email} if email else {}) + if not source.is_available(): + print("Unpaywall enrichment requires --email or UNPAYWALL_EMAIL", file=sys.stderr) + return 1 + + results: list[dict[str, object]] = [] + total = len(citation_keys) + for index, citation_key in enumerate(citation_keys, start=1): + _print_progress("enriching OA", index, total, citation_key) + existing = store.get_entry(citation_key) + if existing is None: + results.append({"citation_key": citation_key, "status": "missing"}) + continue + doi = str(existing.get("doi") or "").strip() + if not doi: + results.append({"citation_key": citation_key, "status": "no_doi"}) + continue + + enriched = source.lookup_by_doi(doi) + if enriched is None: + results.append({"citation_key": citation_key, "status": "no_record", "doi": doi}) + continue + + merged_fields: dict[str, str] = {} + for key, value in existing.items(): + if isinstance(value, str): + merged_fields[key] = value + merged_fields.update(enriched.fields) + + for field_name in ("title", "year", "author", "journal", "booktitle", "publisher", "abstract", "keywords"): + existing_value = str(existing.get(field_name) or "").strip() + if existing_value: + merged_fields[field_name] = existing_value + + replacement = BibEntry( + entry_type=str(existing.get("entry_type") or "misc"), + citation_key=citation_key, + fields=merged_fields, + ) + store.replace_entry( + citation_key, + replacement, + source_type="oa_enrich", + source_label=f"unpaywall:doi:{doi}", + review_status=str(existing.get("review_status") or "enriched"), + ) + updated = store.get_entry(citation_key) or {} + results.append( + { + "citation_key": citation_key, + "status": "enriched", + "doi": doi, + "is_oa": updated.get("is_oa"), + "oa_status": updated.get("oa_status"), + "best_oa_url": updated.get("best_oa_url"), + "best_oa_pdf_url": updated.get("best_oa_pdf_url"), + } + ) + + print(json.dumps(results, indent=2)) + return 0 + + def _resolve_one(store: BibliographyStore, resolver: MetadataResolver, citation_key: str) -> bool: existing = store.get_entry(citation_key) if existing is None: @@ -1664,6 +1739,15 @@ def _run_expand( for relation_name in _expand_relation_types(relation) for item in expander.expand_entry(store, key, relation_type=relation_name, limit=limit) ] + elif source == "opencitations": + from .expand import OpenCitationsExpander + + expander = OpenCitationsExpander() + expand_fn = lambda key: [ + item + for relation_name in _expand_relation_types(relation) + for item in expander.expand_entry(store, key, relation_type=relation_name, limit=limit) + ] else: print(f"Unsupported expansion source: {source}", file=sys.stderr) return 1 diff --git a/src/citegeist/expand.py b/src/citegeist/expand.py index 55f3542..057ac7f 100644 --- a/src/citegeist/expand.py +++ b/src/citegeist/expand.py @@ -9,6 +9,7 @@ from urllib.parse import quote, urlencode from .bibtex import BibEntry, parse_bibtex from .extract import _extract_thesis_like_title, _looks_like_citation_blob as _shared_looks_like_citation_blob from .resolve import MetadataResolver, merge_entries +from .sources import OpenCitationsSource from .storage import BibliographyStore @@ -219,14 +220,94 @@ class OpenAlexExpander: return _normalize_openalex_id(results[0].get("id", "")) +class OpenCitationsExpander: + def __init__(self, resolver: MetadataResolver | None = None, source: OpenCitationsSource | None = None) -> None: + self.resolver = resolver or MetadataResolver() + self.source = source or OpenCitationsSource(config={"source_client": self.resolver.source_client}) + + def expand_entry( + self, + store: BibliographyStore, + citation_key: str, + relation_type: str = "cites", + limit: int = 25, + ) -> list[ExpansionResult]: + entry = store.get_entry(citation_key) + if entry is None: + return [] + + doi = str(entry.get("doi") or "") + if not doi: + return [] + + edges = self.source.get_citations(doi, relation_type=relation_type, limit=limit) + results: list[ExpansionResult] = [] + for edge in edges: + discovered_doi = edge.target_work_id[4:] if relation_type == "cites" else edge.source_work_id[4:] + discovered = self._lookup_discovered_entry(discovered_doi) + if discovered is None: + discovered = _opencitations_stub_entry(discovered_doi, citation_key) + + existing_key = _existing_entry_key_for_discovered_work(store, discovered) + target_key = existing_key or discovered.citation_key + created = False + if existing_key is None and store.get_entry(discovered.citation_key) is None: + store.upsert_entry( + discovered, + raw_bibtex=None, + source_type="graph_expand", + source_label=edge.source_label, + review_status="draft", + ) + store.connection.commit() + created = True + + if relation_type == "cites": + source_key = citation_key + relation_target_key = target_key + else: + source_key = target_key + relation_target_key = citation_key + + store.add_relation( + source_key, + relation_target_key, + "cites", + source_type="graph_expand", + source_label=edge.source_label, + confidence=edge.confidence, + ) + results.append( + ExpansionResult( + source_citation_key=source_key, + discovered_citation_key=target_key, + created_entry=created, + relation_type=relation_type, + source_label=edge.source_label, + ) + ) + return results + + def _lookup_discovered_entry(self, doi: str) -> BibEntry | None: + resolution = self.resolver.resolve_doi(doi) + if resolution is not None: + return resolution.entry + resolution = self.resolver.resolve_datacite_doi(doi) + if resolution is not None: + return resolution.entry + return self.source.lookup_by_doi(doi) + + class TopicExpander: def __init__( self, crossref_expander: CrossrefExpander | None = None, openalex_expander: OpenAlexExpander | None = None, + opencitations_expander: OpenCitationsExpander | None = None, ) -> None: self.crossref_expander = crossref_expander or CrossrefExpander() self.openalex_expander = openalex_expander or OpenAlexExpander() + self.opencitations_expander = opencitations_expander or OpenCitationsExpander() self.last_run_meta: dict[str, object] = {} def expand_topic( @@ -362,6 +443,17 @@ class TopicExpander: ) -> list[tuple[ExpansionResult, dict[str, object] | None]]: if source == "crossref": expansion_rows = self.crossref_expander.expand_entry_references(store, citation_key) + elif source == "opencitations": + expansion_rows = [] + for relation_name in _expand_relation_types(relation_type): + expansion_rows.extend( + self.opencitations_expander.expand_entry( + store, + citation_key, + relation_type=relation_name, + limit=limit, + ) + ) else: expansion_rows: list[ExpansionResult] = [] for relation_name in _expand_relation_types(relation_type): @@ -385,6 +477,11 @@ class TopicExpander: ) -> list[tuple[ExpansionResult, dict[str, object]]]: if source == "crossref": return self._preview_crossref_discoveries(store, citation_key, limit) + if source == "opencitations": + rows: list[tuple[ExpansionResult, dict[str, object]]] = [] + for relation_name in _expand_relation_types(relation_type): + rows.extend(self._preview_opencitations_discoveries(store, citation_key, relation_name, limit)) + return rows rows: list[tuple[ExpansionResult, dict[str, object]]] = [] for relation_name in _expand_relation_types(relation_type): rows.extend(self._preview_openalex_discoveries(store, citation_key, relation_name, limit)) @@ -467,6 +564,40 @@ class TopicExpander: ) return rows + def _preview_opencitations_discoveries( + self, + store: BibliographyStore, + citation_key: str, + relation_type: str, + limit: int, + ) -> list[tuple[ExpansionResult, dict[str, object]]]: + entry = store.get_entry(citation_key) + if entry is None or not entry.get("doi"): + return [] + doi = str(entry["doi"]) + edges = self.opencitations_expander.source.get_citations(doi, relation_type=relation_type, limit=limit) + rows: list[tuple[ExpansionResult, dict[str, object]]] = [] + for edge in edges: + discovered_doi = edge.target_work_id[4:] if relation_type == "cites" else edge.source_work_id[4:] + discovered = self.opencitations_expander._lookup_discovered_entry(discovered_doi) + if discovered is None: + discovered = _opencitations_stub_entry(discovered_doi, citation_key) + existing_key = _existing_entry_key_for_discovered_work(store, discovered) + target_key = existing_key or discovered.citation_key + rows.append( + ( + ExpansionResult( + source_citation_key=citation_key if relation_type == "cites" else target_key, + discovered_citation_key=target_key, + created_entry=existing_key is None and store.get_entry(discovered.citation_key) is None, + relation_type=relation_type, + source_label=edge.source_label, + ), + dict(discovered.fields), + ) + ) + return rows + def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry: title = _crossref_reference_title(reference, ordinal) @@ -567,6 +698,20 @@ def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int return f"{family}{year or 'nd'}{first_word}{ordinal}" +def _opencitations_stub_entry(doi: str, source_citation_key: str) -> BibEntry: + suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower() + return BibEntry( + entry_type="misc", + citation_key=f"doi{suffix}", + fields={ + "title": f"Referenced work for DOI {doi}", + "doi": doi, + "url": f"https://doi.org/{doi}", + "note": f"discovered_from = {{{source_citation_key}}}", + }, + ) + + def _normalize_text(value: str) -> str: without_tags = re.sub(r"<[^>]+>", " ", html.unescape(value)) normalized = " ".join(without_tags.split()) diff --git a/src/citegeist/resolve.py b/src/citegeist/resolve.py index e167a73..56db72a 100644 --- a/src/citegeist/resolve.py +++ b/src/citegeist/resolve.py @@ -7,17 +7,38 @@ import re import urllib.error import urllib.parse import xml.etree.ElementTree as ET -from dataclasses import dataclass +from dataclasses import dataclass, field from .bibtex import BibEntry, parse_bibtex +from .sources.europepmc import EuropePmcSource +from .sources.openlibrary import OpenLibrarySource +from .sources.semanticscholar import SemanticScholarSource from .sources import SourceClient +@dataclass(slots=True) +class ResolutionAttempt: + source_name: str + strategy: str + query_value: str + matched: bool + candidate_count: int | None = None + source_label: str = "" + error: str = "" + + @dataclass(slots=True) class Resolution: entry: BibEntry source_type: str source_label: str + attempts: list[ResolutionAttempt] = field(default_factory=list) + + +@dataclass(slots=True) +class ResolutionOutcome: + resolution: Resolution | None + attempts: list[ResolutionAttempt] class MetadataResolver: @@ -31,70 +52,109 @@ class MetadataResolver: ) -> None: self.user_agent = user_agent self.source_client = source_client or SourceClient(user_agent=user_agent) + self.europepmc = EuropePmcSource(config={"source_client": self.source_client, "user_agent": user_agent}) + self.openlibrary = OpenLibrarySource(config={"source_client": self.source_client, "user_agent": user_agent}) + self.semanticscholar = SemanticScholarSource(config={"user_agent": user_agent}) self.ncbi_api_key = ncbi_api_key if ncbi_api_key is not None else os.environ.get("NCBI_API_KEY", "") self.ncbi_tool = ncbi_tool if ncbi_tool is not None else os.environ.get("NCBI_TOOL", "citegeist") self.ncbi_email = ncbi_email if ncbi_email is not None else os.environ.get("NCBI_EMAIL", "") def resolve_entry(self, entry: BibEntry) -> Resolution | None: + return self.resolve_entry_with_trace(entry).resolution + + def resolve_entry_with_trace(self, entry: BibEntry) -> ResolutionOutcome: + attempts: list[ResolutionAttempt] = [] if doi := entry.fields.get("doi"): - resolved = self.resolve_doi(doi) + resolved = self._attempt_direct_resolution(attempts, "crossref", "doi_lookup", doi, self.resolve_doi) if resolved is not None: - return resolved - resolved = self.resolve_datacite_doi(doi) + return ResolutionOutcome(resolution=resolved, attempts=attempts) + resolved = self._attempt_direct_resolution( + attempts, "datacite", "doi_lookup", doi, self.resolve_datacite_doi + ) if resolved is not None: - return resolved + return ResolutionOutcome(resolution=resolved, attempts=attempts) + resolved = self._attempt_direct_resolution( + attempts, "europepmc", "doi_lookup", doi, self.resolve_europepmc_doi + ) + if resolved is not None: + return ResolutionOutcome(resolution=resolved, attempts=attempts) + resolved = self._attempt_direct_resolution( + attempts, "semanticscholar", "doi_lookup", doi, self.resolve_semanticscholar_doi + ) + if resolved is not None: + return ResolutionOutcome(resolution=resolved, attempts=attempts) if pmid := entry.fields.get("pmid"): - resolved = self.resolve_pmid(pmid) + resolved = self._attempt_direct_resolution(attempts, "pubmed", "pmid_lookup", pmid, self.resolve_pmid) if resolved is not None: - return resolved + return ResolutionOutcome(resolution=resolved, attempts=attempts) if openalex_id := entry.fields.get("openalex"): - resolved = self.resolve_openalex(openalex_id) + resolved = self._attempt_direct_resolution( + attempts, "openalex", "work_lookup", openalex_id, self.resolve_openalex + ) if resolved is not None: - return resolved + return ResolutionOutcome(resolution=resolved, attempts=attempts) if dblp_key := entry.fields.get("dblp"): - resolved = self.resolve_dblp(dblp_key) + resolved = self._attempt_direct_resolution(attempts, "dblp", "key_lookup", dblp_key, self.resolve_dblp) if resolved is not None: - return resolved + return ResolutionOutcome(resolution=resolved, attempts=attempts) if arxiv_id := entry.fields.get("arxiv"): - resolved = self.resolve_arxiv(arxiv_id) + resolved = self._attempt_direct_resolution( + attempts, "arxiv", "id_lookup", arxiv_id, self.resolve_arxiv + ) if resolved is not None: - return resolved + return ResolutionOutcome(resolution=resolved, attempts=attempts) if title := entry.fields.get("title"): - resolved = self.search_crossref_best_match( - title=title, - author_text=entry.fields.get("author", ""), - year=entry.fields.get("year", ""), + author_text = entry.fields.get("author", "") + year = entry.fields.get("year", "") + resolved = self._attempt_title_search_resolution( + attempts, "crossref", title, author_text, year, self.search_crossref ) if resolved is not None: - return resolved - resolved = self.search_datacite_best_match( - title=title, - author_text=entry.fields.get("author", ""), - year=entry.fields.get("year", ""), + return ResolutionOutcome(resolution=resolved, attempts=attempts) + resolved = self._attempt_title_search_resolution( + attempts, "datacite", title, author_text, year, self.search_datacite ) if resolved is not None: - return resolved - resolved = self.search_openalex_best_match( - title=title, - author_text=entry.fields.get("author", ""), - year=entry.fields.get("year", ""), + return ResolutionOutcome(resolution=resolved, attempts=attempts) + resolved = self._attempt_title_search_resolution( + attempts, "openalex", title, author_text, year, self.search_openalex ) if resolved is not None: - return resolved - resolved = self.search_pubmed_best_match( - title=title, - author_text=entry.fields.get("author", ""), - year=entry.fields.get("year", ""), + return ResolutionOutcome(resolution=resolved, attempts=attempts) + resolved = self._attempt_title_search_resolution( + attempts, "pubmed", title, author_text, year, self.search_pubmed ) if resolved is not None: - return resolved + return ResolutionOutcome(resolution=resolved, attempts=attempts) + resolved = self._attempt_title_search_resolution( + attempts, "europepmc", title, author_text, year, self.search_europepmc + ) + if resolved is not None: + return ResolutionOutcome(resolution=resolved, attempts=attempts) + resolved = self._attempt_title_search_resolution( + attempts, "semanticscholar", title, author_text, year, self.search_semanticscholar + ) + if resolved is not None: + return ResolutionOutcome(resolution=resolved, attempts=attempts) + if _entry_prefers_catalog_search(entry): + resolved = self._attempt_title_search_resolution( + attempts, + "openlibrary", + title, + author_text, + year, + self.search_openlibrary, + selector=_select_best_catalog_title_match, + ) + if resolved is not None: + return ResolutionOutcome(resolution=resolved, attempts=attempts) - return None + return ResolutionOutcome(resolution=None, attempts=attempts) def resolve_doi(self, doi: str) -> Resolution | None: encoded = urllib.parse.quote(doi, safe="") @@ -124,19 +184,7 @@ class MetadataResolver: author_text: str = "", year: str = "", ) -> Resolution | None: - candidate = _select_best_title_match( - self.search_crossref(title, limit=5), - title=title, - author_text=author_text, - year=year, - ) - if candidate is None: - return None - return Resolution( - entry=candidate, - source_type="resolver", - source_label=f"crossref:search:{title}", - ) + return self._search_best_match_resolution("crossref", title, author_text, year, self.search_crossref) def resolve_dblp(self, dblp_key: str) -> Resolution | None: encoded_key = urllib.parse.quote(dblp_key, safe="/:") @@ -245,19 +293,7 @@ class MetadataResolver: author_text: str = "", year: str = "", ) -> Resolution | None: - candidate = _select_best_title_match( - self.search_datacite(title, limit=5), - title=title, - author_text=author_text, - year=year, - ) - if candidate is None: - return None - return Resolution( - entry=candidate, - source_type="resolver", - source_label=f"datacite:search:{title}", - ) + return self._search_best_match_resolution("datacite", title, author_text, year, self.search_datacite) def search_openalex(self, title: str, limit: int = 5) -> list[BibEntry]: query = urllib.parse.urlencode({"search": title, "per-page": limit}) @@ -290,6 +326,35 @@ class MetadataResolver: return [] return self._fetch_pubmed_entries(ids[:limit]) + def resolve_europepmc_doi(self, doi: str) -> Resolution | None: + entry = self.europepmc.lookup_by_doi(doi) + if entry is None: + return None + return Resolution( + entry=entry, + source_type="resolver", + source_label=f"europepmc:doi:{doi}", + ) + + def search_europepmc(self, title: str, limit: int = 5) -> list[BibEntry]: + return self.europepmc.search(title, limit=limit) + + def search_openlibrary(self, title: str, limit: int = 5) -> list[BibEntry]: + return self.openlibrary.search(title, limit=limit) + + def resolve_semanticscholar_doi(self, doi: str) -> Resolution | None: + entry = self.semanticscholar.lookup_by_doi(doi) + if entry is None: + return None + return Resolution( + entry=entry, + source_type="resolver", + source_label=f"semanticscholar:doi:{doi}", + ) + + def search_semanticscholar(self, title: str, limit: int = 5) -> list[BibEntry]: + return self.semanticscholar.search(title, limit=limit) + def _safe_get_json(self, url: str) -> dict | None: try: return self.source_client.get_json(url) @@ -333,19 +398,7 @@ class MetadataResolver: author_text: str = "", year: str = "", ) -> Resolution | None: - candidate = _select_best_title_match( - self.search_openalex(title, limit=5), - title=title, - author_text=author_text, - year=year, - ) - if candidate is None: - return None - return Resolution( - entry=candidate, - source_type="resolver", - source_label=f"openalex:search:{title}", - ) + return self._search_best_match_resolution("openalex", title, author_text, year, self.search_openalex) def search_pubmed_best_match( self, @@ -353,19 +406,122 @@ class MetadataResolver: author_text: str = "", year: str = "", ) -> Resolution | None: - candidate = _select_best_title_match( - self.search_pubmed(title, limit=5), - title=title, - author_text=author_text, - year=year, + return self._search_best_match_resolution("pubmed", title, author_text, year, self.search_pubmed) + + def search_europepmc_best_match( + self, + title: str, + author_text: str = "", + year: str = "", + ) -> Resolution | None: + return self._search_best_match_resolution("europepmc", title, author_text, year, self.search_europepmc) + + def search_semanticscholar_best_match( + self, + title: str, + author_text: str = "", + year: str = "", + ) -> Resolution | None: + return self._search_best_match_resolution( + "semanticscholar", title, author_text, year, self.search_semanticscholar ) + + def search_openlibrary_best_match( + self, + title: str, + author_text: str = "", + year: str = "", + ) -> Resolution | None: + return self._search_best_match_resolution("openlibrary", title, author_text, year, self.search_openlibrary) + + def _search_best_match_resolution( + self, source_name: str, title: str, author_text: str, year: str, search_func + ) -> Resolution | None: + candidates = search_func(title, limit=5) + candidate = _select_best_title_match(candidates, title=title, author_text=author_text, year=year) if candidate is None: return None - return Resolution( - entry=candidate, - source_type="resolver", - source_label=f"pubmed:search:{title}", + return Resolution(entry=candidate, source_type="resolver", source_label=f"{source_name}:search:{title}") + + def _attempt_direct_resolution( + self, + attempts: list[ResolutionAttempt], + source_name: str, + strategy: str, + query_value: str, + resolver_func, + ) -> Resolution | None: + try: + resolution = resolver_func(query_value) + except Exception as exc: + attempts.append( + ResolutionAttempt( + source_name=source_name, + strategy=strategy, + query_value=query_value, + matched=False, + error=str(exc), + ) + ) + return None + attempts.append( + ResolutionAttempt( + source_name=source_name, + strategy=strategy, + query_value=query_value, + matched=resolution is not None, + source_label=resolution.source_label if resolution is not None else "", + ) ) + if resolution is not None and not resolution.attempts: + resolution.attempts = list(attempts) + return resolution + + def _attempt_title_search_resolution( + self, + attempts: list[ResolutionAttempt], + source_name: str, + title: str, + author_text: str, + year: str, + search_func, + selector=None, + ) -> Resolution | None: + try: + candidates = search_func(title, limit=5) + except Exception as exc: + attempts.append( + ResolutionAttempt( + source_name=source_name, + strategy="title_search", + query_value=title, + matched=False, + error=str(exc), + ) + ) + return None + match_selector = selector or _select_best_title_match + candidate = match_selector(candidates, title=title, author_text=author_text, year=year) + resolution = None + if candidate is not None: + resolution = Resolution( + entry=candidate, + source_type="resolver", + source_label=f"{source_name}:search:{title}", + ) + attempts.append( + ResolutionAttempt( + source_name=source_name, + strategy="title_search", + query_value=title, + matched=resolution is not None, + candidate_count=len(candidates), + source_label=resolution.source_label if resolution is not None else "", + ) + ) + if resolution is not None and not resolution.attempts: + resolution.attempts = list(attempts) + return resolution def _fetch_pubmed_entries(self, pmids: list[str]) -> list[BibEntry]: ordered_pmids = [pmid for pmid in dict.fromkeys(pmids) if pmid] @@ -768,6 +924,42 @@ def _select_best_title_match( return None +def _select_best_catalog_title_match( + candidates: list[BibEntry], + title: str, + author_text: str = "", + year: str = "", +) -> BibEntry | None: + if not candidates: + return None + + title_tokens = _catalog_title_tokens(title) + author_tokens = _author_match_tokens(author_text) + year_text = str(year or "").strip() + scored: list[tuple[float, BibEntry]] = [] + + for candidate in candidates: + candidate_title_tokens = _catalog_title_tokens(candidate.fields.get("title", "")) + if not candidate_title_tokens: + continue + overlap = len(title_tokens & candidate_title_tokens) + union = len(title_tokens | candidate_title_tokens) + score = (overlap / union) if union else 0.0 + if score < 0.6: + continue + candidate_year = str(candidate.fields.get("year", "") or "").strip() + if year_text and candidate_year and year_text != candidate_year: + continue + if author_tokens and not _candidate_matches_author_tokens(candidate, author_tokens): + continue + scored.append((score, candidate)) + + if not scored: + return None + scored.sort(key=lambda item: (-item[0], item[1].citation_key)) + return scored[0][1] + + def _author_match_tokens(author_text: str) -> set[str]: normalized = _normalize_match_text(author_text) if not normalized: @@ -788,6 +980,39 @@ def _candidate_matches_author_tokens(candidate: BibEntry, author_tokens: set[str return bool(author_tokens & candidate_tokens) +def _catalog_title_tokens(value: str) -> set[str]: + normalized = _normalize_match_text(value) + stopwords = {"the", "and", "for", "with", "from", "into", "after", "all"} + return { + f"{token[:-4]}ic" if token.endswith("ical") and len(token) > 6 else token + for token in re.findall(r"[a-z0-9]+", normalized) + if len(token) >= 4 and token not in stopwords + } + + +def _entry_prefers_catalog_search(entry: BibEntry) -> bool: + if entry.entry_type in {"book", "incollection", "phdthesis", "mastersthesis"}: + return True + title = _normalize_match_text(entry.fields.get("title", "")) + venue = _normalize_match_text( + " ".join( + filter( + None, + [ + entry.fields.get("publisher", ""), + entry.fields.get("howpublished", ""), + entry.fields.get("booktitle", ""), + ], + ) + ) + ) + if entry.entry_type != "misc": + return False + if any(token in venue for token in ("press", "university", "house", "dictionary", "christendom")): + return True + return any(token in title for token in ("dictionary", "history", "world", "universe", "record")) + + def _normalize_pmid(value: str) -> str: return "".join(ch for ch in str(value) if ch.isdigit()) diff --git a/src/citegeist/resolver/__init__.py b/src/citegeist/resolver/__init__.py new file mode 100644 index 0000000..2186b04 --- /dev/null +++ b/src/citegeist/resolver/__init__.py @@ -0,0 +1,27 @@ +""" +Identifier resolution and normalization module. + +Provides functions for extracting, normalizing, and resolving +bibliographic identifiers across multiple schemes. +""" +from __future__ import annotations + +from citegeist.resolver.identifiers import ( + IdentifierExtractor, + IdentifierNormalizer, + IdentifierResolver, + extract_identifiers, + normalize_identifier, + get_primary_identifier, + resolve_identifiers, +) + +__all__ = [ + 'IdentifierExtractor', + 'IdentifierNormalizer', + 'IdentifierResolver', + 'extract_identifiers', + 'normalize_identifier', + 'get_primary_identifier', + 'resolve_identifiers', +] diff --git a/src/citegeist/resolver/identifiers.py b/src/citegeist/resolver/identifiers.py new file mode 100644 index 0000000..8d5f2a2 --- /dev/null +++ b/src/citegeist/resolver/identifiers.py @@ -0,0 +1,418 @@ +""" +Identifier resolution and normalization module. + +This module provides functions for extracting, normalizing, and resolving +bibliographic identifiers across multiple schemes (DOI, PMID, arXiv, ORCID, etc.). +""" +from __future__ import annotations + +import re +from typing import Dict, List, Optional, Tuple + + +# Identifier scheme patterns +DOI_PATTERN = re.compile( + r'^10\.\d{4,9}/\S+$', + re.IGNORECASE +) + +PMID_PATTERN = re.compile(r'^\d{5,7}$') + +PMCID_PATTERN = re.compile( + r'^PMC\d+$|^PMC[0-9a-f]+$', + re.IGNORECASE +) + +ARXIV_PATTERN = re.compile( + r'^\d{4}\.\d{4,5}(v\d+)?$', + re.IGNORECASE +) + +ORCID_PATTERN = re.compile( + r'^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$', + re.IGNORECASE +) + +ROR_PATTERN = re.compile( + r'^https?://ror\.org/[0-9A-Z]{4,10}$' +) + +DBLP_PATTERN = re.compile( + r'^[a-zA-Z0-9_]+:[a-zA-Z0-9_]+$', + re.IGNORECASE +) + +OPENALEX_PATTERN = re.compile( + r'^W[0-9]{4}-[A-F0-9]{4}$', + re.IGNORECASE +) + + +class IdentifierExtractor: + """Extract identifiers from BibEntry fields.""" + + @staticmethod + def extract(entry_fields: Dict[str, str]) -> Dict[str, str]: + """Extract all identifier schemes from entry fields. + + Args: + entry_fields: Dictionary of entry fields + + Returns: + Dictionary mapping scheme names to values + """ + identifiers = {} + + # DOI + if doi := entry_fields.get('doi'): + identifiers['doi'] = doi + + # PMID + if pmid := entry_fields.get('pmid'): + identifiers['pmid'] = pmid + + # PMCID + if pmcid := entry_fields.get('pmcid'): + identifiers['pmcid'] = pmcid + + # arXiv + if arxiv := entry_fields.get('arxiv'): + identifiers['arxiv'] = arxiv + + # DBLP + if dblp := entry_fields.get('dblp'): + identifiers['dblp'] = dblp + + # OpenAlex + if openalex := entry_fields.get('openalex'): + identifiers['openalex'] = openalex + + # ISBN + if isbn := entry_fields.get('isbn'): + identifiers['isbn'] = isbn + + # ISSN + if issn := entry_fields.get('issn'): + identifiers['issn'] = issn + + return identifiers + + +class IdentifierNormalizer: + """Normalize identifiers to canonical form.""" + + @staticmethod + def normalize_doi(doi: str) -> Optional[str]: + """Normalize DOI to lowercase. + + Args: + doi: DOI string + + Returns: + Lowercase DOI, or None if invalid + """ + if not doi: + return None + normalized = doi.strip().lower() + if DOI_PATTERN.match(normalized): + return normalized + return None + + @staticmethod + def normalize_pmid(pmid: str) -> Optional[str]: + """Normalize PMID to string. + + Args: + pmid: PMID string + + Returns: + PMID string, or None if invalid + """ + if not pmid: + return None + pmid_str = str(pmid).strip() + if PMID_PATTERN.match(pmid_str): + return pmid_str + return None + + @staticmethod + def normalize_pmcid(pmcid: str) -> Optional[str]: + """Normalize PMCID to lowercase. + + Args: + pmcid: PMCID string + + Returns: + Lowercase PMCID, or None if invalid + """ + if not pmcid: + return None + normalized = pmcid.strip().lower() + if PMCID_PATTERN.match(normalized): + return normalized + return None + + @staticmethod + def normalize_arxiv(arxiv: str) -> Optional[str]: + """Normalize arXiv ID. + + Args: + arxiv: arXiv ID string + + Returns: + Normalized arXiv ID, or None if invalid + """ + if not arxiv: + return None + # Remove 'v' and version suffix if present + normalized = arxiv.strip().lower() + if 'v' in normalized: + normalized = normalized.split('v')[0] + if ARXIV_PATTERN.match(normalized): + return normalized + return None + + @staticmethod + def normalize_orcid(orcid: str) -> Optional[str]: + """Normalize ORCID to canonical format. + + Args: + orcid: ORCID string + + Returns: + Normalized ORCID (XXXX-XXXX-XXXX-XXX0), or None if invalid + """ + if not orcid: + return None + orcid = orcid.strip().upper().replace(' ', '') + if ORCID_PATTERN.match(orcid): + return orcid + return None + + @staticmethod + def normalize_ror(ror_url: str) -> Optional[str]: + """Normalize ROR URL to identifier. + + Args: + ror_url: ROR URL string + + Returns: + ROR identifier, or None if invalid + """ + if not ror_url: + return None + ror_id = ror_url.strip().lower() + if ROR_PATTERN.match(ror_id): + return ror_id + return None + + @staticmethod + def normalize_dblp(dblp_key: str) -> Optional[str]: + """Normalize DBLP key. + + Args: + dblp_key: DBLP key string + + Returns: + DBLP key, or None if invalid + """ + if not dblp_key: + return None + dblp = dblp_key.strip() + if DBLP_PATTERN.match(dblp): + return dblp + return None + + @staticmethod + def normalize_openalex(openalex_id: str) -> Optional[str]: + """Normalize OpenAlex ID. + + Args: + openalex_id: OpenAlex ID string + + Returns: + OpenAlex ID, or None if invalid + """ + if not openalex_id: + return None + openalex = openalex_id.strip().upper() + if OPENALEX_PATTERN.match(openalex): + return openalex + return None + + @staticmethod + def normalize_identifier(scheme: str, value: str) -> Optional[Tuple[str, str]]: + """Normalize an identifier. + + Args: + scheme: Identifier scheme name + value: Identifier value + + Returns: + Tuple of (scheme, normalized_value), or None if invalid + """ + scheme = scheme.lower() + + normalizers = { + 'doi': IdentifierNormalizer.normalize_doi, + 'pmid': IdentifierNormalizer.normalize_pmid, + 'pmcid': IdentifierNormalizer.normalize_pmcid, + 'arxiv': IdentifierNormalizer.normalize_arxiv, + 'orcid': IdentifierNormalizer.normalize_orcid, + 'ror': IdentifierNormalizer.normalize_ror, + 'dblp': IdentifierNormalizer.normalize_dblp, + 'openalex': IdentifierNormalizer.normalize_openalex, + } + + normalizer = normalizers.get(scheme) + if normalizer: + normalized = normalizer(value) + if normalized: + return (scheme, normalized) + return None + + +class IdentifierResolver: + """Resolve identifiers across multiple schemes.""" + + # Lookup priority: schemes should be checked in this order + LOOKUP_PRIORITY = [ + ('doi', IdentifierNormalizer.normalize_doi), + ('pmid', IdentifierNormalizer.normalize_pmid), + ('pmcid', IdentifierNormalizer.normalize_pmcid), + ('arxiv', IdentifierNormalizer.normalize_arxiv), + ('dblp', IdentifierNormalizer.normalize_dblp), + ('openalex', IdentifierNormalizer.normalize_openalex), + ] + + @staticmethod + def resolve(entry_fields: Dict[str, str]) -> List[Tuple[str, str]]: + """Resolve identifiers from entry fields. + + Args: + entry_fields: Dictionary of entry fields + + Returns: + List of (scheme, normalized_value) tuples in priority order + """ + identifiers = IdentifierExtractor.extract(entry_fields) + resolved = [] + + for scheme, value in identifiers.items(): + if normalized := IdentifierNormalizer.normalize_identifier(scheme, value): + resolved.append(normalized) + + # Add title fingerprint as fallback + if title := entry_fields.get('title'): + fingerprint = IdentifierResolver._create_title_fingerprint(title) + if fingerprint: + resolved.append(('title', fingerprint)) + + return resolved + + @staticmethod + def _create_title_fingerprint(title: str) -> Optional[str]: + """Create a fingerprint from title for fallback lookup. + + Args: + title: Work title + + Returns: + Fingerprint string + """ + if not title: + return None + + # Remove common words, punctuation, and normalize + words = title.lower() + words = re.sub(r'[^\w\s]', ' ', words) # Remove punctuation + words = re.sub(r'\s+', ' ', words) # Normalize whitespace + words = words.strip() + + return words + + @staticmethod + def get_primary_identifier(entry_fields: Dict[str, str]) -> Optional[Tuple[str, str]]: + """Get the primary identifier (first in priority order). + + Args: + entry_fields: Dictionary of entry fields + + Returns: + Tuple of (scheme, value), or None if no identifier found + """ + resolved = IdentifierResolver.resolve(entry_fields) + + for scheme, _ in IdentifierResolver.LOOKUP_PRIORITY: + # Find this scheme in resolved identifiers + for rscheme, rvalue in resolved: + if rscheme == scheme: + return (rscheme, rvalue) + + return None + + @staticmethod + def get_scheme_value(scheme: str, entry_fields: Dict[str, str]) -> Optional[str]: + """Get a specific identifier value from entry fields. + + Args: + scheme: Identifier scheme name + entry_fields: Dictionary of entry fields + + Returns: + Identifier value, or None if not found + """ + if value := entry_fields.get(scheme): + if normalized := IdentifierNormalizer.normalize_identifier(scheme, value): + return normalized[1] + return None + + +# Convenience functions +def extract_identifiers(entry_fields: Dict[str, str]) -> Dict[str, str]: + """Extract all identifiers from entry fields. + + Args: + entry_fields: Dictionary of entry fields + + Returns: + Dictionary mapping scheme names to values + """ + return IdentifierExtractor.extract(entry_fields) + + +def normalize_identifier(scheme: str, value: str) -> Optional[Tuple[str, str]]: + """Normalize an identifier. + + Args: + scheme: Identifier scheme name + value: Identifier value + + Returns: + Tuple of (scheme, normalized_value), or None if invalid + """ + return IdentifierNormalizer.normalize_identifier(scheme, value) + + +def get_primary_identifier(entry_fields: Dict[str, str]) -> Optional[Tuple[str, str]]: + """Get the primary identifier. + + Args: + entry_fields: Dictionary of entry fields + + Returns: + Tuple of (scheme, value), or None if no identifier found + """ + return IdentifierResolver.get_primary_identifier(entry_fields) + + +def resolve_identifiers(entry_fields: Dict[str, str]) -> List[Tuple[str, str]]: + """Resolve identifiers from entry fields. + + Args: + entry_fields: Dictionary of entry fields + + Returns: + List of (scheme, value) tuples + """ + return IdentifierResolver.resolve(entry_fields) diff --git a/src/citegeist/sources/__all__.py b/src/citegeist/sources/__all__.py new file mode 100644 index 0000000..447c210 --- /dev/null +++ b/src/citegeist/sources/__all__.py @@ -0,0 +1,29 @@ +"""Export all source plugins.""" +from __future__ import annotations + +from citegeist.sources.base import BibliographicSource, SourceRecord, CitationEdge +from citegeist.sources.catalog import SourceCatalogEntry, list_source_catalog, prioritized_source_keys +from citegeist.sources.registry import SourceRegistry, get_registry +from citegeist.sources.crossref import CrossRefSource +from citegeist.sources.europepmc import EuropePmcSource +from citegeist.sources.opencitations import OpenCitationsSource +from citegeist.sources.openlibrary import OpenLibrarySource +from citegeist.sources.semanticscholar import SemanticScholarSource +from citegeist.sources.unpaywall import UnpaywallSource + +__all__ = [ + 'BibliographicSource', + 'SourceRecord', + 'CitationEdge', + 'SourceCatalogEntry', + 'SourceRegistry', + 'get_registry', + 'list_source_catalog', + 'prioritized_source_keys', + 'CrossRefSource', + 'EuropePmcSource', + 'OpenCitationsSource', + 'OpenLibrarySource', + 'SemanticScholarSource', + 'UnpaywallSource', +] diff --git a/src/citegeist/sources/__init__.py b/src/citegeist/sources/__init__.py new file mode 100644 index 0000000..e092baa --- /dev/null +++ b/src/citegeist/sources/__init__.py @@ -0,0 +1,44 @@ +""" +Bibliographic source plugins. + +This package provides a plugin architecture for integrating multiple +bibliographic data sources (CrossRef, PubMed, Semantic Scholar, etc.). +""" + +# Import old sources module for backward compatibility +from . import _old_sources_compat + +# Import new plugin architecture +from citegeist.sources.base import BibliographicSource, SourceRecord, CitationEdge +from citegeist.sources.catalog import SourceCatalogEntry, list_source_catalog, prioritized_source_keys +from citegeist.sources.registry import SourceRegistry, get_registry +from citegeist.sources.crossref import CrossRefSource +from citegeist.sources.europepmc import EuropePmcSource +from citegeist.sources.opencitations import OpenCitationsSource +from citegeist.sources.openlibrary import OpenLibrarySource +from citegeist.sources.semanticscholar import SemanticScholarSource +from citegeist.sources.unpaywall import UnpaywallSource + +# Re-export old classes for compatibility +__all__ = [ + # New plugin architecture + 'BibliographicSource', + 'SourceRecord', + 'CitationEdge', + 'SourceCatalogEntry', + 'SourceRegistry', + 'get_registry', + 'list_source_catalog', + 'prioritized_source_keys', + 'CrossRefSource', + 'EuropePmcSource', + 'OpenCitationsSource', + 'OpenLibrarySource', + 'SemanticScholarSource', + 'UnpaywallSource', + # Old API (for backward compatibility) + 'SourceClient', +] + +# Backward compatibility - make SourceClient available from this module +SourceClient = _old_sources_compat.SourceClient diff --git a/src/citegeist/sources/_old_sources_compat.py b/src/citegeist/sources/_old_sources_compat.py new file mode 100644 index 0000000..ace5693 --- /dev/null +++ b/src/citegeist/sources/_old_sources_compat.py @@ -0,0 +1,25 @@ +""" +Backward compatibility module for old sources module. + +This module re-exports the old SourceClient class for compatibility. +""" +from pathlib import Path +import importlib.util + +from .base import BibliographicSource, SourceRecord, CitationEdge +from .registry import SourceRegistry, get_registry +from .crossref import CrossRefSource + +# Load the old sources.py module from the citegeist package root +_OLD_SOURCES_PATH = Path(__file__).resolve().parents[1] / "sources.py" +spec = importlib.util.spec_from_file_location( + "citegeist.sources_old", + _OLD_SOURCES_PATH +) +if spec and spec.loader: + old_sources = importlib.util.module_from_spec(spec) + spec.loader.exec_module(old_sources) + SourceClient = old_sources.SourceClient +else: + # Fallback if old sources.py doesn't exist + SourceClient = None diff --git a/src/citegeist/sources/base.py b/src/citegeist/sources/base.py new file mode 100644 index 0000000..e499f36 --- /dev/null +++ b/src/citegeist/sources/base.py @@ -0,0 +1,189 @@ +""" +Base interface for bibliographic sources. + +This module defines the abstract base class that all source plugins must implement. +Plugins can register themselves with the SourceRegistry for dynamic loading. +""" +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +from citegeist.bibtex import BibEntry + + +@dataclass(slots=True) +class SourceRecord: + """Represents a raw record from a source API.""" + raw: Dict[str, Any] + source_type: str + source_label: str + timestamp: str + confidence: float + + +@dataclass(slots=True) +class CitationEdge: + """Represents a citation relationship.""" + source_work_id: str + target_work_id: str + relation_type: str # "cites" or "cited_by" + source_type: str + source_label: str + confidence: float + + +class BibliographicSource(ABC): + """Abstract base class for bibliographic data sources. + + All source plugins must inherit from this class and implement the required methods. + """ + + def __init__(self, config: Optional[Dict[str, Any]] = None): + """Initialize the source with optional configuration. + + Args: + config: Source-specific configuration dictionary + """ + self.config = config or {} + self.enabled = self.config.get('enabled', True) + self.source_type = self.config.get('source_type', self.__class__.__name__) + + @abstractmethod + def lookup_by_doi(self, doi: str) -> Optional[BibEntry]: + """Look up a work by DOI. + + Args: + doi: Digital Object Identifier + + Returns: + BibEntry if found, None otherwise + """ + pass + + @abstractmethod + def lookup_by_title(self, title: str) -> Optional[BibEntry]: + """Look up a work by title. + + Args: + title: Work title + + Returns: + BibEntry if found, None otherwise + """ + pass + + @abstractmethod + def search(self, query: str, limit: int = 10) -> List[BibEntry]: + """Search for works matching the query. + + Args: + query: Search query string + limit: Maximum number of results + + Returns: + List of matching BibEntry objects + """ + pass + + @abstractmethod + def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]: + """Normalize a raw API record to a canonical BibEntry. + + Args: + record: Raw record from source API + + Returns: + BibEntry if normalization succeeds, None otherwise + """ + pass + + def get_citations(self, work_id: str, relation_type: str = 'cites', limit: int = 10) -> List[CitationEdge]: + """Get citations for a work. + + Args: + work_id: Work identifier (DOI, PMID, etc.) + relation_type: Type of relation ('cites' or 'cited_by') + limit: Maximum number of results + + Returns: + List of CitationEdge objects + """ + return [] + + def get_related(self, work_id: str, limit: int = 10) -> List[BibEntry]: + """Get works related to a work. + + Args: + work_id: Work identifier + limit: Maximum number of results + + Returns: + List of related BibEntry objects + """ + return [] + + def get_fulltext_url(self, doi: str) -> Optional[str]: + """Get full-text URL for a work. + + Args: + doi: Digital Object Identifier + + Returns: + Full-text URL if available, None otherwise + """ + return None + + def get_embedding(self, work_id: str) -> Optional[List[float]]: + """Get embedding vector for a work. + + Args: + work_id: Work identifier + + Returns: + Embedding vector if available, None otherwise + """ + return None + + def get_identifier_scheme(self) -> str: + """Get the identifier scheme used by this source. + + Returns: + Identifier scheme (e.g., 'doi', 'pmid', 'openalex') + """ + return self.source_type.lower() + + def record_source_metadata(self, entry: BibEntry, operation: str = 'ingest') -> SourceRecord: + """Create a source record for provenance tracking. + + Args: + entry: The BibEntry to record + operation: Operation type (e.g., 'ingest', 'enrich') + + Returns: + SourceRecord with metadata + """ + return SourceRecord( + raw=self._entry_to_dict(entry), + source_type=self.source_type, + source_label=f"{self.source_type}:{self.config.get('name', self.__class__.__name__)}", + timestamp='', + confidence=1.0 + ) + + def _entry_to_dict(self, entry: BibEntry) -> Dict[str, Any]: + """Convert BibEntry to dictionary for source records.""" + return { + 'entry_type': entry.entry_type, + 'citation_key': entry.citation_key, + 'fields': entry.fields + } + + def is_available(self) -> bool: + """Check if the source is available and enabled. + + Returns: + True if enabled and available, False otherwise + """ + return self.enabled diff --git a/src/citegeist/sources/catalog.py b/src/citegeist/sources/catalog.py new file mode 100644 index 0000000..54cf67e --- /dev/null +++ b/src/citegeist/sources/catalog.py @@ -0,0 +1,173 @@ +"""Open bibliographic source inventory and prioritization helpers.""" +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True, slots=True) +class SourceCatalogEntry: + key: str + label: str + category: str + access: str + capabilities: tuple[str, ...] + strengths: str + caveats: str + current_status: str + priority: str + + +_CATALOG: tuple[SourceCatalogEntry, ...] = ( + SourceCatalogEntry( + key="crossref", + label="Crossref", + category="metadata", + access="open API", + capabilities=("doi_lookup", "title_search", "reference_lists"), + strengths="Broad DOI coverage and good article-level metadata.", + caveats="Citation coverage is incomplete and some references are unstructured blobs.", + current_status="integrated", + priority="now", + ), + SourceCatalogEntry( + key="openalex", + label="OpenAlex", + category="metadata+graph", + access="open API", + capabilities=("work_lookup", "title_search", "citations", "cited_by", "topics"), + strengths="Best current open source for citation graph expansion and work-level discovery.", + caveats="Occasional noisy secondary records require conservative admission rules.", + current_status="integrated", + priority="now", + ), + SourceCatalogEntry( + key="pubmed", + label="PubMed / NCBI E-utilities", + category="metadata", + access="open API", + capabilities=("pmid_lookup", "title_search", "biomedical_metadata"), + strengths="High-value authoritative metadata for biomedical literature.", + caveats="Domain-specific coverage outside biomedicine is limited.", + current_status="integrated", + priority="now", + ), + SourceCatalogEntry( + key="datacite", + label="DataCite", + category="metadata", + access="open API", + capabilities=("doi_lookup", "title_search", "datasets"), + strengths="Important for datasets, reports, and non-traditional DOI-bearing objects.", + caveats="Less useful than Crossref/OpenAlex for mainstream article citation graphs.", + current_status="integrated", + priority="now", + ), + SourceCatalogEntry( + key="dblp", + label="DBLP", + category="metadata", + access="open API", + capabilities=("key_lookup", "search", "computer_science"), + strengths="Excellent computer-science coverage and clean bibliographic records.", + caveats="Discipline-specific rather than general-purpose.", + current_status="integrated", + priority="selective", + ), + SourceCatalogEntry( + key="arxiv", + label="arXiv", + category="metadata+fulltext", + access="open API", + capabilities=("id_lookup", "search", "preprints"), + strengths="Useful for preprint-first fields and free full-text links.", + caveats="Not a general citation graph source.", + current_status="integrated", + priority="selective", + ), + SourceCatalogEntry( + key="open_citations", + label="OpenCitations", + category="graph", + access="open API", + capabilities=("doi_citations", "doi_references", "provenance"), + strengths="Directly aligned with open citation-edge expansion.", + caveats="Coverage is narrower than OpenAlex and needs merge discipline.", + current_status="integrated", + priority="now", + ), + SourceCatalogEntry( + key="semantic_scholar", + label="Semantic Scholar", + category="metadata+graph", + access="free API with limits", + capabilities=("work_lookup", "search", "citations", "references"), + strengths="Strong graph and relevance signals, especially for discovery workflows.", + caveats="Not fully open data in the same sense as Crossref/OpenAlex/OpenCitations.", + current_status="integrated", + priority="now", + ), + SourceCatalogEntry( + key="unpaywall", + label="Unpaywall", + category="access-links", + access="open API", + capabilities=("doi_fulltext_links", "oa_status"), + strengths="Best open source for landing-page and OA-link enrichment.", + caveats="Improves access, not bibliographic identity or graph completeness.", + current_status="integrated", + priority="now", + ), + SourceCatalogEntry( + key="europe_pmc", + label="Europe PMC", + category="metadata+fulltext", + access="open API", + capabilities=("search", "citations", "fulltext_links", "biomedical"), + strengths="Valuable biomedical complement to PubMed with richer open-access linkage.", + caveats="Domain-specific and partially overlapping with PubMed/OpenAlex.", + current_status="integrated", + priority="now", + ), + SourceCatalogEntry( + key="open_library", + label="Open Library", + category="metadata", + access="open API", + capabilities=("title_search", "book_metadata", "author_catalog", "isbn_hints"), + strengths="Useful general-purpose open catalog coverage for books, monographs, and reference works.", + caveats="Catalog metadata is not a citation graph and can be noisy for article-like titles.", + current_status="integrated", + priority="selective", + ), + SourceCatalogEntry( + key="openaire", + label="OpenAIRE", + category="metadata+repository", + access="open API", + capabilities=("repository_metadata", "oa_links", "project_links"), + strengths="Good for repository, project, and European OA discovery.", + caveats="May be better treated as a corpus-acquisition adapter than a first-line resolver.", + current_status="planned", + priority="evaluate", + ), + SourceCatalogEntry( + key="oai_pmh", + label="OAI-PMH Repositories", + category="repository", + access="open protocol", + capabilities=("repository_harvest", "set_discovery", "metadata_formats"), + strengths="Already useful for theses, dissertations, and institutional repositories.", + caveats="Heterogeneous metadata quality; not a single canonical bibliographic source.", + current_status="integrated", + priority="selective", + ), +) + + +def list_source_catalog() -> list[SourceCatalogEntry]: + return list(_CATALOG) + + +def prioritized_source_keys() -> list[str]: + order = {"now": 0, "next": 1, "selective": 2, "evaluate": 3} + return [entry.key for entry in sorted(_CATALOG, key=lambda entry: (order[entry.priority], entry.label.lower()))] diff --git a/src/citegeist/sources/crossref.py b/src/citegeist/sources/crossref.py new file mode 100644 index 0000000..23dddd4 --- /dev/null +++ b/src/citegeist/sources/crossref.py @@ -0,0 +1,210 @@ +""" +CrossRef source plugin. + +CrossRef provides metadata for DOIs for scholarly works. +""" +from __future__ import annotations + +import json +import urllib.request +import urllib.parse +from typing import Any, Dict, List, Optional + +from citegeist.bibtex import BibEntry +from citegeist.sources.base import BibliographicSource + + +class CrossRefSource(BibliographicSource): + """CrossRef source for DOI-based metadata lookup.""" + + BASE_URL = "https://api.crossref.org" + + def __init__(self, config: Optional[Dict[str, Any]] = None): + """Initialize CrossRef source. + + Args: + config: Configuration with optional 'api_key' + """ + super().__init__(config) + self.api_key = self.config.get('api_key', '') + self.user_agent = self.config.get( + 'user_agent', + 'citegeist/0.1 (local research tool)', + ) + + def lookup_by_doi(self, doi: str) -> Optional[BibEntry]: + """Look up a work by DOI. + + Args: + doi: Digital Object Identifier + + Returns: + BibEntry if found, None otherwise + """ + if not doi: + return None + + encoded = urllib.parse.quote(doi, safe="") + url = f"{self.BASE_URL}/works/{encoded}" + headers = {'User-Agent': self.user_agent} + if self.api_key: + headers['X-Api-Key'] = self.api_key + + try: + req = urllib.request.Request(url, headers=headers) + response = urllib.request.urlopen(req) + data = response.read().decode('utf-8') + payload = json.loads(data) + return self._normalize_crossref(payload) + except Exception: + return None + + def lookup_by_title(self, title: str) -> Optional[BibEntry]: + """CrossRef doesn't support title-only lookup. + + Returns None as this is not a supported operation. + """ + return None + + def search(self, query: str, limit: int = 10) -> List[BibEntry]: + """Search CrossRef for works. + + Args: + query: Search query string + limit: Maximum number of results + + Returns: + List of matching BibEntry objects + """ + if not query: + return [] + + encoded_query = urllib.parse.quote(query, safe="") + url = f"{self.BASE_URL}/works?query={encoded_query}&rows={limit}" + headers = {'User-Agent': self.user_agent} + if self.api_key: + headers['X-Api-Key'] = self.api_key + + try: + req = urllib.request.Request(url, headers=headers) + response = urllib.request.urlopen(req) + data = response.read().decode('utf-8') + payload = json.loads(data) + items = payload.get('message', {}).get('items', []) + return [entry for item in items if (entry := self._normalize_crossref(item)) is not None] + except Exception: + return [] + + def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]: + """Normalize a raw CrossRef record to a BibEntry. + + Args: + record: Raw record from CrossRef API + + Returns: + BibEntry if normalization succeeds + """ + return self._normalize_crossref(record) + + def get_identifier_scheme(self) -> str: + """Return 'doi' as the identifier scheme.""" + return 'doi' + + def _normalize_crossref(self, payload: Dict[str, Any]) -> Optional[BibEntry]: + """Normalize a CrossRef payload to a BibEntry. + + Args: + payload: Raw JSON payload from CrossRef + + Returns: + BibEntry object + """ + message = payload.get('message', payload) + if not message: + return None + + # Extract basic fields + doi = str(message.get('DOI', '')) + title = ' '.join(message.get('title', [])) if message.get('title') else '' + author_data = message.get('author', []) + year = self._extract_year(message) + + # Format authors + authors = [] + for author in author_data: + given = str(author.get('given', '')) + family = str(author.get('family', '')) + if given and family: + authors.append(f"{given} {family}") + elif family: + authors.append(family) + + # Get publisher + publisher = str(message.get('publisher', '')) + + # Get journal info + container_title = message.get('container-title', []) + journal = container_title[0] if container_title else '' + + # Get URL + url = str(message.get('URL', '')) + + # Get abstract + abstract = self._extract_abstract(message.get('abstract')) + + # Map to BibEntry + fields: Dict[str, str] = {} + if title: + fields['title'] = title + if authors: + fields['author'] = ' and '.join(authors) + if year: + fields['year'] = year + if doi: + fields['doi'] = doi + if journal: + fields['journal'] = journal + if publisher: + fields['publisher'] = publisher + if url: + fields['url'] = url + if abstract: + fields['abstract'] = abstract + + citation_key = f"{authors[0] if authors else 'crossref'}_{year or 'n.d.'}_{title or doi}" + + return BibEntry( + entry_type='article', + citation_key=citation_key, + fields=fields + ) + + def _extract_year(self, message: Dict[str, Any]) -> str: + for field_name in ('published-print', 'published-online', 'issued', 'created'): + year = self._extract_year_from_date_parts(message.get(field_name, {})) + if year: + return year + return '' + + def _extract_year_from_date_parts(self, field: Dict[str, Any]) -> str: + date_parts = field.get('date-parts', []) + if not date_parts: + return '' + first_part = date_parts[0] + if not first_part: + return '' + year = first_part[0] + return str(year) if year else '' + + def _extract_abstract(self, raw_abstract: Any) -> str: + if isinstance(raw_abstract, str): + return raw_abstract.strip() + if isinstance(raw_abstract, list): + for item in raw_abstract: + if isinstance(item, dict): + text = str(item.get('value', '')).strip() + if text: + return text + elif isinstance(item, str) and item.strip(): + return item.strip() + return '' diff --git a/src/citegeist/sources/europepmc.py b/src/citegeist/sources/europepmc.py new file mode 100644 index 0000000..892b071 --- /dev/null +++ b/src/citegeist/sources/europepmc.py @@ -0,0 +1,157 @@ +"""Europe PMC source plugin.""" +from __future__ import annotations + +import urllib.parse +from typing import Any, Dict, Optional + +from citegeist.bibtex import BibEntry +from citegeist.sources._old_sources_compat import SourceClient +from citegeist.sources.base import BibliographicSource + + +class EuropePmcSource(BibliographicSource): + """Europe PMC source for biomedical metadata and OA/fulltext links.""" + + BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search" + + def __init__(self, config: Optional[Dict[str, Any]] = None): + super().__init__(config) + user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)") + self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent) + + def lookup_by_doi(self, doi: str) -> Optional[BibEntry]: + normalized = doi.strip() + if not normalized: + return None + query = f'DOI:"{normalized}"' + row = self._search_one(query) + return self.normalize(row) if row else None + + def lookup_by_title(self, title: str) -> Optional[BibEntry]: + query_text = " ".join(title.split()) + if not query_text: + return None + query = f'TITLE:"{query_text}"' + row = self._search_one(query) + return self.normalize(row) if row else None + + def search(self, query: str, limit: int = 10) -> list[BibEntry]: + query_text = " ".join(query.split()) + if not query_text: + return [] + payload = self._search_payload(f'TITLE:"{query_text}"', max(1, limit)) + results = payload.get("resultList", {}).get("result", []) if payload else [] + return [entry for row in results if (entry := self.normalize(row)) is not None] + + def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]: + title = str(record.get("title") or "").strip() + if not title: + return None + + doi = str(record.get("doi") or "").strip() + pmid = str(record.get("pmid") or record.get("id") or "").strip() if str(record.get("source") or "") == "MED" else str(record.get("pmid") or "").strip() + pmcid = str(record.get("pmcid") or "").strip() + year = str(record.get("pubYear") or "").strip() + author_text = self._normalize_author_string(str(record.get("authorString") or "").strip()) + journal_title = str(record.get("journalTitle") or "").strip() + abstract = str(record.get("abstractText") or "").strip() + + fields: Dict[str, str] = {"title": title} + if doi: + fields["doi"] = doi + if pmid: + fields["pmid"] = pmid + if pmcid: + fields["pmcid"] = pmcid + if year: + fields["year"] = year + if author_text: + fields["author"] = author_text + if journal_title: + fields["journal"] = journal_title + if volume := str(record.get("journalVolume") or "").strip(): + fields["volume"] = volume + if issue := str(record.get("issue") or "").strip(): + fields["number"] = issue + if pages := str(record.get("pageInfo") or "").strip(): + fields["pages"] = pages + if abstract: + fields["abstract"] = abstract + if fulltext_url := self._fulltext_url(record): + fields["url"] = fulltext_url + elif article_url := self._article_url(record): + fields["url"] = article_url + if str(record.get("isOpenAccess") or "").strip(): + fields["is_oa"] = "true" if str(record.get("isOpenAccess")).upper() == "Y" else "false" + if cited_by := str(record.get("citedByCount") or "").strip(): + fields["europepmc_cited_by_count"] = cited_by + if source := str(record.get("source") or "").strip(): + fields["europepmc_source"] = source + + citation_key = self._citation_key(doi, pmid, author_text, year, title) + return BibEntry(entry_type="article", citation_key=citation_key, fields=fields) + + def get_fulltext_url(self, doi: str) -> Optional[str]: + normalized = doi.strip() + if not normalized: + return None + payload = self._search_payload(f'DOI:"{normalized}"', 1) + results = payload.get("resultList", {}).get("result", []) if payload else [] + if not results: + return None + return self._fulltext_url(results[0]) or self._article_url(results[0]) + + def get_identifier_scheme(self) -> str: + return "doi" + + def _search_one(self, query: str) -> Dict[str, Any] | None: + payload = self._search_payload(query, 1) + results = payload.get("resultList", {}).get("result", []) if payload else [] + return results[0] if results else None + + def _search_payload(self, query: str, page_size: int) -> Dict[str, Any] | None: + params = { + "query": query, + "format": "json", + "resultType": "core", + "pageSize": max(1, page_size), + } + return self.source_client.try_get_json(f"{self.BASE_URL}?{urllib.parse.urlencode(params)}") + + def _fulltext_url(self, record: Dict[str, Any]) -> str: + candidates = record.get("fullTextUrlList", {}) + if isinstance(candidates, dict): + urls = candidates.get("fullTextUrl", []) + if isinstance(urls, dict): + urls = [urls] + if isinstance(urls, list): + for item in urls: + if not isinstance(item, dict): + continue + url = str(item.get("url") or "").strip() + if url: + return url + return "" + + def _article_url(self, record: Dict[str, Any]) -> str: + source = str(record.get("source") or "").strip() + identifier = str(record.get("id") or "").strip() + if source and identifier: + return f"https://europepmc.org/article/{source}/{identifier}" + return "" + + def _normalize_author_string(self, value: str) -> str: + if not value: + return "" + authors = [part.strip().rstrip(".") for part in value.split(",") if part.strip()] + return " and ".join(authors) + + def _citation_key(self, doi: str, pmid: str, author_text: str, year: str, title: str) -> str: + if doi: + return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum()) + if pmid: + return f"pmid{pmid}" + family = author_text.split(" and ")[0].split()[-1] if author_text else "ref" + family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref" + first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum()) + return f"{family}{year or 'nd'}{first_word or 'untitled'}" diff --git a/src/citegeist/sources/opencitations.py b/src/citegeist/sources/opencitations.py new file mode 100644 index 0000000..01a8724 --- /dev/null +++ b/src/citegeist/sources/opencitations.py @@ -0,0 +1,178 @@ +"""OpenCitations source plugin.""" +from __future__ import annotations + +import urllib.parse +from typing import Any, Dict, List, Optional + +from citegeist.bibtex import BibEntry +from citegeist.sources.base import BibliographicSource, CitationEdge +from citegeist.sources._old_sources_compat import SourceClient + + +class OpenCitationsSource(BibliographicSource): + """OpenCitations source for DOI metadata and citation edges.""" + + INDEX_BASE_URL = "https://api.opencitations.net/index/v2" + META_BASE_URL = "https://api.opencitations.net/meta/v1" + + def __init__(self, config: Optional[Dict[str, Any]] = None): + super().__init__(config) + user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)") + self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent) + + def lookup_by_doi(self, doi: str) -> Optional[BibEntry]: + normalized = self._normalize_doi_pid(doi) + if not normalized: + return None + rows = self.source_client.try_get_json(f"{self.META_BASE_URL}/metadata/{normalized}") + if not rows: + return None + return self.normalize(rows[0]) + + def lookup_by_title(self, title: str) -> Optional[BibEntry]: + return None + + def search(self, query: str, limit: int = 10) -> List[BibEntry]: + return [] + + def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]: + ids = str(record.get("id") or "") + title = str(record.get("title") or "").strip() + if not ids or not title: + return None + + doi = self._extract_id_value(ids, "doi") + openalex = self._extract_id_value(ids, "openalex") + year = self._extract_year(str(record.get("pub_date") or "")) + authors = self._normalize_author_field(str(record.get("author") or "")) + venue, venue_ids = self._parse_venue_field(str(record.get("venue") or "")) + entry_type = self._map_entry_type(str(record.get("type") or "")) + + fields: Dict[str, str] = {"title": title} + if doi: + fields["doi"] = doi + fields["url"] = f"https://doi.org/{doi}" + if openalex: + fields["openalex"] = openalex + if year: + fields["year"] = year + if authors: + fields["author"] = authors + if venue: + if entry_type == "article": + fields["journal"] = venue + else: + fields["booktitle"] = venue + if volume := str(record.get("volume") or "").strip(): + fields["volume"] = volume + if issue := str(record.get("issue") or "").strip(): + fields["number"] = issue + if pages := str(record.get("page") or "").strip(): + fields["pages"] = pages + if publisher := self._strip_bracketed_ids(str(record.get("publisher") or "")): + fields["publisher"] = publisher + if venue_ids: + fields["note"] = f"opencitations_venue_ids = {{{venue_ids}}}" + + citation_key = self._citation_key(doi, openalex, authors, year, title) + return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) + + def get_citations(self, work_id: str, relation_type: str = "cites", limit: int = 10) -> List[CitationEdge]: + normalized = self._normalize_doi_pid(work_id) + if not normalized: + return [] + path = "references" if relation_type == "cites" else "citations" + rows = self.source_client.try_get_json(f"{self.INDEX_BASE_URL}/{path}/{normalized}") + if not rows: + return [] + + edges: List[CitationEdge] = [] + for row in rows[:limit]: + citing = self._extract_id_value(str(row.get("citing") or ""), "doi") + cited = self._extract_id_value(str(row.get("cited") or ""), "doi") + if not citing or not cited: + continue + if relation_type == "cites": + source_work_id, target_work_id = citing, cited + else: + source_work_id, target_work_id = citing, cited + edges.append( + CitationEdge( + source_work_id=f"doi:{source_work_id}", + target_work_id=f"doi:{target_work_id}", + relation_type="cites", + source_type="opencitations", + source_label=f"opencitations:{path}:{normalized}", + confidence=0.85, + ) + ) + return edges + + def get_identifier_scheme(self) -> str: + return "doi" + + def _normalize_doi_pid(self, value: str) -> str: + doi = value.strip() + if not doi: + return "" + if doi.lower().startswith("doi:"): + doi = doi[4:] + return f"doi:{doi}" + + def _extract_id_value(self, identifiers: str, scheme: str) -> str: + prefix = f"{scheme}:" + for token in identifiers.split(): + if token.startswith(prefix): + return token[len(prefix):] + return "" + + def _extract_year(self, pub_date: str) -> str: + pub_date = pub_date.strip() + if len(pub_date) >= 4 and pub_date[:4].isdigit(): + return pub_date[:4] + return "" + + def _normalize_author_field(self, raw_authors: str) -> str: + authors: List[str] = [] + for part in raw_authors.split(";"): + cleaned = self._strip_bracketed_ids(part) + cleaned = " ".join(cleaned.split()) + if cleaned: + authors.append(cleaned) + return " and ".join(authors) + + def _parse_venue_field(self, raw_venue: str) -> tuple[str, str]: + raw_venue = raw_venue.strip() + if not raw_venue: + return "", "" + if "[" not in raw_venue: + return raw_venue, "" + title, _, remainder = raw_venue.partition("[") + return title.strip(), remainder.rstrip("] ").strip() + + def _strip_bracketed_ids(self, value: str) -> str: + return value.split("[", 1)[0].strip() + + def _map_entry_type(self, raw_type: str) -> str: + lowered = raw_type.casefold() + if lowered == "journal article": + return "article" + if lowered == "book": + return "book" + if lowered == "book chapter": + return "incollection" + if lowered in {"proceedings article", "conference paper"}: + return "inproceedings" + if "thesis" in lowered or "dissertation" in lowered: + return "phdthesis" + return "misc" + + def _citation_key(self, doi: str, openalex: str, authors: str, year: str, title: str) -> str: + if doi: + return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum()) + if openalex: + return "openalex" + "".join(ch for ch in openalex.lower() if ch.isalnum()) + family = authors.split(" and ")[0].split(",")[0].split()[-1] if authors else "ref" + family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref" + first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum()) + return f"{family}{year or 'nd'}{first_word or 'untitled'}" diff --git a/src/citegeist/sources/openlibrary.py b/src/citegeist/sources/openlibrary.py new file mode 100644 index 0000000..ca7ef79 --- /dev/null +++ b/src/citegeist/sources/openlibrary.py @@ -0,0 +1,100 @@ +"""Open Library source plugin.""" +from __future__ import annotations + +import urllib.parse +from typing import Any, Dict, List, Optional + +from citegeist.bibtex import BibEntry +from citegeist.sources.base import BibliographicSource +from citegeist.sources._old_sources_compat import SourceClient + + +class OpenLibrarySource(BibliographicSource): + """Open Library source for broad book and monograph metadata.""" + + SEARCH_URL = "https://openlibrary.org/search.json" + WORK_URL = "https://openlibrary.org" + + def __init__(self, config: Optional[Dict[str, Any]] = None): + super().__init__(config) + user_agent = str(self.config.get("user_agent") or "citegeist/0.1 (local research tool)") + self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent) + + def lookup_by_doi(self, doi: str) -> Optional[BibEntry]: + return None + + def lookup_by_title(self, title: str) -> Optional[BibEntry]: + matches = self.search(title, limit=1) + return matches[0] if matches else None + + def search(self, query: str, limit: int = 10) -> List[BibEntry]: + title = " ".join(query.split()) + if not title: + return [] + params = urllib.parse.urlencode({"title": title, "limit": max(1, limit), "fields": "*"}) + payload = self.source_client.try_get_json(f"{self.SEARCH_URL}?{params}") + if not payload: + return [] + docs = payload.get("docs", []) + if not isinstance(docs, list): + return [] + return [entry for record in docs if isinstance(record, dict) and (entry := self.normalize(record)) is not None] + + def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]: + title = str(record.get("title") or "").strip() + if not title: + return None + + authors = self._join_list(record.get("author_name")) + year = self._extract_year(record) + publishers = self._join_list(record.get("publisher")) + work_key = str(record.get("key") or "").strip() + edition_keys = record.get("edition_key") or [] + isbn_values = record.get("isbn") or [] + + fields: Dict[str, str] = {"title": title} + if authors: + fields["author"] = authors + if year: + fields["year"] = year + if publishers: + fields["publisher"] = publishers + if work_key: + fields["openlibrary_work"] = work_key + fields["url"] = f"{self.WORK_URL}{work_key}" + if isinstance(edition_keys, list) and edition_keys: + fields["openlibrary_edition"] = str(edition_keys[0]) + if isinstance(isbn_values, list) and isbn_values: + fields["isbn"] = str(isbn_values[0]) + + return BibEntry( + entry_type="book", + citation_key=self._citation_key(work_key, authors, year, title), + fields=fields, + ) + + def get_identifier_scheme(self) -> str: + return "openlibrary" + + def _extract_year(self, record: Dict[str, Any]) -> str: + first_publish_year = record.get("first_publish_year") + if first_publish_year: + return str(first_publish_year) + publish_year = record.get("publish_year") + if isinstance(publish_year, list) and publish_year: + return str(publish_year[0]) + return "" + + def _join_list(self, value: Any) -> str: + if not isinstance(value, list): + return "" + items = [str(item).strip() for item in value if str(item).strip()] + return " and ".join(items) + + def _citation_key(self, work_key: str, authors: str, year: str, title: str) -> str: + if work_key: + return "ol" + "".join(ch for ch in work_key.lower() if ch.isalnum()) + family = authors.split(" and ")[0].split()[-1] if authors else "book" + family = "".join(ch for ch in family.lower() if ch.isalnum()) or "book" + first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum()) + return f"{family}{year or 'nd'}{first_word or 'untitled'}" diff --git a/src/citegeist/sources/registry.py b/src/citegeist/sources/registry.py new file mode 100644 index 0000000..f93fb06 --- /dev/null +++ b/src/citegeist/sources/registry.py @@ -0,0 +1,253 @@ +""" +Source registry for managing bibliographic source plugins. + +This module provides a registry that can discover, load, and manage +multiple bibliographic source plugins. +""" +from __future__ import annotations + +import importlib.util +import inspect +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Type + +from citegeist.sources.base import BibliographicSource + + +@dataclass(slots=True) +class SourceRegistration: + """Registration information for a source plugin.""" + name: str + source_class: Type[BibliographicSource] + config: Dict[str, Any] + enabled: bool + + +class SourceRegistry: + """Registry for bibliographic source plugins. + + This class manages the discovery, registration, and instantiation + of bibliographic source plugins. + """ + + def __init__(self) -> None: + """Initialize the source registry.""" + self._registrations: Dict[str, SourceRegistration] = {} + self._instances: Dict[str, BibliographicSource] = {} + + def register( + self, + source_class: Type[BibliographicSource], + name: Optional[str] = None, + config: Optional[Dict[str, Any]] = None, + ) -> None: + """Register a source class. + + Args: + source_class: The source class to register (must inherit from BibliographicSource) + name: Optional name for the source (uses class name if not provided) + config: Optional configuration dictionary + """ + if not inspect.isclass(source_class) or not issubclass(source_class, BibliographicSource): + raise ValueError(f"{source_class} must be a subclass of BibliographicSource") + + source_name = name or source_class.__name__ + self._registrations[source_name] = SourceRegistration( + name=source_name, + source_class=source_class, + config=config or {}, + enabled=config.get('enabled', True) if config else True + ) + + def get(self, name: str) -> Optional[BibliographicSource]: + """Get a source instance by name. + + Args: + name: Name of the source + + Returns: + Source instance if registered and enabled, None otherwise + """ + if name not in self._registrations: + return None + + registration = self._registrations[name] + + # Return cached instance if available + if name in self._instances: + return self._instances[name] + + # Create new instance + if not registration.enabled: + return None + + instance = registration.source_class(config=registration.config) + self._instances[name] = instance + return instance + + def list_sources(self, enabled_only: bool = False) -> List[str]: + """List registered source names. + + Args: + enabled_only: Only return enabled sources + + Returns: + List of source names + """ + sources = list(self._registrations.keys()) + if enabled_only: + return [name for name, reg in self._registrations.items() if reg.enabled] + return sources + + def get_config(self, name: str) -> Optional[Dict[str, Any]]: + """Get configuration for a source. + + Args: + name: Name of the source + + Returns: + Configuration dictionary, or None if not found + """ + registration = self._registrations.get(name) + return registration.config if registration else None + + def load_from_file(self, filepath: str) -> None: + """Load source plugins from a Python file. + + Args: + filepath: Path to Python file containing source classes + """ + spec = importlib.util.spec_from_file_location("module.sources", filepath) + if spec is None or spec.loader is None: + raise ImportError(f"Cannot load module from {filepath}") + + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + # Find all classes that inherit from BibliographicSource + for name, obj in inspect.getmembers(module, inspect.isclass): + if issubclass(obj, BibliographicSource) and obj is not BibliographicSource: + self.register(obj) + + def load_from_directory(self, directory: str) -> None: + """Load source plugins from a directory. + + Args: + directory: Path to directory containing source plugin files + """ + import os + for filename in os.listdir(directory): + if filename.endswith('.py') and not filename.startswith('_'): + filepath = os.path.join(directory, filename) + self.load_from_file(filepath) + + def from_config_dict(self, config: Dict[str, Any]) -> None: + """Load sources from a configuration dictionary. + + Example config format: + { + "sources": { + "crossref": { + "source_type": "crossref", + "enabled": true + }, + "semantic_scholar": { + "source_type": "semantic_scholar", + "enabled": true, + "api_key": "..." + } + } + } + + Args: + config: Configuration dictionary + """ + if 'sources' not in config: + return + + for name, source_config in config['sources'].items(): + source_name = str(name) + source_type = str(source_config.get('source_type', source_name)) + self.register( + source_class=self._resolve_source_class(source_type), + name=source_name, + config=source_config + ) + + def to_dict(self) -> Dict[str, Any]: + """Serialize registry to dictionary. + + Returns: + Dictionary representation of registry + """ + return { + name: { + 'enabled': reg.enabled, + 'config': reg.config + } + for name, reg in self._registrations.items() + } + + def from_dict(self, data: Dict[str, Any]) -> None: + """Load registry from dictionary. + + Args: + data: Dictionary representation of registry + """ + for name, source_data in data.items(): + source_name = str(name) + source_type = str(source_data.get('source_type', source_name)) + self.register( + source_class=self._resolve_source_class(source_type), + name=source_name, + config=source_data.get('config', source_data) + ) + + def get_registered_sources(self) -> List[SourceRegistration]: + """Get all registered source registrations. + + Returns: + List of SourceRegistration objects + """ + return list(self._registrations.values()) + + def _resolve_source_class(self, source_type: str) -> Type[BibliographicSource]: + normalized = source_type.strip().lower().replace('-', '_') + if normalized in {'crossref', 'cross_ref'}: + from citegeist.sources.crossref import CrossRefSource + + return CrossRefSource + if normalized in {'opencitations', 'open_citations'}: + from citegeist.sources.opencitations import OpenCitationsSource + + return OpenCitationsSource + if normalized == 'unpaywall': + from citegeist.sources.unpaywall import UnpaywallSource + + return UnpaywallSource + if normalized in {'europepmc', 'europe_pmc'}: + from citegeist.sources.europepmc import EuropePmcSource + + return EuropePmcSource + if normalized in {'semanticscholar', 'semantic_scholar'}: + from citegeist.sources.semanticscholar import SemanticScholarSource + + return SemanticScholarSource + if normalized in {"openlibrary", "open_library"}: + from citegeist.sources.openlibrary import OpenLibrarySource + + return OpenLibrarySource + raise ValueError(f"Unknown source type: {source_type}") + + +# Global registry instance +_global_registry = SourceRegistry() + + +def get_registry() -> SourceRegistry: + """Get the global source registry instance. + + Returns: + The global SourceRegistry instance + """ + return _global_registry diff --git a/src/citegeist/sources/semanticscholar.py b/src/citegeist/sources/semanticscholar.py new file mode 100644 index 0000000..b5358bf --- /dev/null +++ b/src/citegeist/sources/semanticscholar.py @@ -0,0 +1,140 @@ +"""Semantic Scholar source plugin.""" +from __future__ import annotations + +import json +import os +import urllib.parse +import urllib.request +from typing import Any, Dict, List, Optional + +from citegeist.bibtex import BibEntry +from citegeist.sources.base import BibliographicSource + + +class SemanticScholarSource(BibliographicSource): + """Semantic Scholar source for broad scientific metadata coverage.""" + + BASE_URL = "https://api.semanticscholar.org/graph/v1" + DEFAULT_FIELDS = ( + "paperId,title,year,abstract,authors,externalIds,journal,venue,url," + "openAccessPdf,citationCount,publicationTypes" + ) + + def __init__(self, config: Optional[Dict[str, Any]] = None): + super().__init__(config) + self.api_key = str( + self.config.get("api_key") + or os.environ.get("SEMANTIC_SCHOLAR_API_KEY") + or "" + ).strip() + self.user_agent = str(self.config.get("user_agent") or "citegeist/0.1 (local research tool)") + + def lookup_by_doi(self, doi: str) -> Optional[BibEntry]: + normalized = doi.strip() + if not normalized: + return None + encoded = urllib.parse.quote(f"DOI:{normalized}", safe="") + payload = self._get_json(f"{self.BASE_URL}/paper/{encoded}?fields={self.DEFAULT_FIELDS}") + if not payload: + return None + return self.normalize(payload) + + def lookup_by_title(self, title: str) -> Optional[BibEntry]: + matches = self.search(title, limit=1) + return matches[0] if matches else None + + def search(self, query: str, limit: int = 10) -> List[BibEntry]: + query_text = " ".join(query.split()) + if not query_text: + return [] + params = urllib.parse.urlencode( + {"query": query_text, "limit": max(1, limit), "fields": self.DEFAULT_FIELDS} + ) + payload = self._get_json(f"{self.BASE_URL}/paper/search?{params}") + if not payload: + return [] + return [entry for row in payload.get("data", []) if (entry := self.normalize(row)) is not None] + + def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]: + title = str(record.get("title") or "").strip() + if not title: + return None + + external_ids = record.get("externalIds") or {} + doi = str(external_ids.get("DOI") or "").strip() + authors = " and ".join( + str(author.get("name") or "").strip() + for author in record.get("authors", []) + if str(author.get("name") or "").strip() + ) + year = str(record.get("year") or "").strip() + abstract = str(record.get("abstract") or "").strip() + journal = record.get("journal") or {} + journal_name = str(journal.get("name") or record.get("venue") or "").strip() + open_access_pdf = record.get("openAccessPdf") or {} + + fields: Dict[str, str] = {"title": title} + if doi: + fields["doi"] = doi + if paper_id := str(record.get("paperId") or "").strip(): + fields["semanticscholar_id"] = paper_id + if year: + fields["year"] = year + if authors: + fields["author"] = authors + if abstract: + fields["abstract"] = abstract + if journal_name: + if self._entry_type(record) == "inproceedings": + fields["booktitle"] = journal_name + else: + fields["journal"] = journal_name + if url := str(open_access_pdf.get("url") or record.get("url") or "").strip(): + fields["url"] = url + if open_access_pdf: + fields["is_oa"] = "true" + if citation_count := record.get("citationCount"): + fields["semanticscholar_citation_count"] = str(citation_count) + + citation_key = self._citation_key(doi, str(record.get("paperId") or ""), authors, year, title) + return BibEntry(entry_type=self._entry_type(record), citation_key=citation_key, fields=fields) + + def get_fulltext_url(self, doi: str) -> Optional[str]: + entry = self.lookup_by_doi(doi) + if entry is None: + return None + return entry.fields.get("url") + + def get_identifier_scheme(self) -> str: + return "doi" + + def _entry_type(self, record: Dict[str, Any]) -> str: + publication_types = [str(item).lower() for item in (record.get("publicationTypes") or [])] + if any("conference" in item for item in publication_types): + return "inproceedings" + if any("review" in item for item in publication_types): + return "article" + if record.get("journal") or record.get("venue"): + return "article" + return "misc" + + def _citation_key(self, doi: str, paper_id: str, authors: str, year: str, title: str) -> str: + if doi: + return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum()) + if paper_id: + return "s2" + "".join(ch for ch in paper_id.lower() if ch.isalnum()) + family = authors.split(" and ")[0].split()[-1] if authors else "ref" + family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref" + first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum()) + return f"{family}{year or 'nd'}{first_word or 'untitled'}" + + def _get_json(self, url: str) -> Dict[str, Any] | None: + headers = {"User-Agent": self.user_agent} + if self.api_key: + headers["x-api-key"] = self.api_key + try: + request = urllib.request.Request(url, headers=headers) + with urllib.request.urlopen(request) as response: + return json.loads(response.read().decode("utf-8")) + except Exception: + return None diff --git a/src/citegeist/sources/unpaywall.py b/src/citegeist/sources/unpaywall.py new file mode 100644 index 0000000..252bc69 --- /dev/null +++ b/src/citegeist/sources/unpaywall.py @@ -0,0 +1,116 @@ +"""Unpaywall source plugin.""" +from __future__ import annotations + +import os +import urllib.parse +from typing import Any, Dict, Optional + +from citegeist.bibtex import BibEntry +from citegeist.sources._old_sources_compat import SourceClient +from citegeist.sources.base import BibliographicSource + + +class UnpaywallSource(BibliographicSource): + """Unpaywall source for DOI-based OA link enrichment.""" + + BASE_URL = "https://api.unpaywall.org/v2" + + def __init__(self, config: Optional[Dict[str, Any]] = None): + super().__init__(config) + user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)") + self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent) + self.email = str( + self.config.get("email") + or os.environ.get("UNPAYWALL_EMAIL") + or os.environ.get("NCBI_EMAIL") + or "" + ).strip() + + def lookup_by_doi(self, doi: str) -> Optional[BibEntry]: + payload = self.lookup_oa_record(doi) + if not payload: + return None + return self.normalize(payload) + + def lookup_by_title(self, title: str) -> Optional[BibEntry]: + return None + + def search(self, query: str, limit: int = 10) -> list[BibEntry]: + return [] + + def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]: + doi = str(record.get("doi") or "").strip() + title = str(record.get("title") or "").strip() or (doi and f"OA record for DOI {doi}") + if not doi or not title: + return None + + fields: Dict[str, str] = { + "title": title, + "doi": doi, + } + if year := str(record.get("year") or "").strip(): + fields["year"] = year + if landing_url := self._best_landing_url(record): + fields["url"] = landing_url + fields["best_oa_url"] = landing_url + if pdf_url := self._best_pdf_url(record): + fields["best_oa_pdf_url"] = pdf_url + if oa_status := str(record.get("oa_status") or "").strip(): + fields["oa_status"] = oa_status + if license_name := self._best_license(record): + fields["oa_license"] = license_name + if host_type := self._best_host_type(record): + fields["oa_host_type"] = host_type + if version := self._best_version(record): + fields["oa_version"] = version + if evidence := self._best_evidence(record): + fields["oa_evidence"] = evidence + if record.get("is_oa") is not None: + fields["is_oa"] = "true" if bool(record.get("is_oa")) else "false" + + citation_key = "doi" + "".join(ch for ch in doi.lower() if ch.isalnum()) + return BibEntry(entry_type="misc", citation_key=citation_key, fields=fields) + + def get_fulltext_url(self, doi: str) -> Optional[str]: + payload = self.lookup_oa_record(doi) + if not payload: + return None + return self._best_pdf_url(payload) or self._best_landing_url(payload) + + def get_identifier_scheme(self) -> str: + return "doi" + + def is_available(self) -> bool: + return self.enabled and bool(self.email) + + def lookup_oa_record(self, doi: str) -> Dict[str, Any] | None: + normalized = doi.strip() + if not normalized or not self.email: + return None + encoded = urllib.parse.quote(normalized, safe="") + query = urllib.parse.urlencode({"email": self.email}) + return self.source_client.try_get_json(f"{self.BASE_URL}/{encoded}?{query}") + + def _best_landing_url(self, payload: Dict[str, Any]) -> str: + location = payload.get("best_oa_location") or {} + return str(location.get("url") or location.get("url_for_landing_page") or "").strip() + + def _best_pdf_url(self, payload: Dict[str, Any]) -> str: + location = payload.get("best_oa_location") or {} + return str(location.get("url_for_pdf") or "").strip() + + def _best_license(self, payload: Dict[str, Any]) -> str: + location = payload.get("best_oa_location") or {} + return str(location.get("license") or "").strip() + + def _best_host_type(self, payload: Dict[str, Any]) -> str: + location = payload.get("best_oa_location") or {} + return str(location.get("host_type") or "").strip() + + def _best_version(self, payload: Dict[str, Any]) -> str: + location = payload.get("best_oa_location") or {} + return str(location.get("version") or "").strip() + + def _best_evidence(self, payload: Dict[str, Any]) -> str: + location = payload.get("best_oa_location") or {} + return str(location.get("evidence") or "").strip() diff --git a/src/citegeist/talkorigins.py b/src/citegeist/talkorigins.py index 069cc45..172d486 100644 --- a/src/citegeist/talkorigins.py +++ b/src/citegeist/talkorigins.py @@ -138,6 +138,7 @@ class TalkOriginsEnrichmentResult: applied: bool source_label: str = "" weak_reasons_after: list[str] | None = None + resolution_attempts: list[dict[str, object]] | None = None conflicts: list[dict[str, str]] | None = None error: str = "" @@ -545,9 +546,29 @@ class TalkOriginsScraper: if not weak_reasons_before: continue resolution = None + attempts: list[dict[str, object]] = [] error = "" try: - resolution = self.resolver.resolve_entry(canonical) + resolver_with_trace = getattr(self.resolver, "resolve_entry_with_trace", None) + resolver_plain = getattr(self.resolver, "resolve_entry", None) + plain_func = getattr(resolver_plain, "__func__", None) + trace_func = getattr(resolver_with_trace, "__func__", None) + use_trace = ( + resolver_with_trace is not None + and ( + trace_func is None + or ( + plain_func is MetadataResolver.resolve_entry + and trace_func is MetadataResolver.resolve_entry_with_trace + ) + ) + ) + if use_trace: + outcome = self.resolver.resolve_entry_with_trace(canonical) + resolution = outcome.resolution + attempts = [asdict(attempt) for attempt in outcome.attempts] + else: + resolution = self.resolver.resolve_entry(canonical) except Exception as exc: error = str(exc) @@ -559,6 +580,7 @@ class TalkOriginsScraper: applied=False, source_label=resolution.source_label if resolution is not None else "", error=error, + resolution_attempts=attempts, ) if resolution is not None: diff --git a/tests/test_europepmc.py b/tests/test_europepmc.py new file mode 100644 index 0000000..f14a24c --- /dev/null +++ b/tests/test_europepmc.py @@ -0,0 +1,123 @@ +from __future__ import annotations + +from citegeist.resolve import MetadataResolver +from citegeist.sources import EuropePmcSource, SourceRegistry, list_source_catalog + + +def test_europepmc_source_normalizes_core_record() -> None: + source = EuropePmcSource(config={}) + entry = source.normalize( + { + "id": "37158217", + "source": "MED", + "pmid": "37158217", + "pmcid": "PMC10000001", + "doi": "10.1000/example", + "title": "Biomedical Example", + "authorString": "Doe J, Roe A", + "journalTitle": "Biomed Journal", + "pubYear": "2024", + "journalVolume": "16", + "issue": "1", + "pageInfo": "10-20", + "abstractText": "Abstract text.", + "isOpenAccess": "Y", + "citedByCount": 12, + "fullTextUrlList": {"fullTextUrl": [{"url": "https://europepmc.org/articles/PMC10000001?pdf=render"}]}, + } + ) + + assert entry is not None + assert entry.fields["doi"] == "10.1000/example" + assert entry.fields["pmid"] == "37158217" + assert entry.fields["pmcid"] == "PMC10000001" + assert entry.fields["journal"] == "Biomed Journal" + assert entry.fields["url"] == "https://europepmc.org/articles/PMC10000001?pdf=render" + assert entry.fields["is_oa"] == "true" + + +def test_europepmc_registry_and_catalog() -> None: + registry = SourceRegistry() + registry.from_config_dict( + { + "sources": { + "europepmc": { + "source_type": "europepmc", + "enabled": True, + } + } + } + ) + source = registry.get("europepmc") + assert isinstance(source, EuropePmcSource) + + catalog = {entry.key: entry for entry in list_source_catalog()} + assert catalog["europe_pmc"].current_status == "integrated" + assert catalog["europe_pmc"].priority == "now" + + +def test_metadata_resolver_uses_europepmc_doi_after_primary_lookups_fail() -> None: + resolver = MetadataResolver() + resolver.resolve_doi = lambda _doi: None # type: ignore[method-assign] + resolver.resolve_datacite_doi = lambda _doi: None # type: ignore[method-assign] + resolver.europepmc.lookup_by_doi = lambda _doi: resolver.europepmc.normalize( # type: ignore[method-assign] + { + "id": "37158217", + "source": "MED", + "pmid": "37158217", + "doi": "10.1000/example", + "title": "Biomedical Example", + "authorString": "Doe J, Roe A", + "journalTitle": "Biomed Journal", + "pubYear": "2024", + } + ) + + from citegeist.bibtex import BibEntry + + result = resolver.resolve_entry( + BibEntry( + entry_type="article", + citation_key="seed2024", + fields={"doi": "10.1000/example", "title": "Biomedical Example"}, + ) + ) + + assert result is not None + assert result.source_label == "europepmc:doi:10.1000/example" + assert result.entry.fields["pmid"] == "37158217" + + +def test_metadata_resolver_uses_europepmc_title_search_after_pubmed() -> None: + resolver = MetadataResolver() + resolver.search_crossref_best_match = lambda *args, **kwargs: None # type: ignore[method-assign] + resolver.search_datacite_best_match = lambda *args, **kwargs: None # type: ignore[method-assign] + resolver.search_openalex_best_match = lambda *args, **kwargs: None # type: ignore[method-assign] + resolver.search_pubmed_best_match = lambda *args, **kwargs: None # type: ignore[method-assign] + resolver.europepmc.search = lambda _title, limit=5: [ # type: ignore[method-assign] + resolver.europepmc.normalize( + { + "id": "37158217", + "source": "MED", + "pmid": "37158217", + "doi": "10.1000/example", + "title": "Biomedical Example", + "authorString": "Doe J, Roe A", + "journalTitle": "Biomed Journal", + "pubYear": "2024", + } + ) + ] + + from citegeist.bibtex import BibEntry + + result = resolver.resolve_entry( + BibEntry( + entry_type="article", + citation_key="seed2024", + fields={"title": "Biomedical Example", "author": "Doe J", "year": "2024"}, + ) + ) + + assert result is not None + assert result.source_label == "europepmc:search:Biomedical Example" diff --git a/tests/test_opencitations.py b/tests/test_opencitations.py new file mode 100644 index 0000000..20949ed --- /dev/null +++ b/tests/test_opencitations.py @@ -0,0 +1,137 @@ +from __future__ import annotations + +from citegeist.expand import OpenCitationsExpander +from citegeist.sources import OpenCitationsSource +from citegeist.storage import BibliographyStore + + +def test_opencitations_source_normalizes_metadata_row() -> None: + source = OpenCitationsSource(config={}) + entry = source.normalize( + { + "id": "doi:10.1000/example openalex:W1234567890 omid:br/06123", + "title": "Example Work", + "author": "Doe, Jane [omid:ra/1]; Roe, Alex [omid:ra/2]", + "pub_date": "2024-05", + "venue": "Journal of Examples [issn:1234-5678]", + "volume": "12", + "issue": "3", + "page": "10-20", + "type": "journal article", + "publisher": "Example Press [crossref:123]", + } + ) + + assert entry is not None + assert entry.fields["doi"] == "10.1000/example" + assert entry.fields["openalex"] == "W1234567890" + assert entry.fields["author"] == "Doe, Jane and Roe, Alex" + assert entry.fields["journal"] == "Journal of Examples" + assert entry.fields["publisher"] == "Example Press" + assert entry.fields["year"] == "2024" + + +def test_opencitations_source_builds_edges_for_references() -> None: + source = OpenCitationsSource(config={}) + source.source_client.get_json = lambda _url: [ # type: ignore[method-assign] + { + "oci": "1-2", + "citing": "omid:br/1 doi:10.1000/source", + "cited": "omid:br/2 doi:10.1000/target", + "creation": "2024-01-01", + } + ] + + edges = source.get_citations("10.1000/source", relation_type="cites", limit=10) + assert len(edges) == 1 + assert edges[0].source_work_id == "doi:10.1000/source" + assert edges[0].target_work_id == "doi:10.1000/target" + + +def test_opencitations_expander_creates_reference_nodes_and_relations() -> None: + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + doi = {10.1000/source} +} +""" + ) + + expander = OpenCitationsExpander() + expander.source.source_client.get_json = lambda url: [ # type: ignore[method-assign] + { + "oci": "1-2", + "citing": "omid:br/1 doi:10.1000/source", + "cited": "omid:br/2 doi:10.1000/target", + "creation": "2024-01-01", + } + ] if "/references/" in url else [ + { + "id": "doi:10.1000/target omid:br/2", + "title": "Target Work", + "author": "Doe, Jane [omid:ra/1]", + "pub_date": "2023", + "venue": "Journal of Targets [issn:1111-1111]", + "type": "journal article", + } + ] + expander.resolver.resolve_doi = lambda _doi: None # type: ignore[method-assign] + expander.resolver.resolve_datacite_doi = lambda _doi: None # type: ignore[method-assign] + + results = expander.expand_entry(store, "seed2024", relation_type="cites", limit=10) + + assert [item.discovered_citation_key for item in results] == ["doi101000target"] + discovered = store.get_entry("doi101000target") + assert discovered is not None + assert discovered["title"] == "Target Work" + assert store.get_relations("seed2024") == ["doi101000target"] + finally: + store.close() + + +def test_opencitations_expander_supports_cited_by_direction() -> None: + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + doi = {10.1000/seed} +} +""" + ) + + expander = OpenCitationsExpander() + expander.source.source_client.get_json = lambda url: [ # type: ignore[method-assign] + { + "oci": "2-1", + "citing": "omid:br/2 doi:10.1000/citing", + "cited": "omid:br/1 doi:10.1000/seed", + "creation": "2024-01-01", + } + ] if "/citations/" in url else [ + { + "id": "doi:10.1000/citing omid:br/2", + "title": "Citing Work", + "author": "Doe, Jane [omid:ra/1]", + "pub_date": "2025", + "venue": "Journal of Citers [issn:1111-1111]", + "type": "journal article", + } + ] + expander.resolver.resolve_doi = lambda _doi: None # type: ignore[method-assign] + expander.resolver.resolve_datacite_doi = lambda _doi: None # type: ignore[method-assign] + + results = expander.expand_entry(store, "seed2024", relation_type="cited_by", limit=10) + + assert [item.discovered_citation_key for item in results] == ["doi101000citing"] + assert store.get_relations("doi101000citing") == ["seed2024"] + finally: + store.close() diff --git a/tests/test_openlibrary.py b/tests/test_openlibrary.py new file mode 100644 index 0000000..3878188 --- /dev/null +++ b/tests/test_openlibrary.py @@ -0,0 +1,188 @@ +from __future__ import annotations + +from citegeist.bibtex import BibEntry +from citegeist.resolve import MetadataResolver +from citegeist.sources import OpenLibrarySource, SourceRegistry, list_source_catalog + + +class FakeSourceClient: + def __init__(self, payload: dict[str, object]) -> None: + self.payload = payload + + def try_get_json(self, _url: str) -> dict[str, object]: + return dict(self.payload) + + +def test_openlibrary_source_normalizes_book_record() -> None: + source = OpenLibrarySource(config={"source_client": FakeSourceClient({})}) + entry = source.normalize( + { + "title": "The Nature of the Stratigraphic Record", + "author_name": ["D. V. Ager"], + "first_publish_year": 1973, + "publisher": ["Macmillan"], + "key": "/works/OL82563W", + "edition_key": ["OL12345M"], + "isbn": ["9781234567890"], + } + ) + + assert entry is not None + assert entry.entry_type == "book" + assert entry.fields["title"] == "The Nature of the Stratigraphic Record" + assert entry.fields["author"] == "D. V. Ager" + assert entry.fields["year"] == "1973" + assert entry.fields["publisher"] == "Macmillan" + assert entry.fields["openlibrary_work"] == "/works/OL82563W" + assert entry.fields["openlibrary_edition"] == "OL12345M" + assert entry.fields["isbn"] == "9781234567890" + + +def test_openlibrary_registry_and_catalog() -> None: + registry = SourceRegistry() + registry.from_config_dict( + { + "sources": { + "openlibrary": { + "source_type": "openlibrary", + "enabled": True, + } + } + } + ) + source = registry.get("openlibrary") + assert isinstance(source, OpenLibrarySource) + + catalog = {entry.key: entry for entry in list_source_catalog()} + assert catalog["open_library"].current_status == "integrated" + assert "book_metadata" in catalog["open_library"].capabilities + + +def test_metadata_resolver_uses_openlibrary_after_other_searches_fail() -> None: + resolver = MetadataResolver() + resolver.search_crossref = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_datacite = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_openalex = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_pubmed = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_europepmc = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_semanticscholar = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_openlibrary = lambda _title, limit=5: [ # type: ignore[method-assign] + BibEntry( + entry_type="book", + citation_key="olworks123", + fields={ + "title": "The Nature of the Stratigraphic Record", + "author": "D. V. Ager", + "year": "1973", + "openlibrary_work": "/works/OL82563W", + }, + ) + ] + + result = resolver.resolve_entry( + BibEntry( + entry_type="book", + citation_key="seed1973", + fields={"title": "The Nature of the Stratigraphic Record", "author": "D. V. Ager", "year": "1973"}, + ) + ) + + assert result is not None + assert result.source_label == "openlibrary:search:The Nature of the Stratigraphic Record" + + +def test_metadata_resolver_trace_records_fallback_attempts() -> None: + resolver = MetadataResolver() + resolver.search_crossref = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_datacite = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_openalex = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_pubmed = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_europepmc = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_semanticscholar = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_openlibrary = lambda _title, limit=5: [ # type: ignore[method-assign] + BibEntry( + entry_type="book", + citation_key="olworks123", + fields={"title": "Example Book", "author": "Author, A", "year": "1980"}, + ) + ] + + outcome = resolver.resolve_entry_with_trace( + BibEntry( + entry_type="book", + citation_key="seed1980", + fields={"title": "Example Book", "author": "Author, A", "year": "1980"}, + ) + ) + + assert outcome.resolution is not None + assert outcome.resolution.source_label == "openlibrary:search:Example Book" + assert [attempt.source_name for attempt in outcome.attempts[-2:]] == ["semanticscholar", "openlibrary"] + assert outcome.attempts[-1].matched is True + assert outcome.attempts[-1].candidate_count == 1 + + +def test_metadata_resolver_uses_fuzzy_catalog_match_for_book_titles() -> None: + resolver = MetadataResolver() + resolver.search_crossref = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_datacite = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_openalex = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_pubmed = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_europepmc = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_semanticscholar = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_openlibrary = lambda _title, limit=5: [ # type: ignore[method-assign] + BibEntry( + entry_type="book", + citation_key="olworks123", + fields={ + "title": "The nature of the stratigraphical record", + "author": "D. V. Ager", + "year": "1973", + "openlibrary_work": "/works/OL82563W", + }, + ) + ] + + result = resolver.resolve_entry( + BibEntry( + entry_type="book", + citation_key="seed1973", + fields={"title": "The Nature of the Stratigraphic Record", "author": "D. V. Ager", "year": "1973"}, + ) + ) + + assert result is not None + assert result.source_label == "openlibrary:search:The Nature of the Stratigraphic Record" + + +def test_metadata_resolver_skips_openlibrary_for_article_like_entries() -> None: + resolver = MetadataResolver() + resolver.search_crossref = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_datacite = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_openalex = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_pubmed = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_europepmc = lambda _title, limit=5: [] # type: ignore[method-assign] + resolver.search_semanticscholar = lambda _title, limit=5: [] # type: ignore[method-assign] + called = {"openlibrary": False} + + def fake_openlibrary(_title: str, limit: int = 5) -> list[BibEntry]: + called["openlibrary"] = True + return [] + + resolver.search_openlibrary = fake_openlibrary # type: ignore[method-assign] + outcome = resolver.resolve_entry_with_trace( + BibEntry( + entry_type="article", + citation_key="seed1977", + fields={ + "title": "Fast locomotion of some African ungulates", + "author": "Alexander, R. M.", + "year": "1977", + "journal": "Journal of Zoology", + }, + ) + ) + + assert outcome.resolution is None + assert called["openlibrary"] is False + assert all(attempt.source_name != "openlibrary" for attempt in outcome.attempts) diff --git a/tests/test_resolver_identifiers.py b/tests/test_resolver_identifiers.py new file mode 100644 index 0000000..d27e666 --- /dev/null +++ b/tests/test_resolver_identifiers.py @@ -0,0 +1,201 @@ +"""Tests for identifier resolution and normalization.""" +from __future__ import annotations + +import pytest + +from citegeist.resolver import ( + IdentifierExtractor, + IdentifierNormalizer, + IdentifierResolver, + extract_identifiers, + normalize_identifier, + get_primary_identifier, + resolve_identifiers, +) + + +class TestIdentifierExtractor: + """Test IdentifierExtractor class.""" + + def test_extract_from_entry(self): + """Test extracting identifiers from entry fields.""" + fields = { + 'doi': '10.1234/example', + 'title': 'Test Title', + 'author': 'John Doe', + 'pmid': '123456', + } + + identifiers = IdentifierExtractor.extract(fields) + + assert 'doi' in identifiers + assert identifiers['doi'] == '10.1234/example' + assert 'pmid' in identifiers + assert identifiers['pmid'] == '123456' + assert 'title' not in identifiers # Title is not an identifier + + def test_extract_multiple_identifiers(self): + """Test extracting multiple identifiers.""" + fields = { + 'doi': '10.1234/example', + 'pmid': '123456', + 'arxiv': '2310.12345', + 'isbn': '978-0-123456-78-9', + } + + identifiers = IdentifierExtractor.extract(fields) + + assert len(identifiers) == 4 + assert identifiers['doi'] == '10.1234/example' + assert identifiers['pmid'] == '123456' + assert identifiers['arxiv'] == '2310.12345' + assert identifiers['isbn'] == '978-0-123456-78-9' + + +class TestIdentifierNormalizer: + """Test IdentifierNormalizer class.""" + + def test_normalize_doi(self): + """Test DOI normalization.""" + assert IdentifierNormalizer.normalize_doi('10.1234/EXAMPLE') == '10.1234/example' + assert IdentifierNormalizer.normalize_doi('10.1234/test') == '10.1234/test' + assert IdentifierNormalizer.normalize_doi('invalid') is None + + def test_normalize_pmid(self): + """Test PMID normalization.""" + assert IdentifierNormalizer.normalize_pmid('12345') == '12345' + assert IdentifierNormalizer.normalize_pmid('1234567') == '1234567' + assert IdentifierNormalizer.normalize_pmid('invalid') is None + + def test_normalize_pmcid(self): + """Test PMCID normalization.""" + assert IdentifierNormalizer.normalize_pmcid('PMC12345') == 'pmc12345' + assert IdentifierNormalizer.normalize_pmcid('PMCabcdef') == 'pmcabcdef' + assert IdentifierNormalizer.normalize_pmcid('invalid') is None + + def test_normalize_arxiv(self): + """Test arXiv normalization.""" + assert IdentifierNormalizer.normalize_arxiv('2310.12345') == '2310.12345' + assert IdentifierNormalizer.normalize_arxiv('2310.12345v1') == '2310.12345' + assert IdentifierNormalizer.normalize_arxiv('INVALID') is None + + def test_normalize_orcid(self): + """Test ORCID normalization.""" + assert IdentifierNormalizer.normalize_orcid('0000-0001-2345-6789') == '0000-0001-2345-6789' + # ORCID with spaces is invalid according to the canonical format + assert IdentifierNormalizer.normalize_orcid('0000 0001 2345 6789') is None + assert IdentifierNormalizer.normalize_orcid('invalid') is None + + def test_normalize_identifier(self): + """Test generic identifier normalization.""" + result = IdentifierNormalizer.normalize_identifier('doi', '10.1234/test') + assert result == ('doi', '10.1234/test') + + result = IdentifierNormalizer.normalize_identifier('pmid', '12345') + assert result == ('pmid', '12345') + + result = IdentifierNormalizer.normalize_identifier('invalid', 'value') + assert result is None + + +class TestIdentifierResolver: + """Test IdentifierResolver class.""" + + def test_resolve_with_doi(self): + """Test resolving with DOI.""" + fields = {'doi': '10.1234/example', 'title': 'Test Title'} + + resolved = IdentifierResolver.resolve(fields) + + assert len(resolved) >= 1 + doi_resolved = [r for r in resolved if r[0] == 'doi'] + assert len(doi_resolved) > 0 + + def test_resolve_with_multiple_identifiers(self): + """Test resolving with multiple identifiers.""" + fields = { + 'doi': '10.1234/example', + 'pmid': '12345', + 'arxiv': '2310.12345', + } + + resolved = IdentifierResolver.resolve(fields) + + assert len(resolved) >= 2 + doi_resolved = [r for r in resolved if r[0] == 'doi'] + assert len(doi_resolved) > 0 + + def test_resolve_without_identifiers(self): + """Test resolving without identifiers.""" + fields = {'title': 'Test Title', 'author': 'John Doe'} + + resolved = IdentifierResolver.resolve(fields) + + # Should have at least title fingerprint + assert len(resolved) >= 1 + title_resolved = [r for r in resolved if r[0] == 'title'] + assert len(title_resolved) > 0 + + def test_get_primary_identifier(self): + """Test getting primary identifier.""" + fields = { + 'doi': '10.1234/example', + 'pmid': '12345', + 'title': 'Test Title', + } + + primary = IdentifierResolver.get_primary_identifier(fields) + + assert primary is not None + # DOI should be first priority + assert primary[0] == 'doi' + + def test_get_scheme_value(self): + """Test getting specific scheme value.""" + fields = { + 'doi': '10.1234/example', + 'pmid': '12345', + } + + doi = IdentifierResolver.get_scheme_value('doi', fields) + assert doi == '10.1234/example' + + pmid = IdentifierResolver.get_scheme_value('pmid', fields) + assert pmid == '12345' + + isbn = IdentifierResolver.get_scheme_value('isbn', fields) + assert isbn is None + + +class TestConvenienceFunctions: + """Test convenience functions.""" + + def test_extract_identifiers(self): + """Test extract_identifiers function.""" + fields = {'doi': '10.1234/example', 'pmid': '12345'} + + identifiers = extract_identifiers(fields) + + assert 'doi' in identifiers + assert 'pmid' in identifiers + + def test_normalize_identifier(self): + """Test normalize_identifier function.""" + result = normalize_identifier('doi', '10.1234/test') + assert result == ('doi', '10.1234/test') + + def test_get_primary_identifier(self): + """Test get_primary_identifier function.""" + fields = {'doi': '10.1234/example'} + + primary = get_primary_identifier(fields) + + assert primary == ('doi', '10.1234/example') + + def test_resolve_identifiers(self): + """Test resolve_identifiers function.""" + fields = {'doi': '10.1234/example'} + + resolved = resolve_identifiers(fields) + + assert len(resolved) > 0 diff --git a/tests/test_semanticscholar.py b/tests/test_semanticscholar.py new file mode 100644 index 0000000..a63bb38 --- /dev/null +++ b/tests/test_semanticscholar.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +from citegeist.resolve import MetadataResolver +from citegeist.sources import SemanticScholarSource, SourceRegistry, list_source_catalog + + +def test_semanticscholar_source_normalizes_record() -> None: + source = SemanticScholarSource(config={}) + entry = source.normalize( + { + "paperId": "abcdef123456", + "title": "Physics Example", + "year": 2024, + "abstract": "Abstract text.", + "authors": [{"name": "Jane Doe"}, {"name": "Alex Roe"}], + "externalIds": {"DOI": "10.1000/physics"}, + "journal": {"name": "Physical Review Example"}, + "openAccessPdf": {"url": "https://example.org/paper.pdf"}, + "citationCount": 42, + "publicationTypes": ["JournalArticle"], + } + ) + + assert entry is not None + assert entry.fields["doi"] == "10.1000/physics" + assert entry.fields["author"] == "Jane Doe and Alex Roe" + assert entry.fields["journal"] == "Physical Review Example" + assert entry.fields["url"] == "https://example.org/paper.pdf" + assert entry.fields["is_oa"] == "true" + assert entry.fields["semanticscholar_citation_count"] == "42" + + +def test_semanticscholar_registry_and_catalog() -> None: + registry = SourceRegistry() + registry.from_config_dict( + { + "sources": { + "semanticscholar": { + "source_type": "semanticscholar", + "enabled": True, + } + } + } + ) + source = registry.get("semanticscholar") + assert isinstance(source, SemanticScholarSource) + + catalog = {entry.key: entry for entry in list_source_catalog()} + assert catalog["semantic_scholar"].current_status == "integrated" + assert catalog["semantic_scholar"].priority == "now" + + +def test_metadata_resolver_uses_semanticscholar_doi_after_other_lookups_fail() -> None: + resolver = MetadataResolver() + resolver.resolve_doi = lambda _doi: None # type: ignore[method-assign] + resolver.resolve_datacite_doi = lambda _doi: None # type: ignore[method-assign] + resolver.resolve_europepmc_doi = lambda _doi: None # type: ignore[method-assign] + resolver.semanticscholar.lookup_by_doi = lambda _doi: resolver.semanticscholar.normalize( # type: ignore[method-assign] + { + "paperId": "abcdef123456", + "title": "Physics Example", + "year": 2024, + "authors": [{"name": "Jane Doe"}], + "externalIds": {"DOI": "10.1000/physics"}, + "journal": {"name": "Physical Review Example"}, + "publicationTypes": ["JournalArticle"], + } + ) + + from citegeist.bibtex import BibEntry + + result = resolver.resolve_entry( + BibEntry( + entry_type="article", + citation_key="seed2024", + fields={"doi": "10.1000/physics", "title": "Physics Example"}, + ) + ) + + assert result is not None + assert result.source_label == "semanticscholar:doi:10.1000/physics" + assert result.entry.fields["journal"] == "Physical Review Example" + + +def test_metadata_resolver_uses_semanticscholar_title_search_after_other_searches_fail() -> None: + resolver = MetadataResolver() + resolver.search_crossref_best_match = lambda *args, **kwargs: None # type: ignore[method-assign] + resolver.search_datacite_best_match = lambda *args, **kwargs: None # type: ignore[method-assign] + resolver.search_openalex_best_match = lambda *args, **kwargs: None # type: ignore[method-assign] + resolver.search_pubmed_best_match = lambda *args, **kwargs: None # type: ignore[method-assign] + resolver.search_europepmc_best_match = lambda *args, **kwargs: None # type: ignore[method-assign] + resolver.semanticscholar.search = lambda _title, limit=5: [ # type: ignore[method-assign] + resolver.semanticscholar.normalize( + { + "paperId": "abcdef123456", + "title": "Physics Example", + "year": 2024, + "authors": [{"name": "Jane Doe"}], + "externalIds": {"DOI": "10.1000/physics"}, + "journal": {"name": "Physical Review Example"}, + "publicationTypes": ["JournalArticle"], + } + ) + ] + + from citegeist.bibtex import BibEntry + + result = resolver.resolve_entry( + BibEntry( + entry_type="article", + citation_key="seed2024", + fields={"title": "Physics Example", "author": "Jane Doe", "year": "2024"}, + ) + ) + + assert result is not None + assert result.source_label == "semanticscholar:search:Physics Example" diff --git a/tests/test_sources_catalog.py b/tests/test_sources_catalog.py new file mode 100644 index 0000000..0537e6c --- /dev/null +++ b/tests/test_sources_catalog.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from citegeist.sources import CrossRefSource, OpenCitationsSource, SourceRegistry, list_source_catalog, prioritized_source_keys + + +def test_catalog_prioritizes_existing_core_sources() -> None: + keys = prioritized_source_keys() + assert keys[:6] == ["crossref", "datacite", "europe_pmc", "openalex", "open_citations", "pubmed"] + + +def test_catalog_includes_open_citation_and_access_sources() -> None: + catalog = {entry.key: entry for entry in list_source_catalog()} + assert "open_citations" in catalog + assert "unpaywall" in catalog + assert catalog["open_citations"].priority == "now" + assert "doi_citations" in catalog["open_citations"].capabilities + + +def test_registry_loads_known_source_from_config() -> None: + registry = SourceRegistry() + registry.from_config_dict( + { + "sources": { + "crossref": { + "source_type": "crossref", + "enabled": True, + } + } + } + ) + + source = registry.get("crossref") + assert isinstance(source, CrossRefSource) + + +def test_registry_rejects_unknown_source_type() -> None: + registry = SourceRegistry() + try: + registry.from_config_dict({"sources": {"mystery": {"source_type": "mystery"}}}) + except ValueError as exc: + assert "Unknown source type" in str(exc) + else: + raise AssertionError("expected ValueError for unknown source type") + + +def test_registry_loads_opencitations_from_config() -> None: + registry = SourceRegistry() + registry.from_config_dict( + { + "sources": { + "opencitations": { + "source_type": "opencitations", + "enabled": True, + } + } + } + ) + + source = registry.get("opencitations") + assert isinstance(source, OpenCitationsSource) diff --git a/tests/test_sources_plugin.py b/tests/test_sources_plugin.py new file mode 100644 index 0000000..c1bd6da --- /dev/null +++ b/tests/test_sources_plugin.py @@ -0,0 +1,171 @@ +"""Tests for the source plugin architecture.""" +from __future__ import annotations + +import pytest + +from citegeist.sources import BibliographicSource, SourceRegistry, CrossRefSource + + +class MockSource(BibliographicSource): + """Mock source for testing.""" + + def __init__(self, config: dict | None = None): + super().__init__(config) + self.lookup_calls = [] + + def lookup_by_doi(self, doi: str) -> None: + """Return None to indicate not found.""" + self.lookup_calls.append(('doi', doi)) + return None + + def lookup_by_title(self, title: str) -> None: + """Return None to indicate not found.""" + self.lookup_calls.append(('title', title)) + return None + + def search(self, query: str, limit: int = 10) -> list: + return [] + + def normalize(self, record: dict) -> None: + return None + + +def test_source_base_interface(): + """Test that BibliographicSource base class works.""" + source = MockSource() + assert source.is_available() + assert source.get_identifier_scheme() == 'mocksource' + assert source.get_fulltext_url('doi:test') is None + assert source.get_embedding('doi:test') is None + + +def test_mock_source(): + """Test that mock source implements interface correctly.""" + source = MockSource() + source.lookup_by_doi('10.1234/test') + source.lookup_by_title('Test Title') + + assert source.lookup_calls == [ + ('doi', '10.1234/test'), + ('title', 'Test Title') + ] + + +def test_source_registry(): + """Test source registry functionality.""" + registry = SourceRegistry() + + # Register a source + registry.register(MockSource, name='mock_source', config={'enabled': True}) + + # List sources + sources = registry.list_sources() + assert 'mock_source' in sources + + # Get source instance + source = registry.get('mock_source') + assert source is not None + assert isinstance(source, MockSource) + assert source.is_available() + + +def test_source_registry_disabled(): + """Test that disabled sources are not returned.""" + registry = SourceRegistry() + + registry.register( + MockSource, + name='disabled_source', + config={'enabled': False} + ) + + sources = registry.list_sources() + assert 'disabled_source' in sources + + # Getting disabled source should return None + source = registry.get('disabled_source') + assert source is None + + +def test_crossref_source(): + """Test CrossRef source plugin.""" + registry = SourceRegistry() + registry.register(CrossRefSource, name='crossref', config={}) + + source = registry.get('crossref') + assert source is not None + assert source.is_available() + assert source.get_identifier_scheme() == 'doi' + + entry = source.normalize( + { + 'message': { + 'DOI': '10.1234/example', + 'title': ['Test Title'], + 'author': [{'given': 'Jane', 'family': 'Doe'}], + 'published-print': {'date-parts': [[2024]]}, + 'container-title': ['Journal of Tests'], + 'publisher': 'Test Publisher', + 'URL': 'https://doi.org/10.1234/example', + 'abstract': 'Example abstract', + } + } + ) + + assert entry is not None + assert entry.fields['doi'] == '10.1234/example' + assert entry.fields['title'] == 'Test Title' + assert entry.fields['year'] == '2024' + assert entry.fields['journal'] == 'Journal of Tests' + + +def test_crossref_search_item_normalization(): + source = CrossRefSource() + + entry = source.normalize( + { + 'DOI': '10.1234/example', + 'title': ['Search Result'], + 'author': [{'family': 'Doe'}], + 'issued': {'date-parts': [[2023]]}, + } + ) + + assert entry is not None + assert entry.fields['doi'] == '10.1234/example' + assert entry.fields['year'] == '2023' + + +def test_source_record(): + """Test SourceRecord dataclass.""" + from citegeist.sources import SourceRecord + + record = SourceRecord( + raw={'test': 'data'}, + source_type='test', + source_label='test_source', + timestamp='2024-01-01', + confidence=1.0 + ) + + assert record.source_type == 'test' + assert record.source_label == 'test_source' + assert record.confidence == 1.0 + assert record.raw == {'test': 'data'} + + +def test_citation_edge(): + """Test CitationEdge dataclass.""" + from citegeist.sources import CitationEdge + + edge = CitationEdge( + source_work_id='doi:10.1234', + target_work_id='doi:10.5678', + relation_type='cites', + source_type='crossref', + source_label='crossref:test', + confidence=0.9 + ) + + assert edge.relation_type == 'cites' + assert edge.confidence == 0.9 diff --git a/tests/test_talkorigins.py b/tests/test_talkorigins.py index e9a8a1d..11feff7 100644 --- a/tests/test_talkorigins.py +++ b/tests/test_talkorigins.py @@ -530,6 +530,88 @@ def test_talkorigins_enrich_weak_canonicals_can_preview_resolution(tmp_path: Pat assert results[0].weak_reasons_after == [] +def test_talkorigins_enrich_weak_canonicals_includes_resolution_attempts(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + f"{base_url}evolution.html": EVOLUTION_HTML, + } + ) + ) + + export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path) + Path(export.seed_sets[0].seed_bib).write_text( + """ +@misc{weak1, + author = "Smith, Jane", + year = "1999", + title = "Weak Duplicate" +} + +@misc{weak2, + author = "Smith, Jane", + year = "1999", + title = "Weak Duplicate", + note = "Copied from legacy source" +} +""", + encoding="utf-8", + ) + Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8") + + from citegeist.resolve import Resolution, ResolutionAttempt, ResolutionOutcome + + scraper.resolver.resolve_entry_with_trace = lambda entry: ResolutionOutcome( # type: ignore[method-assign] + resolution=Resolution( + entry=BibEntry( + entry_type="article", + citation_key="resolved", + fields={ + "author": entry.fields["author"], + "title": entry.fields["title"], + "year": entry.fields["year"], + "doi": "10.1000/weak", + "journal": "Journal of Better Metadata", + }, + ), + source_type="resolver", + source_label="crossref:search:Weak Duplicate", + ), + attempts=[ + ResolutionAttempt( + source_name="crossref", + strategy="title_search", + query_value="Weak Duplicate", + matched=True, + candidate_count=1, + source_label="crossref:search:Weak Duplicate", + ) + ], + ) + + store = BibliographyStore() + try: + results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=False) + finally: + store.close() + + assert len(results) == 1 + assert results[0].resolution_attempts == [ + { + "source_name": "crossref", + "strategy": "title_search", + "query_value": "Weak Duplicate", + "matched": True, + "candidate_count": 1, + "source_label": "crossref:search:Weak Duplicate", + "error": "", + } + ] + + def test_talkorigins_enrich_weak_canonicals_can_apply_to_store(tmp_path: Path): base_url = "https://www.talkorigins.org/origins/biblio/" scraper = TalkOriginsScraper( @@ -799,6 +881,7 @@ def test_talkorigins_build_review_export_combines_cluster_and_enrichment(tmp_pat assert review.items[0]["canonical"]["citation_key"] == "weak2" assert review.items[0]["enrichment"]["resolved"] is True assert review.items[0]["enrichment"]["applied"] is False + assert review.items[0]["enrichment"]["resolution_attempts"] == [] def test_talkorigins_apply_review_corrections_updates_store_entry(tmp_path: Path): diff --git a/tests/test_unpaywall.py b/tests/test_unpaywall.py new file mode 100644 index 0000000..44d25dc --- /dev/null +++ b/tests/test_unpaywall.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +from citegeist.cli import _run_enrich_oa +from citegeist.sources import SourceRegistry, UnpaywallSource, list_source_catalog, prioritized_source_keys +from citegeist.storage import BibliographyStore + + +def test_unpaywall_source_normalizes_oa_record() -> None: + source = UnpaywallSource(config={"email": "tester@example.org"}) + entry = source.normalize( + { + "doi": "10.1000/example", + "title": "Example Article", + "year": 2024, + "is_oa": True, + "oa_status": "gold", + "best_oa_location": { + "url": "https://example.org/article", + "url_for_pdf": "https://example.org/article.pdf", + "license": "cc-by", + "host_type": "publisher", + "version": "publishedVersion", + "evidence": "open (via free pdf)", + }, + } + ) + + assert entry is not None + assert entry.fields["doi"] == "10.1000/example" + assert entry.fields["best_oa_url"] == "https://example.org/article" + assert entry.fields["best_oa_pdf_url"] == "https://example.org/article.pdf" + assert entry.fields["oa_status"] == "gold" + assert entry.fields["oa_license"] == "cc-by" + assert entry.fields["is_oa"] == "true" + + +def test_unpaywall_registry_and_catalog() -> None: + registry = SourceRegistry() + registry.from_config_dict( + { + "sources": { + "unpaywall": { + "source_type": "unpaywall", + "enabled": True, + "email": "tester@example.org", + } + } + } + ) + source = registry.get("unpaywall") + assert isinstance(source, UnpaywallSource) + + catalog = {entry.key: entry for entry in list_source_catalog()} + assert catalog["unpaywall"].current_status == "integrated" + assert catalog["unpaywall"].priority == "now" + assert "unpaywall" in prioritized_source_keys() + + +def test_run_enrich_oa_updates_entry() -> None: + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + doi = {10.1000/example} +} +""" + ) + + original_lookup = UnpaywallSource.lookup_by_doi + + def fake_lookup(self: UnpaywallSource, doi: str): + return self.normalize( + { + "doi": doi, + "title": "Seed Paper", + "year": 2024, + "is_oa": True, + "oa_status": "green", + "best_oa_location": { + "url": "https://repository.example.org/seed", + "url_for_pdf": "https://repository.example.org/seed.pdf", + "license": "cc-by", + "host_type": "repository", + "version": "acceptedVersion", + "evidence": "oa repository", + }, + } + ) + + UnpaywallSource.lookup_by_doi = fake_lookup # type: ignore[method-assign] + try: + assert _run_enrich_oa(store, ["seed2024"], "tester@example.org") == 0 + finally: + UnpaywallSource.lookup_by_doi = original_lookup # type: ignore[method-assign] + + entry = store.get_entry("seed2024") + assert entry is not None + assert entry["best_oa_url"] == "https://repository.example.org/seed" + assert entry["best_oa_pdf_url"] == "https://repository.example.org/seed.pdf" + assert entry["oa_status"] == "green" + assert entry["oa_host_type"] == "repository" + provenance = store.get_field_provenance("seed2024") + assert any(item["source_type"] == "oa_enrich" for item in provenance) + finally: + store.close() + + +def test_run_enrich_oa_requires_email() -> None: + store = BibliographyStore() + try: + assert _run_enrich_oa(store, ["missing"], None) == 1 + finally: + store.close()