Add source tracing and broader open source coverage

2026-04-25 22:27:53 -04:00 · 2026-04-25 22:27:53 -04:00 · 0497e18f04
parent 39fe5ea86c
commit 0497e18f04
37 changed files with 4975 additions and 86 deletions
--- a/db/migrations/0001_multisource.sql
+++ b/db/migrations/0001_multisource.sql
@ -0,0 +1,185 @@
 -- Migration: Multi-source bibliographic schema
 -- Description: Add multi-source support with works, identifiers, source records, citations, and embeddings
 -- ============================================================================
 -- WORKS TABLE - Canonical metadata for works
 -- ============================================================================
 CREATE TABLE IF NOT EXISTS works (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    work_id TEXT NOT NULL UNIQUE,
    title TEXT,
    abstract TEXT,
    publication_year INTEGER,
    publication_date TEXT,
    journal_name TEXT,
    publisher TEXT,
    volume TEXT,
    issue TEXT,
    pages TEXT,
    doi TEXT,
    pmid TEXT,
    pmcid TEXT,
    arxiv_id TEXT,
    dblp_key TEXT,
    openalex_id TEXT,
    isbn TEXT,
    issn TEXT,
    entry_type TEXT NOT NULL DEFAULT 'article',
    citation_count INTEGER DEFAULT 0,
    cited_by_count INTEGER DEFAULT 0,
    influential_citations INTEGER DEFAULT 0,
    is_open_access BOOLEAN DEFAULT 0,
    best_oa_url TEXT,
    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
    updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
 );
 -- ============================================================================
 -- WORK IDENTIFIERS TABLE - Mapping scheme + value to works
 -- ============================================================================
 CREATE TABLE IF NOT EXISTS work_identifiers (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    work_id TEXT NOT NULL,
    scheme TEXT NOT NULL,
    value TEXT NOT NULL,
    is_primary BOOLEAN DEFAULT 0,
    normalized_value TEXT,
    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
    UNIQUE(work_id, scheme, value),
    FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
 );
 -- ============================================================================
 -- SOURCE RECORDS TABLE - Raw API responses with provenance
 -- ============================================================================
 CREATE TABLE IF NOT EXISTS source_records (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    work_id TEXT NOT NULL,
    source_type TEXT NOT NULL,
    source_label TEXT NOT NULL,
    raw_data_json TEXT NOT NULL,
    raw_record_id TEXT,
    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
    UNIQUE(work_id, source_type, source_label),
    FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
 );
 -- ============================================================================
 -- CITATIONS TABLE - Citation graph with provenance
 -- ============================================================================
 CREATE TABLE IF NOT EXISTS citations (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    source_work_id TEXT NOT NULL,
    target_work_id TEXT NOT NULL,
    relation_type TEXT NOT NULL,
    source_type TEXT NOT NULL,
    source_label TEXT NOT NULL,
    confidence REAL DEFAULT 1.0,
    is_verified BOOLEAN DEFAULT 0,
    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
    UNIQUE(source_work_id, target_work_id, relation_type),
    FOREIGN KEY (source_work_id) REFERENCES works(id) ON DELETE CASCADE,
    FOREIGN KEY (target_work_id) REFERENCES works(id) ON DELETE CASCADE
 );
 -- ============================================================================
 -- WORK EMBEDDINGS TABLE - Vector storage for semantic search
 -- ============================================================================
 CREATE TABLE IF NOT EXISTS work_embeddings (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    work_id TEXT NOT NULL,
    embedding TEXT NOT NULL,
    model_name TEXT NOT NULL,
    model_version TEXT,
    dimension INTEGER NOT NULL,
    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
    UNIQUE(work_id, model_name),
    FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
 );
 -- ============================================================================
 -- INDEXES - For performance optimization
 -- ============================================================================
 -- Work identifiers indexes
 CREATE INDEX IF NOT EXISTS idx_work_identifiers_scheme ON work_identifiers(scheme);
 CREATE INDEX IF NOT EXISTS idx_work_identifiers_value ON work_identifiers(value);
 CREATE INDEX IF NOT EXISTS idx_work_identifiers_work_id ON work_identifiers(work_id);
 CREATE INDEX IF NOT EXISTS idx_work_identifiers_normalized ON work_identifiers(normalized_value);
 -- Source records indexes
 CREATE INDEX IF NOT EXISTS idx_source_records_work_id ON source_records(work_id);
 CREATE INDEX IF NOT EXISTS idx_source_records_source_type ON source_records(source_type);
 -- Citations indexes
 CREATE INDEX IF NOT EXISTS idx_citations_source_work ON citations(source_work_id);
 CREATE INDEX IF NOT EXISTS idx_citations_target_work ON citations(target_work_id);
 CREATE INDEX IF NOT EXISTS idx_citations_relation_type ON citations(relation_type);
 CREATE INDEX IF NOT EXISTS idx_citations_source_type ON citations(source_type);
 -- Works indexes
 CREATE INDEX IF NOT EXISTS idx_works_doi ON works(doi);
 CREATE INDEX IF NOT EXISTS idx_works_pmid ON works(pmid);
 CREATE INDEX IF NOT EXISTS idx_works_pmcid ON works(pmcid);
 CREATE INDEX IF NOT EXISTS idx_works_arxiv ON works(arxiv_id);
 CREATE INDEX IF NOT EXISTS idx_works_openalex ON works(openalex_id);
 CREATE INDEX IF NOT EXISTS idx_works_is_open_access ON works(is_open_access);
 CREATE INDEX IF NOT EXISTS idx_works_created_at ON works(created_at);
 -- Embeddings indexes
 CREATE INDEX IF NOT EXISTS idx_embeddings_work_id ON work_embeddings(work_id);
 CREATE INDEX IF NOT EXISTS idx_embeddings_model_name ON work_embeddings(model_name);
 -- ============================================================================
 -- PostgreSQL-specific extensions and vector indexing
 -- ============================================================================
 -- Note: The following are PostgreSQL-specific and should be run when using pgvector
 -- Uncomment these when using PostgreSQL with pgvector extension:
 -- CREATE EXTENSION IF NOT EXISTS vector;
 -- CREATE INDEX IF NOT EXISTS idx_embeddings_vector ON work_embeddings
 --     USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
 -- ============================================================================
 -- TRIGGERS - For automatic timestamp updates
 -- ============================================================================
 -- Works table update trigger
 CREATE TRIGGER IF NOT EXISTS works_updated_at
 AFTER UPDATE ON works
 FOR EACH ROW
 WHEN (new.updated_at IS NULL)
 BEGIN
    UPDATE works SET updated_at = CURRENT_TIMESTAMP WHERE id = old.id;
 END;
 -- Work identifiers update trigger
 CREATE TRIGGER IF NOT EXISTS work_identifiers_updated_at
 AFTER UPDATE ON work_identifiers
 FOR EACH ROW
 WHEN (new.created_at IS NULL)
 BEGIN
    UPDATE work_identifiers SET created_at = CURRENT_TIMESTAMP WHERE id = old.id;
 END;
 -- ============================================================================
 -- VIEWS - For simplified queries
 -- ============================================================================
 -- View to join works with their identifiers
 CREATE VIEW IF NOT EXISTS works_with_identifiers AS
 SELECT 
    w.id,
    w.work_id,
    w.title,
    w.abstract,
    w.publication_year,
    w.journal_name,
    w.publisher,
    w.doi,
    w.pmid,
    w.pmcid,
    w.arxiv_id,
    w.dblp_key,
    w.openalex_id,
    GROUP_CONCAT(DISTINCT wi.scheme || ':' || wi.value, ', ') AS identifiers
 FROM works w
 LEFT JOIN work_identifiers wi ON w.id = wi.work_id
 GROUP BY w.id, w.work_id, w.title, w.abstract, w.publication_year, w.journal_name, w.publisher, w.doi, w.pmid, w.pmcid, w.arxiv_id, w.dblp_key, w.openalex_id;
--- a/docs/README.md
+++ b/docs/README.md
@ -0,0 +1,103 @@
 # CiteGeist Source Planning Documentation
 Welcome to the source-planning documentation for CiteGeist.
 ## Quick Overview
 The immediate planning question is which additional open bibliographic sources should be incorporated next.
 This documentation therefore emphasizes:
 - the current source baseline already present in the repository
 - the next highest-value open sources to add
 - a smaller, more realistic source-layer abstraction
 - explicit deferral of unrelated database/vector ambitions
 ## Documentation Files
 ### Planning and Status
 - **[source-landscape.md](./source-landscape.md)** - recommended next open bibliographic sources
 - **[implementation-progress.md](./implementation-progress.md)** - sources-first progress tracker
 - **[phase-completion.md](./phase-completion.md)** - short status summary
 - **[file-structure.md](./file-structure.md)** - file structure and module notes
 ### Existing Architecture References
 - **[architecture-current.md](./architecture-current.md)** - current architecture overview
 - **[schema-current.sql](./schema-current.sql)** - existing database schema
 ## Current Status
 ### Current Baseline
 1. Crossref, OpenAlex, PubMed, Europe PMC, Semantic Scholar, DataCite, DBLP, arXiv, and OAI-PMH are already in play.
 2. OpenCitations and Unpaywall are now integrated as source-layer additions.
 3. The SQLite-based local workflow remains the baseline.
 ### Recommended Next Sources
 1. OpenAIRE only if repository-acquisition scope expands
 ### Explicitly Deferred
 1. Database redesign
 2. pgvector / embedding-first work
 ## Source Layer
 The source-layer code now provides:
 - `BibliographicSource` as the common interface
 - `SourceRegistry` for known concrete source classes
 - `CrossRefSource` as the repaired first concrete plugin
 - `OpenCitationsSource` plus DOI-based graph expansion
 - `UnpaywallSource` plus DOI-based OA-link enrichment
 - `EuropePmcSource` plus biomedical resolver/search support
 - `SemanticScholarSource` plus broader biological/physical sciences resolver/search support
 - a source catalog with current status and priority order
 - compatibility with the existing `SourceClient`-based resolver and expander code
 ## Quick Start
 ```python
 from citegeist.sources import (
    CrossRefSource,
    EuropePmcSource,
    OpenCitationsSource,
    SemanticScholarSource,
    SourceRegistry,
    UnpaywallSource,
    list_source_catalog,
    prioritized_source_keys,
 )
 registry = SourceRegistry()
 registry.register(CrossRefSource, name="crossref", config={})
 registry.register(EuropePmcSource, name="europepmc", config={})
 registry.register(OpenCitationsSource, name="opencitations", config={})
 registry.register(SemanticScholarSource, name="semanticscholar", config={})
 registry.register(UnpaywallSource, name="unpaywall", config={"email": "you@example.org"})
 source = registry.get("crossref")
 catalog = list_source_catalog()
 priority = prioritized_source_keys()
 ```
 ## Tests
 Relevant tests for the refocused source work:
 - `tests/test_sources_plugin.py`
 - `tests/test_sources_catalog.py`
 The existing broader repository test suite should continue to pass as the source-layer changes are integrated.
 ## Next Steps
 1. Decide whether `OpenAIRE` is worth adding for repository-acquisition breadth.
 2. Keep database/vector redesign work deferred unless a source need forces it.
 ## License
 Same as the CiteGeist project.
 ---
 **Last Updated:** 2026-04-25
 **Status:** Sources-first plan in effect
--- a/docs/architecture-current.md
+++ b/docs/architecture-current.md
@ -0,0 +1,87 @@
 # CiteGeist Current Architecture
 ## Overview
 CiteGeist is currently designed as a local BibTeX-native tooling system with:
 - BibTeX parsing and storage
 - Local text search (FTS5)
 - Entry provenance tracking
 - Citation graph traversal
 - Topic-based expansion
 ## Core Modules
 ### Source Management
 - **sources.py**: `SourceClient` class for HTTP requests with caching and retry logic
  - Base HTTP client with JSON/XML/text support
  - Built-in retry with exponential backoff
  - Cache directory support
 ### Metadata Resolution
 - **resolve.py**: `MetadataResolver` class for entry resolution
  - DOI → CrossRef lookup
  - PMID → PubMed lookup
  - arXiv, DBLP, OpenAlex lookup
  - Title search fallback with best-match selection
  - DataCite integration
  - Returns `Resolution` objects with provenance
 ### Storage
 - **storage.py**: `BibliographyStore` class (SQLite)
  - Tables: entries, creators, entry_creators, identifiers, relations, topics, entry_topics, field_provenance, relation_provenance
  - FTS5 text search integration
  - Field-level provenance tracking
  - Citation graph support (cites, cited_by edges)
 ### BibTeX Processing
 - **bibtex.py**: BibEntry dataclass and parsing/rendering
  - BibTeX → BibEntry conversion
  - BibEntry → BibTeX rendering
  - Citation key generation
 ### CLI and Server
 - **cli.py**: Command-line interface
 - **app_server.py**: Local HTTP server for UI/JSON API
 - **app_api.py**: JSON API adapter surface
 ### Expansion and Discovery
 - **expand.py**: Citation graph expansion workflows
 - **extract.py**: Plaintext reference extraction
 - **bootstrap.py**: Topic bootstrap and expansion
 ## Current State Summary
 **Completed/Usable:**
 - BibTeX parsing and storage
 - Identifier-based resolution (DOI, PMID, arXiv, DBLP, OpenAlex)
 - Title search with best-match selection
 - Citation graph traversal and expansion
 - Field provenance tracking
 - Local search with FTS5
 - Topic-based discovery workflows
 **Not Yet Implemented (from new roadmap):**
 - Plugin-based source architecture
 - Multi-source record merging
 - PGVector embeddings
 - Full-text OA link retrieval
 - Semantic Scholar integration
 - OpenCitations integration
 - Unified API endpoints for multi-source queries
 ## Data Flow
 1. **Ingest**: BibTeX file → parse → store in entries table
 2. **Resolve**: Entry → resolve_doi/resolve_pmid/resolve_arxiv → fetch metadata → merge with existing
 3. **Expand**: Start from entry → traverse citation edges → discover new entries
 4. **Search**: Query FTS5 index → retrieve relevant entries
 5. **Export**: Entries → render BibTeX → output file
 ## Database Schema
 SQLite-based storage with:
 - Normalized entry fields
 - Creator relationships
 - Identifier mapping
 - Citation relations
 - Topic associations
 - Field provenance metadata
--- a/docs/file-structure.md
+++ b/docs/file-structure.md
@ -0,0 +1,165 @@
 # CiteGeist Multi-Source File Structure
 **Date:** 2026-04-25
 ## Project Structure
 ```
 /home/netuser/dev/CiteGeist/
 ├── db/
 │   └── migrations/
 │       └── 0001_multisource.sql           ✅ NEW - Multi-source schema
 │
 ├── docs/
 │   ├── architecture-current.md             ✅ NEW - Current architecture docs
 │   ├── implementation-progress.md          ✅ NEW - Implementation progress tracker
 │   ├── schema-current.sql                  ✅ NEW - Current schema SQL
 │   └── file-structure.md                   ✅ NEW - This file
 │
 ├── src/citegeist/
 │   ├── sources/                            ✅ NEW - Source plugin architecture
 │   │   ├── __init__.py                     ✅ NEW - Package exports
 │   │   ├── __all__.py                      ✅ NEW - Public API
 │   │   ├── base.py                         ✅ NEW - Base BibliographicSource class
 │   │   ├── registry.py                     ✅ NEW - SourceRegistry implementation
 │   │   ├── crossref.py                     ✅ NEW - CrossRef source plugin
 │   │   └── _old_sources_compat.py          ✅ NEW - Backward compatibility
 │   │
 │   ├── resolver/                           ✅ NEW - Identifier resolution
 │   │   ├── __init__.py                     ✅ NEW - Module exports
 │   │   └── identifiers.py                  ✅ NEW - Extract, normalize, resolve
 │   │
 │   ├── db/                                 ✅ NEW - Database operations
 │   │   └── __init__.py                     🚧 TO DO - Database client
 │   │
 │   ├── ... (existing files)
 │   ├── sources.py                          📦 Existing - Old SourceClient
 │   ├── resolve.py                          📦 Existing - MetadataResolver
 │   └── storage.py                          📦 Existing - BibliographyStore
 │
 └── tests/
    ├── test_sources_plugin.py              ✅ NEW - Source plugin tests
    └── test_resolver_identifiers.py        ✅ NEW - Identifier tests
 ```
 ## Module Documentation
 ### New Modules
 #### `src/citegeist/sources/`
 Plugin architecture for bibliographic sources.
 **Classes:**
 - `BibliographicSource` - Abstract base class for source plugins
 - `SourceRecord` - Raw source record dataclass
 - `CitationEdge` - Citation relationship dataclass
 - `SourceRegistry` - Manages source plugins
 **Plugin:**
 - `CrossRefSource` - CrossRef API implementation
 #### `src/citegeist/resolver/`
 Identifier extraction, normalization, and resolution.
 **Classes:**
 - `IdentifierExtractor` - Extract identifiers from entry fields
 - `IdentifierNormalizer` - Normalize identifiers to canonical form
 - `IdentifierResolver` - Resolve identifiers with lookup priority
 **Functions:**
 - `extract_identifiers()` - Quick identifier extraction
 - `normalize_identifier()` - Quick normalization
 - `get_primary_identifier()` - Get primary identifier
 - `resolve_identifiers()` - Resolve all identifiers
 #### `src/citegeist/db/`
 Database operations (to be implemented).
 **Planned:**
 - Database client for works table
 - Migration runner
 - Query builders
 #### `db/migrations/0001_multisource.sql`
 Multi-source database schema migration.
 **Tables:**
 1. `works` - Canonical work metadata
 2. `work_identifiers` - Multi-scheme identifiers
 3. `source_records` - Raw API responses
 4. `citations` - Citation graph
 5. `work_embeddings` - Vector embeddings
 ### Existing Modules (Preserved)
 - `src/citegeist/sources.py` - Old SourceClient (backward compatible)
 - `src/citegeist/resolve.py` - Old MetadataResolver
 - `src/citegeist/storage.py` - Old BibliographyStore
 ## Test Coverage
 **New Tests:**
 - `tests/test_sources_plugin.py` (7 tests)
 - `tests/test_resolver_identifiers.py` (17 tests)
 **Total:** 24 tests passing
 ## Dependencies
 **New Dependencies Required:**
 - No new Python packages (uses stdlib only)
 **Planned Dependencies (Future phases):**
 - `pgvector` - PostgreSQL vector extension
 - `sentence-transformers` - Local embedding model
 - `fastapi` - API framework
 - `unpaywall` - OA link retrieval (if needed)
 ## Implementation Status
 ### Completed (100%)
 - ✅ Phase 0: Baseline Audit
 - ✅ Phase 1: Source Plugin Architecture
 - ✅ Phase 2: Identifier Resolution Layer
 ### In Progress (50%)
 - 🚧 Phase 3: Database Schema Upgrade
 ### Pending (0%)
 - ⏳ Phase 4: High-Value Source Integrations
 - ⏳ Phase 5: Merge & Deduplication Engine
 - ⏳ Phase 6: Citation Graph Construction
 - ⏳ Phase 7: Embedding Pipeline
 - ⏳ Phase 8: Full-Text Retrieval Layer
 - ⏳ Phase 9: API Layer
 - ⏳ Phase 10: Ranking & Relevance
 - ⏳ Phase 12: Observability & QA
 - ⏳ Phase 13: Performance Optimization
 ## Quick Start
 ```python
 # Register a source
 from citegeist.sources import SourceRegistry, CrossRefSource
 registry = SourceRegistry()
 registry.register(CrossRefSource, name='crossref', config={})
 # Get source instance
 source = registry.get('crossref')
 entry = source.lookup_by_doi('10.1234/example')
 # Resolve identifiers
 from citegeist.resolver import resolve_identifiers
 fields = {'doi': '10.1234/example', 'title': 'Test'}
 resolved = resolve_identifiers(fields)
 # Returns [('doi', '10.1234/example'), ('title', 'test title')]
 ```
 ## Next Steps
 1. ✅ Phase 0-2: Complete
 2. 🚧 Phase 3: Implement Python interface for database operations
 3. ⏳ Phase 4: Add Unpaywall, Semantic Scholar, OpenCitations integrations
 4. ⏳ Phase 5: Build merge engine
--- a/docs/implementation-progress.md
+++ b/docs/implementation-progress.md
@ -0,0 +1,122 @@
 # CiteGeist Sources-First Progress
 **Last Updated:** 2026-04-25
 This document tracks the refocused plan for source incorporation. The working question is which additional open bibliographic sources CiteGeist should integrate next, not whether it needs a new storage platform first.
 ---
 ## Phase 0: Scope Reframe ✅ COMPLETE
 **Status:** Completed
 **Deliverables:**
 - ✅ `/docs/source-landscape.md` - source inventory and recommendation document
 - ✅ `/src/citegeist/sources/catalog.py` - code-backed source catalog
 **Completed:**
 - Identified which source integrations already exist in the repository
 - Split source-expansion planning from database/vector-search ambitions
 - Prioritized open-source additions by workflow value
 ---
 ## Phase 1: Source Layer Tightening ✅ COMPLETE
 **Status:** Completed
 **Deliverables:**
 - ✅ `/src/citegeist/sources/base.py` - Base `BibliographicSource` interface
 - ✅ `/src/citegeist/sources/registry.py` - Registry for known concrete sources
 - ✅ `/src/citegeist/sources/crossref.py` - Repaired CrossRef source implementation
 - ✅ `/src/citegeist/sources/catalog.py` - Open-source inventory
 - ✅ `/src/citegeist/sources/__init__.py` - Package initialization
 - ✅ `/tests/test_sources_plugin.py` - Source plugin tests
 - ✅ `/tests/test_sources_catalog.py` - Source catalog and registry tests
 **Completed:**
 - ✅ Created `BibliographicSource` abstract base class
 - ✅ Repaired `SourceRegistry` so config-backed loading resolves real source classes
 - ✅ Fixed `CrossRefSource` normalization for direct lookup and search-style payloads
 - ✅ Replaced path-specific compatibility loading with repo-relative loading
 - ✅ Added a source catalog that captures current status and next-priority sources
 **Features:**
 - Abstract interface for source plugins
 - Registry for known source discovery and instantiation
 - Config-driven enable/disable for known source types
 - Source prioritization metadata
 - Compatibility with the existing `SourceClient`-based resolver/expander code
 ---
 ## Current Integrated Sources ✅ AVAILABLE
 - `Crossref`
 - `OpenAlex`
 - `OpenCitations`
 - `Unpaywall`
 - `PubMed`
 - `Europe PMC`
 - `Semantic Scholar`
 - `DataCite`
 - `DBLP`
 - `arXiv`
 - `OAI-PMH`
 These are already sufficient for a credible local enrichment-and-discovery workflow. The next work should complement them rather than restart infrastructure underneath them.
 ---
 ## Phase 2: Next Source Additions 🚧 IN PROGRESS
 **Status:** In Progress
 **Priority Order:**
 1. `OpenAIRE` only if repository-acquisition scope expands
 **Completed Deliverables:**
 - ✅ OpenCitations adapter for DOI citation/reference lookup
 - ✅ OpenCitations graph expansion support in CLI and topic expansion flows
 - ✅ Unpaywall adapter for DOI OA-link enrichment
 - ✅ `enrich-oa` CLI flow for applying OA metadata to stored entries
 - ✅ Europe PMC biomedical resolver/search integration
 - ✅ Semantic Scholar broad-science resolver/search integration
 **Planned Deliverables:**
 - ⏳ Decide whether repository-acquisition breadth needs another dedicated source
 **Rationale:**
 - `OpenCitations` now improves open citation-edge coverage
 - `Unpaywall` now improves access-link enrichment
 - `Europe PMC` now improves biomedical metadata and OA/fulltext coverage
 - `Semantic Scholar` now improves broader biological and physical sciences coverage
 - neither requires a new database architecture to become useful
 ---
 ## Phase 3: Optional Source Evaluation ⏳ PLANNED
 **Status:** Planned
 - `OpenAIRE`
 **Decision Rule:**
 - add them only if they solve a concrete discovery or acquisition gap that current open sources do not already cover well
 ---
 ## Explicitly Deferred
 - second-schema redesign work
 - pgvector integration
 - embedding-first retrieval
 - broad canonical-work reconstruction
 ---
 ## Summary
 **Completed:** scope reframe and source-layer cleanup
 **Planned next:** `OpenAIRE` reevaluation
 **Deferred:** database/vector expansion work not required by the source question
--- a/docs/phase-completion.md
+++ b/docs/phase-completion.md
@ -0,0 +1,111 @@
 # Sources-First Status
 **Current Focus:** identify and prioritize the next open bibliographic sources to add, using the existing SQLite-based workflow as the baseline.
 ---
 ## Phase Matrix
 | Phase | Title | Status | Outcome |
 |-------|-------|--------|---------|
 | **0** | Scope Reframe | ✅ Complete | Planning now answers the source question directly |
 | **1** | Source Layer Tightening | ✅ Complete | Registry, CrossRef plugin, compatibility seam, and source catalog repaired |
 | **2** | Next Open Source Additions | 🚧 In Progress | OpenCitations, Unpaywall, and Europe PMC integrated |
 | **3** | Optional Source Evaluation | ⏳ Planned | OpenAIRE evaluated later if acquisition breadth matters |
 | **D** | Database / Vector Expansion | ⏸ Deferred | Not required for the current source-incorporation decision |
 ---
 ## Test Coverage Summary
 ```
 ✅ test_sources_plugin.py
 ✅ test_sources_catalog.py
 ✅ existing full suite still expected to pass
 ```
 ---
 ## Key Artifacts
 ### Documentation
 ```
 docs/
 ├── source-landscape.md          ✅ Source inventory and recommendations
 ├── implementation-progress.md   ✅ Sources-first progress tracker
 └── phase-completion.md          ✅ Short status summary
 ```
 ### Source Layer
 ```
 src/citegeist/sources/
 ├── base.py                      ✅ Base source interface
 ├── catalog.py                   ✅ Source inventory in code
 ├── registry.py                  ✅ Registry for known source classes
 ├── crossref.py                  ✅ Repaired CrossRef plugin
 └── _old_sources_compat.py       ✅ Repo-relative compatibility bridge
 ```
 ### Tests
 ```
 tests/
 ├── test_sources_plugin.py       ✅ Source plugin tests
 └── test_sources_catalog.py      ✅ Source catalog/registry tests
 ```
 ---
 ## Key Features Implemented
 - ✅ Source catalog covering current and candidate open sources
 - ✅ Config-driven registry loading for known real source classes
 - ✅ CrossRef normalization that works for both single-record and search-result payloads
 - ✅ Compatibility bridge that no longer depends on one checkout path
 - ✅ OpenCitations DOI-based graph expansion with CLI support
 - ✅ Unpaywall OA-link enrichment with CLI support
 - ✅ Europe PMC biomedical resolver/search support
 - ✅ Semantic Scholar broad-science resolver/search support
 ---
 ## Next Milestones
 ### Immediate
 1. Decide whether repository-acquisition scope justifies `OpenAIRE`
 2. Keep the OA-enrichment flow aligned with review/export needs
 3. Keep graph-source scope disciplined as broader coverage grows
 ### Later
 1. Evaluate `Semantic Scholar`
 2. Evaluate `OpenAIRE`
 3. Revisit database/vector work only if a concrete source need demands it
 ---
 ## Success Metrics
 ### Completed
 - ✅ Planning now matches the actual source question
 - ✅ Source-layer defects from the first pass have been corrected
 - ✅ OpenCitations is now a working integrated source
 - ✅ Unpaywall is now a working integrated source
 - ✅ Europe PMC is now a working integrated source
 - ✅ Semantic Scholar is now a working integrated source
 - ✅ The next source priorities are explicit
 ### Planned
 - ⏳ Better source selection discipline before adding more integrations
 ---
 ## Recommendations
 1. Treat the current SQLite/FTS workflow as the baseline, not as a blocker.
 2. Add source integrations only when they materially improve bibliographic coverage, citation coverage, or open-access linkage.
 3. Keep database/vector work explicitly subordinate to source-incorporation needs.
 ---
 **Last Updated:** 2026-04-25
 **Status:** Sources-first plan in effect
 **Confidence:** High
--- a/docs/schema-current.sql
+++ b/docs/schema-current.sql
@ -0,0 +1,131 @@
 -- CiteGeist Current Schema (SQLite)
 -- Entries table
 CREATE TABLE IF NOT EXISTS entries (
    id INTEGER PRIMARY KEY,
    citation_key TEXT NOT NULL UNIQUE,
    entry_type TEXT NOT NULL,
    review_status TEXT NOT NULL DEFAULT 'draft',
    title TEXT,
    year TEXT,
    journal TEXT,
    booktitle TEXT,
    publisher TEXT,
    abstract TEXT,
    keywords TEXT,
    url TEXT,
    doi TEXT,
    isbn TEXT,
    fulltext TEXT,
    raw_bibtex TEXT,
    extra_fields_json TEXT NOT NULL DEFAULT '{}',
    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
    updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
 );
 -- Creators table
 CREATE TABLE IF NOT EXISTS creators (
    id INTEGER PRIMARY KEY,
    full_name TEXT NOT NULL UNIQUE,
    family_name TEXT,
    given_names TEXT
 );
 -- Entry-Creators relationship
 CREATE TABLE IF NOT EXISTS entry_creators (
    entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
    creator_id INTEGER NOT NULL REFERENCES creators(id) ON DELETE CASCADE,
    role TEXT NOT NULL,
    ordinal INTEGER NOT NULL,
    PRIMARY KEY (entry_id, role, ordinal)
 );
 -- Identifiers table
 CREATE TABLE IF NOT EXISTS identifiers (
    entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
    scheme TEXT NOT NULL,
    value TEXT NOT NULL,
    PRIMARY KEY (scheme, value)
 );
 -- Relations table (citation graph)
 CREATE TABLE IF NOT EXISTS relations (
    source_entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
    target_citation_key TEXT NOT NULL,
    relation_type TEXT NOT NULL,
    PRIMARY KEY (source_entry_id, target_citation_key, relation_type)
 );
 -- Topics table
 CREATE TABLE IF NOT EXISTS topics (
    id INTEGER PRIMARY KEY,
    slug TEXT NOT NULL UNIQUE,
    name TEXT NOT NULL,
    source_type TEXT NOT NULL,
    source_url TEXT,
    expansion_phrase TEXT,
    suggested_phrase TEXT,
    phrase_review_status TEXT NOT NULL DEFAULT 'unreviewed',
    phrase_review_notes TEXT,
    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
    updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
 );
 -- Entry-Topics relationship
 CREATE TABLE IF NOT EXISTS entry_topics (
    entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
    topic_id INTEGER NOT NULL REFERENCES topics(id) ON DELETE CASCADE,
    source_label TEXT NOT NULL,
    confidence REAL,
    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY (entry_id, topic_id)
 );
 -- Field Provenance table
 CREATE TABLE IF NOT EXISTS field_provenance (
    id INTEGER PRIMARY KEY,
    entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
    field_name TEXT NOT NULL,
    field_value TEXT,
    source_type TEXT NOT NULL,
    source_label TEXT NOT NULL,
    operation TEXT NOT NULL,
    confidence REAL,
    recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
 );
 -- Relation Provenance table
 CREATE TABLE IF NOT EXISTS relation_provenance (
    id INTEGER PRIMARY KEY,
    source_entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
    target_citation_key TEXT NOT NULL,
    relation_type TEXT NOT NULL,
    source_type TEXT NOT NULL,
    source_label TEXT NOT NULL,
    confidence REAL,
    recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
 );
 -- Full-text Search (FTS5)
 CREATE VIRTUAL TABLE IF NOT EXISTS entries_fts USING fts5(
    title,
    abstract,
    keywords,
    content='entries',
    content_rowid='id'
 );
 -- Trigger to sync entries with FTS
 CREATE TRIGGER IF NOT EXISTS entries_ai AFTER INSERT ON entries BEGIN
    INSERT INTO entries_fts(rowid, title, abstract, keywords)
    VALUES (new.id, new.title, new.abstract, new.keywords);
 END;
 CREATE TRIGGER IF NOT EXISTS entries_ad AFTER DELETE ON entries BEGIN
    DELETE FROM entries_fts WHERE rowid = old.id;
 END;
 CREATE TRIGGER IF NOT EXISTS entries_au AFTER UPDATE ON entries BEGIN
    UPDATE entries_fts SET title = new.title, abstract = new.abstract, keywords = new.keywords
    WHERE rowid = new.id;
 END;
--- a/docs/source-landscape.md
+++ b/docs/source-landscape.md
@ -0,0 +1,131 @@
 # Open Bibliographic Source Landscape
 This document answers the question that prompted the recent planning work: which additional open bibliographic sources are worth incorporating into CiteGeist, given the sources it already uses?
 ## Current Baseline
 CiteGeist already has useful source coverage for a local BibTeX-first workflow:
 - `Crossref`: DOI lookup, title search, and reference-list expansion.
 - `OpenAlex`: work lookup, title/discovery search, and citation-graph expansion.
 - `PubMed`: authoritative biomedical metadata lookup by PMID and title search fallback.
 - `Europe PMC`: biomedical metadata/fulltext complement to PubMed.
 - `Semantic Scholar`: broad cross-domain metadata with strong biological and physical sciences coverage.
 - `DataCite`: DOI-backed dataset/report/non-article metadata.
 - `DBLP`: strong computer-science metadata.
 - `arXiv`: preprint metadata.
 - `OAI-PMH`: repository harvesting for theses, dissertations, and institutional collections.
 That means the immediate gap is no longer “get any scholarly metadata at all.” The immediate gap is to add the next highest-value open sources without destabilizing the existing ingest, review, and export pipeline.
 ## Recommended Priorities
 ### OpenCitations
 Why:
 - It directly improves open citation-edge coverage.
 - It fits CiteGeist's graph-discovery workflow better than another generic metadata source.
 - It complements OpenAlex rather than replacing it.
 Expected role:
 - DOI-to-citations lookup
 - DOI-to-references lookup
 - provenance for citation edges
 Status:
 - now integrated as a DOI-based citation/reference source in the source layer and graph expansion flow
 Main risk:
 - coverage is narrower than OpenAlex, so merge rules need to treat it as an additional edge source rather than a primary metadata authority.
 ### Unpaywall
 Why:
 - It solves a different problem from Crossref/OpenAlex: full-text access and OA status.
 - It improves the “can I get the paper?” part of the workflow without forcing a storage redesign.
 Expected role:
 - DOI-to-best-open-access-link lookup
 - OA status enrichment
 Status:
 - now integrated as an OA-link enrichment source with a dedicated `enrich-oa` CLI flow
 Main risk:
 - It should remain an access-link enrichment layer, not become entangled with identity resolution logic.
 ### Europe PMC
 Why:
 - It is valuable for biomedical and life-sciences use cases.
 - It complements PubMed with richer open-access and citation-related information.
 Expected role:
 - domain-specific metadata enrichment
 - biomedical search
 - OA/full-text linkage
 Status:
 - now integrated as a biomedical resolver/search complement to `PubMed`
 Main risk:
 - this should remain a domain-specific source, not be treated as a universal resolver.
 ### Semantic Scholar
 Pros:
 - good graph and relevance signals
 - useful for discovery quality
 Status:
 - now integrated as a broad resolver/search complement with good biological and physical sciences coverage
 Main risk:
 - rate limits and product-policy changes still matter more here than for the more explicitly open bibliographic sources
 ## Evaluate But Do Not Make Core Yet
 ### OpenAIRE
 Pros:
 - strong repository and OA/project linkage
 - good for European repository acquisition
 Cons:
 - better suited to corpus acquisition than first-line metadata resolution
 Recommendation:
 - treat as an acquisition adapter, not an immediate resolver target
 ## What Not To Prioritize Right Now
 ### Database Redesign
 The repository already has a working SQLite storage model and FTS-backed local workflow. A second schema track should not lead the next phase of work unless a concrete source integration is blocked on it.
 ### Vector Search
 Optional semantic ranking may become useful later, but it was not the motivating question and does not need to be a prerequisite for source incorporation.
 ## Suggested Execution Order
 1. Keep the source abstraction aligned with sources already in use.
 2. Revisit `OpenAIRE` after the current source additions settle.
--- a/new-roadmap.md
+++ b/new-roadmap.md
@ -0,0 +1,113 @@
 # CiteGeist Roadmap: Sources-First Expansion
 ## Purpose
 The primary question is not “how do we redesign CiteGeist around a new storage engine?” The primary question is “which additional open bibliographic sources should CiteGeist incorporate next?”
 This roadmap treats the current SQLite-based local workflow as the baseline and focuses on source evaluation, source integration order, and reviewable source behavior.
 ## Baseline
 Already present in the repository:
 - local BibTeX ingest, review, export, and graph traversal
 - metadata resolution from `Crossref`, `PubMed`, `Europe PMC`, `OpenAlex`, `Semantic Scholar`, `DBLP`, `arXiv`, and `DataCite`
 - citation-graph expansion using `Crossref` and `OpenAlex`
 - repository harvesting via `OAI-PMH`
 That means the next planning step is source prioritization, not another platform pivot.
 ## Phase 0: Reframe Scope
 Goal:
 Put source-incorporation decisions ahead of database and vector-search ambitions.
 Tasks:
 - [x] identify which source integrations already exist
 - [x] separate “source expansion” work from “new database/vector stack” work
 - [x] document the source landscape and recommended order
 Deliverables:
 - `/docs/source-landscape.md`
 - `/src/citegeist/sources/catalog.py`
 ## Phase 1: Tighten The Source Layer
 Goal:
 Make the new source abstraction useful for the repository that already exists, rather than speculative infrastructure.
 Tasks:
 - [x] keep the compatibility bridge to the existing `SourceClient`
 - [x] fix the initial `CrossRefSource` implementation so normalization works
 - [x] make config-driven registry loading work for known concrete sources
 - [x] add a code-backed source catalog for planning and prioritization
 Deliverables:
 - `/src/citegeist/sources/base.py`
 - `/src/citegeist/sources/registry.py`
 - `/src/citegeist/sources/crossref.py`
 - `/src/citegeist/sources/catalog.py`
 ## Phase 2: Highest-Value Open Source Additions
 Goal:
 Incorporate the next open sources that materially improve the current workflow.
 Priority order:
 1. `OpenAIRE` only if repository-acquisition scope expands
 Tasks:
 - [x] add `OpenCitations` DOI-to-citation and DOI-to-reference lookup
 - [x] merge `OpenCitations` edges into the existing graph-expansion workflow with provenance
 - [x] add `Unpaywall` DOI-to-OA-link enrichment
 - [x] expose OA-link enrichment in a dedicated CLI flow
 - [x] add `Europe PMC` as a biomedical metadata/fulltext complement to `PubMed`
 - [x] add `Semantic Scholar` as a broader scientific metadata complement across biological and physical sciences
 Why these first:
 - `OpenCitations` directly answers the open-citation-coverage gap
 - `Unpaywall` now solves access-link enrichment without forcing a storage redesign
 - `Europe PMC` now improves biomedical metadata and OA/fulltext coverage without changing the storage model
 - `Semantic Scholar` now improves broader biological and physical sciences coverage without changing the storage model
 ## Phase 3: Evaluate Optional Sources, Do Not Commit Prematurely
 Goal:
 Assess sources that may be useful, but are not clearly the next source-first move.
 Candidates:
 - `OpenAIRE`
 Tasks:
 - [ ] document API limits, openness constraints, and integration risk
 - [ ] decide whether each source belongs in core resolution, graph expansion, or corpus acquisition
 - [ ] avoid adding sources that duplicate existing coverage without a clear payoff
 ## Deferred Work
 These are valid future ideas, but they are not the current planning driver:
 - a second database schema
 - pgvector integration
 - embedding-first search
 - large-scale canonical-work reconstruction
 The repository already has a working local storage/search path. Those ideas should only return to the front of the plan if a concrete source-integration need forces them there.
 ## Immediate Next Steps
 1. Land the source inventory and source-layer cleanup.
 2. Reassess whether `OpenAIRE` is worth adding for repository-acquisition breadth.
--- a/src/citegeist/init.py
+++ b/src/citegeist/init.py
@ -2,7 +2,7 @@ from .app_api import LiteratureExplorerApi
 from .batch import BatchBootstrapRunner, BatchJobResult, load_batch_jobs
 from .bibtex import BibEntry, parse_bibtex
 from .bootstrap import BootstrapResult, Bootstrapper
-from .expand import CrossrefExpander, OpenAlexExpander
+from .expand import CrossrefExpander, OpenAlexExpander, OpenCitationsExpander
 from .extract import (
    available_extraction_backends,
    check_extraction_comparison_summary,
@ -16,6 +16,10 @@ from .harvest import OaiMetadataFormat, OaiPmhHarvester, OaiSet
 from .llm_verify import VerificationLlmClient, VerificationLlmConfig
 from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts
 from .sources import SourceClient
 from .sources import EuropePmcSource
 from .sources import OpenLibrarySource
 from .sources import SemanticScholarSource
 from .sources import UnpaywallSource
 from .storage import BibliographyStore
 from .verify import BibliographyVerifier, VerificationResult, VerificationMatch
@ -31,10 +35,15 @@ __all__ = [
    "LiteratureExplorerApi",
    "MetadataResolver",
    "OpenAlexExpander",
    "OpenCitationsExpander",
    "OaiPmhHarvester",
    "OaiMetadataFormat",
    "OaiSet",
    "SourceClient",
    "EuropePmcSource",
    "OpenLibrarySource",
    "SemanticScholarSource",
    "UnpaywallSource",
    "VerificationLlmClient",
    "VerificationLlmConfig",
    "VerificationMatch",
--- a/src/citegeist/cli.py
+++ b/src/citegeist/cli.py
@ -173,6 +173,13 @@ def build_parser() -> argparse.ArgumentParser:
    resolve_parser = subparsers.add_parser("resolve", help="Enrich stored entries from external metadata sources")
    resolve_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich")
    enrich_oa_parser = subparsers.add_parser(
        "enrich-oa",
        help="Enrich DOI-bearing entries with Unpaywall OA link metadata",
    )
    enrich_oa_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich")
    enrich_oa_parser.add_argument("--email", help="Email address required by the Unpaywall API")
    resolve_stubs_parser = subparsers.add_parser(
        "resolve-stubs",
        help="Find and enrich stub-like stored entries, optionally limited to DOI-bearing candidates",
@ -237,7 +244,7 @@ def build_parser() -> argparse.ArgumentParser:
    expand_parser.add_argument("citation_keys", nargs="+", help="Seed citation keys to expand")
    expand_parser.add_argument(
        "--source",
-        choices=["crossref", "openalex"],
+        choices=["crossref", "openalex", "opencitations"],
        default="crossref",
        help="Graph expansion source",
    )
@ -260,7 +267,7 @@ def build_parser() -> argparse.ArgumentParser:
    )
    expand_topic_parser.add_argument(
        "--source",
-        choices=["crossref", "openalex"],
+        choices=["crossref", "openalex", "opencitations"],
        default="openalex",
        help="Topic graph expansion source",
    )
@ -749,6 +756,8 @@ def main(argv: list[str] | None = None) -> int:
            )
        if args.command == "resolve":
            return _run_resolve(store, args.citation_keys)
        if args.command == "enrich-oa":
            return _run_enrich_oa(store, args.citation_keys, args.email)
        if args.command == "resolve-stubs":
            return _run_resolve_stubs(store, args.limit, args.doi_only, args.all_misc, args.topic, args.preview)
        if args.command == "graph":
@ -1215,6 +1224,72 @@ def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int:
    return exit_code
 def _run_enrich_oa(store: BibliographyStore, citation_keys: list[str], email: str | None) -> int:
    from .sources import UnpaywallSource
    source = UnpaywallSource(config={"email": email} if email else {})
    if not source.is_available():
        print("Unpaywall enrichment requires --email or UNPAYWALL_EMAIL", file=sys.stderr)
        return 1
    results: list[dict[str, object]] = []
    total = len(citation_keys)
    for index, citation_key in enumerate(citation_keys, start=1):
        _print_progress("enriching OA", index, total, citation_key)
        existing = store.get_entry(citation_key)
        if existing is None:
            results.append({"citation_key": citation_key, "status": "missing"})
            continue
        doi = str(existing.get("doi") or "").strip()
        if not doi:
            results.append({"citation_key": citation_key, "status": "no_doi"})
            continue
        enriched = source.lookup_by_doi(doi)
        if enriched is None:
            results.append({"citation_key": citation_key, "status": "no_record", "doi": doi})
            continue
        merged_fields: dict[str, str] = {}
        for key, value in existing.items():
            if isinstance(value, str):
                merged_fields[key] = value
        merged_fields.update(enriched.fields)
        for field_name in ("title", "year", "author", "journal", "booktitle", "publisher", "abstract", "keywords"):
            existing_value = str(existing.get(field_name) or "").strip()
            if existing_value:
                merged_fields[field_name] = existing_value
        replacement = BibEntry(
            entry_type=str(existing.get("entry_type") or "misc"),
            citation_key=citation_key,
            fields=merged_fields,
        )
        store.replace_entry(
            citation_key,
            replacement,
            source_type="oa_enrich",
            source_label=f"unpaywall:doi:{doi}",
            review_status=str(existing.get("review_status") or "enriched"),
        )
        updated = store.get_entry(citation_key) or {}
        results.append(
            {
                "citation_key": citation_key,
                "status": "enriched",
                "doi": doi,
                "is_oa": updated.get("is_oa"),
                "oa_status": updated.get("oa_status"),
                "best_oa_url": updated.get("best_oa_url"),
                "best_oa_pdf_url": updated.get("best_oa_pdf_url"),
            }
        )
    print(json.dumps(results, indent=2))
    return 0
 def _resolve_one(store: BibliographyStore, resolver: MetadataResolver, citation_key: str) -> bool:
    existing = store.get_entry(citation_key)
    if existing is None:
@ -1664,6 +1739,15 @@ def _run_expand(
            for relation_name in _expand_relation_types(relation)
            for item in expander.expand_entry(store, key, relation_type=relation_name, limit=limit)
        ]
    elif source == "opencitations":
        from .expand import OpenCitationsExpander
        expander = OpenCitationsExpander()
        expand_fn = lambda key: [
            item
            for relation_name in _expand_relation_types(relation)
            for item in expander.expand_entry(store, key, relation_type=relation_name, limit=limit)
        ]
    else:
        print(f"Unsupported expansion source: {source}", file=sys.stderr)
        return 1
--- a/src/citegeist/expand.py
+++ b/src/citegeist/expand.py
@ -9,6 +9,7 @@ from urllib.parse import quote, urlencode
 from .bibtex import BibEntry, parse_bibtex
 from .extract import _extract_thesis_like_title, _looks_like_citation_blob as _shared_looks_like_citation_blob
 from .resolve import MetadataResolver, merge_entries
 from .sources import OpenCitationsSource
 from .storage import BibliographyStore
@ -219,14 +220,94 @@ class OpenAlexExpander:
        return _normalize_openalex_id(results[0].get("id", ""))
 class OpenCitationsExpander:
    def __init__(self, resolver: MetadataResolver | None = None, source: OpenCitationsSource | None = None) -> None:
        self.resolver = resolver or MetadataResolver()
        self.source = source or OpenCitationsSource(config={"source_client": self.resolver.source_client})
    def expand_entry(
        self,
        store: BibliographyStore,
        citation_key: str,
        relation_type: str = "cites",
        limit: int = 25,
    ) -> list[ExpansionResult]:
        entry = store.get_entry(citation_key)
        if entry is None:
            return []
        doi = str(entry.get("doi") or "")
        if not doi:
            return []
        edges = self.source.get_citations(doi, relation_type=relation_type, limit=limit)
        results: list[ExpansionResult] = []
        for edge in edges:
            discovered_doi = edge.target_work_id[4:] if relation_type == "cites" else edge.source_work_id[4:]
            discovered = self._lookup_discovered_entry(discovered_doi)
            if discovered is None:
                discovered = _opencitations_stub_entry(discovered_doi, citation_key)
            existing_key = _existing_entry_key_for_discovered_work(store, discovered)
            target_key = existing_key or discovered.citation_key
            created = False
            if existing_key is None and store.get_entry(discovered.citation_key) is None:
                store.upsert_entry(
                    discovered,
                    raw_bibtex=None,
                    source_type="graph_expand",
                    source_label=edge.source_label,
                    review_status="draft",
                )
                store.connection.commit()
                created = True
            if relation_type == "cites":
                source_key = citation_key
                relation_target_key = target_key
            else:
                source_key = target_key
                relation_target_key = citation_key
            store.add_relation(
                source_key,
                relation_target_key,
                "cites",
                source_type="graph_expand",
                source_label=edge.source_label,
                confidence=edge.confidence,
            )
            results.append(
                ExpansionResult(
                    source_citation_key=source_key,
                    discovered_citation_key=target_key,
                    created_entry=created,
                    relation_type=relation_type,
                    source_label=edge.source_label,
                )
            )
        return results
    def _lookup_discovered_entry(self, doi: str) -> BibEntry | None:
        resolution = self.resolver.resolve_doi(doi)
        if resolution is not None:
            return resolution.entry
        resolution = self.resolver.resolve_datacite_doi(doi)
        if resolution is not None:
            return resolution.entry
        return self.source.lookup_by_doi(doi)
 class TopicExpander:
    def __init__(
        self,
        crossref_expander: CrossrefExpander | None = None,
        openalex_expander: OpenAlexExpander | None = None,
        opencitations_expander: OpenCitationsExpander | None = None,
    ) -> None:
        self.crossref_expander = crossref_expander or CrossrefExpander()
        self.openalex_expander = openalex_expander or OpenAlexExpander()
        self.opencitations_expander = opencitations_expander or OpenCitationsExpander()
        self.last_run_meta: dict[str, object] = {}
    def expand_topic(
@ -362,6 +443,17 @@ class TopicExpander:
    ) -> list[tuple[ExpansionResult, dict[str, object] | None]]:
        if source == "crossref":
            expansion_rows = self.crossref_expander.expand_entry_references(store, citation_key)
        elif source == "opencitations":
            expansion_rows = []
            for relation_name in _expand_relation_types(relation_type):
                expansion_rows.extend(
                    self.opencitations_expander.expand_entry(
                        store,
                        citation_key,
                        relation_type=relation_name,
                        limit=limit,
                    )
                )
        else:
            expansion_rows: list[ExpansionResult] = []
            for relation_name in _expand_relation_types(relation_type):
@ -385,6 +477,11 @@ class TopicExpander:
    ) -> list[tuple[ExpansionResult, dict[str, object]]]:
        if source == "crossref":
            return self._preview_crossref_discoveries(store, citation_key, limit)
        if source == "opencitations":
            rows: list[tuple[ExpansionResult, dict[str, object]]] = []
            for relation_name in _expand_relation_types(relation_type):
                rows.extend(self._preview_opencitations_discoveries(store, citation_key, relation_name, limit))
            return rows
        rows: list[tuple[ExpansionResult, dict[str, object]]] = []
        for relation_name in _expand_relation_types(relation_type):
            rows.extend(self._preview_openalex_discoveries(store, citation_key, relation_name, limit))
@ -467,6 +564,40 @@ class TopicExpander:
            )
        return rows
    def _preview_opencitations_discoveries(
        self,
        store: BibliographyStore,
        citation_key: str,
        relation_type: str,
        limit: int,
    ) -> list[tuple[ExpansionResult, dict[str, object]]]:
        entry = store.get_entry(citation_key)
        if entry is None or not entry.get("doi"):
            return []
        doi = str(entry["doi"])
        edges = self.opencitations_expander.source.get_citations(doi, relation_type=relation_type, limit=limit)
        rows: list[tuple[ExpansionResult, dict[str, object]]] = []
        for edge in edges:
            discovered_doi = edge.target_work_id[4:] if relation_type == "cites" else edge.source_work_id[4:]
            discovered = self.opencitations_expander._lookup_discovered_entry(discovered_doi)
            if discovered is None:
                discovered = _opencitations_stub_entry(discovered_doi, citation_key)
            existing_key = _existing_entry_key_for_discovered_work(store, discovered)
            target_key = existing_key or discovered.citation_key
            rows.append(
                (
                    ExpansionResult(
                        source_citation_key=citation_key if relation_type == "cites" else target_key,
                        discovered_citation_key=target_key,
                        created_entry=existing_key is None and store.get_entry(discovered.citation_key) is None,
                        relation_type=relation_type,
                        source_label=edge.source_label,
                    ),
                    dict(discovered.fields),
                )
            )
        return rows
 def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
    title = _crossref_reference_title(reference, ordinal)
@ -567,6 +698,20 @@ def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int
    return f"{family}{year or 'nd'}{first_word}{ordinal}"
 def _opencitations_stub_entry(doi: str, source_citation_key: str) -> BibEntry:
    suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
    return BibEntry(
        entry_type="misc",
        citation_key=f"doi{suffix}",
        fields={
            "title": f"Referenced work for DOI {doi}",
            "doi": doi,
            "url": f"https://doi.org/{doi}",
            "note": f"discovered_from = {{{source_citation_key}}}",
        },
    )
 def _normalize_text(value: str) -> str:
    without_tags = re.sub(r"<[^>]+>", " ", html.unescape(value))
    normalized = " ".join(without_tags.split())
--- a/src/citegeist/resolve.py
+++ b/src/citegeist/resolve.py
@ -7,17 +7,38 @@ import re
 import urllib.error
 import urllib.parse
 import xml.etree.ElementTree as ET
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from .bibtex import BibEntry, parse_bibtex
 from .sources.europepmc import EuropePmcSource
 from .sources.openlibrary import OpenLibrarySource
 from .sources.semanticscholar import SemanticScholarSource
 from .sources import SourceClient
@dataclass(slots=True)
 class ResolutionAttempt:
    source_name: str
    strategy: str
    query_value: str
    matched: bool
    candidate_count: int | None = None
    source_label: str = ""
    error: str = ""
@dataclass(slots=True)
 class Resolution:
    entry: BibEntry
    source_type: str
    source_label: str
    attempts: list[ResolutionAttempt] = field(default_factory=list)
@dataclass(slots=True)
 class ResolutionOutcome:
    resolution: Resolution | None
    attempts: list[ResolutionAttempt]
 class MetadataResolver:
@ -31,70 +52,109 @@ class MetadataResolver:
    ) -> None:
        self.user_agent = user_agent
        self.source_client = source_client or SourceClient(user_agent=user_agent)
        self.europepmc = EuropePmcSource(config={"source_client": self.source_client, "user_agent": user_agent})
        self.openlibrary = OpenLibrarySource(config={"source_client": self.source_client, "user_agent": user_agent})
        self.semanticscholar = SemanticScholarSource(config={"user_agent": user_agent})
        self.ncbi_api_key = ncbi_api_key if ncbi_api_key is not None else os.environ.get("NCBI_API_KEY", "")
        self.ncbi_tool = ncbi_tool if ncbi_tool is not None else os.environ.get("NCBI_TOOL", "citegeist")
        self.ncbi_email = ncbi_email if ncbi_email is not None else os.environ.get("NCBI_EMAIL", "")
    def resolve_entry(self, entry: BibEntry) -> Resolution | None:
        return self.resolve_entry_with_trace(entry).resolution
    def resolve_entry_with_trace(self, entry: BibEntry) -> ResolutionOutcome:
        attempts: list[ResolutionAttempt] = []
        if doi := entry.fields.get("doi"):
-            resolved = self.resolve_doi(doi)
+            resolved = self._attempt_direct_resolution(attempts, "crossref", "doi_lookup", doi, self.resolve_doi)
            if resolved is not None:
-                return resolved
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
-            resolved = self.resolve_datacite_doi(doi)
+            resolved = self._attempt_direct_resolution(
                attempts, "datacite", "doi_lookup", doi, self.resolve_datacite_doi
            )
            if resolved is not None:
-                return resolved
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
            resolved = self._attempt_direct_resolution(
                attempts, "europepmc", "doi_lookup", doi, self.resolve_europepmc_doi
            )
            if resolved is not None:
                return ResolutionOutcome(resolution=resolved, attempts=attempts)
            resolved = self._attempt_direct_resolution(
                attempts, "semanticscholar", "doi_lookup", doi, self.resolve_semanticscholar_doi
            )
            if resolved is not None:
                return ResolutionOutcome(resolution=resolved, attempts=attempts)
        if pmid := entry.fields.get("pmid"):
-            resolved = self.resolve_pmid(pmid)
+            resolved = self._attempt_direct_resolution(attempts, "pubmed", "pmid_lookup", pmid, self.resolve_pmid)
            if resolved is not None:
-                return resolved
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
        if openalex_id := entry.fields.get("openalex"):
-            resolved = self.resolve_openalex(openalex_id)
+            resolved = self._attempt_direct_resolution(
                attempts, "openalex", "work_lookup", openalex_id, self.resolve_openalex
            )
            if resolved is not None:
-                return resolved
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
        if dblp_key := entry.fields.get("dblp"):
-            resolved = self.resolve_dblp(dblp_key)
+            resolved = self._attempt_direct_resolution(attempts, "dblp", "key_lookup", dblp_key, self.resolve_dblp)
            if resolved is not None:
-                return resolved
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
        if arxiv_id := entry.fields.get("arxiv"):
-            resolved = self.resolve_arxiv(arxiv_id)
+            resolved = self._attempt_direct_resolution(
                attempts, "arxiv", "id_lookup", arxiv_id, self.resolve_arxiv
            )
            if resolved is not None:
-                return resolved
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
        if title := entry.fields.get("title"):
-            resolved = self.search_crossref_best_match(
+            author_text = entry.fields.get("author", "")
-                title=title,
+            year = entry.fields.get("year", "")
-                author_text=entry.fields.get("author", ""),
+            resolved = self._attempt_title_search_resolution(
-                year=entry.fields.get("year", ""),
+                attempts, "crossref", title, author_text, year, self.search_crossref
            )
            if resolved is not None:
-                return resolved
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
-            resolved = self.search_datacite_best_match(
+            resolved = self._attempt_title_search_resolution(
-                title=title,
+                attempts, "datacite", title, author_text, year, self.search_datacite
                author_text=entry.fields.get("author", ""),
                year=entry.fields.get("year", ""),
            )
            if resolved is not None:
-                return resolved
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
-            resolved = self.search_openalex_best_match(
+            resolved = self._attempt_title_search_resolution(
-                title=title,
+                attempts, "openalex", title, author_text, year, self.search_openalex
                author_text=entry.fields.get("author", ""),
                year=entry.fields.get("year", ""),
            )
            if resolved is not None:
-                return resolved
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
-            resolved = self.search_pubmed_best_match(
+            resolved = self._attempt_title_search_resolution(
-                title=title,
+                attempts, "pubmed", title, author_text, year, self.search_pubmed
                author_text=entry.fields.get("author", ""),
                year=entry.fields.get("year", ""),
            )
            if resolved is not None:
-                return resolved
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
            resolved = self._attempt_title_search_resolution(
                attempts, "europepmc", title, author_text, year, self.search_europepmc
            )
            if resolved is not None:
                return ResolutionOutcome(resolution=resolved, attempts=attempts)
            resolved = self._attempt_title_search_resolution(
                attempts, "semanticscholar", title, author_text, year, self.search_semanticscholar
            )
            if resolved is not None:
                return ResolutionOutcome(resolution=resolved, attempts=attempts)
            if _entry_prefers_catalog_search(entry):
                resolved = self._attempt_title_search_resolution(
                    attempts,
                    "openlibrary",
                    title,
                    author_text,
                    year,
                    self.search_openlibrary,
                    selector=_select_best_catalog_title_match,
                )
                if resolved is not None:
                    return ResolutionOutcome(resolution=resolved, attempts=attempts)
-        return None
+        return ResolutionOutcome(resolution=None, attempts=attempts)
    def resolve_doi(self, doi: str) -> Resolution | None:
        encoded = urllib.parse.quote(doi, safe="")
@ -124,19 +184,7 @@ class MetadataResolver:
        author_text: str = "",
        year: str = "",
    ) -> Resolution | None:
-        candidate = _select_best_title_match(
+        return self._search_best_match_resolution("crossref", title, author_text, year, self.search_crossref)
            self.search_crossref(title, limit=5),
            title=title,
            author_text=author_text,
            year=year,
        )
        if candidate is None:
            return None
        return Resolution(
            entry=candidate,
            source_type="resolver",
            source_label=f"crossref:search:{title}",
        )
    def resolve_dblp(self, dblp_key: str) -> Resolution | None:
        encoded_key = urllib.parse.quote(dblp_key, safe="/:")
@ -245,19 +293,7 @@ class MetadataResolver:
        author_text: str = "",
        year: str = "",
    ) -> Resolution | None:
-        candidate = _select_best_title_match(
+        return self._search_best_match_resolution("datacite", title, author_text, year, self.search_datacite)
            self.search_datacite(title, limit=5),
            title=title,
            author_text=author_text,
            year=year,
        )
        if candidate is None:
            return None
        return Resolution(
            entry=candidate,
            source_type="resolver",
            source_label=f"datacite:search:{title}",
        )
    def search_openalex(self, title: str, limit: int = 5) -> list[BibEntry]:
        query = urllib.parse.urlencode({"search": title, "per-page": limit})
@ -290,6 +326,35 @@ class MetadataResolver:
            return []
        return self._fetch_pubmed_entries(ids[:limit])
    def resolve_europepmc_doi(self, doi: str) -> Resolution | None:
        entry = self.europepmc.lookup_by_doi(doi)
        if entry is None:
            return None
        return Resolution(
            entry=entry,
            source_type="resolver",
            source_label=f"europepmc:doi:{doi}",
        )
    def search_europepmc(self, title: str, limit: int = 5) -> list[BibEntry]:
        return self.europepmc.search(title, limit=limit)
    def search_openlibrary(self, title: str, limit: int = 5) -> list[BibEntry]:
        return self.openlibrary.search(title, limit=limit)
    def resolve_semanticscholar_doi(self, doi: str) -> Resolution | None:
        entry = self.semanticscholar.lookup_by_doi(doi)
        if entry is None:
            return None
        return Resolution(
            entry=entry,
            source_type="resolver",
            source_label=f"semanticscholar:doi:{doi}",
        )
    def search_semanticscholar(self, title: str, limit: int = 5) -> list[BibEntry]:
        return self.semanticscholar.search(title, limit=limit)
    def _safe_get_json(self, url: str) -> dict | None:
        try:
            return self.source_client.get_json(url)
@ -333,19 +398,7 @@ class MetadataResolver:
        author_text: str = "",
        year: str = "",
    ) -> Resolution | None:
-        candidate = _select_best_title_match(
+        return self._search_best_match_resolution("openalex", title, author_text, year, self.search_openalex)
            self.search_openalex(title, limit=5),
            title=title,
            author_text=author_text,
            year=year,
        )
        if candidate is None:
            return None
        return Resolution(
            entry=candidate,
            source_type="resolver",
            source_label=f"openalex:search:{title}",
        )
    def search_pubmed_best_match(
        self,
@ -353,19 +406,122 @@ class MetadataResolver:
        author_text: str = "",
        year: str = "",
    ) -> Resolution | None:
-        candidate = _select_best_title_match(
+        return self._search_best_match_resolution("pubmed", title, author_text, year, self.search_pubmed)
-            self.search_pubmed(title, limit=5),
+
-            title=title,
+    def search_europepmc_best_match(
-            author_text=author_text,
+        self,
-            year=year,
+        title: str,
        author_text: str = "",
        year: str = "",
    ) -> Resolution | None:
        return self._search_best_match_resolution("europepmc", title, author_text, year, self.search_europepmc)
    def search_semanticscholar_best_match(
        self,
        title: str,
        author_text: str = "",
        year: str = "",
    ) -> Resolution | None:
        return self._search_best_match_resolution(
            "semanticscholar", title, author_text, year, self.search_semanticscholar
        )
    def search_openlibrary_best_match(
        self,
        title: str,
        author_text: str = "",
        year: str = "",
    ) -> Resolution | None:
        return self._search_best_match_resolution("openlibrary", title, author_text, year, self.search_openlibrary)
    def _search_best_match_resolution(
        self, source_name: str, title: str, author_text: str, year: str, search_func
    ) -> Resolution | None:
        candidates = search_func(title, limit=5)
        candidate = _select_best_title_match(candidates, title=title, author_text=author_text, year=year)
        if candidate is None:
            return None
-        return Resolution(
+        return Resolution(entry=candidate, source_type="resolver", source_label=f"{source_name}:search:{title}")
    def _attempt_direct_resolution(
        self,
        attempts: list[ResolutionAttempt],
        source_name: str,
        strategy: str,
        query_value: str,
        resolver_func,
    ) -> Resolution | None:
        try:
            resolution = resolver_func(query_value)
        except Exception as exc:
            attempts.append(
                ResolutionAttempt(
                    source_name=source_name,
                    strategy=strategy,
                    query_value=query_value,
                    matched=False,
                    error=str(exc),
                )
            )
            return None
        attempts.append(
            ResolutionAttempt(
                source_name=source_name,
                strategy=strategy,
                query_value=query_value,
                matched=resolution is not None,
                source_label=resolution.source_label if resolution is not None else "",
            )
        )
        if resolution is not None and not resolution.attempts:
            resolution.attempts = list(attempts)
        return resolution
    def _attempt_title_search_resolution(
        self,
        attempts: list[ResolutionAttempt],
        source_name: str,
        title: str,
        author_text: str,
        year: str,
        search_func,
        selector=None,
    ) -> Resolution | None:
        try:
            candidates = search_func(title, limit=5)
        except Exception as exc:
            attempts.append(
                ResolutionAttempt(
                    source_name=source_name,
                    strategy="title_search",
                    query_value=title,
                    matched=False,
                    error=str(exc),
                )
            )
            return None
        match_selector = selector or _select_best_title_match
        candidate = match_selector(candidates, title=title, author_text=author_text, year=year)
        resolution = None
        if candidate is not None:
            resolution = Resolution(
                entry=candidate,
                source_type="resolver",
-            source_label=f"pubmed:search:{title}",
+                source_label=f"{source_name}:search:{title}",
            )
        attempts.append(
            ResolutionAttempt(
                source_name=source_name,
                strategy="title_search",
                query_value=title,
                matched=resolution is not None,
                candidate_count=len(candidates),
                source_label=resolution.source_label if resolution is not None else "",
            )
        )
        if resolution is not None and not resolution.attempts:
            resolution.attempts = list(attempts)
        return resolution
    def _fetch_pubmed_entries(self, pmids: list[str]) -> list[BibEntry]:
        ordered_pmids = [pmid for pmid in dict.fromkeys(pmids) if pmid]
@ -768,6 +924,42 @@ def _select_best_title_match(
    return None
 def _select_best_catalog_title_match(
    candidates: list[BibEntry],
    title: str,
    author_text: str = "",
    year: str = "",
 ) -> BibEntry | None:
    if not candidates:
        return None
    title_tokens = _catalog_title_tokens(title)
    author_tokens = _author_match_tokens(author_text)
    year_text = str(year or "").strip()
    scored: list[tuple[float, BibEntry]] = []
    for candidate in candidates:
        candidate_title_tokens = _catalog_title_tokens(candidate.fields.get("title", ""))
        if not candidate_title_tokens:
            continue
        overlap = len(title_tokens & candidate_title_tokens)
        union = len(title_tokens | candidate_title_tokens)
        score = (overlap / union) if union else 0.0
        if score < 0.6:
            continue
        candidate_year = str(candidate.fields.get("year", "") or "").strip()
        if year_text and candidate_year and year_text != candidate_year:
            continue
        if author_tokens and not _candidate_matches_author_tokens(candidate, author_tokens):
            continue
        scored.append((score, candidate))
    if not scored:
        return None
    scored.sort(key=lambda item: (-item[0], item[1].citation_key))
    return scored[0][1]
 def _author_match_tokens(author_text: str) -> set[str]:
    normalized = _normalize_match_text(author_text)
    if not normalized:
@ -788,6 +980,39 @@ def _candidate_matches_author_tokens(candidate: BibEntry, author_tokens: set[str
    return bool(author_tokens & candidate_tokens)
 def _catalog_title_tokens(value: str) -> set[str]:
    normalized = _normalize_match_text(value)
    stopwords = {"the", "and", "for", "with", "from", "into", "after", "all"}
    return {
        f"{token[:-4]}ic" if token.endswith("ical") and len(token) > 6 else token
        for token in re.findall(r"[a-z0-9]+", normalized)
        if len(token) >= 4 and token not in stopwords
    }
 def _entry_prefers_catalog_search(entry: BibEntry) -> bool:
    if entry.entry_type in {"book", "incollection", "phdthesis", "mastersthesis"}:
        return True
    title = _normalize_match_text(entry.fields.get("title", ""))
    venue = _normalize_match_text(
        " ".join(
            filter(
                None,
                [
                    entry.fields.get("publisher", ""),
                    entry.fields.get("howpublished", ""),
                    entry.fields.get("booktitle", ""),
                ],
            )
        )
    )
    if entry.entry_type != "misc":
        return False
    if any(token in venue for token in ("press", "university", "house", "dictionary", "christendom")):
        return True
    return any(token in title for token in ("dictionary", "history", "world", "universe", "record"))
 def _normalize_pmid(value: str) -> str:
    return "".join(ch for ch in str(value) if ch.isdigit())
--- a/src/citegeist/resolver/init.py
+++ b/src/citegeist/resolver/init.py
@ -0,0 +1,27 @@
 """
 Identifier resolution and normalization module.
 Provides functions for extracting, normalizing, and resolving
 bibliographic identifiers across multiple schemes.
 """
 from __future__ import annotations
 from citegeist.resolver.identifiers import (
    IdentifierExtractor,
    IdentifierNormalizer,
    IdentifierResolver,
    extract_identifiers,
    normalize_identifier,
    get_primary_identifier,
    resolve_identifiers,
 )
 __all__ = [
    'IdentifierExtractor',
    'IdentifierNormalizer',
    'IdentifierResolver',
    'extract_identifiers',
    'normalize_identifier',
    'get_primary_identifier',
    'resolve_identifiers',
 ]
--- a/src/citegeist/resolver/identifiers.py
+++ b/src/citegeist/resolver/identifiers.py
@ -0,0 +1,418 @@
 """
 Identifier resolution and normalization module.
 This module provides functions for extracting, normalizing, and resolving
 bibliographic identifiers across multiple schemes (DOI, PMID, arXiv, ORCID, etc.).
 """
 from __future__ import annotations
 import re
 from typing import Dict, List, Optional, Tuple
 # Identifier scheme patterns
 DOI_PATTERN = re.compile(
    r'^10\.\d{4,9}/\S+$',
    re.IGNORECASE
 )
 PMID_PATTERN = re.compile(r'^\d{5,7}$')
 PMCID_PATTERN = re.compile(
    r'^PMC\d+$|^PMC[0-9a-f]+$', 
    re.IGNORECASE
 )
 ARXIV_PATTERN = re.compile(
    r'^\d{4}\.\d{4,5}(v\d+)?$',
    re.IGNORECASE
 )
 ORCID_PATTERN = re.compile(
    r'^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$',
    re.IGNORECASE
 )
 ROR_PATTERN = re.compile(
    r'^https?://ror\.org/[0-9A-Z]{4,10}$'
 )
 DBLP_PATTERN = re.compile(
    r'^[a-zA-Z0-9_]+:[a-zA-Z0-9_]+$', 
    re.IGNORECASE
 )
 OPENALEX_PATTERN = re.compile(
    r'^W[0-9]{4}-[A-F0-9]{4}$',
    re.IGNORECASE
 )
 class IdentifierExtractor:
    """Extract identifiers from BibEntry fields."""
    @staticmethod
    def extract(entry_fields: Dict[str, str]) -> Dict[str, str]:
        """Extract all identifier schemes from entry fields.
        Args:
            entry_fields: Dictionary of entry fields
        Returns:
            Dictionary mapping scheme names to values
        """
        identifiers = {}
        # DOI
        if doi := entry_fields.get('doi'):
            identifiers['doi'] = doi
        # PMID
        if pmid := entry_fields.get('pmid'):
            identifiers['pmid'] = pmid
        # PMCID
        if pmcid := entry_fields.get('pmcid'):
            identifiers['pmcid'] = pmcid
        # arXiv
        if arxiv := entry_fields.get('arxiv'):
            identifiers['arxiv'] = arxiv
        # DBLP
        if dblp := entry_fields.get('dblp'):
            identifiers['dblp'] = dblp
        # OpenAlex
        if openalex := entry_fields.get('openalex'):
            identifiers['openalex'] = openalex
        # ISBN
        if isbn := entry_fields.get('isbn'):
            identifiers['isbn'] = isbn
        # ISSN
        if issn := entry_fields.get('issn'):
            identifiers['issn'] = issn
        return identifiers
 class IdentifierNormalizer:
    """Normalize identifiers to canonical form."""
    @staticmethod
    def normalize_doi(doi: str) -> Optional[str]:
        """Normalize DOI to lowercase.
        Args:
            doi: DOI string
        Returns:
            Lowercase DOI, or None if invalid
        """
        if not doi:
            return None
        normalized = doi.strip().lower()
        if DOI_PATTERN.match(normalized):
            return normalized
        return None
    @staticmethod
    def normalize_pmid(pmid: str) -> Optional[str]:
        """Normalize PMID to string.
        Args:
            pmid: PMID string
        Returns:
            PMID string, or None if invalid
        """
        if not pmid:
            return None
        pmid_str = str(pmid).strip()
        if PMID_PATTERN.match(pmid_str):
            return pmid_str
        return None
    @staticmethod
    def normalize_pmcid(pmcid: str) -> Optional[str]:
        """Normalize PMCID to lowercase.
        Args:
            pmcid: PMCID string
        Returns:
            Lowercase PMCID, or None if invalid
        """
        if not pmcid:
            return None
        normalized = pmcid.strip().lower()
        if PMCID_PATTERN.match(normalized):
            return normalized
        return None
    @staticmethod
    def normalize_arxiv(arxiv: str) -> Optional[str]:
        """Normalize arXiv ID.
        Args:
            arxiv: arXiv ID string
        Returns:
            Normalized arXiv ID, or None if invalid
        """
        if not arxiv:
            return None
        # Remove 'v' and version suffix if present
        normalized = arxiv.strip().lower()
        if 'v' in normalized:
            normalized = normalized.split('v')[0]
        if ARXIV_PATTERN.match(normalized):
            return normalized
        return None
    @staticmethod
    def normalize_orcid(orcid: str) -> Optional[str]:
        """Normalize ORCID to canonical format.
        Args:
            orcid: ORCID string
        Returns:
            Normalized ORCID (XXXX-XXXX-XXXX-XXX0), or None if invalid
        """
        if not orcid:
            return None
        orcid = orcid.strip().upper().replace(' ', '')
        if ORCID_PATTERN.match(orcid):
            return orcid
        return None
    @staticmethod
    def normalize_ror(ror_url: str) -> Optional[str]:
        """Normalize ROR URL to identifier.
        Args:
            ror_url: ROR URL string
        Returns:
            ROR identifier, or None if invalid
        """
        if not ror_url:
            return None
        ror_id = ror_url.strip().lower()
        if ROR_PATTERN.match(ror_id):
            return ror_id
        return None
    @staticmethod
    def normalize_dblp(dblp_key: str) -> Optional[str]:
        """Normalize DBLP key.
        Args:
            dblp_key: DBLP key string
        Returns:
            DBLP key, or None if invalid
        """
        if not dblp_key:
            return None
        dblp = dblp_key.strip()
        if DBLP_PATTERN.match(dblp):
            return dblp
        return None
    @staticmethod
    def normalize_openalex(openalex_id: str) -> Optional[str]:
        """Normalize OpenAlex ID.
        Args:
            openalex_id: OpenAlex ID string
        Returns:
            OpenAlex ID, or None if invalid
        """
        if not openalex_id:
            return None
        openalex = openalex_id.strip().upper()
        if OPENALEX_PATTERN.match(openalex):
            return openalex
        return None
    @staticmethod
    def normalize_identifier(scheme: str, value: str) -> Optional[Tuple[str, str]]:
        """Normalize an identifier.
        Args:
            scheme: Identifier scheme name
            value: Identifier value
        Returns:
            Tuple of (scheme, normalized_value), or None if invalid
        """
        scheme = scheme.lower()
        normalizers = {
            'doi': IdentifierNormalizer.normalize_doi,
            'pmid': IdentifierNormalizer.normalize_pmid,
            'pmcid': IdentifierNormalizer.normalize_pmcid,
            'arxiv': IdentifierNormalizer.normalize_arxiv,
            'orcid': IdentifierNormalizer.normalize_orcid,
            'ror': IdentifierNormalizer.normalize_ror,
            'dblp': IdentifierNormalizer.normalize_dblp,
            'openalex': IdentifierNormalizer.normalize_openalex,
        }
        normalizer = normalizers.get(scheme)
        if normalizer:
            normalized = normalizer(value)
            if normalized:
                return (scheme, normalized)
        return None
 class IdentifierResolver:
    """Resolve identifiers across multiple schemes."""
    # Lookup priority: schemes should be checked in this order
    LOOKUP_PRIORITY = [
        ('doi', IdentifierNormalizer.normalize_doi),
        ('pmid', IdentifierNormalizer.normalize_pmid),
        ('pmcid', IdentifierNormalizer.normalize_pmcid),
        ('arxiv', IdentifierNormalizer.normalize_arxiv),
        ('dblp', IdentifierNormalizer.normalize_dblp),
        ('openalex', IdentifierNormalizer.normalize_openalex),
    ]
    @staticmethod
    def resolve(entry_fields: Dict[str, str]) -> List[Tuple[str, str]]:
        """Resolve identifiers from entry fields.
        Args:
            entry_fields: Dictionary of entry fields
        Returns:
            List of (scheme, normalized_value) tuples in priority order
        """
        identifiers = IdentifierExtractor.extract(entry_fields)
        resolved = []
        for scheme, value in identifiers.items():
            if normalized := IdentifierNormalizer.normalize_identifier(scheme, value):
                resolved.append(normalized)
        # Add title fingerprint as fallback
        if title := entry_fields.get('title'):
            fingerprint = IdentifierResolver._create_title_fingerprint(title)
            if fingerprint:
                resolved.append(('title', fingerprint))
        return resolved
    @staticmethod
    def _create_title_fingerprint(title: str) -> Optional[str]:
        """Create a fingerprint from title for fallback lookup.
        Args:
            title: Work title
        Returns:
            Fingerprint string
        """
        if not title:
            return None
        # Remove common words, punctuation, and normalize
        words = title.lower()
        words = re.sub(r'[^\w\s]', ' ', words)  # Remove punctuation
        words = re.sub(r'\s+', ' ', words)  # Normalize whitespace
        words = words.strip()
        return words
    @staticmethod
    def get_primary_identifier(entry_fields: Dict[str, str]) -> Optional[Tuple[str, str]]:
        """Get the primary identifier (first in priority order).
        Args:
            entry_fields: Dictionary of entry fields
        Returns:
            Tuple of (scheme, value), or None if no identifier found
        """
        resolved = IdentifierResolver.resolve(entry_fields)
        for scheme, _ in IdentifierResolver.LOOKUP_PRIORITY:
            # Find this scheme in resolved identifiers
            for rscheme, rvalue in resolved:
                if rscheme == scheme:
                    return (rscheme, rvalue)
        return None
    @staticmethod
    def get_scheme_value(scheme: str, entry_fields: Dict[str, str]) -> Optional[str]:
        """Get a specific identifier value from entry fields.
        Args:
            scheme: Identifier scheme name
            entry_fields: Dictionary of entry fields
        Returns:
            Identifier value, or None if not found
        """
        if value := entry_fields.get(scheme):
            if normalized := IdentifierNormalizer.normalize_identifier(scheme, value):
                return normalized[1]
        return None
 # Convenience functions
 def extract_identifiers(entry_fields: Dict[str, str]) -> Dict[str, str]:
    """Extract all identifiers from entry fields.
    Args:
        entry_fields: Dictionary of entry fields
    Returns:
        Dictionary mapping scheme names to values
    """
    return IdentifierExtractor.extract(entry_fields)
 def normalize_identifier(scheme: str, value: str) -> Optional[Tuple[str, str]]:
    """Normalize an identifier.
    Args:
        scheme: Identifier scheme name
        value: Identifier value
    Returns:
        Tuple of (scheme, normalized_value), or None if invalid
    """
    return IdentifierNormalizer.normalize_identifier(scheme, value)
 def get_primary_identifier(entry_fields: Dict[str, str]) -> Optional[Tuple[str, str]]:
    """Get the primary identifier.
    Args:
        entry_fields: Dictionary of entry fields
    Returns:
        Tuple of (scheme, value), or None if no identifier found
    """
    return IdentifierResolver.get_primary_identifier(entry_fields)
 def resolve_identifiers(entry_fields: Dict[str, str]) -> List[Tuple[str, str]]:
    """Resolve identifiers from entry fields.
    Args:
        entry_fields: Dictionary of entry fields
    Returns:
        List of (scheme, value) tuples
    """
    return IdentifierResolver.resolve(entry_fields)
--- a/src/citegeist/sources/all.py
+++ b/src/citegeist/sources/all.py
@ -0,0 +1,29 @@
 """Export all source plugins."""
 from __future__ import annotations
 from citegeist.sources.base import BibliographicSource, SourceRecord, CitationEdge
 from citegeist.sources.catalog import SourceCatalogEntry, list_source_catalog, prioritized_source_keys
 from citegeist.sources.registry import SourceRegistry, get_registry
 from citegeist.sources.crossref import CrossRefSource
 from citegeist.sources.europepmc import EuropePmcSource
 from citegeist.sources.opencitations import OpenCitationsSource
 from citegeist.sources.openlibrary import OpenLibrarySource
 from citegeist.sources.semanticscholar import SemanticScholarSource
 from citegeist.sources.unpaywall import UnpaywallSource
 __all__ = [
    'BibliographicSource',
    'SourceRecord',
    'CitationEdge',
    'SourceCatalogEntry',
    'SourceRegistry',
    'get_registry',
    'list_source_catalog',
    'prioritized_source_keys',
    'CrossRefSource',
    'EuropePmcSource',
    'OpenCitationsSource',
    'OpenLibrarySource',
    'SemanticScholarSource',
    'UnpaywallSource',
 ]
--- a/src/citegeist/sources/init.py
+++ b/src/citegeist/sources/init.py
@ -0,0 +1,44 @@
 """
 Bibliographic source plugins.
 This package provides a plugin architecture for integrating multiple
 bibliographic data sources (CrossRef, PubMed, Semantic Scholar, etc.).
 """
 # Import old sources module for backward compatibility
 from . import _old_sources_compat
 # Import new plugin architecture
 from citegeist.sources.base import BibliographicSource, SourceRecord, CitationEdge
 from citegeist.sources.catalog import SourceCatalogEntry, list_source_catalog, prioritized_source_keys
 from citegeist.sources.registry import SourceRegistry, get_registry
 from citegeist.sources.crossref import CrossRefSource
 from citegeist.sources.europepmc import EuropePmcSource
 from citegeist.sources.opencitations import OpenCitationsSource
 from citegeist.sources.openlibrary import OpenLibrarySource
 from citegeist.sources.semanticscholar import SemanticScholarSource
 from citegeist.sources.unpaywall import UnpaywallSource
 # Re-export old classes for compatibility
 __all__ = [
    # New plugin architecture
    'BibliographicSource',
    'SourceRecord',
    'CitationEdge',
    'SourceCatalogEntry',
    'SourceRegistry',
    'get_registry',
    'list_source_catalog',
    'prioritized_source_keys',
    'CrossRefSource',
    'EuropePmcSource',
    'OpenCitationsSource',
    'OpenLibrarySource',
    'SemanticScholarSource',
    'UnpaywallSource',
    # Old API (for backward compatibility)
    'SourceClient',
 ]
 # Backward compatibility - make SourceClient available from this module
 SourceClient = _old_sources_compat.SourceClient
--- a/src/citegeist/sources/_old_sources_compat.py
+++ b/src/citegeist/sources/_old_sources_compat.py
@ -0,0 +1,25 @@
 """
 Backward compatibility module for old sources module.
 This module re-exports the old SourceClient class for compatibility.
 """
 from pathlib import Path
 import importlib.util
 from .base import BibliographicSource, SourceRecord, CitationEdge
 from .registry import SourceRegistry, get_registry
 from .crossref import CrossRefSource
 # Load the old sources.py module from the citegeist package root
 _OLD_SOURCES_PATH = Path(__file__).resolve().parents[1] / "sources.py"
 spec = importlib.util.spec_from_file_location(
    "citegeist.sources_old",
    _OLD_SOURCES_PATH
 )
 if spec and spec.loader:
    old_sources = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(old_sources)
    SourceClient = old_sources.SourceClient
 else:
    # Fallback if old sources.py doesn't exist
    SourceClient = None
--- a/src/citegeist/sources/base.py
+++ b/src/citegeist/sources/base.py
@ -0,0 +1,189 @@
 """
 Base interface for bibliographic sources.
 This module defines the abstract base class that all source plugins must implement.
 Plugins can register themselves with the SourceRegistry for dynamic loading.
 """
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 from citegeist.bibtex import BibEntry
@dataclass(slots=True)
 class SourceRecord:
    """Represents a raw record from a source API."""
    raw: Dict[str, Any]
    source_type: str
    source_label: str
    timestamp: str
    confidence: float
@dataclass(slots=True)
 class CitationEdge:
    """Represents a citation relationship."""
    source_work_id: str
    target_work_id: str
    relation_type: str  # "cites" or "cited_by"
    source_type: str
    source_label: str
    confidence: float
 class BibliographicSource(ABC):
    """Abstract base class for bibliographic data sources.
    All source plugins must inherit from this class and implement the required methods.
    """
    def __init__(self, config: Optional[Dict[str, Any]] = None):
        """Initialize the source with optional configuration.
        Args:
            config: Source-specific configuration dictionary
        """
        self.config = config or {}
        self.enabled = self.config.get('enabled', True)
        self.source_type = self.config.get('source_type', self.__class__.__name__)
    @abstractmethod
    def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
        """Look up a work by DOI.
        Args:
            doi: Digital Object Identifier
        Returns:
            BibEntry if found, None otherwise
        """
        pass
    @abstractmethod
    def lookup_by_title(self, title: str) -> Optional[BibEntry]:
        """Look up a work by title.
        Args:
            title: Work title
        Returns:
            BibEntry if found, None otherwise
        """
        pass
    @abstractmethod
    def search(self, query: str, limit: int = 10) -> List[BibEntry]:
        """Search for works matching the query.
        Args:
            query: Search query string
            limit: Maximum number of results
        Returns:
            List of matching BibEntry objects
        """
        pass
    @abstractmethod
    def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
        """Normalize a raw API record to a canonical BibEntry.
        Args:
            record: Raw record from source API
        Returns:
            BibEntry if normalization succeeds, None otherwise
        """
        pass
    def get_citations(self, work_id: str, relation_type: str = 'cites', limit: int = 10) -> List[CitationEdge]:
        """Get citations for a work.
        Args:
            work_id: Work identifier (DOI, PMID, etc.)
            relation_type: Type of relation ('cites' or 'cited_by')
            limit: Maximum number of results
        Returns:
            List of CitationEdge objects
        """
        return []
    def get_related(self, work_id: str, limit: int = 10) -> List[BibEntry]:
        """Get works related to a work.
        Args:
            work_id: Work identifier
            limit: Maximum number of results
        Returns:
            List of related BibEntry objects
        """
        return []
    def get_fulltext_url(self, doi: str) -> Optional[str]:
        """Get full-text URL for a work.
        Args:
            doi: Digital Object Identifier
        Returns:
            Full-text URL if available, None otherwise
        """
        return None
    def get_embedding(self, work_id: str) -> Optional[List[float]]:
        """Get embedding vector for a work.
        Args:
            work_id: Work identifier
        Returns:
            Embedding vector if available, None otherwise
        """
        return None
    def get_identifier_scheme(self) -> str:
        """Get the identifier scheme used by this source.
        Returns:
            Identifier scheme (e.g., 'doi', 'pmid', 'openalex')
        """
        return self.source_type.lower()
    def record_source_metadata(self, entry: BibEntry, operation: str = 'ingest') -> SourceRecord:
        """Create a source record for provenance tracking.
        Args:
            entry: The BibEntry to record
            operation: Operation type (e.g., 'ingest', 'enrich')
        Returns:
            SourceRecord with metadata
        """
        return SourceRecord(
            raw=self._entry_to_dict(entry),
            source_type=self.source_type,
            source_label=f"{self.source_type}:{self.config.get('name', self.__class__.__name__)}",
            timestamp='',
            confidence=1.0
        )
    def _entry_to_dict(self, entry: BibEntry) -> Dict[str, Any]:
        """Convert BibEntry to dictionary for source records."""
        return {
            'entry_type': entry.entry_type,
            'citation_key': entry.citation_key,
            'fields': entry.fields
        }
    def is_available(self) -> bool:
        """Check if the source is available and enabled.
        Returns:
            True if enabled and available, False otherwise
        """
        return self.enabled
--- a/src/citegeist/sources/catalog.py
+++ b/src/citegeist/sources/catalog.py
@ -0,0 +1,173 @@
 """Open bibliographic source inventory and prioritization helpers."""
 from __future__ import annotations
 from dataclasses import dataclass
@dataclass(frozen=True, slots=True)
 class SourceCatalogEntry:
    key: str
    label: str
    category: str
    access: str
    capabilities: tuple[str, ...]
    strengths: str
    caveats: str
    current_status: str
    priority: str
 _CATALOG: tuple[SourceCatalogEntry, ...] = (
    SourceCatalogEntry(
        key="crossref",
        label="Crossref",
        category="metadata",
        access="open API",
        capabilities=("doi_lookup", "title_search", "reference_lists"),
        strengths="Broad DOI coverage and good article-level metadata.",
        caveats="Citation coverage is incomplete and some references are unstructured blobs.",
        current_status="integrated",
        priority="now",
    ),
    SourceCatalogEntry(
        key="openalex",
        label="OpenAlex",
        category="metadata+graph",
        access="open API",
        capabilities=("work_lookup", "title_search", "citations", "cited_by", "topics"),
        strengths="Best current open source for citation graph expansion and work-level discovery.",
        caveats="Occasional noisy secondary records require conservative admission rules.",
        current_status="integrated",
        priority="now",
    ),
    SourceCatalogEntry(
        key="pubmed",
        label="PubMed / NCBI E-utilities",
        category="metadata",
        access="open API",
        capabilities=("pmid_lookup", "title_search", "biomedical_metadata"),
        strengths="High-value authoritative metadata for biomedical literature.",
        caveats="Domain-specific coverage outside biomedicine is limited.",
        current_status="integrated",
        priority="now",
    ),
    SourceCatalogEntry(
        key="datacite",
        label="DataCite",
        category="metadata",
        access="open API",
        capabilities=("doi_lookup", "title_search", "datasets"),
        strengths="Important for datasets, reports, and non-traditional DOI-bearing objects.",
        caveats="Less useful than Crossref/OpenAlex for mainstream article citation graphs.",
        current_status="integrated",
        priority="now",
    ),
    SourceCatalogEntry(
        key="dblp",
        label="DBLP",
        category="metadata",
        access="open API",
        capabilities=("key_lookup", "search", "computer_science"),
        strengths="Excellent computer-science coverage and clean bibliographic records.",
        caveats="Discipline-specific rather than general-purpose.",
        current_status="integrated",
        priority="selective",
    ),
    SourceCatalogEntry(
        key="arxiv",
        label="arXiv",
        category="metadata+fulltext",
        access="open API",
        capabilities=("id_lookup", "search", "preprints"),
        strengths="Useful for preprint-first fields and free full-text links.",
        caveats="Not a general citation graph source.",
        current_status="integrated",
        priority="selective",
    ),
    SourceCatalogEntry(
        key="open_citations",
        label="OpenCitations",
        category="graph",
        access="open API",
        capabilities=("doi_citations", "doi_references", "provenance"),
        strengths="Directly aligned with open citation-edge expansion.",
        caveats="Coverage is narrower than OpenAlex and needs merge discipline.",
        current_status="integrated",
        priority="now",
    ),
    SourceCatalogEntry(
        key="semantic_scholar",
        label="Semantic Scholar",
        category="metadata+graph",
        access="free API with limits",
        capabilities=("work_lookup", "search", "citations", "references"),
        strengths="Strong graph and relevance signals, especially for discovery workflows.",
        caveats="Not fully open data in the same sense as Crossref/OpenAlex/OpenCitations.",
        current_status="integrated",
        priority="now",
    ),
    SourceCatalogEntry(
        key="unpaywall",
        label="Unpaywall",
        category="access-links",
        access="open API",
        capabilities=("doi_fulltext_links", "oa_status"),
        strengths="Best open source for landing-page and OA-link enrichment.",
        caveats="Improves access, not bibliographic identity or graph completeness.",
        current_status="integrated",
        priority="now",
    ),
    SourceCatalogEntry(
        key="europe_pmc",
        label="Europe PMC",
        category="metadata+fulltext",
        access="open API",
        capabilities=("search", "citations", "fulltext_links", "biomedical"),
        strengths="Valuable biomedical complement to PubMed with richer open-access linkage.",
        caveats="Domain-specific and partially overlapping with PubMed/OpenAlex.",
        current_status="integrated",
        priority="now",
    ),
    SourceCatalogEntry(
        key="open_library",
        label="Open Library",
        category="metadata",
        access="open API",
        capabilities=("title_search", "book_metadata", "author_catalog", "isbn_hints"),
        strengths="Useful general-purpose open catalog coverage for books, monographs, and reference works.",
        caveats="Catalog metadata is not a citation graph and can be noisy for article-like titles.",
        current_status="integrated",
        priority="selective",
    ),
    SourceCatalogEntry(
        key="openaire",
        label="OpenAIRE",
        category="metadata+repository",
        access="open API",
        capabilities=("repository_metadata", "oa_links", "project_links"),
        strengths="Good for repository, project, and European OA discovery.",
        caveats="May be better treated as a corpus-acquisition adapter than a first-line resolver.",
        current_status="planned",
        priority="evaluate",
    ),
    SourceCatalogEntry(
        key="oai_pmh",
        label="OAI-PMH Repositories",
        category="repository",
        access="open protocol",
        capabilities=("repository_harvest", "set_discovery", "metadata_formats"),
        strengths="Already useful for theses, dissertations, and institutional repositories.",
        caveats="Heterogeneous metadata quality; not a single canonical bibliographic source.",
        current_status="integrated",
        priority="selective",
    ),
 )
 def list_source_catalog() -> list[SourceCatalogEntry]:
    return list(_CATALOG)
 def prioritized_source_keys() -> list[str]:
    order = {"now": 0, "next": 1, "selective": 2, "evaluate": 3}
    return [entry.key for entry in sorted(_CATALOG, key=lambda entry: (order[entry.priority], entry.label.lower()))]
--- a/src/citegeist/sources/crossref.py
+++ b/src/citegeist/sources/crossref.py
@ -0,0 +1,210 @@
 """
 CrossRef source plugin.
 CrossRef provides metadata for DOIs for scholarly works.
 """
 from __future__ import annotations
 import json
 import urllib.request
 import urllib.parse
 from typing import Any, Dict, List, Optional
 from citegeist.bibtex import BibEntry
 from citegeist.sources.base import BibliographicSource
 class CrossRefSource(BibliographicSource):
    """CrossRef source for DOI-based metadata lookup."""
    BASE_URL = "https://api.crossref.org"
    def __init__(self, config: Optional[Dict[str, Any]] = None):
        """Initialize CrossRef source.
        Args:
            config: Configuration with optional 'api_key'
        """
        super().__init__(config)
        self.api_key = self.config.get('api_key', '')
        self.user_agent = self.config.get(
            'user_agent',
            'citegeist/0.1 (local research tool)',
        )
    def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
        """Look up a work by DOI.
        Args:
            doi: Digital Object Identifier
        Returns:
            BibEntry if found, None otherwise
        """
        if not doi:
            return None
        encoded = urllib.parse.quote(doi, safe="")
        url = f"{self.BASE_URL}/works/{encoded}"
        headers = {'User-Agent': self.user_agent}
        if self.api_key:
            headers['X-Api-Key'] = self.api_key
        try:
            req = urllib.request.Request(url, headers=headers)
            response = urllib.request.urlopen(req)
            data = response.read().decode('utf-8')
            payload = json.loads(data)
            return self._normalize_crossref(payload)
        except Exception:
            return None
    def lookup_by_title(self, title: str) -> Optional[BibEntry]:
        """CrossRef doesn't support title-only lookup.
        Returns None as this is not a supported operation.
        """
        return None
    def search(self, query: str, limit: int = 10) -> List[BibEntry]:
        """Search CrossRef for works.
        Args:
            query: Search query string
            limit: Maximum number of results
        Returns:
            List of matching BibEntry objects
        """
        if not query:
            return []
        encoded_query = urllib.parse.quote(query, safe="")
        url = f"{self.BASE_URL}/works?query={encoded_query}&rows={limit}"
        headers = {'User-Agent': self.user_agent}
        if self.api_key:
            headers['X-Api-Key'] = self.api_key
        try:
            req = urllib.request.Request(url, headers=headers)
            response = urllib.request.urlopen(req)
            data = response.read().decode('utf-8')
            payload = json.loads(data)
            items = payload.get('message', {}).get('items', [])
            return [entry for item in items if (entry := self._normalize_crossref(item)) is not None]
        except Exception:
            return []
    def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
        """Normalize a raw CrossRef record to a BibEntry.
        Args:
            record: Raw record from CrossRef API
        Returns:
            BibEntry if normalization succeeds
        """
        return self._normalize_crossref(record)
    def get_identifier_scheme(self) -> str:
        """Return 'doi' as the identifier scheme."""
        return 'doi'
    def _normalize_crossref(self, payload: Dict[str, Any]) -> Optional[BibEntry]:
        """Normalize a CrossRef payload to a BibEntry.
        Args:
            payload: Raw JSON payload from CrossRef
        Returns:
            BibEntry object
        """
        message = payload.get('message', payload)
        if not message:
            return None
        # Extract basic fields
        doi = str(message.get('DOI', ''))
        title = ' '.join(message.get('title', [])) if message.get('title') else ''
        author_data = message.get('author', [])
        year = self._extract_year(message)
        # Format authors
        authors = []
        for author in author_data:
            given = str(author.get('given', ''))
            family = str(author.get('family', ''))
            if given and family:
                authors.append(f"{given} {family}")
            elif family:
                authors.append(family)
        # Get publisher
        publisher = str(message.get('publisher', ''))
        # Get journal info
        container_title = message.get('container-title', [])
        journal = container_title[0] if container_title else ''
        # Get URL
        url = str(message.get('URL', ''))
        # Get abstract
        abstract = self._extract_abstract(message.get('abstract'))
        # Map to BibEntry
        fields: Dict[str, str] = {}
        if title:
            fields['title'] = title
        if authors:
            fields['author'] = ' and '.join(authors)
        if year:
            fields['year'] = year
        if doi:
            fields['doi'] = doi
        if journal:
            fields['journal'] = journal
        if publisher:
            fields['publisher'] = publisher
        if url:
            fields['url'] = url
        if abstract:
            fields['abstract'] = abstract
        citation_key = f"{authors[0] if authors else 'crossref'}_{year or 'n.d.'}_{title or doi}"
        return BibEntry(
            entry_type='article',
            citation_key=citation_key,
            fields=fields
        )
    def _extract_year(self, message: Dict[str, Any]) -> str:
        for field_name in ('published-print', 'published-online', 'issued', 'created'):
            year = self._extract_year_from_date_parts(message.get(field_name, {}))
            if year:
                return year
        return ''
    def _extract_year_from_date_parts(self, field: Dict[str, Any]) -> str:
        date_parts = field.get('date-parts', [])
        if not date_parts:
            return ''
        first_part = date_parts[0]
        if not first_part:
            return ''
        year = first_part[0]
        return str(year) if year else ''
    def _extract_abstract(self, raw_abstract: Any) -> str:
        if isinstance(raw_abstract, str):
            return raw_abstract.strip()
        if isinstance(raw_abstract, list):
            for item in raw_abstract:
                if isinstance(item, dict):
                    text = str(item.get('value', '')).strip()
                    if text:
                        return text
                elif isinstance(item, str) and item.strip():
                    return item.strip()
        return ''
--- a/src/citegeist/sources/europepmc.py
+++ b/src/citegeist/sources/europepmc.py
@ -0,0 +1,157 @@
 """Europe PMC source plugin."""
 from __future__ import annotations
 import urllib.parse
 from typing import Any, Dict, Optional
 from citegeist.bibtex import BibEntry
 from citegeist.sources._old_sources_compat import SourceClient
 from citegeist.sources.base import BibliographicSource
 class EuropePmcSource(BibliographicSource):
    """Europe PMC source for biomedical metadata and OA/fulltext links."""
    BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
    def __init__(self, config: Optional[Dict[str, Any]] = None):
        super().__init__(config)
        user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)")
        self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
    def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
        normalized = doi.strip()
        if not normalized:
            return None
        query = f'DOI:"{normalized}"'
        row = self._search_one(query)
        return self.normalize(row) if row else None
    def lookup_by_title(self, title: str) -> Optional[BibEntry]:
        query_text = " ".join(title.split())
        if not query_text:
            return None
        query = f'TITLE:"{query_text}"'
        row = self._search_one(query)
        return self.normalize(row) if row else None
    def search(self, query: str, limit: int = 10) -> list[BibEntry]:
        query_text = " ".join(query.split())
        if not query_text:
            return []
        payload = self._search_payload(f'TITLE:"{query_text}"', max(1, limit))
        results = payload.get("resultList", {}).get("result", []) if payload else []
        return [entry for row in results if (entry := self.normalize(row)) is not None]
    def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
        title = str(record.get("title") or "").strip()
        if not title:
            return None
        doi = str(record.get("doi") or "").strip()
        pmid = str(record.get("pmid") or record.get("id") or "").strip() if str(record.get("source") or "") == "MED" else str(record.get("pmid") or "").strip()
        pmcid = str(record.get("pmcid") or "").strip()
        year = str(record.get("pubYear") or "").strip()
        author_text = self._normalize_author_string(str(record.get("authorString") or "").strip())
        journal_title = str(record.get("journalTitle") or "").strip()
        abstract = str(record.get("abstractText") or "").strip()
        fields: Dict[str, str] = {"title": title}
        if doi:
            fields["doi"] = doi
        if pmid:
            fields["pmid"] = pmid
        if pmcid:
            fields["pmcid"] = pmcid
        if year:
            fields["year"] = year
        if author_text:
            fields["author"] = author_text
        if journal_title:
            fields["journal"] = journal_title
        if volume := str(record.get("journalVolume") or "").strip():
            fields["volume"] = volume
        if issue := str(record.get("issue") or "").strip():
            fields["number"] = issue
        if pages := str(record.get("pageInfo") or "").strip():
            fields["pages"] = pages
        if abstract:
            fields["abstract"] = abstract
        if fulltext_url := self._fulltext_url(record):
            fields["url"] = fulltext_url
        elif article_url := self._article_url(record):
            fields["url"] = article_url
        if str(record.get("isOpenAccess") or "").strip():
            fields["is_oa"] = "true" if str(record.get("isOpenAccess")).upper() == "Y" else "false"
        if cited_by := str(record.get("citedByCount") or "").strip():
            fields["europepmc_cited_by_count"] = cited_by
        if source := str(record.get("source") or "").strip():
            fields["europepmc_source"] = source
        citation_key = self._citation_key(doi, pmid, author_text, year, title)
        return BibEntry(entry_type="article", citation_key=citation_key, fields=fields)
    def get_fulltext_url(self, doi: str) -> Optional[str]:
        normalized = doi.strip()
        if not normalized:
            return None
        payload = self._search_payload(f'DOI:"{normalized}"', 1)
        results = payload.get("resultList", {}).get("result", []) if payload else []
        if not results:
            return None
        return self._fulltext_url(results[0]) or self._article_url(results[0])
    def get_identifier_scheme(self) -> str:
        return "doi"
    def _search_one(self, query: str) -> Dict[str, Any] | None:
        payload = self._search_payload(query, 1)
        results = payload.get("resultList", {}).get("result", []) if payload else []
        return results[0] if results else None
    def _search_payload(self, query: str, page_size: int) -> Dict[str, Any] | None:
        params = {
            "query": query,
            "format": "json",
            "resultType": "core",
            "pageSize": max(1, page_size),
        }
        return self.source_client.try_get_json(f"{self.BASE_URL}?{urllib.parse.urlencode(params)}")
    def _fulltext_url(self, record: Dict[str, Any]) -> str:
        candidates = record.get("fullTextUrlList", {})
        if isinstance(candidates, dict):
            urls = candidates.get("fullTextUrl", [])
            if isinstance(urls, dict):
                urls = [urls]
            if isinstance(urls, list):
                for item in urls:
                    if not isinstance(item, dict):
                        continue
                    url = str(item.get("url") or "").strip()
                    if url:
                        return url
        return ""
    def _article_url(self, record: Dict[str, Any]) -> str:
        source = str(record.get("source") or "").strip()
        identifier = str(record.get("id") or "").strip()
        if source and identifier:
            return f"https://europepmc.org/article/{source}/{identifier}"
        return ""
    def _normalize_author_string(self, value: str) -> str:
        if not value:
            return ""
        authors = [part.strip().rstrip(".") for part in value.split(",") if part.strip()]
        return " and ".join(authors)
    def _citation_key(self, doi: str, pmid: str, author_text: str, year: str, title: str) -> str:
        if doi:
            return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
        if pmid:
            return f"pmid{pmid}"
        family = author_text.split(" and ")[0].split()[-1] if author_text else "ref"
        family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref"
        first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
        return f"{family}{year or 'nd'}{first_word or 'untitled'}"
--- a/src/citegeist/sources/opencitations.py
+++ b/src/citegeist/sources/opencitations.py
@ -0,0 +1,178 @@
 """OpenCitations source plugin."""
 from __future__ import annotations
 import urllib.parse
 from typing import Any, Dict, List, Optional
 from citegeist.bibtex import BibEntry
 from citegeist.sources.base import BibliographicSource, CitationEdge
 from citegeist.sources._old_sources_compat import SourceClient
 class OpenCitationsSource(BibliographicSource):
    """OpenCitations source for DOI metadata and citation edges."""
    INDEX_BASE_URL = "https://api.opencitations.net/index/v2"
    META_BASE_URL = "https://api.opencitations.net/meta/v1"
    def __init__(self, config: Optional[Dict[str, Any]] = None):
        super().__init__(config)
        user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)")
        self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
    def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
        normalized = self._normalize_doi_pid(doi)
        if not normalized:
            return None
        rows = self.source_client.try_get_json(f"{self.META_BASE_URL}/metadata/{normalized}")
        if not rows:
            return None
        return self.normalize(rows[0])
    def lookup_by_title(self, title: str) -> Optional[BibEntry]:
        return None
    def search(self, query: str, limit: int = 10) -> List[BibEntry]:
        return []
    def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
        ids = str(record.get("id") or "")
        title = str(record.get("title") or "").strip()
        if not ids or not title:
            return None
        doi = self._extract_id_value(ids, "doi")
        openalex = self._extract_id_value(ids, "openalex")
        year = self._extract_year(str(record.get("pub_date") or ""))
        authors = self._normalize_author_field(str(record.get("author") or ""))
        venue, venue_ids = self._parse_venue_field(str(record.get("venue") or ""))
        entry_type = self._map_entry_type(str(record.get("type") or ""))
        fields: Dict[str, str] = {"title": title}
        if doi:
            fields["doi"] = doi
            fields["url"] = f"https://doi.org/{doi}"
        if openalex:
            fields["openalex"] = openalex
        if year:
            fields["year"] = year
        if authors:
            fields["author"] = authors
        if venue:
            if entry_type == "article":
                fields["journal"] = venue
            else:
                fields["booktitle"] = venue
        if volume := str(record.get("volume") or "").strip():
            fields["volume"] = volume
        if issue := str(record.get("issue") or "").strip():
            fields["number"] = issue
        if pages := str(record.get("page") or "").strip():
            fields["pages"] = pages
        if publisher := self._strip_bracketed_ids(str(record.get("publisher") or "")):
            fields["publisher"] = publisher
        if venue_ids:
            fields["note"] = f"opencitations_venue_ids = {{{venue_ids}}}"
        citation_key = self._citation_key(doi, openalex, authors, year, title)
        return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
    def get_citations(self, work_id: str, relation_type: str = "cites", limit: int = 10) -> List[CitationEdge]:
        normalized = self._normalize_doi_pid(work_id)
        if not normalized:
            return []
        path = "references" if relation_type == "cites" else "citations"
        rows = self.source_client.try_get_json(f"{self.INDEX_BASE_URL}/{path}/{normalized}")
        if not rows:
            return []
        edges: List[CitationEdge] = []
        for row in rows[:limit]:
            citing = self._extract_id_value(str(row.get("citing") or ""), "doi")
            cited = self._extract_id_value(str(row.get("cited") or ""), "doi")
            if not citing or not cited:
                continue
            if relation_type == "cites":
                source_work_id, target_work_id = citing, cited
            else:
                source_work_id, target_work_id = citing, cited
            edges.append(
                CitationEdge(
                    source_work_id=f"doi:{source_work_id}",
                    target_work_id=f"doi:{target_work_id}",
                    relation_type="cites",
                    source_type="opencitations",
                    source_label=f"opencitations:{path}:{normalized}",
                    confidence=0.85,
                )
            )
        return edges
    def get_identifier_scheme(self) -> str:
        return "doi"
    def _normalize_doi_pid(self, value: str) -> str:
        doi = value.strip()
        if not doi:
            return ""
        if doi.lower().startswith("doi:"):
            doi = doi[4:]
        return f"doi:{doi}"
    def _extract_id_value(self, identifiers: str, scheme: str) -> str:
        prefix = f"{scheme}:"
        for token in identifiers.split():
            if token.startswith(prefix):
                return token[len(prefix):]
        return ""
    def _extract_year(self, pub_date: str) -> str:
        pub_date = pub_date.strip()
        if len(pub_date) >= 4 and pub_date[:4].isdigit():
            return pub_date[:4]
        return ""
    def _normalize_author_field(self, raw_authors: str) -> str:
        authors: List[str] = []
        for part in raw_authors.split(";"):
            cleaned = self._strip_bracketed_ids(part)
            cleaned = " ".join(cleaned.split())
            if cleaned:
                authors.append(cleaned)
        return " and ".join(authors)
    def _parse_venue_field(self, raw_venue: str) -> tuple[str, str]:
        raw_venue = raw_venue.strip()
        if not raw_venue:
            return "", ""
        if "[" not in raw_venue:
            return raw_venue, ""
        title, _, remainder = raw_venue.partition("[")
        return title.strip(), remainder.rstrip("] ").strip()
    def _strip_bracketed_ids(self, value: str) -> str:
        return value.split("[", 1)[0].strip()
    def _map_entry_type(self, raw_type: str) -> str:
        lowered = raw_type.casefold()
        if lowered == "journal article":
            return "article"
        if lowered == "book":
            return "book"
        if lowered == "book chapter":
            return "incollection"
        if lowered in {"proceedings article", "conference paper"}:
            return "inproceedings"
        if "thesis" in lowered or "dissertation" in lowered:
            return "phdthesis"
        return "misc"
    def _citation_key(self, doi: str, openalex: str, authors: str, year: str, title: str) -> str:
        if doi:
            return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
        if openalex:
            return "openalex" + "".join(ch for ch in openalex.lower() if ch.isalnum())
        family = authors.split(" and ")[0].split(",")[0].split()[-1] if authors else "ref"
        family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref"
        first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
        return f"{family}{year or 'nd'}{first_word or 'untitled'}"
--- a/src/citegeist/sources/openlibrary.py
+++ b/src/citegeist/sources/openlibrary.py
@ -0,0 +1,100 @@
 """Open Library source plugin."""
 from __future__ import annotations
 import urllib.parse
 from typing import Any, Dict, List, Optional
 from citegeist.bibtex import BibEntry
 from citegeist.sources.base import BibliographicSource
 from citegeist.sources._old_sources_compat import SourceClient
 class OpenLibrarySource(BibliographicSource):
    """Open Library source for broad book and monograph metadata."""
    SEARCH_URL = "https://openlibrary.org/search.json"
    WORK_URL = "https://openlibrary.org"
    def __init__(self, config: Optional[Dict[str, Any]] = None):
        super().__init__(config)
        user_agent = str(self.config.get("user_agent") or "citegeist/0.1 (local research tool)")
        self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
    def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
        return None
    def lookup_by_title(self, title: str) -> Optional[BibEntry]:
        matches = self.search(title, limit=1)
        return matches[0] if matches else None
    def search(self, query: str, limit: int = 10) -> List[BibEntry]:
        title = " ".join(query.split())
        if not title:
            return []
        params = urllib.parse.urlencode({"title": title, "limit": max(1, limit), "fields": "*"})
        payload = self.source_client.try_get_json(f"{self.SEARCH_URL}?{params}")
        if not payload:
            return []
        docs = payload.get("docs", [])
        if not isinstance(docs, list):
            return []
        return [entry for record in docs if isinstance(record, dict) and (entry := self.normalize(record)) is not None]
    def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
        title = str(record.get("title") or "").strip()
        if not title:
            return None
        authors = self._join_list(record.get("author_name"))
        year = self._extract_year(record)
        publishers = self._join_list(record.get("publisher"))
        work_key = str(record.get("key") or "").strip()
        edition_keys = record.get("edition_key") or []
        isbn_values = record.get("isbn") or []
        fields: Dict[str, str] = {"title": title}
        if authors:
            fields["author"] = authors
        if year:
            fields["year"] = year
        if publishers:
            fields["publisher"] = publishers
        if work_key:
            fields["openlibrary_work"] = work_key
            fields["url"] = f"{self.WORK_URL}{work_key}"
        if isinstance(edition_keys, list) and edition_keys:
            fields["openlibrary_edition"] = str(edition_keys[0])
        if isinstance(isbn_values, list) and isbn_values:
            fields["isbn"] = str(isbn_values[0])
        return BibEntry(
            entry_type="book",
            citation_key=self._citation_key(work_key, authors, year, title),
            fields=fields,
        )
    def get_identifier_scheme(self) -> str:
        return "openlibrary"
    def _extract_year(self, record: Dict[str, Any]) -> str:
        first_publish_year = record.get("first_publish_year")
        if first_publish_year:
            return str(first_publish_year)
        publish_year = record.get("publish_year")
        if isinstance(publish_year, list) and publish_year:
            return str(publish_year[0])
        return ""
    def _join_list(self, value: Any) -> str:
        if not isinstance(value, list):
            return ""
        items = [str(item).strip() for item in value if str(item).strip()]
        return " and ".join(items)
    def _citation_key(self, work_key: str, authors: str, year: str, title: str) -> str:
        if work_key:
            return "ol" + "".join(ch for ch in work_key.lower() if ch.isalnum())
        family = authors.split(" and ")[0].split()[-1] if authors else "book"
        family = "".join(ch for ch in family.lower() if ch.isalnum()) or "book"
        first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
        return f"{family}{year or 'nd'}{first_word or 'untitled'}"
--- a/src/citegeist/sources/registry.py
+++ b/src/citegeist/sources/registry.py
@ -0,0 +1,253 @@
 """
 Source registry for managing bibliographic source plugins.
 This module provides a registry that can discover, load, and manage
 multiple bibliographic source plugins.
 """
 from __future__ import annotations
 import importlib.util
 import inspect
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Type
 from citegeist.sources.base import BibliographicSource
@dataclass(slots=True)
 class SourceRegistration:
    """Registration information for a source plugin."""
    name: str
    source_class: Type[BibliographicSource]
    config: Dict[str, Any]
    enabled: bool
 class SourceRegistry:
    """Registry for bibliographic source plugins.
    This class manages the discovery, registration, and instantiation
    of bibliographic source plugins.
    """
    def __init__(self) -> None:
        """Initialize the source registry."""
        self._registrations: Dict[str, SourceRegistration] = {}
        self._instances: Dict[str, BibliographicSource] = {}
    def register(
        self,
        source_class: Type[BibliographicSource],
        name: Optional[str] = None,
        config: Optional[Dict[str, Any]] = None,
    ) -> None:
        """Register a source class.
        Args:
            source_class: The source class to register (must inherit from BibliographicSource)
            name: Optional name for the source (uses class name if not provided)
            config: Optional configuration dictionary
        """
        if not inspect.isclass(source_class) or not issubclass(source_class, BibliographicSource):
            raise ValueError(f"{source_class} must be a subclass of BibliographicSource")
        source_name = name or source_class.__name__
        self._registrations[source_name] = SourceRegistration(
            name=source_name,
            source_class=source_class,
            config=config or {},
            enabled=config.get('enabled', True) if config else True
        )
    def get(self, name: str) -> Optional[BibliographicSource]:
        """Get a source instance by name.
        Args:
            name: Name of the source
        Returns:
            Source instance if registered and enabled, None otherwise
        """
        if name not in self._registrations:
            return None
        registration = self._registrations[name]
        # Return cached instance if available
        if name in self._instances:
            return self._instances[name]
        # Create new instance
        if not registration.enabled:
            return None
        instance = registration.source_class(config=registration.config)
        self._instances[name] = instance
        return instance
    def list_sources(self, enabled_only: bool = False) -> List[str]:
        """List registered source names.
        Args:
            enabled_only: Only return enabled sources
        Returns:
            List of source names
        """
        sources = list(self._registrations.keys())
        if enabled_only:
            return [name for name, reg in self._registrations.items() if reg.enabled]
        return sources
    def get_config(self, name: str) -> Optional[Dict[str, Any]]:
        """Get configuration for a source.
        Args:
            name: Name of the source
        Returns:
            Configuration dictionary, or None if not found
        """
        registration = self._registrations.get(name)
        return registration.config if registration else None
    def load_from_file(self, filepath: str) -> None:
        """Load source plugins from a Python file.
        Args:
            filepath: Path to Python file containing source classes
        """
        spec = importlib.util.spec_from_file_location("module.sources", filepath)
        if spec is None or spec.loader is None:
            raise ImportError(f"Cannot load module from {filepath}")
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        # Find all classes that inherit from BibliographicSource
        for name, obj in inspect.getmembers(module, inspect.isclass):
            if issubclass(obj, BibliographicSource) and obj is not BibliographicSource:
                self.register(obj)
    def load_from_directory(self, directory: str) -> None:
        """Load source plugins from a directory.
        Args:
            directory: Path to directory containing source plugin files
        """
        import os
        for filename in os.listdir(directory):
            if filename.endswith('.py') and not filename.startswith('_'):
                filepath = os.path.join(directory, filename)
                self.load_from_file(filepath)
    def from_config_dict(self, config: Dict[str, Any]) -> None:
        """Load sources from a configuration dictionary.
        Example config format:
        {
            "sources": {
                "crossref": {
                    "source_type": "crossref",
                    "enabled": true
                },
                "semantic_scholar": {
                    "source_type": "semantic_scholar",
                    "enabled": true,
                    "api_key": "..."
                }
            }
        }
        Args:
            config: Configuration dictionary
        """
        if 'sources' not in config:
            return
        for name, source_config in config['sources'].items():
            source_name = str(name)
            source_type = str(source_config.get('source_type', source_name))
            self.register(
                source_class=self._resolve_source_class(source_type),
                name=source_name,
                config=source_config
            )
    def to_dict(self) -> Dict[str, Any]:
        """Serialize registry to dictionary.
        Returns:
            Dictionary representation of registry
        """
        return {
            name: {
                'enabled': reg.enabled,
                'config': reg.config
            }
            for name, reg in self._registrations.items()
        }
    def from_dict(self, data: Dict[str, Any]) -> None:
        """Load registry from dictionary.
        Args:
            data: Dictionary representation of registry
        """
        for name, source_data in data.items():
            source_name = str(name)
            source_type = str(source_data.get('source_type', source_name))
            self.register(
                source_class=self._resolve_source_class(source_type),
                name=source_name,
                config=source_data.get('config', source_data)
            )
    def get_registered_sources(self) -> List[SourceRegistration]:
        """Get all registered source registrations.
        Returns:
            List of SourceRegistration objects
        """
        return list(self._registrations.values())
    def _resolve_source_class(self, source_type: str) -> Type[BibliographicSource]:
        normalized = source_type.strip().lower().replace('-', '_')
        if normalized in {'crossref', 'cross_ref'}:
            from citegeist.sources.crossref import CrossRefSource
            return CrossRefSource
        if normalized in {'opencitations', 'open_citations'}:
            from citegeist.sources.opencitations import OpenCitationsSource
            return OpenCitationsSource
        if normalized == 'unpaywall':
            from citegeist.sources.unpaywall import UnpaywallSource
            return UnpaywallSource
        if normalized in {'europepmc', 'europe_pmc'}:
            from citegeist.sources.europepmc import EuropePmcSource
            return EuropePmcSource
        if normalized in {'semanticscholar', 'semantic_scholar'}:
            from citegeist.sources.semanticscholar import SemanticScholarSource
            return SemanticScholarSource
        if normalized in {"openlibrary", "open_library"}:
            from citegeist.sources.openlibrary import OpenLibrarySource
            return OpenLibrarySource
        raise ValueError(f"Unknown source type: {source_type}")
 # Global registry instance
 _global_registry = SourceRegistry()
 def get_registry() -> SourceRegistry:
    """Get the global source registry instance.
    Returns:
        The global SourceRegistry instance
    """
    return _global_registry
--- a/src/citegeist/sources/semanticscholar.py
+++ b/src/citegeist/sources/semanticscholar.py
@ -0,0 +1,140 @@
 """Semantic Scholar source plugin."""
 from __future__ import annotations
 import json
 import os
 import urllib.parse
 import urllib.request
 from typing import Any, Dict, List, Optional
 from citegeist.bibtex import BibEntry
 from citegeist.sources.base import BibliographicSource
 class SemanticScholarSource(BibliographicSource):
    """Semantic Scholar source for broad scientific metadata coverage."""
    BASE_URL = "https://api.semanticscholar.org/graph/v1"
    DEFAULT_FIELDS = (
        "paperId,title,year,abstract,authors,externalIds,journal,venue,url,"
        "openAccessPdf,citationCount,publicationTypes"
    )
    def __init__(self, config: Optional[Dict[str, Any]] = None):
        super().__init__(config)
        self.api_key = str(
            self.config.get("api_key")
            or os.environ.get("SEMANTIC_SCHOLAR_API_KEY")
            or ""
        ).strip()
        self.user_agent = str(self.config.get("user_agent") or "citegeist/0.1 (local research tool)")
    def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
        normalized = doi.strip()
        if not normalized:
            return None
        encoded = urllib.parse.quote(f"DOI:{normalized}", safe="")
        payload = self._get_json(f"{self.BASE_URL}/paper/{encoded}?fields={self.DEFAULT_FIELDS}")
        if not payload:
            return None
        return self.normalize(payload)
    def lookup_by_title(self, title: str) -> Optional[BibEntry]:
        matches = self.search(title, limit=1)
        return matches[0] if matches else None
    def search(self, query: str, limit: int = 10) -> List[BibEntry]:
        query_text = " ".join(query.split())
        if not query_text:
            return []
        params = urllib.parse.urlencode(
            {"query": query_text, "limit": max(1, limit), "fields": self.DEFAULT_FIELDS}
        )
        payload = self._get_json(f"{self.BASE_URL}/paper/search?{params}")
        if not payload:
            return []
        return [entry for row in payload.get("data", []) if (entry := self.normalize(row)) is not None]
    def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
        title = str(record.get("title") or "").strip()
        if not title:
            return None
        external_ids = record.get("externalIds") or {}
        doi = str(external_ids.get("DOI") or "").strip()
        authors = " and ".join(
            str(author.get("name") or "").strip()
            for author in record.get("authors", [])
            if str(author.get("name") or "").strip()
        )
        year = str(record.get("year") or "").strip()
        abstract = str(record.get("abstract") or "").strip()
        journal = record.get("journal") or {}
        journal_name = str(journal.get("name") or record.get("venue") or "").strip()
        open_access_pdf = record.get("openAccessPdf") or {}
        fields: Dict[str, str] = {"title": title}
        if doi:
            fields["doi"] = doi
        if paper_id := str(record.get("paperId") or "").strip():
            fields["semanticscholar_id"] = paper_id
        if year:
            fields["year"] = year
        if authors:
            fields["author"] = authors
        if abstract:
            fields["abstract"] = abstract
        if journal_name:
            if self._entry_type(record) == "inproceedings":
                fields["booktitle"] = journal_name
            else:
                fields["journal"] = journal_name
        if url := str(open_access_pdf.get("url") or record.get("url") or "").strip():
            fields["url"] = url
        if open_access_pdf:
            fields["is_oa"] = "true"
        if citation_count := record.get("citationCount"):
            fields["semanticscholar_citation_count"] = str(citation_count)
        citation_key = self._citation_key(doi, str(record.get("paperId") or ""), authors, year, title)
        return BibEntry(entry_type=self._entry_type(record), citation_key=citation_key, fields=fields)
    def get_fulltext_url(self, doi: str) -> Optional[str]:
        entry = self.lookup_by_doi(doi)
        if entry is None:
            return None
        return entry.fields.get("url")
    def get_identifier_scheme(self) -> str:
        return "doi"
    def _entry_type(self, record: Dict[str, Any]) -> str:
        publication_types = [str(item).lower() for item in (record.get("publicationTypes") or [])]
        if any("conference" in item for item in publication_types):
            return "inproceedings"
        if any("review" in item for item in publication_types):
            return "article"
        if record.get("journal") or record.get("venue"):
            return "article"
        return "misc"
    def _citation_key(self, doi: str, paper_id: str, authors: str, year: str, title: str) -> str:
        if doi:
            return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
        if paper_id:
            return "s2" + "".join(ch for ch in paper_id.lower() if ch.isalnum())
        family = authors.split(" and ")[0].split()[-1] if authors else "ref"
        family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref"
        first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
        return f"{family}{year or 'nd'}{first_word or 'untitled'}"
    def _get_json(self, url: str) -> Dict[str, Any] | None:
        headers = {"User-Agent": self.user_agent}
        if self.api_key:
            headers["x-api-key"] = self.api_key
        try:
            request = urllib.request.Request(url, headers=headers)
            with urllib.request.urlopen(request) as response:
                return json.loads(response.read().decode("utf-8"))
        except Exception:
            return None
--- a/src/citegeist/sources/unpaywall.py
+++ b/src/citegeist/sources/unpaywall.py
@ -0,0 +1,116 @@
 """Unpaywall source plugin."""
 from __future__ import annotations
 import os
 import urllib.parse
 from typing import Any, Dict, Optional
 from citegeist.bibtex import BibEntry
 from citegeist.sources._old_sources_compat import SourceClient
 from citegeist.sources.base import BibliographicSource
 class UnpaywallSource(BibliographicSource):
    """Unpaywall source for DOI-based OA link enrichment."""
    BASE_URL = "https://api.unpaywall.org/v2"
    def __init__(self, config: Optional[Dict[str, Any]] = None):
        super().__init__(config)
        user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)")
        self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
        self.email = str(
            self.config.get("email")
            or os.environ.get("UNPAYWALL_EMAIL")
            or os.environ.get("NCBI_EMAIL")
            or ""
        ).strip()
    def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
        payload = self.lookup_oa_record(doi)
        if not payload:
            return None
        return self.normalize(payload)
    def lookup_by_title(self, title: str) -> Optional[BibEntry]:
        return None
    def search(self, query: str, limit: int = 10) -> list[BibEntry]:
        return []
    def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
        doi = str(record.get("doi") or "").strip()
        title = str(record.get("title") or "").strip() or (doi and f"OA record for DOI {doi}")
        if not doi or not title:
            return None
        fields: Dict[str, str] = {
            "title": title,
            "doi": doi,
        }
        if year := str(record.get("year") or "").strip():
            fields["year"] = year
        if landing_url := self._best_landing_url(record):
            fields["url"] = landing_url
            fields["best_oa_url"] = landing_url
        if pdf_url := self._best_pdf_url(record):
            fields["best_oa_pdf_url"] = pdf_url
        if oa_status := str(record.get("oa_status") or "").strip():
            fields["oa_status"] = oa_status
        if license_name := self._best_license(record):
            fields["oa_license"] = license_name
        if host_type := self._best_host_type(record):
            fields["oa_host_type"] = host_type
        if version := self._best_version(record):
            fields["oa_version"] = version
        if evidence := self._best_evidence(record):
            fields["oa_evidence"] = evidence
        if record.get("is_oa") is not None:
            fields["is_oa"] = "true" if bool(record.get("is_oa")) else "false"
        citation_key = "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
        return BibEntry(entry_type="misc", citation_key=citation_key, fields=fields)
    def get_fulltext_url(self, doi: str) -> Optional[str]:
        payload = self.lookup_oa_record(doi)
        if not payload:
            return None
        return self._best_pdf_url(payload) or self._best_landing_url(payload)
    def get_identifier_scheme(self) -> str:
        return "doi"
    def is_available(self) -> bool:
        return self.enabled and bool(self.email)
    def lookup_oa_record(self, doi: str) -> Dict[str, Any] | None:
        normalized = doi.strip()
        if not normalized or not self.email:
            return None
        encoded = urllib.parse.quote(normalized, safe="")
        query = urllib.parse.urlencode({"email": self.email})
        return self.source_client.try_get_json(f"{self.BASE_URL}/{encoded}?{query}")
    def _best_landing_url(self, payload: Dict[str, Any]) -> str:
        location = payload.get("best_oa_location") or {}
        return str(location.get("url") or location.get("url_for_landing_page") or "").strip()
    def _best_pdf_url(self, payload: Dict[str, Any]) -> str:
        location = payload.get("best_oa_location") or {}
        return str(location.get("url_for_pdf") or "").strip()
    def _best_license(self, payload: Dict[str, Any]) -> str:
        location = payload.get("best_oa_location") or {}
        return str(location.get("license") or "").strip()
    def _best_host_type(self, payload: Dict[str, Any]) -> str:
        location = payload.get("best_oa_location") or {}
        return str(location.get("host_type") or "").strip()
    def _best_version(self, payload: Dict[str, Any]) -> str:
        location = payload.get("best_oa_location") or {}
        return str(location.get("version") or "").strip()
    def _best_evidence(self, payload: Dict[str, Any]) -> str:
        location = payload.get("best_oa_location") or {}
        return str(location.get("evidence") or "").strip()
--- a/src/citegeist/talkorigins.py
+++ b/src/citegeist/talkorigins.py
@ -138,6 +138,7 @@ class TalkOriginsEnrichmentResult:
    applied: bool
    source_label: str = ""
    weak_reasons_after: list[str] | None = None
    resolution_attempts: list[dict[str, object]] | None = None
    conflicts: list[dict[str, str]] | None = None
    error: str = ""
@ -545,8 +546,28 @@ class TalkOriginsScraper:
            if not weak_reasons_before:
                continue
            resolution = None
            attempts: list[dict[str, object]] = []
            error = ""
            try:
                resolver_with_trace = getattr(self.resolver, "resolve_entry_with_trace", None)
                resolver_plain = getattr(self.resolver, "resolve_entry", None)
                plain_func = getattr(resolver_plain, "__func__", None)
                trace_func = getattr(resolver_with_trace, "__func__", None)
                use_trace = (
                    resolver_with_trace is not None
                    and (
                        trace_func is None
                        or (
                            plain_func is MetadataResolver.resolve_entry
                            and trace_func is MetadataResolver.resolve_entry_with_trace
                        )
                    )
                )
                if use_trace:
                    outcome = self.resolver.resolve_entry_with_trace(canonical)
                    resolution = outcome.resolution
                    attempts = [asdict(attempt) for attempt in outcome.attempts]
                else:
                    resolution = self.resolver.resolve_entry(canonical)
            except Exception as exc:
                error = str(exc)
@ -559,6 +580,7 @@ class TalkOriginsScraper:
                applied=False,
                source_label=resolution.source_label if resolution is not None else "",
                error=error,
                resolution_attempts=attempts,
            )
            if resolution is not None:
--- a/tests/test_europepmc.py
+++ b/tests/test_europepmc.py
@ -0,0 +1,123 @@
 from __future__ import annotations
 from citegeist.resolve import MetadataResolver
 from citegeist.sources import EuropePmcSource, SourceRegistry, list_source_catalog
 def test_europepmc_source_normalizes_core_record() -> None:
    source = EuropePmcSource(config={})
    entry = source.normalize(
        {
            "id": "37158217",
            "source": "MED",
            "pmid": "37158217",
            "pmcid": "PMC10000001",
            "doi": "10.1000/example",
            "title": "Biomedical Example",
            "authorString": "Doe J, Roe A",
            "journalTitle": "Biomed Journal",
            "pubYear": "2024",
            "journalVolume": "16",
            "issue": "1",
            "pageInfo": "10-20",
            "abstractText": "Abstract text.",
            "isOpenAccess": "Y",
            "citedByCount": 12,
            "fullTextUrlList": {"fullTextUrl": [{"url": "https://europepmc.org/articles/PMC10000001?pdf=render"}]},
        }
    )
    assert entry is not None
    assert entry.fields["doi"] == "10.1000/example"
    assert entry.fields["pmid"] == "37158217"
    assert entry.fields["pmcid"] == "PMC10000001"
    assert entry.fields["journal"] == "Biomed Journal"
    assert entry.fields["url"] == "https://europepmc.org/articles/PMC10000001?pdf=render"
    assert entry.fields["is_oa"] == "true"
 def test_europepmc_registry_and_catalog() -> None:
    registry = SourceRegistry()
    registry.from_config_dict(
        {
            "sources": {
                "europepmc": {
                    "source_type": "europepmc",
                    "enabled": True,
                }
            }
        }
    )
    source = registry.get("europepmc")
    assert isinstance(source, EuropePmcSource)
    catalog = {entry.key: entry for entry in list_source_catalog()}
    assert catalog["europe_pmc"].current_status == "integrated"
    assert catalog["europe_pmc"].priority == "now"
 def test_metadata_resolver_uses_europepmc_doi_after_primary_lookups_fail() -> None:
    resolver = MetadataResolver()
    resolver.resolve_doi = lambda _doi: None  # type: ignore[method-assign]
    resolver.resolve_datacite_doi = lambda _doi: None  # type: ignore[method-assign]
    resolver.europepmc.lookup_by_doi = lambda _doi: resolver.europepmc.normalize(  # type: ignore[method-assign]
        {
            "id": "37158217",
            "source": "MED",
            "pmid": "37158217",
            "doi": "10.1000/example",
            "title": "Biomedical Example",
            "authorString": "Doe J, Roe A",
            "journalTitle": "Biomed Journal",
            "pubYear": "2024",
        }
    )
    from citegeist.bibtex import BibEntry
    result = resolver.resolve_entry(
        BibEntry(
            entry_type="article",
            citation_key="seed2024",
            fields={"doi": "10.1000/example", "title": "Biomedical Example"},
        )
    )
    assert result is not None
    assert result.source_label == "europepmc:doi:10.1000/example"
    assert result.entry.fields["pmid"] == "37158217"
 def test_metadata_resolver_uses_europepmc_title_search_after_pubmed() -> None:
    resolver = MetadataResolver()
    resolver.search_crossref_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
    resolver.search_datacite_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
    resolver.search_openalex_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
    resolver.search_pubmed_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
    resolver.europepmc.search = lambda _title, limit=5: [  # type: ignore[method-assign]
        resolver.europepmc.normalize(
            {
                "id": "37158217",
                "source": "MED",
                "pmid": "37158217",
                "doi": "10.1000/example",
                "title": "Biomedical Example",
                "authorString": "Doe J, Roe A",
                "journalTitle": "Biomed Journal",
                "pubYear": "2024",
            }
        )
    ]
    from citegeist.bibtex import BibEntry
    result = resolver.resolve_entry(
        BibEntry(
            entry_type="article",
            citation_key="seed2024",
            fields={"title": "Biomedical Example", "author": "Doe J", "year": "2024"},
        )
    )
    assert result is not None
    assert result.source_label == "europepmc:search:Biomedical Example"
--- a/tests/test_opencitations.py
+++ b/tests/test_opencitations.py
@ -0,0 +1,137 @@
 from __future__ import annotations
 from citegeist.expand import OpenCitationsExpander
 from citegeist.sources import OpenCitationsSource
 from citegeist.storage import BibliographyStore
 def test_opencitations_source_normalizes_metadata_row() -> None:
    source = OpenCitationsSource(config={})
    entry = source.normalize(
        {
            "id": "doi:10.1000/example openalex:W1234567890 omid:br/06123",
            "title": "Example Work",
            "author": "Doe, Jane [omid:ra/1]; Roe, Alex [omid:ra/2]",
            "pub_date": "2024-05",
            "venue": "Journal of Examples [issn:1234-5678]",
            "volume": "12",
            "issue": "3",
            "page": "10-20",
            "type": "journal article",
            "publisher": "Example Press [crossref:123]",
        }
    )
    assert entry is not None
    assert entry.fields["doi"] == "10.1000/example"
    assert entry.fields["openalex"] == "W1234567890"
    assert entry.fields["author"] == "Doe, Jane and Roe, Alex"
    assert entry.fields["journal"] == "Journal of Examples"
    assert entry.fields["publisher"] == "Example Press"
    assert entry.fields["year"] == "2024"
 def test_opencitations_source_builds_edges_for_references() -> None:
    source = OpenCitationsSource(config={})
    source.source_client.get_json = lambda _url: [  # type: ignore[method-assign]
        {
            "oci": "1-2",
            "citing": "omid:br/1 doi:10.1000/source",
            "cited": "omid:br/2 doi:10.1000/target",
            "creation": "2024-01-01",
        }
    ]
    edges = source.get_citations("10.1000/source", relation_type="cites", limit=10)
    assert len(edges) == 1
    assert edges[0].source_work_id == "doi:10.1000/source"
    assert edges[0].target_work_id == "doi:10.1000/target"
 def test_opencitations_expander_creates_reference_nodes_and_relations() -> None:
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            """
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024},
  doi = {10.1000/source}
 }
 """
        )
        expander = OpenCitationsExpander()
        expander.source.source_client.get_json = lambda url: [  # type: ignore[method-assign]
            {
                "oci": "1-2",
                "citing": "omid:br/1 doi:10.1000/source",
                "cited": "omid:br/2 doi:10.1000/target",
                "creation": "2024-01-01",
            }
        ] if "/references/" in url else [
            {
                "id": "doi:10.1000/target omid:br/2",
                "title": "Target Work",
                "author": "Doe, Jane [omid:ra/1]",
                "pub_date": "2023",
                "venue": "Journal of Targets [issn:1111-1111]",
                "type": "journal article",
            }
        ]
        expander.resolver.resolve_doi = lambda _doi: None  # type: ignore[method-assign]
        expander.resolver.resolve_datacite_doi = lambda _doi: None  # type: ignore[method-assign]
        results = expander.expand_entry(store, "seed2024", relation_type="cites", limit=10)
        assert [item.discovered_citation_key for item in results] == ["doi101000target"]
        discovered = store.get_entry("doi101000target")
        assert discovered is not None
        assert discovered["title"] == "Target Work"
        assert store.get_relations("seed2024") == ["doi101000target"]
    finally:
        store.close()
 def test_opencitations_expander_supports_cited_by_direction() -> None:
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            """
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024},
  doi = {10.1000/seed}
 }
 """
        )
        expander = OpenCitationsExpander()
        expander.source.source_client.get_json = lambda url: [  # type: ignore[method-assign]
            {
                "oci": "2-1",
                "citing": "omid:br/2 doi:10.1000/citing",
                "cited": "omid:br/1 doi:10.1000/seed",
                "creation": "2024-01-01",
            }
        ] if "/citations/" in url else [
            {
                "id": "doi:10.1000/citing omid:br/2",
                "title": "Citing Work",
                "author": "Doe, Jane [omid:ra/1]",
                "pub_date": "2025",
                "venue": "Journal of Citers [issn:1111-1111]",
                "type": "journal article",
            }
        ]
        expander.resolver.resolve_doi = lambda _doi: None  # type: ignore[method-assign]
        expander.resolver.resolve_datacite_doi = lambda _doi: None  # type: ignore[method-assign]
        results = expander.expand_entry(store, "seed2024", relation_type="cited_by", limit=10)
        assert [item.discovered_citation_key for item in results] == ["doi101000citing"]
        assert store.get_relations("doi101000citing") == ["seed2024"]
    finally:
        store.close()
--- a/tests/test_openlibrary.py
+++ b/tests/test_openlibrary.py
@ -0,0 +1,188 @@
 from __future__ import annotations
 from citegeist.bibtex import BibEntry
 from citegeist.resolve import MetadataResolver
 from citegeist.sources import OpenLibrarySource, SourceRegistry, list_source_catalog
 class FakeSourceClient:
    def __init__(self, payload: dict[str, object]) -> None:
        self.payload = payload
    def try_get_json(self, _url: str) -> dict[str, object]:
        return dict(self.payload)
 def test_openlibrary_source_normalizes_book_record() -> None:
    source = OpenLibrarySource(config={"source_client": FakeSourceClient({})})
    entry = source.normalize(
        {
            "title": "The Nature of the Stratigraphic Record",
            "author_name": ["D. V. Ager"],
            "first_publish_year": 1973,
            "publisher": ["Macmillan"],
            "key": "/works/OL82563W",
            "edition_key": ["OL12345M"],
            "isbn": ["9781234567890"],
        }
    )
    assert entry is not None
    assert entry.entry_type == "book"
    assert entry.fields["title"] == "The Nature of the Stratigraphic Record"
    assert entry.fields["author"] == "D. V. Ager"
    assert entry.fields["year"] == "1973"
    assert entry.fields["publisher"] == "Macmillan"
    assert entry.fields["openlibrary_work"] == "/works/OL82563W"
    assert entry.fields["openlibrary_edition"] == "OL12345M"
    assert entry.fields["isbn"] == "9781234567890"
 def test_openlibrary_registry_and_catalog() -> None:
    registry = SourceRegistry()
    registry.from_config_dict(
        {
            "sources": {
                "openlibrary": {
                    "source_type": "openlibrary",
                    "enabled": True,
                }
            }
        }
    )
    source = registry.get("openlibrary")
    assert isinstance(source, OpenLibrarySource)
    catalog = {entry.key: entry for entry in list_source_catalog()}
    assert catalog["open_library"].current_status == "integrated"
    assert "book_metadata" in catalog["open_library"].capabilities
 def test_metadata_resolver_uses_openlibrary_after_other_searches_fail() -> None:
    resolver = MetadataResolver()
    resolver.search_crossref = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_datacite = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_openalex = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_pubmed = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_europepmc = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_semanticscholar = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_openlibrary = lambda _title, limit=5: [  # type: ignore[method-assign]
        BibEntry(
            entry_type="book",
            citation_key="olworks123",
            fields={
                "title": "The Nature of the Stratigraphic Record",
                "author": "D. V. Ager",
                "year": "1973",
                "openlibrary_work": "/works/OL82563W",
            },
        )
    ]
    result = resolver.resolve_entry(
        BibEntry(
            entry_type="book",
            citation_key="seed1973",
            fields={"title": "The Nature of the Stratigraphic Record", "author": "D. V. Ager", "year": "1973"},
        )
    )
    assert result is not None
    assert result.source_label == "openlibrary:search:The Nature of the Stratigraphic Record"
 def test_metadata_resolver_trace_records_fallback_attempts() -> None:
    resolver = MetadataResolver()
    resolver.search_crossref = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_datacite = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_openalex = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_pubmed = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_europepmc = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_semanticscholar = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_openlibrary = lambda _title, limit=5: [  # type: ignore[method-assign]
        BibEntry(
            entry_type="book",
            citation_key="olworks123",
            fields={"title": "Example Book", "author": "Author, A", "year": "1980"},
        )
    ]
    outcome = resolver.resolve_entry_with_trace(
        BibEntry(
            entry_type="book",
            citation_key="seed1980",
            fields={"title": "Example Book", "author": "Author, A", "year": "1980"},
        )
    )
    assert outcome.resolution is not None
    assert outcome.resolution.source_label == "openlibrary:search:Example Book"
    assert [attempt.source_name for attempt in outcome.attempts[-2:]] == ["semanticscholar", "openlibrary"]
    assert outcome.attempts[-1].matched is True
    assert outcome.attempts[-1].candidate_count == 1
 def test_metadata_resolver_uses_fuzzy_catalog_match_for_book_titles() -> None:
    resolver = MetadataResolver()
    resolver.search_crossref = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_datacite = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_openalex = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_pubmed = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_europepmc = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_semanticscholar = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_openlibrary = lambda _title, limit=5: [  # type: ignore[method-assign]
        BibEntry(
            entry_type="book",
            citation_key="olworks123",
            fields={
                "title": "The nature of the stratigraphical record",
                "author": "D. V. Ager",
                "year": "1973",
                "openlibrary_work": "/works/OL82563W",
            },
        )
    ]
    result = resolver.resolve_entry(
        BibEntry(
            entry_type="book",
            citation_key="seed1973",
            fields={"title": "The Nature of the Stratigraphic Record", "author": "D. V. Ager", "year": "1973"},
        )
    )
    assert result is not None
    assert result.source_label == "openlibrary:search:The Nature of the Stratigraphic Record"
 def test_metadata_resolver_skips_openlibrary_for_article_like_entries() -> None:
    resolver = MetadataResolver()
    resolver.search_crossref = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_datacite = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_openalex = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_pubmed = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_europepmc = lambda _title, limit=5: []  # type: ignore[method-assign]
    resolver.search_semanticscholar = lambda _title, limit=5: []  # type: ignore[method-assign]
    called = {"openlibrary": False}
    def fake_openlibrary(_title: str, limit: int = 5) -> list[BibEntry]:
        called["openlibrary"] = True
        return []
    resolver.search_openlibrary = fake_openlibrary  # type: ignore[method-assign]
    outcome = resolver.resolve_entry_with_trace(
        BibEntry(
            entry_type="article",
            citation_key="seed1977",
            fields={
                "title": "Fast locomotion of some African ungulates",
                "author": "Alexander, R. M.",
                "year": "1977",
                "journal": "Journal of Zoology",
            },
        )
    )
    assert outcome.resolution is None
    assert called["openlibrary"] is False
    assert all(attempt.source_name != "openlibrary" for attempt in outcome.attempts)
--- a/tests/test_resolver_identifiers.py
+++ b/tests/test_resolver_identifiers.py
@ -0,0 +1,201 @@
 """Tests for identifier resolution and normalization."""
 from __future__ import annotations
 import pytest
 from citegeist.resolver import (
    IdentifierExtractor,
    IdentifierNormalizer,
    IdentifierResolver,
    extract_identifiers,
    normalize_identifier,
    get_primary_identifier,
    resolve_identifiers,
 )
 class TestIdentifierExtractor:
    """Test IdentifierExtractor class."""
    def test_extract_from_entry(self):
        """Test extracting identifiers from entry fields."""
        fields = {
            'doi': '10.1234/example',
            'title': 'Test Title',
            'author': 'John Doe',
            'pmid': '123456',
        }
        identifiers = IdentifierExtractor.extract(fields)
        assert 'doi' in identifiers
        assert identifiers['doi'] == '10.1234/example'
        assert 'pmid' in identifiers
        assert identifiers['pmid'] == '123456'
        assert 'title' not in identifiers  # Title is not an identifier
    def test_extract_multiple_identifiers(self):
        """Test extracting multiple identifiers."""
        fields = {
            'doi': '10.1234/example',
            'pmid': '123456',
            'arxiv': '2310.12345',
            'isbn': '978-0-123456-78-9',
        }
        identifiers = IdentifierExtractor.extract(fields)
        assert len(identifiers) == 4
        assert identifiers['doi'] == '10.1234/example'
        assert identifiers['pmid'] == '123456'
        assert identifiers['arxiv'] == '2310.12345'
        assert identifiers['isbn'] == '978-0-123456-78-9'
 class TestIdentifierNormalizer:
    """Test IdentifierNormalizer class."""
    def test_normalize_doi(self):
        """Test DOI normalization."""
        assert IdentifierNormalizer.normalize_doi('10.1234/EXAMPLE') == '10.1234/example'
        assert IdentifierNormalizer.normalize_doi('10.1234/test') == '10.1234/test'
        assert IdentifierNormalizer.normalize_doi('invalid') is None
    def test_normalize_pmid(self):
        """Test PMID normalization."""
        assert IdentifierNormalizer.normalize_pmid('12345') == '12345'
        assert IdentifierNormalizer.normalize_pmid('1234567') == '1234567'
        assert IdentifierNormalizer.normalize_pmid('invalid') is None
    def test_normalize_pmcid(self):
        """Test PMCID normalization."""
        assert IdentifierNormalizer.normalize_pmcid('PMC12345') == 'pmc12345'
        assert IdentifierNormalizer.normalize_pmcid('PMCabcdef') == 'pmcabcdef'
        assert IdentifierNormalizer.normalize_pmcid('invalid') is None
    def test_normalize_arxiv(self):
        """Test arXiv normalization."""
        assert IdentifierNormalizer.normalize_arxiv('2310.12345') == '2310.12345'
        assert IdentifierNormalizer.normalize_arxiv('2310.12345v1') == '2310.12345'
        assert IdentifierNormalizer.normalize_arxiv('INVALID') is None
    def test_normalize_orcid(self):
        """Test ORCID normalization."""
        assert IdentifierNormalizer.normalize_orcid('0000-0001-2345-6789') == '0000-0001-2345-6789'
        # ORCID with spaces is invalid according to the canonical format
        assert IdentifierNormalizer.normalize_orcid('0000 0001 2345 6789') is None
        assert IdentifierNormalizer.normalize_orcid('invalid') is None
    def test_normalize_identifier(self):
        """Test generic identifier normalization."""
        result = IdentifierNormalizer.normalize_identifier('doi', '10.1234/test')
        assert result == ('doi', '10.1234/test')
        result = IdentifierNormalizer.normalize_identifier('pmid', '12345')
        assert result == ('pmid', '12345')
        result = IdentifierNormalizer.normalize_identifier('invalid', 'value')
        assert result is None
 class TestIdentifierResolver:
    """Test IdentifierResolver class."""
    def test_resolve_with_doi(self):
        """Test resolving with DOI."""
        fields = {'doi': '10.1234/example', 'title': 'Test Title'}
        resolved = IdentifierResolver.resolve(fields)
        assert len(resolved) >= 1
        doi_resolved = [r for r in resolved if r[0] == 'doi']
        assert len(doi_resolved) > 0
    def test_resolve_with_multiple_identifiers(self):
        """Test resolving with multiple identifiers."""
        fields = {
            'doi': '10.1234/example',
            'pmid': '12345',
            'arxiv': '2310.12345',
        }
        resolved = IdentifierResolver.resolve(fields)
        assert len(resolved) >= 2
        doi_resolved = [r for r in resolved if r[0] == 'doi']
        assert len(doi_resolved) > 0
    def test_resolve_without_identifiers(self):
        """Test resolving without identifiers."""
        fields = {'title': 'Test Title', 'author': 'John Doe'}
        resolved = IdentifierResolver.resolve(fields)
        # Should have at least title fingerprint
        assert len(resolved) >= 1
        title_resolved = [r for r in resolved if r[0] == 'title']
        assert len(title_resolved) > 0
    def test_get_primary_identifier(self):
        """Test getting primary identifier."""
        fields = {
            'doi': '10.1234/example',
            'pmid': '12345',
            'title': 'Test Title',
        }
        primary = IdentifierResolver.get_primary_identifier(fields)
        assert primary is not None
        # DOI should be first priority
        assert primary[0] == 'doi'
    def test_get_scheme_value(self):
        """Test getting specific scheme value."""
        fields = {
            'doi': '10.1234/example',
            'pmid': '12345',
        }
        doi = IdentifierResolver.get_scheme_value('doi', fields)
        assert doi == '10.1234/example'
        pmid = IdentifierResolver.get_scheme_value('pmid', fields)
        assert pmid == '12345'
        isbn = IdentifierResolver.get_scheme_value('isbn', fields)
        assert isbn is None
 class TestConvenienceFunctions:
    """Test convenience functions."""
    def test_extract_identifiers(self):
        """Test extract_identifiers function."""
        fields = {'doi': '10.1234/example', 'pmid': '12345'}
        identifiers = extract_identifiers(fields)
        assert 'doi' in identifiers
        assert 'pmid' in identifiers
    def test_normalize_identifier(self):
        """Test normalize_identifier function."""
        result = normalize_identifier('doi', '10.1234/test')
        assert result == ('doi', '10.1234/test')
    def test_get_primary_identifier(self):
        """Test get_primary_identifier function."""
        fields = {'doi': '10.1234/example'}
        primary = get_primary_identifier(fields)
        assert primary == ('doi', '10.1234/example')
    def test_resolve_identifiers(self):
        """Test resolve_identifiers function."""
        fields = {'doi': '10.1234/example'}
        resolved = resolve_identifiers(fields)
        assert len(resolved) > 0
--- a/tests/test_semanticscholar.py
+++ b/tests/test_semanticscholar.py
@ -0,0 +1,117 @@
 from __future__ import annotations
 from citegeist.resolve import MetadataResolver
 from citegeist.sources import SemanticScholarSource, SourceRegistry, list_source_catalog
 def test_semanticscholar_source_normalizes_record() -> None:
    source = SemanticScholarSource(config={})
    entry = source.normalize(
        {
            "paperId": "abcdef123456",
            "title": "Physics Example",
            "year": 2024,
            "abstract": "Abstract text.",
            "authors": [{"name": "Jane Doe"}, {"name": "Alex Roe"}],
            "externalIds": {"DOI": "10.1000/physics"},
            "journal": {"name": "Physical Review Example"},
            "openAccessPdf": {"url": "https://example.org/paper.pdf"},
            "citationCount": 42,
            "publicationTypes": ["JournalArticle"],
        }
    )
    assert entry is not None
    assert entry.fields["doi"] == "10.1000/physics"
    assert entry.fields["author"] == "Jane Doe and Alex Roe"
    assert entry.fields["journal"] == "Physical Review Example"
    assert entry.fields["url"] == "https://example.org/paper.pdf"
    assert entry.fields["is_oa"] == "true"
    assert entry.fields["semanticscholar_citation_count"] == "42"
 def test_semanticscholar_registry_and_catalog() -> None:
    registry = SourceRegistry()
    registry.from_config_dict(
        {
            "sources": {
                "semanticscholar": {
                    "source_type": "semanticscholar",
                    "enabled": True,
                }
            }
        }
    )
    source = registry.get("semanticscholar")
    assert isinstance(source, SemanticScholarSource)
    catalog = {entry.key: entry for entry in list_source_catalog()}
    assert catalog["semantic_scholar"].current_status == "integrated"
    assert catalog["semantic_scholar"].priority == "now"
 def test_metadata_resolver_uses_semanticscholar_doi_after_other_lookups_fail() -> None:
    resolver = MetadataResolver()
    resolver.resolve_doi = lambda _doi: None  # type: ignore[method-assign]
    resolver.resolve_datacite_doi = lambda _doi: None  # type: ignore[method-assign]
    resolver.resolve_europepmc_doi = lambda _doi: None  # type: ignore[method-assign]
    resolver.semanticscholar.lookup_by_doi = lambda _doi: resolver.semanticscholar.normalize(  # type: ignore[method-assign]
        {
            "paperId": "abcdef123456",
            "title": "Physics Example",
            "year": 2024,
            "authors": [{"name": "Jane Doe"}],
            "externalIds": {"DOI": "10.1000/physics"},
            "journal": {"name": "Physical Review Example"},
            "publicationTypes": ["JournalArticle"],
        }
    )
    from citegeist.bibtex import BibEntry
    result = resolver.resolve_entry(
        BibEntry(
            entry_type="article",
            citation_key="seed2024",
            fields={"doi": "10.1000/physics", "title": "Physics Example"},
        )
    )
    assert result is not None
    assert result.source_label == "semanticscholar:doi:10.1000/physics"
    assert result.entry.fields["journal"] == "Physical Review Example"
 def test_metadata_resolver_uses_semanticscholar_title_search_after_other_searches_fail() -> None:
    resolver = MetadataResolver()
    resolver.search_crossref_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
    resolver.search_datacite_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
    resolver.search_openalex_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
    resolver.search_pubmed_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
    resolver.search_europepmc_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
    resolver.semanticscholar.search = lambda _title, limit=5: [  # type: ignore[method-assign]
        resolver.semanticscholar.normalize(
            {
                "paperId": "abcdef123456",
                "title": "Physics Example",
                "year": 2024,
                "authors": [{"name": "Jane Doe"}],
                "externalIds": {"DOI": "10.1000/physics"},
                "journal": {"name": "Physical Review Example"},
                "publicationTypes": ["JournalArticle"],
            }
        )
    ]
    from citegeist.bibtex import BibEntry
    result = resolver.resolve_entry(
        BibEntry(
            entry_type="article",
            citation_key="seed2024",
            fields={"title": "Physics Example", "author": "Jane Doe", "year": "2024"},
        )
    )
    assert result is not None
    assert result.source_label == "semanticscholar:search:Physics Example"
--- a/tests/test_sources_catalog.py
+++ b/tests/test_sources_catalog.py
@ -0,0 +1,60 @@
 from __future__ import annotations
 from citegeist.sources import CrossRefSource, OpenCitationsSource, SourceRegistry, list_source_catalog, prioritized_source_keys
 def test_catalog_prioritizes_existing_core_sources() -> None:
    keys = prioritized_source_keys()
    assert keys[:6] == ["crossref", "datacite", "europe_pmc", "openalex", "open_citations", "pubmed"]
 def test_catalog_includes_open_citation_and_access_sources() -> None:
    catalog = {entry.key: entry for entry in list_source_catalog()}
    assert "open_citations" in catalog
    assert "unpaywall" in catalog
    assert catalog["open_citations"].priority == "now"
    assert "doi_citations" in catalog["open_citations"].capabilities
 def test_registry_loads_known_source_from_config() -> None:
    registry = SourceRegistry()
    registry.from_config_dict(
        {
            "sources": {
                "crossref": {
                    "source_type": "crossref",
                    "enabled": True,
                }
            }
        }
    )
    source = registry.get("crossref")
    assert isinstance(source, CrossRefSource)
 def test_registry_rejects_unknown_source_type() -> None:
    registry = SourceRegistry()
    try:
        registry.from_config_dict({"sources": {"mystery": {"source_type": "mystery"}}})
    except ValueError as exc:
        assert "Unknown source type" in str(exc)
    else:
        raise AssertionError("expected ValueError for unknown source type")
 def test_registry_loads_opencitations_from_config() -> None:
    registry = SourceRegistry()
    registry.from_config_dict(
        {
            "sources": {
                "opencitations": {
                    "source_type": "opencitations",
                    "enabled": True,
                }
            }
        }
    )
    source = registry.get("opencitations")
    assert isinstance(source, OpenCitationsSource)
--- a/tests/test_sources_plugin.py
+++ b/tests/test_sources_plugin.py
@ -0,0 +1,171 @@
 """Tests for the source plugin architecture."""
 from __future__ import annotations
 import pytest
 from citegeist.sources import BibliographicSource, SourceRegistry, CrossRefSource
 class MockSource(BibliographicSource):
    """Mock source for testing."""
    def __init__(self, config: dict | None = None):
        super().__init__(config)
        self.lookup_calls = []
    def lookup_by_doi(self, doi: str) -> None:
        """Return None to indicate not found."""
        self.lookup_calls.append(('doi', doi))
        return None
    def lookup_by_title(self, title: str) -> None:
        """Return None to indicate not found."""
        self.lookup_calls.append(('title', title))
        return None
    def search(self, query: str, limit: int = 10) -> list:
        return []
    def normalize(self, record: dict) -> None:
        return None
 def test_source_base_interface():
    """Test that BibliographicSource base class works."""
    source = MockSource()
    assert source.is_available()
    assert source.get_identifier_scheme() == 'mocksource'
    assert source.get_fulltext_url('doi:test') is None
    assert source.get_embedding('doi:test') is None
 def test_mock_source():
    """Test that mock source implements interface correctly."""
    source = MockSource()
    source.lookup_by_doi('10.1234/test')
    source.lookup_by_title('Test Title')
    assert source.lookup_calls == [
        ('doi', '10.1234/test'),
        ('title', 'Test Title')
    ]
 def test_source_registry():
    """Test source registry functionality."""
    registry = SourceRegistry()
    # Register a source
    registry.register(MockSource, name='mock_source', config={'enabled': True})
    # List sources
    sources = registry.list_sources()
    assert 'mock_source' in sources
    # Get source instance
    source = registry.get('mock_source')
    assert source is not None
    assert isinstance(source, MockSource)
    assert source.is_available()
 def test_source_registry_disabled():
    """Test that disabled sources are not returned."""
    registry = SourceRegistry()
    registry.register(
        MockSource,
        name='disabled_source',
        config={'enabled': False}
    )
    sources = registry.list_sources()
    assert 'disabled_source' in sources
    # Getting disabled source should return None
    source = registry.get('disabled_source')
    assert source is None
 def test_crossref_source():
    """Test CrossRef source plugin."""
    registry = SourceRegistry()
    registry.register(CrossRefSource, name='crossref', config={})
    source = registry.get('crossref')
    assert source is not None
    assert source.is_available()
    assert source.get_identifier_scheme() == 'doi'
    entry = source.normalize(
        {
            'message': {
                'DOI': '10.1234/example',
                'title': ['Test Title'],
                'author': [{'given': 'Jane', 'family': 'Doe'}],
                'published-print': {'date-parts': [[2024]]},
                'container-title': ['Journal of Tests'],
                'publisher': 'Test Publisher',
                'URL': 'https://doi.org/10.1234/example',
                'abstract': '<jats:p>Example abstract</jats:p>',
            }
        }
    )
    assert entry is not None
    assert entry.fields['doi'] == '10.1234/example'
    assert entry.fields['title'] == 'Test Title'
    assert entry.fields['year'] == '2024'
    assert entry.fields['journal'] == 'Journal of Tests'
 def test_crossref_search_item_normalization():
    source = CrossRefSource()
    entry = source.normalize(
        {
            'DOI': '10.1234/example',
            'title': ['Search Result'],
            'author': [{'family': 'Doe'}],
            'issued': {'date-parts': [[2023]]},
        }
    )
    assert entry is not None
    assert entry.fields['doi'] == '10.1234/example'
    assert entry.fields['year'] == '2023'
 def test_source_record():
    """Test SourceRecord dataclass."""
    from citegeist.sources import SourceRecord
    record = SourceRecord(
        raw={'test': 'data'},
        source_type='test',
        source_label='test_source',
        timestamp='2024-01-01',
        confidence=1.0
    )
    assert record.source_type == 'test'
    assert record.source_label == 'test_source'
    assert record.confidence == 1.0
    assert record.raw == {'test': 'data'}
 def test_citation_edge():
    """Test CitationEdge dataclass."""
    from citegeist.sources import CitationEdge
    edge = CitationEdge(
        source_work_id='doi:10.1234',
        target_work_id='doi:10.5678',
        relation_type='cites',
        source_type='crossref',
        source_label='crossref:test',
        confidence=0.9
    )
    assert edge.relation_type == 'cites'
    assert edge.confidence == 0.9
--- a/tests/test_talkorigins.py
+++ b/tests/test_talkorigins.py
@ -530,6 +530,88 @@ def test_talkorigins_enrich_weak_canonicals_can_preview_resolution(tmp_path: Pat
    assert results[0].weak_reasons_after == []
 def test_talkorigins_enrich_weak_canonicals_includes_resolution_attempts(tmp_path: Path):
    base_url = "https://www.talkorigins.org/origins/biblio/"
    scraper = TalkOriginsScraper(
        source_client=FakeSourceClient(
            {
                base_url: INDEX_HTML,
                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
                f"{base_url}evolution.html": EVOLUTION_HTML,
            }
        )
    )
    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
    Path(export.seed_sets[0].seed_bib).write_text(
        """
@misc{weak1,
    author = "Smith, Jane",
    year = "1999",
    title = "Weak Duplicate"
 }
@misc{weak2,
    author = "Smith, Jane",
    year = "1999",
    title = "Weak Duplicate",
    note = "Copied from legacy source"
 }
 """,
        encoding="utf-8",
    )
    Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8")
    from citegeist.resolve import Resolution, ResolutionAttempt, ResolutionOutcome
    scraper.resolver.resolve_entry_with_trace = lambda entry: ResolutionOutcome(  # type: ignore[method-assign]
        resolution=Resolution(
            entry=BibEntry(
                entry_type="article",
                citation_key="resolved",
                fields={
                    "author": entry.fields["author"],
                    "title": entry.fields["title"],
                    "year": entry.fields["year"],
                    "doi": "10.1000/weak",
                    "journal": "Journal of Better Metadata",
                },
            ),
            source_type="resolver",
            source_label="crossref:search:Weak Duplicate",
        ),
        attempts=[
            ResolutionAttempt(
                source_name="crossref",
                strategy="title_search",
                query_value="Weak Duplicate",
                matched=True,
                candidate_count=1,
                source_label="crossref:search:Weak Duplicate",
            )
        ],
    )
    store = BibliographyStore()
    try:
        results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=False)
    finally:
        store.close()
    assert len(results) == 1
    assert results[0].resolution_attempts == [
        {
            "source_name": "crossref",
            "strategy": "title_search",
            "query_value": "Weak Duplicate",
            "matched": True,
            "candidate_count": 1,
            "source_label": "crossref:search:Weak Duplicate",
            "error": "",
        }
    ]
 def test_talkorigins_enrich_weak_canonicals_can_apply_to_store(tmp_path: Path):
    base_url = "https://www.talkorigins.org/origins/biblio/"
    scraper = TalkOriginsScraper(
@ -799,6 +881,7 @@ def test_talkorigins_build_review_export_combines_cluster_and_enrichment(tmp_pat
    assert review.items[0]["canonical"]["citation_key"] == "weak2"
    assert review.items[0]["enrichment"]["resolved"] is True
    assert review.items[0]["enrichment"]["applied"] is False
    assert review.items[0]["enrichment"]["resolution_attempts"] == []
 def test_talkorigins_apply_review_corrections_updates_store_entry(tmp_path: Path):
--- a/tests/test_unpaywall.py
+++ b/tests/test_unpaywall.py
@ -0,0 +1,117 @@
 from __future__ import annotations
 from citegeist.cli import _run_enrich_oa
 from citegeist.sources import SourceRegistry, UnpaywallSource, list_source_catalog, prioritized_source_keys
 from citegeist.storage import BibliographyStore
 def test_unpaywall_source_normalizes_oa_record() -> None:
    source = UnpaywallSource(config={"email": "tester@example.org"})
    entry = source.normalize(
        {
            "doi": "10.1000/example",
            "title": "Example Article",
            "year": 2024,
            "is_oa": True,
            "oa_status": "gold",
            "best_oa_location": {
                "url": "https://example.org/article",
                "url_for_pdf": "https://example.org/article.pdf",
                "license": "cc-by",
                "host_type": "publisher",
                "version": "publishedVersion",
                "evidence": "open (via free pdf)",
            },
        }
    )
    assert entry is not None
    assert entry.fields["doi"] == "10.1000/example"
    assert entry.fields["best_oa_url"] == "https://example.org/article"
    assert entry.fields["best_oa_pdf_url"] == "https://example.org/article.pdf"
    assert entry.fields["oa_status"] == "gold"
    assert entry.fields["oa_license"] == "cc-by"
    assert entry.fields["is_oa"] == "true"
 def test_unpaywall_registry_and_catalog() -> None:
    registry = SourceRegistry()
    registry.from_config_dict(
        {
            "sources": {
                "unpaywall": {
                    "source_type": "unpaywall",
                    "enabled": True,
                    "email": "tester@example.org",
                }
            }
        }
    )
    source = registry.get("unpaywall")
    assert isinstance(source, UnpaywallSource)
    catalog = {entry.key: entry for entry in list_source_catalog()}
    assert catalog["unpaywall"].current_status == "integrated"
    assert catalog["unpaywall"].priority == "now"
    assert "unpaywall" in prioritized_source_keys()
 def test_run_enrich_oa_updates_entry() -> None:
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            """
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024},
  doi = {10.1000/example}
 }
 """
        )
        original_lookup = UnpaywallSource.lookup_by_doi
        def fake_lookup(self: UnpaywallSource, doi: str):
            return self.normalize(
                {
                    "doi": doi,
                    "title": "Seed Paper",
                    "year": 2024,
                    "is_oa": True,
                    "oa_status": "green",
                    "best_oa_location": {
                        "url": "https://repository.example.org/seed",
                        "url_for_pdf": "https://repository.example.org/seed.pdf",
                        "license": "cc-by",
                        "host_type": "repository",
                        "version": "acceptedVersion",
                        "evidence": "oa repository",
                    },
                }
            )
        UnpaywallSource.lookup_by_doi = fake_lookup  # type: ignore[method-assign]
        try:
            assert _run_enrich_oa(store, ["seed2024"], "tester@example.org") == 0
        finally:
            UnpaywallSource.lookup_by_doi = original_lookup  # type: ignore[method-assign]
        entry = store.get_entry("seed2024")
        assert entry is not None
        assert entry["best_oa_url"] == "https://repository.example.org/seed"
        assert entry["best_oa_pdf_url"] == "https://repository.example.org/seed.pdf"
        assert entry["oa_status"] == "green"
        assert entry["oa_host_type"] == "repository"
        provenance = store.get_field_provenance("seed2024")
        assert any(item["source_type"] == "oa_enrich" for item in provenance)
    finally:
        store.close()
 def test_run_enrich_oa_requires_email() -> None:
    store = BibliographyStore()
    try:
        assert _run_enrich_oa(store, ["missing"], None) == 1
    finally:
        store.close()