Add source tracing and broader open source coverage

2026-04-25 22:27:53 -04:00 · 2026-04-25 22:27:53 -04:00 · 0497e18f04
parent 39fe5ea86c
commit 0497e18f04
37 changed files with 4975 additions and 86 deletions
--- a/db/migrations/0001_multisource.sql
+++ b/db/migrations/0001_multisource.sql
@ -0,0 +1,185 @@
+-- Migration: Multi-source bibliographic schema
+-- Description: Add multi-source support with works, identifiers, source records, citations, and embeddings
+
+-- ============================================================================
+-- WORKS TABLE - Canonical metadata for works
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS works (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    work_id TEXT NOT NULL UNIQUE,
+    title TEXT,
+    abstract TEXT,
+    publication_year INTEGER,
+    publication_date TEXT,
+    journal_name TEXT,
+    publisher TEXT,
+    volume TEXT,
+    issue TEXT,
+    pages TEXT,
+    doi TEXT,
+    pmid TEXT,
+    pmcid TEXT,
+    arxiv_id TEXT,
+    dblp_key TEXT,
+    openalex_id TEXT,
+    isbn TEXT,
+    issn TEXT,
+    entry_type TEXT NOT NULL DEFAULT 'article',
+    citation_count INTEGER DEFAULT 0,
+    cited_by_count INTEGER DEFAULT 0,
+    influential_citations INTEGER DEFAULT 0,
+    is_open_access BOOLEAN DEFAULT 0,
+    best_oa_url TEXT,
+    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+-- ============================================================================
+-- WORK IDENTIFIERS TABLE - Mapping scheme + value to works
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS work_identifiers (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    work_id TEXT NOT NULL,
+    scheme TEXT NOT NULL,
+    value TEXT NOT NULL,
+    is_primary BOOLEAN DEFAULT 0,
+    normalized_value TEXT,
+    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    UNIQUE(work_id, scheme, value),
+    FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
+);
+
+-- ============================================================================
+-- SOURCE RECORDS TABLE - Raw API responses with provenance
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS source_records (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    work_id TEXT NOT NULL,
+    source_type TEXT NOT NULL,
+    source_label TEXT NOT NULL,
+    raw_data_json TEXT NOT NULL,
+    raw_record_id TEXT,
+    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    UNIQUE(work_id, source_type, source_label),
+    FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
+);
+
+-- ============================================================================
+-- CITATIONS TABLE - Citation graph with provenance
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS citations (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    source_work_id TEXT NOT NULL,
+    target_work_id TEXT NOT NULL,
+    relation_type TEXT NOT NULL,
+    source_type TEXT NOT NULL,
+    source_label TEXT NOT NULL,
+    confidence REAL DEFAULT 1.0,
+    is_verified BOOLEAN DEFAULT 0,
+    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    UNIQUE(source_work_id, target_work_id, relation_type),
+    FOREIGN KEY (source_work_id) REFERENCES works(id) ON DELETE CASCADE,
+    FOREIGN KEY (target_work_id) REFERENCES works(id) ON DELETE CASCADE
+);
+
+-- ============================================================================
+-- WORK EMBEDDINGS TABLE - Vector storage for semantic search
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS work_embeddings (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    work_id TEXT NOT NULL,
+    embedding TEXT NOT NULL,
+    model_name TEXT NOT NULL,
+    model_version TEXT,
+    dimension INTEGER NOT NULL,
+    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    UNIQUE(work_id, model_name),
+    FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
+);
+
+-- ============================================================================
+-- INDEXES - For performance optimization
+-- ============================================================================
+-- Work identifiers indexes
+CREATE INDEX IF NOT EXISTS idx_work_identifiers_scheme ON work_identifiers(scheme);
+CREATE INDEX IF NOT EXISTS idx_work_identifiers_value ON work_identifiers(value);
+CREATE INDEX IF NOT EXISTS idx_work_identifiers_work_id ON work_identifiers(work_id);
+CREATE INDEX IF NOT EXISTS idx_work_identifiers_normalized ON work_identifiers(normalized_value);
+
+-- Source records indexes
+CREATE INDEX IF NOT EXISTS idx_source_records_work_id ON source_records(work_id);
+CREATE INDEX IF NOT EXISTS idx_source_records_source_type ON source_records(source_type);
+
+-- Citations indexes
+CREATE INDEX IF NOT EXISTS idx_citations_source_work ON citations(source_work_id);
+CREATE INDEX IF NOT EXISTS idx_citations_target_work ON citations(target_work_id);
+CREATE INDEX IF NOT EXISTS idx_citations_relation_type ON citations(relation_type);
+CREATE INDEX IF NOT EXISTS idx_citations_source_type ON citations(source_type);
+
+-- Works indexes
+CREATE INDEX IF NOT EXISTS idx_works_doi ON works(doi);
+CREATE INDEX IF NOT EXISTS idx_works_pmid ON works(pmid);
+CREATE INDEX IF NOT EXISTS idx_works_pmcid ON works(pmcid);
+CREATE INDEX IF NOT EXISTS idx_works_arxiv ON works(arxiv_id);
+CREATE INDEX IF NOT EXISTS idx_works_openalex ON works(openalex_id);
+CREATE INDEX IF NOT EXISTS idx_works_is_open_access ON works(is_open_access);
+CREATE INDEX IF NOT EXISTS idx_works_created_at ON works(created_at);
+
+-- Embeddings indexes
+CREATE INDEX IF NOT EXISTS idx_embeddings_work_id ON work_embeddings(work_id);
+CREATE INDEX IF NOT EXISTS idx_embeddings_model_name ON work_embeddings(model_name);
+
+-- ============================================================================
+-- PostgreSQL-specific extensions and vector indexing
+-- ============================================================================
+-- Note: The following are PostgreSQL-specific and should be run when using pgvector
+
+-- Uncomment these when using PostgreSQL with pgvector extension:
+-- CREATE EXTENSION IF NOT EXISTS vector;
+-- CREATE INDEX IF NOT EXISTS idx_embeddings_vector ON work_embeddings
+--     USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
+
+-- ============================================================================
+-- TRIGGERS - For automatic timestamp updates
+-- ============================================================================
+-- Works table update trigger
+CREATE TRIGGER IF NOT EXISTS works_updated_at
+AFTER UPDATE ON works
+FOR EACH ROW
+WHEN (new.updated_at IS NULL)
+BEGIN
+    UPDATE works SET updated_at = CURRENT_TIMESTAMP WHERE id = old.id;
+END;
+
+-- Work identifiers update trigger
+CREATE TRIGGER IF NOT EXISTS work_identifiers_updated_at
+AFTER UPDATE ON work_identifiers
+FOR EACH ROW
+WHEN (new.created_at IS NULL)
+BEGIN
+    UPDATE work_identifiers SET created_at = CURRENT_TIMESTAMP WHERE id = old.id;
+END;
+
+-- ============================================================================
+-- VIEWS - For simplified queries
+-- ============================================================================
+-- View to join works with their identifiers
+CREATE VIEW IF NOT EXISTS works_with_identifiers AS
+SELECT 
+    w.id,
+    w.work_id,
+    w.title,
+    w.abstract,
+    w.publication_year,
+    w.journal_name,
+    w.publisher,
+    w.doi,
+    w.pmid,
+    w.pmcid,
+    w.arxiv_id,
+    w.dblp_key,
+    w.openalex_id,
+    GROUP_CONCAT(DISTINCT wi.scheme || ':' || wi.value, ', ') AS identifiers
+FROM works w
+LEFT JOIN work_identifiers wi ON w.id = wi.work_id
+GROUP BY w.id, w.work_id, w.title, w.abstract, w.publication_year, w.journal_name, w.publisher, w.doi, w.pmid, w.pmcid, w.arxiv_id, w.dblp_key, w.openalex_id;
--- a/docs/README.md
+++ b/docs/README.md
@ -0,0 +1,103 @@
+# CiteGeist Source Planning Documentation
+
+Welcome to the source-planning documentation for CiteGeist.
+
+## Quick Overview
+
+The immediate planning question is which additional open bibliographic sources should be incorporated next.
+
+This documentation therefore emphasizes:
+
+- the current source baseline already present in the repository
+- the next highest-value open sources to add
+- a smaller, more realistic source-layer abstraction
+- explicit deferral of unrelated database/vector ambitions
+
+## Documentation Files
+
+### Planning and Status
+- **[source-landscape.md](./source-landscape.md)** - recommended next open bibliographic sources
+- **[implementation-progress.md](./implementation-progress.md)** - sources-first progress tracker
+- **[phase-completion.md](./phase-completion.md)** - short status summary
+- **[file-structure.md](./file-structure.md)** - file structure and module notes
+
+### Existing Architecture References
+- **[architecture-current.md](./architecture-current.md)** - current architecture overview
+- **[schema-current.sql](./schema-current.sql)** - existing database schema
+
+## Current Status
+
+### Current Baseline
+1. Crossref, OpenAlex, PubMed, Europe PMC, Semantic Scholar, DataCite, DBLP, arXiv, and OAI-PMH are already in play.
+2. OpenCitations and Unpaywall are now integrated as source-layer additions.
+3. The SQLite-based local workflow remains the baseline.
+
+### Recommended Next Sources
+1. OpenAIRE only if repository-acquisition scope expands
+
+### Explicitly Deferred
+1. Database redesign
+2. pgvector / embedding-first work
+
+## Source Layer
+
+The source-layer code now provides:
+
+- `BibliographicSource` as the common interface
+- `SourceRegistry` for known concrete source classes
+- `CrossRefSource` as the repaired first concrete plugin
+- `OpenCitationsSource` plus DOI-based graph expansion
+- `UnpaywallSource` plus DOI-based OA-link enrichment
+- `EuropePmcSource` plus biomedical resolver/search support
+- `SemanticScholarSource` plus broader biological/physical sciences resolver/search support
+- a source catalog with current status and priority order
+- compatibility with the existing `SourceClient`-based resolver and expander code
+
+## Quick Start
+
+```python
+from citegeist.sources import (
+    CrossRefSource,
+    EuropePmcSource,
+    OpenCitationsSource,
+    SemanticScholarSource,
+    SourceRegistry,
+    UnpaywallSource,
+    list_source_catalog,
+    prioritized_source_keys,
+)
+
+registry = SourceRegistry()
+registry.register(CrossRefSource, name="crossref", config={})
+registry.register(EuropePmcSource, name="europepmc", config={})
+registry.register(OpenCitationsSource, name="opencitations", config={})
+registry.register(SemanticScholarSource, name="semanticscholar", config={})
+registry.register(UnpaywallSource, name="unpaywall", config={"email": "you@example.org"})
+
+source = registry.get("crossref")
+catalog = list_source_catalog()
+priority = prioritized_source_keys()
+```
+
+## Tests
+
+Relevant tests for the refocused source work:
+
+- `tests/test_sources_plugin.py`
+- `tests/test_sources_catalog.py`
+
+The existing broader repository test suite should continue to pass as the source-layer changes are integrated.
+
+## Next Steps
+
+1. Decide whether `OpenAIRE` is worth adding for repository-acquisition breadth.
+2. Keep database/vector redesign work deferred unless a source need forces it.
+
+## License
+
+Same as the CiteGeist project.
+
+---
+
+**Last Updated:** 2026-04-25
+**Status:** Sources-first plan in effect
--- a/docs/architecture-current.md
+++ b/docs/architecture-current.md
@ -0,0 +1,87 @@
+# CiteGeist Current Architecture
+
+## Overview
+CiteGeist is currently designed as a local BibTeX-native tooling system with:
+- BibTeX parsing and storage
+- Local text search (FTS5)
+- Entry provenance tracking
+- Citation graph traversal
+- Topic-based expansion
+
+## Core Modules
+
+### Source Management
+- **sources.py**: `SourceClient` class for HTTP requests with caching and retry logic
+  - Base HTTP client with JSON/XML/text support
+  - Built-in retry with exponential backoff
+  - Cache directory support
+
+### Metadata Resolution
+- **resolve.py**: `MetadataResolver` class for entry resolution
+  - DOI → CrossRef lookup
+  - PMID → PubMed lookup
+  - arXiv, DBLP, OpenAlex lookup
+  - Title search fallback with best-match selection
+  - DataCite integration
+  - Returns `Resolution` objects with provenance
+
+### Storage
+- **storage.py**: `BibliographyStore` class (SQLite)
+  - Tables: entries, creators, entry_creators, identifiers, relations, topics, entry_topics, field_provenance, relation_provenance
+  - FTS5 text search integration
+  - Field-level provenance tracking
+  - Citation graph support (cites, cited_by edges)
+
+### BibTeX Processing
+- **bibtex.py**: BibEntry dataclass and parsing/rendering
+  - BibTeX → BibEntry conversion
+  - BibEntry → BibTeX rendering
+  - Citation key generation
+
+### CLI and Server
+- **cli.py**: Command-line interface
+- **app_server.py**: Local HTTP server for UI/JSON API
+- **app_api.py**: JSON API adapter surface
+
+### Expansion and Discovery
+- **expand.py**: Citation graph expansion workflows
+- **extract.py**: Plaintext reference extraction
+- **bootstrap.py**: Topic bootstrap and expansion
+
+## Current State Summary
+
+**Completed/Usable:**
+- BibTeX parsing and storage
+- Identifier-based resolution (DOI, PMID, arXiv, DBLP, OpenAlex)
+- Title search with best-match selection
+- Citation graph traversal and expansion
+- Field provenance tracking
+- Local search with FTS5
+- Topic-based discovery workflows
+
+**Not Yet Implemented (from new roadmap):**
+- Plugin-based source architecture
+- Multi-source record merging
+- PGVector embeddings
+- Full-text OA link retrieval
+- Semantic Scholar integration
+- OpenCitations integration
+- Unified API endpoints for multi-source queries
+
+## Data Flow
+
+1. **Ingest**: BibTeX file → parse → store in entries table
+2. **Resolve**: Entry → resolve_doi/resolve_pmid/resolve_arxiv → fetch metadata → merge with existing
+3. **Expand**: Start from entry → traverse citation edges → discover new entries
+4. **Search**: Query FTS5 index → retrieve relevant entries
+5. **Export**: Entries → render BibTeX → output file
+
+## Database Schema
+
+SQLite-based storage with:
+- Normalized entry fields
+- Creator relationships
+- Identifier mapping
+- Citation relations
+- Topic associations
+- Field provenance metadata
--- a/docs/file-structure.md
+++ b/docs/file-structure.md
@ -0,0 +1,165 @@
+# CiteGeist Multi-Source File Structure
+
+**Date:** 2026-04-25
+
+## Project Structure
+
+```
+/home/netuser/dev/CiteGeist/
+├── db/
+│   └── migrations/
+│       └── 0001_multisource.sql           ✅ NEW - Multi-source schema
+│
+├── docs/
+│   ├── architecture-current.md             ✅ NEW - Current architecture docs
+│   ├── implementation-progress.md          ✅ NEW - Implementation progress tracker
+│   ├── schema-current.sql                  ✅ NEW - Current schema SQL
+│   └── file-structure.md                   ✅ NEW - This file
+│
+├── src/citegeist/
+│   ├── sources/                            ✅ NEW - Source plugin architecture
+│   │   ├── __init__.py                     ✅ NEW - Package exports
+│   │   ├── __all__.py                      ✅ NEW - Public API
+│   │   ├── base.py                         ✅ NEW - Base BibliographicSource class
+│   │   ├── registry.py                     ✅ NEW - SourceRegistry implementation
+│   │   ├── crossref.py                     ✅ NEW - CrossRef source plugin
+│   │   └── _old_sources_compat.py          ✅ NEW - Backward compatibility
+│   │
+│   ├── resolver/                           ✅ NEW - Identifier resolution
+│   │   ├── __init__.py                     ✅ NEW - Module exports
+│   │   └── identifiers.py                  ✅ NEW - Extract, normalize, resolve
+│   │
+│   ├── db/                                 ✅ NEW - Database operations
+│   │   └── __init__.py                     🚧 TO DO - Database client
+│   │
+│   ├── ... (existing files)
+│   ├── sources.py                          📦 Existing - Old SourceClient
+│   ├── resolve.py                          📦 Existing - MetadataResolver
+│   └── storage.py                          📦 Existing - BibliographyStore
+│
+└── tests/
+    ├── test_sources_plugin.py              ✅ NEW - Source plugin tests
+    └── test_resolver_identifiers.py        ✅ NEW - Identifier tests
+```
+
+## Module Documentation
+
+### New Modules
+
+#### `src/citegeist/sources/`
+Plugin architecture for bibliographic sources.
+
+**Classes:**
+- `BibliographicSource` - Abstract base class for source plugins
+- `SourceRecord` - Raw source record dataclass
+- `CitationEdge` - Citation relationship dataclass
+- `SourceRegistry` - Manages source plugins
+
+**Plugin:**
+- `CrossRefSource` - CrossRef API implementation
+
+#### `src/citegeist/resolver/`
+Identifier extraction, normalization, and resolution.
+
+**Classes:**
+- `IdentifierExtractor` - Extract identifiers from entry fields
+- `IdentifierNormalizer` - Normalize identifiers to canonical form
+- `IdentifierResolver` - Resolve identifiers with lookup priority
+
+**Functions:**
+- `extract_identifiers()` - Quick identifier extraction
+- `normalize_identifier()` - Quick normalization
+- `get_primary_identifier()` - Get primary identifier
+- `resolve_identifiers()` - Resolve all identifiers
+
+#### `src/citegeist/db/`
+Database operations (to be implemented).
+
+**Planned:**
+- Database client for works table
+- Migration runner
+- Query builders
+
+#### `db/migrations/0001_multisource.sql`
+Multi-source database schema migration.
+
+**Tables:**
+1. `works` - Canonical work metadata
+2. `work_identifiers` - Multi-scheme identifiers
+3. `source_records` - Raw API responses
+4. `citations` - Citation graph
+5. `work_embeddings` - Vector embeddings
+
+### Existing Modules (Preserved)
+
+- `src/citegeist/sources.py` - Old SourceClient (backward compatible)
+- `src/citegeist/resolve.py` - Old MetadataResolver
+- `src/citegeist/storage.py` - Old BibliographyStore
+
+## Test Coverage
+
+**New Tests:**
+- `tests/test_sources_plugin.py` (7 tests)
+- `tests/test_resolver_identifiers.py` (17 tests)
+
+**Total:** 24 tests passing
+
+## Dependencies
+
+**New Dependencies Required:**
+- No new Python packages (uses stdlib only)
+
+**Planned Dependencies (Future phases):**
+- `pgvector` - PostgreSQL vector extension
+- `sentence-transformers` - Local embedding model
+- `fastapi` - API framework
+- `unpaywall` - OA link retrieval (if needed)
+
+## Implementation Status
+
+### Completed (100%)
+- ✅ Phase 0: Baseline Audit
+- ✅ Phase 1: Source Plugin Architecture
+- ✅ Phase 2: Identifier Resolution Layer
+
+### In Progress (50%)
+- 🚧 Phase 3: Database Schema Upgrade
+
+### Pending (0%)
+- ⏳ Phase 4: High-Value Source Integrations
+- ⏳ Phase 5: Merge & Deduplication Engine
+- ⏳ Phase 6: Citation Graph Construction
+- ⏳ Phase 7: Embedding Pipeline
+- ⏳ Phase 8: Full-Text Retrieval Layer
+- ⏳ Phase 9: API Layer
+- ⏳ Phase 10: Ranking & Relevance
+- ⏳ Phase 12: Observability & QA
+- ⏳ Phase 13: Performance Optimization
+
+## Quick Start
+
+```python
+# Register a source
+from citegeist.sources import SourceRegistry, CrossRefSource
+
+registry = SourceRegistry()
+registry.register(CrossRefSource, name='crossref', config={})
+
+# Get source instance
+source = registry.get('crossref')
+entry = source.lookup_by_doi('10.1234/example')
+
+# Resolve identifiers
+from citegeist.resolver import resolve_identifiers
+
+fields = {'doi': '10.1234/example', 'title': 'Test'}
+resolved = resolve_identifiers(fields)
+# Returns [('doi', '10.1234/example'), ('title', 'test title')]
+```
+
+## Next Steps
+
+1. ✅ Phase 0-2: Complete
+2. 🚧 Phase 3: Implement Python interface for database operations
+3. ⏳ Phase 4: Add Unpaywall, Semantic Scholar, OpenCitations integrations
+4. ⏳ Phase 5: Build merge engine
--- a/docs/implementation-progress.md
+++ b/docs/implementation-progress.md
@ -0,0 +1,122 @@
+# CiteGeist Sources-First Progress
+
+**Last Updated:** 2026-04-25
+
+This document tracks the refocused plan for source incorporation. The working question is which additional open bibliographic sources CiteGeist should integrate next, not whether it needs a new storage platform first.
+
+---
+
+## Phase 0: Scope Reframe ✅ COMPLETE
+
+**Status:** Completed
+
+**Deliverables:**
+- ✅ `/docs/source-landscape.md` - source inventory and recommendation document
+- ✅ `/src/citegeist/sources/catalog.py` - code-backed source catalog
+
+**Completed:**
+- Identified which source integrations already exist in the repository
+- Split source-expansion planning from database/vector-search ambitions
+- Prioritized open-source additions by workflow value
+
+---
+
+## Phase 1: Source Layer Tightening ✅ COMPLETE
+
+**Status:** Completed
+
+**Deliverables:**
+- ✅ `/src/citegeist/sources/base.py` - Base `BibliographicSource` interface
+- ✅ `/src/citegeist/sources/registry.py` - Registry for known concrete sources
+- ✅ `/src/citegeist/sources/crossref.py` - Repaired CrossRef source implementation
+- ✅ `/src/citegeist/sources/catalog.py` - Open-source inventory
+- ✅ `/src/citegeist/sources/__init__.py` - Package initialization
+- ✅ `/tests/test_sources_plugin.py` - Source plugin tests
+- ✅ `/tests/test_sources_catalog.py` - Source catalog and registry tests
+
+**Completed:**
+- ✅ Created `BibliographicSource` abstract base class
+- ✅ Repaired `SourceRegistry` so config-backed loading resolves real source classes
+- ✅ Fixed `CrossRefSource` normalization for direct lookup and search-style payloads
+- ✅ Replaced path-specific compatibility loading with repo-relative loading
+- ✅ Added a source catalog that captures current status and next-priority sources
+
+**Features:**
+- Abstract interface for source plugins
+- Registry for known source discovery and instantiation
+- Config-driven enable/disable for known source types
+- Source prioritization metadata
+- Compatibility with the existing `SourceClient`-based resolver/expander code
+
+---
+
+## Current Integrated Sources ✅ AVAILABLE
+
+- `Crossref`
+- `OpenAlex`
+- `OpenCitations`
+- `Unpaywall`
+- `PubMed`
+- `Europe PMC`
+- `Semantic Scholar`
+- `DataCite`
+- `DBLP`
+- `arXiv`
+- `OAI-PMH`
+
+These are already sufficient for a credible local enrichment-and-discovery workflow. The next work should complement them rather than restart infrastructure underneath them.
+
+---
+
+## Phase 2: Next Source Additions 🚧 IN PROGRESS
+
+**Status:** In Progress
+
+**Priority Order:**
+1. `OpenAIRE` only if repository-acquisition scope expands
+
+**Completed Deliverables:**
+- ✅ OpenCitations adapter for DOI citation/reference lookup
+- ✅ OpenCitations graph expansion support in CLI and topic expansion flows
+- ✅ Unpaywall adapter for DOI OA-link enrichment
+- ✅ `enrich-oa` CLI flow for applying OA metadata to stored entries
+- ✅ Europe PMC biomedical resolver/search integration
+- ✅ Semantic Scholar broad-science resolver/search integration
+
+**Planned Deliverables:**
+- ⏳ Decide whether repository-acquisition breadth needs another dedicated source
+
+**Rationale:**
+- `OpenCitations` now improves open citation-edge coverage
+- `Unpaywall` now improves access-link enrichment
+- `Europe PMC` now improves biomedical metadata and OA/fulltext coverage
+- `Semantic Scholar` now improves broader biological and physical sciences coverage
+- neither requires a new database architecture to become useful
+
+---
+
+## Phase 3: Optional Source Evaluation ⏳ PLANNED
+
+**Status:** Planned
+
+- `OpenAIRE`
+
+**Decision Rule:**
+- add them only if they solve a concrete discovery or acquisition gap that current open sources do not already cover well
+
+---
+
+## Explicitly Deferred
+
+- second-schema redesign work
+- pgvector integration
+- embedding-first retrieval
+- broad canonical-work reconstruction
+
+---
+
+## Summary
+
+**Completed:** scope reframe and source-layer cleanup
+**Planned next:** `OpenAIRE` reevaluation
+**Deferred:** database/vector expansion work not required by the source question
--- a/docs/phase-completion.md
+++ b/docs/phase-completion.md
@ -0,0 +1,111 @@
+# Sources-First Status
+
+**Current Focus:** identify and prioritize the next open bibliographic sources to add, using the existing SQLite-based workflow as the baseline.
+
+---
+
+## Phase Matrix
+
+| Phase | Title | Status | Outcome |
+|-------|-------|--------|---------|
+| **0** | Scope Reframe | ✅ Complete | Planning now answers the source question directly |
+| **1** | Source Layer Tightening | ✅ Complete | Registry, CrossRef plugin, compatibility seam, and source catalog repaired |
+| **2** | Next Open Source Additions | 🚧 In Progress | OpenCitations, Unpaywall, and Europe PMC integrated |
+| **3** | Optional Source Evaluation | ⏳ Planned | OpenAIRE evaluated later if acquisition breadth matters |
+| **D** | Database / Vector Expansion | ⏸ Deferred | Not required for the current source-incorporation decision |
+
+---
+
+## Test Coverage Summary
+
+```
+✅ test_sources_plugin.py
+✅ test_sources_catalog.py
+✅ existing full suite still expected to pass
+```
+
+---
+
+## Key Artifacts
+
+### Documentation
+```
+docs/
+├── source-landscape.md          ✅ Source inventory and recommendations
+├── implementation-progress.md   ✅ Sources-first progress tracker
+└── phase-completion.md          ✅ Short status summary
+```
+
+### Source Layer
+```
+src/citegeist/sources/
+├── base.py                      ✅ Base source interface
+├── catalog.py                   ✅ Source inventory in code
+├── registry.py                  ✅ Registry for known source classes
+├── crossref.py                  ✅ Repaired CrossRef plugin
+└── _old_sources_compat.py       ✅ Repo-relative compatibility bridge
+```
+
+### Tests
+```
+tests/
+├── test_sources_plugin.py       ✅ Source plugin tests
+└── test_sources_catalog.py      ✅ Source catalog/registry tests
+```
+
+---
+
+## Key Features Implemented
+
+- ✅ Source catalog covering current and candidate open sources
+- ✅ Config-driven registry loading for known real source classes
+- ✅ CrossRef normalization that works for both single-record and search-result payloads
+- ✅ Compatibility bridge that no longer depends on one checkout path
+- ✅ OpenCitations DOI-based graph expansion with CLI support
+- ✅ Unpaywall OA-link enrichment with CLI support
+- ✅ Europe PMC biomedical resolver/search support
+- ✅ Semantic Scholar broad-science resolver/search support
+
+---
+
+## Next Milestones
+
+### Immediate
+1. Decide whether repository-acquisition scope justifies `OpenAIRE`
+2. Keep the OA-enrichment flow aligned with review/export needs
+3. Keep graph-source scope disciplined as broader coverage grows
+
+### Later
+1. Evaluate `Semantic Scholar`
+2. Evaluate `OpenAIRE`
+3. Revisit database/vector work only if a concrete source need demands it
+
+---
+
+## Success Metrics
+
+### Completed
+- ✅ Planning now matches the actual source question
+- ✅ Source-layer defects from the first pass have been corrected
+- ✅ OpenCitations is now a working integrated source
+- ✅ Unpaywall is now a working integrated source
+- ✅ Europe PMC is now a working integrated source
+- ✅ Semantic Scholar is now a working integrated source
+- ✅ The next source priorities are explicit
+
+### Planned
+- ⏳ Better source selection discipline before adding more integrations
+
+---
+
+## Recommendations
+
+1. Treat the current SQLite/FTS workflow as the baseline, not as a blocker.
+2. Add source integrations only when they materially improve bibliographic coverage, citation coverage, or open-access linkage.
+3. Keep database/vector work explicitly subordinate to source-incorporation needs.
+
+---
+
+**Last Updated:** 2026-04-25
+**Status:** Sources-first plan in effect
+**Confidence:** High
--- a/docs/schema-current.sql
+++ b/docs/schema-current.sql
@ -0,0 +1,131 @@
+-- CiteGeist Current Schema (SQLite)
+
+-- Entries table
+CREATE TABLE IF NOT EXISTS entries (
+    id INTEGER PRIMARY KEY,
+    citation_key TEXT NOT NULL UNIQUE,
+    entry_type TEXT NOT NULL,
+    review_status TEXT NOT NULL DEFAULT 'draft',
+    title TEXT,
+    year TEXT,
+    journal TEXT,
+    booktitle TEXT,
+    publisher TEXT,
+    abstract TEXT,
+    keywords TEXT,
+    url TEXT,
+    doi TEXT,
+    isbn TEXT,
+    fulltext TEXT,
+    raw_bibtex TEXT,
+    extra_fields_json TEXT NOT NULL DEFAULT '{}',
+    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+-- Creators table
+CREATE TABLE IF NOT EXISTS creators (
+    id INTEGER PRIMARY KEY,
+    full_name TEXT NOT NULL UNIQUE,
+    family_name TEXT,
+    given_names TEXT
+);
+
+-- Entry-Creators relationship
+CREATE TABLE IF NOT EXISTS entry_creators (
+    entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
+    creator_id INTEGER NOT NULL REFERENCES creators(id) ON DELETE CASCADE,
+    role TEXT NOT NULL,
+    ordinal INTEGER NOT NULL,
+    PRIMARY KEY (entry_id, role, ordinal)
+);
+
+-- Identifiers table
+CREATE TABLE IF NOT EXISTS identifiers (
+    entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
+    scheme TEXT NOT NULL,
+    value TEXT NOT NULL,
+    PRIMARY KEY (scheme, value)
+);
+
+-- Relations table (citation graph)
+CREATE TABLE IF NOT EXISTS relations (
+    source_entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
+    target_citation_key TEXT NOT NULL,
+    relation_type TEXT NOT NULL,
+    PRIMARY KEY (source_entry_id, target_citation_key, relation_type)
+);
+
+-- Topics table
+CREATE TABLE IF NOT EXISTS topics (
+    id INTEGER PRIMARY KEY,
+    slug TEXT NOT NULL UNIQUE,
+    name TEXT NOT NULL,
+    source_type TEXT NOT NULL,
+    source_url TEXT,
+    expansion_phrase TEXT,
+    suggested_phrase TEXT,
+    phrase_review_status TEXT NOT NULL DEFAULT 'unreviewed',
+    phrase_review_notes TEXT,
+    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+-- Entry-Topics relationship
+CREATE TABLE IF NOT EXISTS entry_topics (
+    entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
+    topic_id INTEGER NOT NULL REFERENCES topics(id) ON DELETE CASCADE,
+    source_label TEXT NOT NULL,
+    confidence REAL,
+    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    PRIMARY KEY (entry_id, topic_id)
+);
+
+-- Field Provenance table
+CREATE TABLE IF NOT EXISTS field_provenance (
+    id INTEGER PRIMARY KEY,
+    entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
+    field_name TEXT NOT NULL,
+    field_value TEXT,
+    source_type TEXT NOT NULL,
+    source_label TEXT NOT NULL,
+    operation TEXT NOT NULL,
+    confidence REAL,
+    recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+-- Relation Provenance table
+CREATE TABLE IF NOT EXISTS relation_provenance (
+    id INTEGER PRIMARY KEY,
+    source_entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
+    target_citation_key TEXT NOT NULL,
+    relation_type TEXT NOT NULL,
+    source_type TEXT NOT NULL,
+    source_label TEXT NOT NULL,
+    confidence REAL,
+    recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+-- Full-text Search (FTS5)
+CREATE VIRTUAL TABLE IF NOT EXISTS entries_fts USING fts5(
+    title,
+    abstract,
+    keywords,
+    content='entries',
+    content_rowid='id'
+);
+
+-- Trigger to sync entries with FTS
+CREATE TRIGGER IF NOT EXISTS entries_ai AFTER INSERT ON entries BEGIN
+    INSERT INTO entries_fts(rowid, title, abstract, keywords)
+    VALUES (new.id, new.title, new.abstract, new.keywords);
+END;
+
+CREATE TRIGGER IF NOT EXISTS entries_ad AFTER DELETE ON entries BEGIN
+    DELETE FROM entries_fts WHERE rowid = old.id;
+END;
+
+CREATE TRIGGER IF NOT EXISTS entries_au AFTER UPDATE ON entries BEGIN
+    UPDATE entries_fts SET title = new.title, abstract = new.abstract, keywords = new.keywords
+    WHERE rowid = new.id;
+END;
--- a/docs/source-landscape.md
+++ b/docs/source-landscape.md
@ -0,0 +1,131 @@
+# Open Bibliographic Source Landscape
+
+This document answers the question that prompted the recent planning work: which additional open bibliographic sources are worth incorporating into CiteGeist, given the sources it already uses?
+
+## Current Baseline
+
+CiteGeist already has useful source coverage for a local BibTeX-first workflow:
+
+- `Crossref`: DOI lookup, title search, and reference-list expansion.
+- `OpenAlex`: work lookup, title/discovery search, and citation-graph expansion.
+- `PubMed`: authoritative biomedical metadata lookup by PMID and title search fallback.
+- `Europe PMC`: biomedical metadata/fulltext complement to PubMed.
+- `Semantic Scholar`: broad cross-domain metadata with strong biological and physical sciences coverage.
+- `DataCite`: DOI-backed dataset/report/non-article metadata.
+- `DBLP`: strong computer-science metadata.
+- `arXiv`: preprint metadata.
+- `OAI-PMH`: repository harvesting for theses, dissertations, and institutional collections.
+
+That means the immediate gap is no longer “get any scholarly metadata at all.” The immediate gap is to add the next highest-value open sources without destabilizing the existing ingest, review, and export pipeline.
+
+## Recommended Priorities
+
+### OpenCitations
+
+Why:
+
+- It directly improves open citation-edge coverage.
+- It fits CiteGeist's graph-discovery workflow better than another generic metadata source.
+- It complements OpenAlex rather than replacing it.
+
+Expected role:
+
+- DOI-to-citations lookup
+- DOI-to-references lookup
+- provenance for citation edges
+
+Status:
+
+- now integrated as a DOI-based citation/reference source in the source layer and graph expansion flow
+
+Main risk:
+
+- coverage is narrower than OpenAlex, so merge rules need to treat it as an additional edge source rather than a primary metadata authority.
+
+### Unpaywall
+
+Why:
+
+- It solves a different problem from Crossref/OpenAlex: full-text access and OA status.
+- It improves the “can I get the paper?” part of the workflow without forcing a storage redesign.
+
+Expected role:
+
+- DOI-to-best-open-access-link lookup
+- OA status enrichment
+
+Status:
+
+- now integrated as an OA-link enrichment source with a dedicated `enrich-oa` CLI flow
+
+Main risk:
+
+- It should remain an access-link enrichment layer, not become entangled with identity resolution logic.
+
+### Europe PMC
+
+Why:
+
+- It is valuable for biomedical and life-sciences use cases.
+- It complements PubMed with richer open-access and citation-related information.
+
+Expected role:
+
+- domain-specific metadata enrichment
+- biomedical search
+- OA/full-text linkage
+
+Status:
+
+- now integrated as a biomedical resolver/search complement to `PubMed`
+
+Main risk:
+
+- this should remain a domain-specific source, not be treated as a universal resolver.
+
+### Semantic Scholar
+
+Pros:
+
+- good graph and relevance signals
+- useful for discovery quality
+
+Status:
+
+- now integrated as a broad resolver/search complement with good biological and physical sciences coverage
+
+Main risk:
+
+- rate limits and product-policy changes still matter more here than for the more explicitly open bibliographic sources
+
+## Evaluate But Do Not Make Core Yet
+
+### OpenAIRE
+
+Pros:
+
+- strong repository and OA/project linkage
+- good for European repository acquisition
+
+Cons:
+
+- better suited to corpus acquisition than first-line metadata resolution
+
+Recommendation:
+
+- treat as an acquisition adapter, not an immediate resolver target
+
+## What Not To Prioritize Right Now
+
+### Database Redesign
+
+The repository already has a working SQLite storage model and FTS-backed local workflow. A second schema track should not lead the next phase of work unless a concrete source integration is blocked on it.
+
+### Vector Search
+
+Optional semantic ranking may become useful later, but it was not the motivating question and does not need to be a prerequisite for source incorporation.
+
+## Suggested Execution Order
+
+1. Keep the source abstraction aligned with sources already in use.
+2. Revisit `OpenAIRE` after the current source additions settle.
--- a/new-roadmap.md
+++ b/new-roadmap.md
@ -0,0 +1,113 @@
+# CiteGeist Roadmap: Sources-First Expansion
+
+## Purpose
+
+The primary question is not “how do we redesign CiteGeist around a new storage engine?” The primary question is “which additional open bibliographic sources should CiteGeist incorporate next?”
+
+This roadmap treats the current SQLite-based local workflow as the baseline and focuses on source evaluation, source integration order, and reviewable source behavior.
+
+## Baseline
+
+Already present in the repository:
+
+- local BibTeX ingest, review, export, and graph traversal
+- metadata resolution from `Crossref`, `PubMed`, `Europe PMC`, `OpenAlex`, `Semantic Scholar`, `DBLP`, `arXiv`, and `DataCite`
+- citation-graph expansion using `Crossref` and `OpenAlex`
+- repository harvesting via `OAI-PMH`
+
+That means the next planning step is source prioritization, not another platform pivot.
+
+## Phase 0: Reframe Scope
+
+Goal:
+
+Put source-incorporation decisions ahead of database and vector-search ambitions.
+
+Tasks:
+
+- [x] identify which source integrations already exist
+- [x] separate “source expansion” work from “new database/vector stack” work
+- [x] document the source landscape and recommended order
+
+Deliverables:
+
+- `/docs/source-landscape.md`
+- `/src/citegeist/sources/catalog.py`
+
+## Phase 1: Tighten The Source Layer
+
+Goal:
+
+Make the new source abstraction useful for the repository that already exists, rather than speculative infrastructure.
+
+Tasks:
+
+- [x] keep the compatibility bridge to the existing `SourceClient`
+- [x] fix the initial `CrossRefSource` implementation so normalization works
+- [x] make config-driven registry loading work for known concrete sources
+- [x] add a code-backed source catalog for planning and prioritization
+
+Deliverables:
+
+- `/src/citegeist/sources/base.py`
+- `/src/citegeist/sources/registry.py`
+- `/src/citegeist/sources/crossref.py`
+- `/src/citegeist/sources/catalog.py`
+
+## Phase 2: Highest-Value Open Source Additions
+
+Goal:
+
+Incorporate the next open sources that materially improve the current workflow.
+
+Priority order:
+
+1. `OpenAIRE` only if repository-acquisition scope expands
+
+Tasks:
+
+- [x] add `OpenCitations` DOI-to-citation and DOI-to-reference lookup
+- [x] merge `OpenCitations` edges into the existing graph-expansion workflow with provenance
+- [x] add `Unpaywall` DOI-to-OA-link enrichment
+- [x] expose OA-link enrichment in a dedicated CLI flow
+- [x] add `Europe PMC` as a biomedical metadata/fulltext complement to `PubMed`
+- [x] add `Semantic Scholar` as a broader scientific metadata complement across biological and physical sciences
+
+Why these first:
+
+- `OpenCitations` directly answers the open-citation-coverage gap
+- `Unpaywall` now solves access-link enrichment without forcing a storage redesign
+- `Europe PMC` now improves biomedical metadata and OA/fulltext coverage without changing the storage model
+- `Semantic Scholar` now improves broader biological and physical sciences coverage without changing the storage model
+
+## Phase 3: Evaluate Optional Sources, Do Not Commit Prematurely
+
+Goal:
+
+Assess sources that may be useful, but are not clearly the next source-first move.
+
+Candidates:
+
+- `OpenAIRE`
+
+Tasks:
+
+- [ ] document API limits, openness constraints, and integration risk
+- [ ] decide whether each source belongs in core resolution, graph expansion, or corpus acquisition
+- [ ] avoid adding sources that duplicate existing coverage without a clear payoff
+
+## Deferred Work
+
+These are valid future ideas, but they are not the current planning driver:
+
+- a second database schema
+- pgvector integration
+- embedding-first search
+- large-scale canonical-work reconstruction
+
+The repository already has a working local storage/search path. Those ideas should only return to the front of the plan if a concrete source-integration need forces them there.
+
+## Immediate Next Steps
+
+1. Land the source inventory and source-layer cleanup.
+2. Reassess whether `OpenAIRE` is worth adding for repository-acquisition breadth.
--- a/src/citegeist/init.py
+++ b/src/citegeist/init.py
@ -2,7 +2,7 @@ from .app_api import LiteratureExplorerApi
 from .batch import BatchBootstrapRunner, BatchJobResult, load_batch_jobs
 from .bibtex import BibEntry, parse_bibtex
 from .bootstrap import BootstrapResult, Bootstrapper
-from .expand import CrossrefExpander, OpenAlexExpander
+from .expand import CrossrefExpander, OpenAlexExpander, OpenCitationsExpander
 from .extract import (
    available_extraction_backends,
    check_extraction_comparison_summary,
@ -16,6 +16,10 @@ from .harvest import OaiMetadataFormat, OaiPmhHarvester, OaiSet
 from .llm_verify import VerificationLlmClient, VerificationLlmConfig
 from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts
 from .sources import SourceClient
+from .sources import EuropePmcSource
+from .sources import OpenLibrarySource
+from .sources import SemanticScholarSource
+from .sources import UnpaywallSource
 from .storage import BibliographyStore
 from .verify import BibliographyVerifier, VerificationResult, VerificationMatch

@ -31,10 +35,15 @@ __all__ = [
    "LiteratureExplorerApi",
    "MetadataResolver",
    "OpenAlexExpander",
+    "OpenCitationsExpander",
    "OaiPmhHarvester",
    "OaiMetadataFormat",
    "OaiSet",
    "SourceClient",
+    "EuropePmcSource",
+    "OpenLibrarySource",
+    "SemanticScholarSource",
+    "UnpaywallSource",
    "VerificationLlmClient",
    "VerificationLlmConfig",
    "VerificationMatch",
--- a/src/citegeist/cli.py
+++ b/src/citegeist/cli.py
@ -173,6 +173,13 @@ def build_parser() -> argparse.ArgumentParser:
    resolve_parser = subparsers.add_parser("resolve", help="Enrich stored entries from external metadata sources")
    resolve_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich")

+    enrich_oa_parser = subparsers.add_parser(
+        "enrich-oa",
+        help="Enrich DOI-bearing entries with Unpaywall OA link metadata",
+    )
+    enrich_oa_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich")
+    enrich_oa_parser.add_argument("--email", help="Email address required by the Unpaywall API")
+
    resolve_stubs_parser = subparsers.add_parser(
        "resolve-stubs",
        help="Find and enrich stub-like stored entries, optionally limited to DOI-bearing candidates",
@ -237,7 +244,7 @@ def build_parser() -> argparse.ArgumentParser:
    expand_parser.add_argument("citation_keys", nargs="+", help="Seed citation keys to expand")
    expand_parser.add_argument(
        "--source",
-        choices=["crossref", "openalex"],
+        choices=["crossref", "openalex", "opencitations"],
        default="crossref",
        help="Graph expansion source",
    )
@ -260,7 +267,7 @@ def build_parser() -> argparse.ArgumentParser:
    )
    expand_topic_parser.add_argument(
        "--source",
-        choices=["crossref", "openalex"],
+        choices=["crossref", "openalex", "opencitations"],
        default="openalex",
        help="Topic graph expansion source",
    )
@ -749,6 +756,8 @@ def main(argv: list[str] | None = None) -> int:
            )
        if args.command == "resolve":
            return _run_resolve(store, args.citation_keys)
+        if args.command == "enrich-oa":
+            return _run_enrich_oa(store, args.citation_keys, args.email)
        if args.command == "resolve-stubs":
            return _run_resolve_stubs(store, args.limit, args.doi_only, args.all_misc, args.topic, args.preview)
        if args.command == "graph":
@ -1215,6 +1224,72 @@ def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int:
    return exit_code


+def _run_enrich_oa(store: BibliographyStore, citation_keys: list[str], email: str | None) -> int:
+    from .sources import UnpaywallSource
+
+    source = UnpaywallSource(config={"email": email} if email else {})
+    if not source.is_available():
+        print("Unpaywall enrichment requires --email or UNPAYWALL_EMAIL", file=sys.stderr)
+        return 1
+
+    results: list[dict[str, object]] = []
+    total = len(citation_keys)
+    for index, citation_key in enumerate(citation_keys, start=1):
+        _print_progress("enriching OA", index, total, citation_key)
+        existing = store.get_entry(citation_key)
+        if existing is None:
+            results.append({"citation_key": citation_key, "status": "missing"})
+            continue
+        doi = str(existing.get("doi") or "").strip()
+        if not doi:
+            results.append({"citation_key": citation_key, "status": "no_doi"})
+            continue
+
+        enriched = source.lookup_by_doi(doi)
+        if enriched is None:
+            results.append({"citation_key": citation_key, "status": "no_record", "doi": doi})
+            continue
+
+        merged_fields: dict[str, str] = {}
+        for key, value in existing.items():
+            if isinstance(value, str):
+                merged_fields[key] = value
+        merged_fields.update(enriched.fields)
+
+        for field_name in ("title", "year", "author", "journal", "booktitle", "publisher", "abstract", "keywords"):
+            existing_value = str(existing.get(field_name) or "").strip()
+            if existing_value:
+                merged_fields[field_name] = existing_value
+
+        replacement = BibEntry(
+            entry_type=str(existing.get("entry_type") or "misc"),
+            citation_key=citation_key,
+            fields=merged_fields,
+        )
+        store.replace_entry(
+            citation_key,
+            replacement,
+            source_type="oa_enrich",
+            source_label=f"unpaywall:doi:{doi}",
+            review_status=str(existing.get("review_status") or "enriched"),
+        )
+        updated = store.get_entry(citation_key) or {}
+        results.append(
+            {
+                "citation_key": citation_key,
+                "status": "enriched",
+                "doi": doi,
+                "is_oa": updated.get("is_oa"),
+                "oa_status": updated.get("oa_status"),
+                "best_oa_url": updated.get("best_oa_url"),
+                "best_oa_pdf_url": updated.get("best_oa_pdf_url"),
+            }
+        )
+
+    print(json.dumps(results, indent=2))
+    return 0
+
+
 def _resolve_one(store: BibliographyStore, resolver: MetadataResolver, citation_key: str) -> bool:
    existing = store.get_entry(citation_key)
    if existing is None:
@ -1664,6 +1739,15 @@ def _run_expand(
            for relation_name in _expand_relation_types(relation)
            for item in expander.expand_entry(store, key, relation_type=relation_name, limit=limit)
        ]
+    elif source == "opencitations":
+        from .expand import OpenCitationsExpander
+
+        expander = OpenCitationsExpander()
+        expand_fn = lambda key: [
+            item
+            for relation_name in _expand_relation_types(relation)
+            for item in expander.expand_entry(store, key, relation_type=relation_name, limit=limit)
+        ]
    else:
        print(f"Unsupported expansion source: {source}", file=sys.stderr)
        return 1
--- a/src/citegeist/expand.py
+++ b/src/citegeist/expand.py
@ -9,6 +9,7 @@ from urllib.parse import quote, urlencode
 from .bibtex import BibEntry, parse_bibtex
 from .extract import _extract_thesis_like_title, _looks_like_citation_blob as _shared_looks_like_citation_blob
 from .resolve import MetadataResolver, merge_entries
+from .sources import OpenCitationsSource
 from .storage import BibliographyStore


@ -219,14 +220,94 @@ class OpenAlexExpander:
        return _normalize_openalex_id(results[0].get("id", ""))


+class OpenCitationsExpander:
+    def __init__(self, resolver: MetadataResolver | None = None, source: OpenCitationsSource | None = None) -> None:
+        self.resolver = resolver or MetadataResolver()
+        self.source = source or OpenCitationsSource(config={"source_client": self.resolver.source_client})
+
+    def expand_entry(
+        self,
+        store: BibliographyStore,
+        citation_key: str,
+        relation_type: str = "cites",
+        limit: int = 25,
+    ) -> list[ExpansionResult]:
+        entry = store.get_entry(citation_key)
+        if entry is None:
+            return []
+
+        doi = str(entry.get("doi") or "")
+        if not doi:
+            return []
+
+        edges = self.source.get_citations(doi, relation_type=relation_type, limit=limit)
+        results: list[ExpansionResult] = []
+        for edge in edges:
+            discovered_doi = edge.target_work_id[4:] if relation_type == "cites" else edge.source_work_id[4:]
+            discovered = self._lookup_discovered_entry(discovered_doi)
+            if discovered is None:
+                discovered = _opencitations_stub_entry(discovered_doi, citation_key)
+
+            existing_key = _existing_entry_key_for_discovered_work(store, discovered)
+            target_key = existing_key or discovered.citation_key
+            created = False
+            if existing_key is None and store.get_entry(discovered.citation_key) is None:
+                store.upsert_entry(
+                    discovered,
+                    raw_bibtex=None,
+                    source_type="graph_expand",
+                    source_label=edge.source_label,
+                    review_status="draft",
+                )
+                store.connection.commit()
+                created = True
+
+            if relation_type == "cites":
+                source_key = citation_key
+                relation_target_key = target_key
+            else:
+                source_key = target_key
+                relation_target_key = citation_key
+
+            store.add_relation(
+                source_key,
+                relation_target_key,
+                "cites",
+                source_type="graph_expand",
+                source_label=edge.source_label,
+                confidence=edge.confidence,
+            )
+            results.append(
+                ExpansionResult(
+                    source_citation_key=source_key,
+                    discovered_citation_key=target_key,
+                    created_entry=created,
+                    relation_type=relation_type,
+                    source_label=edge.source_label,
+                )
+            )
+        return results
+
+    def _lookup_discovered_entry(self, doi: str) -> BibEntry | None:
+        resolution = self.resolver.resolve_doi(doi)
+        if resolution is not None:
+            return resolution.entry
+        resolution = self.resolver.resolve_datacite_doi(doi)
+        if resolution is not None:
+            return resolution.entry
+        return self.source.lookup_by_doi(doi)
+
+
 class TopicExpander:
    def __init__(
        self,
        crossref_expander: CrossrefExpander | None = None,
        openalex_expander: OpenAlexExpander | None = None,
+        opencitations_expander: OpenCitationsExpander | None = None,
    ) -> None:
        self.crossref_expander = crossref_expander or CrossrefExpander()
        self.openalex_expander = openalex_expander or OpenAlexExpander()
+        self.opencitations_expander = opencitations_expander or OpenCitationsExpander()
        self.last_run_meta: dict[str, object] = {}

    def expand_topic(
@ -362,6 +443,17 @@ class TopicExpander:
    ) -> list[tuple[ExpansionResult, dict[str, object] | None]]:
        if source == "crossref":
            expansion_rows = self.crossref_expander.expand_entry_references(store, citation_key)
+        elif source == "opencitations":
+            expansion_rows = []
+            for relation_name in _expand_relation_types(relation_type):
+                expansion_rows.extend(
+                    self.opencitations_expander.expand_entry(
+                        store,
+                        citation_key,
+                        relation_type=relation_name,
+                        limit=limit,
+                    )
+                )
        else:
            expansion_rows: list[ExpansionResult] = []
            for relation_name in _expand_relation_types(relation_type):
@ -385,6 +477,11 @@ class TopicExpander:
    ) -> list[tuple[ExpansionResult, dict[str, object]]]:
        if source == "crossref":
            return self._preview_crossref_discoveries(store, citation_key, limit)
+        if source == "opencitations":
+            rows: list[tuple[ExpansionResult, dict[str, object]]] = []
+            for relation_name in _expand_relation_types(relation_type):
+                rows.extend(self._preview_opencitations_discoveries(store, citation_key, relation_name, limit))
+            return rows
        rows: list[tuple[ExpansionResult, dict[str, object]]] = []
        for relation_name in _expand_relation_types(relation_type):
            rows.extend(self._preview_openalex_discoveries(store, citation_key, relation_name, limit))
@ -467,6 +564,40 @@ class TopicExpander:
            )
        return rows

+    def _preview_opencitations_discoveries(
+        self,
+        store: BibliographyStore,
+        citation_key: str,
+        relation_type: str,
+        limit: int,
+    ) -> list[tuple[ExpansionResult, dict[str, object]]]:
+        entry = store.get_entry(citation_key)
+        if entry is None or not entry.get("doi"):
+            return []
+        doi = str(entry["doi"])
+        edges = self.opencitations_expander.source.get_citations(doi, relation_type=relation_type, limit=limit)
+        rows: list[tuple[ExpansionResult, dict[str, object]]] = []
+        for edge in edges:
+            discovered_doi = edge.target_work_id[4:] if relation_type == "cites" else edge.source_work_id[4:]
+            discovered = self.opencitations_expander._lookup_discovered_entry(discovered_doi)
+            if discovered is None:
+                discovered = _opencitations_stub_entry(discovered_doi, citation_key)
+            existing_key = _existing_entry_key_for_discovered_work(store, discovered)
+            target_key = existing_key or discovered.citation_key
+            rows.append(
+                (
+                    ExpansionResult(
+                        source_citation_key=citation_key if relation_type == "cites" else target_key,
+                        discovered_citation_key=target_key,
+                        created_entry=existing_key is None and store.get_entry(discovered.citation_key) is None,
+                        relation_type=relation_type,
+                        source_label=edge.source_label,
+                    ),
+                    dict(discovered.fields),
+                )
+            )
+        return rows
+

 def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
    title = _crossref_reference_title(reference, ordinal)
@ -567,6 +698,20 @@ def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int
    return f"{family}{year or 'nd'}{first_word}{ordinal}"


+def _opencitations_stub_entry(doi: str, source_citation_key: str) -> BibEntry:
+    suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
+    return BibEntry(
+        entry_type="misc",
+        citation_key=f"doi{suffix}",
+        fields={
+            "title": f"Referenced work for DOI {doi}",
+            "doi": doi,
+            "url": f"https://doi.org/{doi}",
+            "note": f"discovered_from = {{{source_citation_key}}}",
+        },
+    )
+
+
 def _normalize_text(value: str) -> str:
    without_tags = re.sub(r"<[^>]+>", " ", html.unescape(value))
    normalized = " ".join(without_tags.split())
--- a/src/citegeist/resolve.py
+++ b/src/citegeist/resolve.py
@ -7,17 +7,38 @@ import re
 import urllib.error
 import urllib.parse
 import xml.etree.ElementTree as ET
-from dataclasses import dataclass
+from dataclasses import dataclass, field

 from .bibtex import BibEntry, parse_bibtex
+from .sources.europepmc import EuropePmcSource
+from .sources.openlibrary import OpenLibrarySource
+from .sources.semanticscholar import SemanticScholarSource
 from .sources import SourceClient


+@dataclass(slots=True)
+class ResolutionAttempt:
+    source_name: str
+    strategy: str
+    query_value: str
+    matched: bool
+    candidate_count: int | None = None
+    source_label: str = ""
+    error: str = ""
+
+
@dataclass(slots=True)
 class Resolution:
    entry: BibEntry
    source_type: str
    source_label: str
+    attempts: list[ResolutionAttempt] = field(default_factory=list)
+
+
+@dataclass(slots=True)
+class ResolutionOutcome:
+    resolution: Resolution | None
+    attempts: list[ResolutionAttempt]


 class MetadataResolver:
@ -31,70 +52,109 @@ class MetadataResolver:
    ) -> None:
        self.user_agent = user_agent
        self.source_client = source_client or SourceClient(user_agent=user_agent)
+        self.europepmc = EuropePmcSource(config={"source_client": self.source_client, "user_agent": user_agent})
+        self.openlibrary = OpenLibrarySource(config={"source_client": self.source_client, "user_agent": user_agent})
+        self.semanticscholar = SemanticScholarSource(config={"user_agent": user_agent})
        self.ncbi_api_key = ncbi_api_key if ncbi_api_key is not None else os.environ.get("NCBI_API_KEY", "")
        self.ncbi_tool = ncbi_tool if ncbi_tool is not None else os.environ.get("NCBI_TOOL", "citegeist")
        self.ncbi_email = ncbi_email if ncbi_email is not None else os.environ.get("NCBI_EMAIL", "")

    def resolve_entry(self, entry: BibEntry) -> Resolution | None:
+        return self.resolve_entry_with_trace(entry).resolution
+
+    def resolve_entry_with_trace(self, entry: BibEntry) -> ResolutionOutcome:
+        attempts: list[ResolutionAttempt] = []
        if doi := entry.fields.get("doi"):
-            resolved = self.resolve_doi(doi)
+            resolved = self._attempt_direct_resolution(attempts, "crossref", "doi_lookup", doi, self.resolve_doi)
            if resolved is not None:
-                return resolved
-            resolved = self.resolve_datacite_doi(doi)
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
+            resolved = self._attempt_direct_resolution(
+                attempts, "datacite", "doi_lookup", doi, self.resolve_datacite_doi
+            )
            if resolved is not None:
-                return resolved
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
+            resolved = self._attempt_direct_resolution(
+                attempts, "europepmc", "doi_lookup", doi, self.resolve_europepmc_doi
+            )
+            if resolved is not None:
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
+            resolved = self._attempt_direct_resolution(
+                attempts, "semanticscholar", "doi_lookup", doi, self.resolve_semanticscholar_doi
+            )
+            if resolved is not None:
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)

        if pmid := entry.fields.get("pmid"):
-            resolved = self.resolve_pmid(pmid)
+            resolved = self._attempt_direct_resolution(attempts, "pubmed", "pmid_lookup", pmid, self.resolve_pmid)
            if resolved is not None:
-                return resolved
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)

        if openalex_id := entry.fields.get("openalex"):
-            resolved = self.resolve_openalex(openalex_id)
+            resolved = self._attempt_direct_resolution(
+                attempts, "openalex", "work_lookup", openalex_id, self.resolve_openalex
+            )
            if resolved is not None:
-                return resolved
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)

        if dblp_key := entry.fields.get("dblp"):
-            resolved = self.resolve_dblp(dblp_key)
+            resolved = self._attempt_direct_resolution(attempts, "dblp", "key_lookup", dblp_key, self.resolve_dblp)
            if resolved is not None:
-                return resolved
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)

        if arxiv_id := entry.fields.get("arxiv"):
-            resolved = self.resolve_arxiv(arxiv_id)
+            resolved = self._attempt_direct_resolution(
+                attempts, "arxiv", "id_lookup", arxiv_id, self.resolve_arxiv
+            )
            if resolved is not None:
-                return resolved
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)

        if title := entry.fields.get("title"):
-            resolved = self.search_crossref_best_match(
-                title=title,
-                author_text=entry.fields.get("author", ""),
-                year=entry.fields.get("year", ""),
+            author_text = entry.fields.get("author", "")
+            year = entry.fields.get("year", "")
+            resolved = self._attempt_title_search_resolution(
+                attempts, "crossref", title, author_text, year, self.search_crossref
            )
            if resolved is not None:
-                return resolved
-            resolved = self.search_datacite_best_match(
-                title=title,
-                author_text=entry.fields.get("author", ""),
-                year=entry.fields.get("year", ""),
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
+            resolved = self._attempt_title_search_resolution(
+                attempts, "datacite", title, author_text, year, self.search_datacite
            )
            if resolved is not None:
-                return resolved
-            resolved = self.search_openalex_best_match(
-                title=title,
-                author_text=entry.fields.get("author", ""),
-                year=entry.fields.get("year", ""),
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
+            resolved = self._attempt_title_search_resolution(
+                attempts, "openalex", title, author_text, year, self.search_openalex
            )
            if resolved is not None:
-                return resolved
-            resolved = self.search_pubmed_best_match(
-                title=title,
-                author_text=entry.fields.get("author", ""),
-                year=entry.fields.get("year", ""),
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
+            resolved = self._attempt_title_search_resolution(
+                attempts, "pubmed", title, author_text, year, self.search_pubmed
            )
            if resolved is not None:
-                return resolved
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
+            resolved = self._attempt_title_search_resolution(
+                attempts, "europepmc", title, author_text, year, self.search_europepmc
+            )
+            if resolved is not None:
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
+            resolved = self._attempt_title_search_resolution(
+                attempts, "semanticscholar", title, author_text, year, self.search_semanticscholar
+            )
+            if resolved is not None:
+                return ResolutionOutcome(resolution=resolved, attempts=attempts)
+            if _entry_prefers_catalog_search(entry):
+                resolved = self._attempt_title_search_resolution(
+                    attempts,
+                    "openlibrary",
+                    title,
+                    author_text,
+                    year,
+                    self.search_openlibrary,
+                    selector=_select_best_catalog_title_match,
+                )
+                if resolved is not None:
+                    return ResolutionOutcome(resolution=resolved, attempts=attempts)

-        return None
+        return ResolutionOutcome(resolution=None, attempts=attempts)

    def resolve_doi(self, doi: str) -> Resolution | None:
        encoded = urllib.parse.quote(doi, safe="")
@ -124,19 +184,7 @@ class MetadataResolver:
        author_text: str = "",
        year: str = "",
    ) -> Resolution | None:
-        candidate = _select_best_title_match(
-            self.search_crossref(title, limit=5),
-            title=title,
-            author_text=author_text,
-            year=year,
-        )
-        if candidate is None:
-            return None
-        return Resolution(
-            entry=candidate,
-            source_type="resolver",
-            source_label=f"crossref:search:{title}",
-        )
+        return self._search_best_match_resolution("crossref", title, author_text, year, self.search_crossref)

    def resolve_dblp(self, dblp_key: str) -> Resolution | None:
        encoded_key = urllib.parse.quote(dblp_key, safe="/:")
@ -245,19 +293,7 @@ class MetadataResolver:
        author_text: str = "",
        year: str = "",
    ) -> Resolution | None:
-        candidate = _select_best_title_match(
-            self.search_datacite(title, limit=5),
-            title=title,
-            author_text=author_text,
-            year=year,
-        )
-        if candidate is None:
-            return None
-        return Resolution(
-            entry=candidate,
-            source_type="resolver",
-            source_label=f"datacite:search:{title}",
-        )
+        return self._search_best_match_resolution("datacite", title, author_text, year, self.search_datacite)

    def search_openalex(self, title: str, limit: int = 5) -> list[BibEntry]:
        query = urllib.parse.urlencode({"search": title, "per-page": limit})
@ -290,6 +326,35 @@ class MetadataResolver:
            return []
        return self._fetch_pubmed_entries(ids[:limit])

+    def resolve_europepmc_doi(self, doi: str) -> Resolution | None:
+        entry = self.europepmc.lookup_by_doi(doi)
+        if entry is None:
+            return None
+        return Resolution(
+            entry=entry,
+            source_type="resolver",
+            source_label=f"europepmc:doi:{doi}",
+        )
+
+    def search_europepmc(self, title: str, limit: int = 5) -> list[BibEntry]:
+        return self.europepmc.search(title, limit=limit)
+
+    def search_openlibrary(self, title: str, limit: int = 5) -> list[BibEntry]:
+        return self.openlibrary.search(title, limit=limit)
+
+    def resolve_semanticscholar_doi(self, doi: str) -> Resolution | None:
+        entry = self.semanticscholar.lookup_by_doi(doi)
+        if entry is None:
+            return None
+        return Resolution(
+            entry=entry,
+            source_type="resolver",
+            source_label=f"semanticscholar:doi:{doi}",
+        )
+
+    def search_semanticscholar(self, title: str, limit: int = 5) -> list[BibEntry]:
+        return self.semanticscholar.search(title, limit=limit)
+
    def _safe_get_json(self, url: str) -> dict | None:
        try:
            return self.source_client.get_json(url)
@ -333,19 +398,7 @@ class MetadataResolver:
        author_text: str = "",
        year: str = "",
    ) -> Resolution | None:
-        candidate = _select_best_title_match(
-            self.search_openalex(title, limit=5),
-            title=title,
-            author_text=author_text,
-            year=year,
-        )
-        if candidate is None:
-            return None
-        return Resolution(
-            entry=candidate,
-            source_type="resolver",
-            source_label=f"openalex:search:{title}",
-        )
+        return self._search_best_match_resolution("openalex", title, author_text, year, self.search_openalex)

    def search_pubmed_best_match(
        self,
@ -353,19 +406,122 @@ class MetadataResolver:
        author_text: str = "",
        year: str = "",
    ) -> Resolution | None:
-        candidate = _select_best_title_match(
-            self.search_pubmed(title, limit=5),
-            title=title,
-            author_text=author_text,
-            year=year,
+        return self._search_best_match_resolution("pubmed", title, author_text, year, self.search_pubmed)
+
+    def search_europepmc_best_match(
+        self,
+        title: str,
+        author_text: str = "",
+        year: str = "",
+    ) -> Resolution | None:
+        return self._search_best_match_resolution("europepmc", title, author_text, year, self.search_europepmc)
+
+    def search_semanticscholar_best_match(
+        self,
+        title: str,
+        author_text: str = "",
+        year: str = "",
+    ) -> Resolution | None:
+        return self._search_best_match_resolution(
+            "semanticscholar", title, author_text, year, self.search_semanticscholar
        )
+
+    def search_openlibrary_best_match(
+        self,
+        title: str,
+        author_text: str = "",
+        year: str = "",
+    ) -> Resolution | None:
+        return self._search_best_match_resolution("openlibrary", title, author_text, year, self.search_openlibrary)
+
+    def _search_best_match_resolution(
+        self, source_name: str, title: str, author_text: str, year: str, search_func
+    ) -> Resolution | None:
+        candidates = search_func(title, limit=5)
+        candidate = _select_best_title_match(candidates, title=title, author_text=author_text, year=year)
        if candidate is None:
            return None
-        return Resolution(
+        return Resolution(entry=candidate, source_type="resolver", source_label=f"{source_name}:search:{title}")
+
+    def _attempt_direct_resolution(
+        self,
+        attempts: list[ResolutionAttempt],
+        source_name: str,
+        strategy: str,
+        query_value: str,
+        resolver_func,
+    ) -> Resolution | None:
+        try:
+            resolution = resolver_func(query_value)
+        except Exception as exc:
+            attempts.append(
+                ResolutionAttempt(
+                    source_name=source_name,
+                    strategy=strategy,
+                    query_value=query_value,
+                    matched=False,
+                    error=str(exc),
+                )
+            )
+            return None
+        attempts.append(
+            ResolutionAttempt(
+                source_name=source_name,
+                strategy=strategy,
+                query_value=query_value,
+                matched=resolution is not None,
+                source_label=resolution.source_label if resolution is not None else "",
+            )
+        )
+        if resolution is not None and not resolution.attempts:
+            resolution.attempts = list(attempts)
+        return resolution
+
+    def _attempt_title_search_resolution(
+        self,
+        attempts: list[ResolutionAttempt],
+        source_name: str,
+        title: str,
+        author_text: str,
+        year: str,
+        search_func,
+        selector=None,
+    ) -> Resolution | None:
+        try:
+            candidates = search_func(title, limit=5)
+        except Exception as exc:
+            attempts.append(
+                ResolutionAttempt(
+                    source_name=source_name,
+                    strategy="title_search",
+                    query_value=title,
+                    matched=False,
+                    error=str(exc),
+                )
+            )
+            return None
+        match_selector = selector or _select_best_title_match
+        candidate = match_selector(candidates, title=title, author_text=author_text, year=year)
+        resolution = None
+        if candidate is not None:
+            resolution = Resolution(
                entry=candidate,
                source_type="resolver",
-            source_label=f"pubmed:search:{title}",
+                source_label=f"{source_name}:search:{title}",
            )
+        attempts.append(
+            ResolutionAttempt(
+                source_name=source_name,
+                strategy="title_search",
+                query_value=title,
+                matched=resolution is not None,
+                candidate_count=len(candidates),
+                source_label=resolution.source_label if resolution is not None else "",
+            )
+        )
+        if resolution is not None and not resolution.attempts:
+            resolution.attempts = list(attempts)
+        return resolution

    def _fetch_pubmed_entries(self, pmids: list[str]) -> list[BibEntry]:
        ordered_pmids = [pmid for pmid in dict.fromkeys(pmids) if pmid]
@ -768,6 +924,42 @@ def _select_best_title_match(
    return None


+def _select_best_catalog_title_match(
+    candidates: list[BibEntry],
+    title: str,
+    author_text: str = "",
+    year: str = "",
+) -> BibEntry | None:
+    if not candidates:
+        return None
+
+    title_tokens = _catalog_title_tokens(title)
+    author_tokens = _author_match_tokens(author_text)
+    year_text = str(year or "").strip()
+    scored: list[tuple[float, BibEntry]] = []
+
+    for candidate in candidates:
+        candidate_title_tokens = _catalog_title_tokens(candidate.fields.get("title", ""))
+        if not candidate_title_tokens:
+            continue
+        overlap = len(title_tokens & candidate_title_tokens)
+        union = len(title_tokens | candidate_title_tokens)
+        score = (overlap / union) if union else 0.0
+        if score < 0.6:
+            continue
+        candidate_year = str(candidate.fields.get("year", "") or "").strip()
+        if year_text and candidate_year and year_text != candidate_year:
+            continue
+        if author_tokens and not _candidate_matches_author_tokens(candidate, author_tokens):
+            continue
+        scored.append((score, candidate))
+
+    if not scored:
+        return None
+    scored.sort(key=lambda item: (-item[0], item[1].citation_key))
+    return scored[0][1]
+
+
 def _author_match_tokens(author_text: str) -> set[str]:
    normalized = _normalize_match_text(author_text)
    if not normalized:
@ -788,6 +980,39 @@ def _candidate_matches_author_tokens(candidate: BibEntry, author_tokens: set[str
    return bool(author_tokens & candidate_tokens)


+def _catalog_title_tokens(value: str) -> set[str]:
+    normalized = _normalize_match_text(value)
+    stopwords = {"the", "and", "for", "with", "from", "into", "after", "all"}
+    return {
+        f"{token[:-4]}ic" if token.endswith("ical") and len(token) > 6 else token
+        for token in re.findall(r"[a-z0-9]+", normalized)
+        if len(token) >= 4 and token not in stopwords
+    }
+
+
+def _entry_prefers_catalog_search(entry: BibEntry) -> bool:
+    if entry.entry_type in {"book", "incollection", "phdthesis", "mastersthesis"}:
+        return True
+    title = _normalize_match_text(entry.fields.get("title", ""))
+    venue = _normalize_match_text(
+        " ".join(
+            filter(
+                None,
+                [
+                    entry.fields.get("publisher", ""),
+                    entry.fields.get("howpublished", ""),
+                    entry.fields.get("booktitle", ""),
+                ],
+            )
+        )
+    )
+    if entry.entry_type != "misc":
+        return False
+    if any(token in venue for token in ("press", "university", "house", "dictionary", "christendom")):
+        return True
+    return any(token in title for token in ("dictionary", "history", "world", "universe", "record"))
+
+
 def _normalize_pmid(value: str) -> str:
    return "".join(ch for ch in str(value) if ch.isdigit())

--- a/src/citegeist/resolver/init.py
+++ b/src/citegeist/resolver/init.py
@ -0,0 +1,27 @@
+"""
+Identifier resolution and normalization module.
+
+Provides functions for extracting, normalizing, and resolving
+bibliographic identifiers across multiple schemes.
+"""
+from __future__ import annotations
+
+from citegeist.resolver.identifiers import (
+    IdentifierExtractor,
+    IdentifierNormalizer,
+    IdentifierResolver,
+    extract_identifiers,
+    normalize_identifier,
+    get_primary_identifier,
+    resolve_identifiers,
+)
+
+__all__ = [
+    'IdentifierExtractor',
+    'IdentifierNormalizer',
+    'IdentifierResolver',
+    'extract_identifiers',
+    'normalize_identifier',
+    'get_primary_identifier',
+    'resolve_identifiers',
+]
--- a/src/citegeist/resolver/identifiers.py
+++ b/src/citegeist/resolver/identifiers.py
@ -0,0 +1,418 @@
+"""
+Identifier resolution and normalization module.
+
+This module provides functions for extracting, normalizing, and resolving
+bibliographic identifiers across multiple schemes (DOI, PMID, arXiv, ORCID, etc.).
+"""
+from __future__ import annotations
+
+import re
+from typing import Dict, List, Optional, Tuple
+
+
+# Identifier scheme patterns
+DOI_PATTERN = re.compile(
+    r'^10\.\d{4,9}/\S+$',
+    re.IGNORECASE
+)
+
+PMID_PATTERN = re.compile(r'^\d{5,7}$')
+
+PMCID_PATTERN = re.compile(
+    r'^PMC\d+$|^PMC[0-9a-f]+$', 
+    re.IGNORECASE
+)
+
+ARXIV_PATTERN = re.compile(
+    r'^\d{4}\.\d{4,5}(v\d+)?$',
+    re.IGNORECASE
+)
+
+ORCID_PATTERN = re.compile(
+    r'^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$',
+    re.IGNORECASE
+)
+
+ROR_PATTERN = re.compile(
+    r'^https?://ror\.org/[0-9A-Z]{4,10}$'
+)
+
+DBLP_PATTERN = re.compile(
+    r'^[a-zA-Z0-9_]+:[a-zA-Z0-9_]+$', 
+    re.IGNORECASE
+)
+
+OPENALEX_PATTERN = re.compile(
+    r'^W[0-9]{4}-[A-F0-9]{4}$',
+    re.IGNORECASE
+)
+
+
+class IdentifierExtractor:
+    """Extract identifiers from BibEntry fields."""
+    
+    @staticmethod
+    def extract(entry_fields: Dict[str, str]) -> Dict[str, str]:
+        """Extract all identifier schemes from entry fields.
+        
+        Args:
+            entry_fields: Dictionary of entry fields
+            
+        Returns:
+            Dictionary mapping scheme names to values
+        """
+        identifiers = {}
+        
+        # DOI
+        if doi := entry_fields.get('doi'):
+            identifiers['doi'] = doi
+        
+        # PMID
+        if pmid := entry_fields.get('pmid'):
+            identifiers['pmid'] = pmid
+        
+        # PMCID
+        if pmcid := entry_fields.get('pmcid'):
+            identifiers['pmcid'] = pmcid
+        
+        # arXiv
+        if arxiv := entry_fields.get('arxiv'):
+            identifiers['arxiv'] = arxiv
+        
+        # DBLP
+        if dblp := entry_fields.get('dblp'):
+            identifiers['dblp'] = dblp
+        
+        # OpenAlex
+        if openalex := entry_fields.get('openalex'):
+            identifiers['openalex'] = openalex
+        
+        # ISBN
+        if isbn := entry_fields.get('isbn'):
+            identifiers['isbn'] = isbn
+        
+        # ISSN
+        if issn := entry_fields.get('issn'):
+            identifiers['issn'] = issn
+        
+        return identifiers
+
+
+class IdentifierNormalizer:
+    """Normalize identifiers to canonical form."""
+    
+    @staticmethod
+    def normalize_doi(doi: str) -> Optional[str]:
+        """Normalize DOI to lowercase.
+        
+        Args:
+            doi: DOI string
+            
+        Returns:
+            Lowercase DOI, or None if invalid
+        """
+        if not doi:
+            return None
+        normalized = doi.strip().lower()
+        if DOI_PATTERN.match(normalized):
+            return normalized
+        return None
+    
+    @staticmethod
+    def normalize_pmid(pmid: str) -> Optional[str]:
+        """Normalize PMID to string.
+        
+        Args:
+            pmid: PMID string
+            
+        Returns:
+            PMID string, or None if invalid
+        """
+        if not pmid:
+            return None
+        pmid_str = str(pmid).strip()
+        if PMID_PATTERN.match(pmid_str):
+            return pmid_str
+        return None
+    
+    @staticmethod
+    def normalize_pmcid(pmcid: str) -> Optional[str]:
+        """Normalize PMCID to lowercase.
+        
+        Args:
+            pmcid: PMCID string
+            
+        Returns:
+            Lowercase PMCID, or None if invalid
+        """
+        if not pmcid:
+            return None
+        normalized = pmcid.strip().lower()
+        if PMCID_PATTERN.match(normalized):
+            return normalized
+        return None
+    
+    @staticmethod
+    def normalize_arxiv(arxiv: str) -> Optional[str]:
+        """Normalize arXiv ID.
+        
+        Args:
+            arxiv: arXiv ID string
+            
+        Returns:
+            Normalized arXiv ID, or None if invalid
+        """
+        if not arxiv:
+            return None
+        # Remove 'v' and version suffix if present
+        normalized = arxiv.strip().lower()
+        if 'v' in normalized:
+            normalized = normalized.split('v')[0]
+        if ARXIV_PATTERN.match(normalized):
+            return normalized
+        return None
+    
+    @staticmethod
+    def normalize_orcid(orcid: str) -> Optional[str]:
+        """Normalize ORCID to canonical format.
+        
+        Args:
+            orcid: ORCID string
+            
+        Returns:
+            Normalized ORCID (XXXX-XXXX-XXXX-XXX0), or None if invalid
+        """
+        if not orcid:
+            return None
+        orcid = orcid.strip().upper().replace(' ', '')
+        if ORCID_PATTERN.match(orcid):
+            return orcid
+        return None
+    
+    @staticmethod
+    def normalize_ror(ror_url: str) -> Optional[str]:
+        """Normalize ROR URL to identifier.
+        
+        Args:
+            ror_url: ROR URL string
+            
+        Returns:
+            ROR identifier, or None if invalid
+        """
+        if not ror_url:
+            return None
+        ror_id = ror_url.strip().lower()
+        if ROR_PATTERN.match(ror_id):
+            return ror_id
+        return None
+    
+    @staticmethod
+    def normalize_dblp(dblp_key: str) -> Optional[str]:
+        """Normalize DBLP key.
+        
+        Args:
+            dblp_key: DBLP key string
+            
+        Returns:
+            DBLP key, or None if invalid
+        """
+        if not dblp_key:
+            return None
+        dblp = dblp_key.strip()
+        if DBLP_PATTERN.match(dblp):
+            return dblp
+        return None
+    
+    @staticmethod
+    def normalize_openalex(openalex_id: str) -> Optional[str]:
+        """Normalize OpenAlex ID.
+        
+        Args:
+            openalex_id: OpenAlex ID string
+            
+        Returns:
+            OpenAlex ID, or None if invalid
+        """
+        if not openalex_id:
+            return None
+        openalex = openalex_id.strip().upper()
+        if OPENALEX_PATTERN.match(openalex):
+            return openalex
+        return None
+    
+    @staticmethod
+    def normalize_identifier(scheme: str, value: str) -> Optional[Tuple[str, str]]:
+        """Normalize an identifier.
+        
+        Args:
+            scheme: Identifier scheme name
+            value: Identifier value
+            
+        Returns:
+            Tuple of (scheme, normalized_value), or None if invalid
+        """
+        scheme = scheme.lower()
+        
+        normalizers = {
+            'doi': IdentifierNormalizer.normalize_doi,
+            'pmid': IdentifierNormalizer.normalize_pmid,
+            'pmcid': IdentifierNormalizer.normalize_pmcid,
+            'arxiv': IdentifierNormalizer.normalize_arxiv,
+            'orcid': IdentifierNormalizer.normalize_orcid,
+            'ror': IdentifierNormalizer.normalize_ror,
+            'dblp': IdentifierNormalizer.normalize_dblp,
+            'openalex': IdentifierNormalizer.normalize_openalex,
+        }
+        
+        normalizer = normalizers.get(scheme)
+        if normalizer:
+            normalized = normalizer(value)
+            if normalized:
+                return (scheme, normalized)
+        return None
+
+
+class IdentifierResolver:
+    """Resolve identifiers across multiple schemes."""
+    
+    # Lookup priority: schemes should be checked in this order
+    LOOKUP_PRIORITY = [
+        ('doi', IdentifierNormalizer.normalize_doi),
+        ('pmid', IdentifierNormalizer.normalize_pmid),
+        ('pmcid', IdentifierNormalizer.normalize_pmcid),
+        ('arxiv', IdentifierNormalizer.normalize_arxiv),
+        ('dblp', IdentifierNormalizer.normalize_dblp),
+        ('openalex', IdentifierNormalizer.normalize_openalex),
+    ]
+    
+    @staticmethod
+    def resolve(entry_fields: Dict[str, str]) -> List[Tuple[str, str]]:
+        """Resolve identifiers from entry fields.
+        
+        Args:
+            entry_fields: Dictionary of entry fields
+            
+        Returns:
+            List of (scheme, normalized_value) tuples in priority order
+        """
+        identifiers = IdentifierExtractor.extract(entry_fields)
+        resolved = []
+        
+        for scheme, value in identifiers.items():
+            if normalized := IdentifierNormalizer.normalize_identifier(scheme, value):
+                resolved.append(normalized)
+        
+        # Add title fingerprint as fallback
+        if title := entry_fields.get('title'):
+            fingerprint = IdentifierResolver._create_title_fingerprint(title)
+            if fingerprint:
+                resolved.append(('title', fingerprint))
+        
+        return resolved
+    
+    @staticmethod
+    def _create_title_fingerprint(title: str) -> Optional[str]:
+        """Create a fingerprint from title for fallback lookup.
+        
+        Args:
+            title: Work title
+            
+        Returns:
+            Fingerprint string
+        """
+        if not title:
+            return None
+        
+        # Remove common words, punctuation, and normalize
+        words = title.lower()
+        words = re.sub(r'[^\w\s]', ' ', words)  # Remove punctuation
+        words = re.sub(r'\s+', ' ', words)  # Normalize whitespace
+        words = words.strip()
+        
+        return words
+    
+    @staticmethod
+    def get_primary_identifier(entry_fields: Dict[str, str]) -> Optional[Tuple[str, str]]:
+        """Get the primary identifier (first in priority order).
+        
+        Args:
+            entry_fields: Dictionary of entry fields
+            
+        Returns:
+            Tuple of (scheme, value), or None if no identifier found
+        """
+        resolved = IdentifierResolver.resolve(entry_fields)
+        
+        for scheme, _ in IdentifierResolver.LOOKUP_PRIORITY:
+            # Find this scheme in resolved identifiers
+            for rscheme, rvalue in resolved:
+                if rscheme == scheme:
+                    return (rscheme, rvalue)
+        
+        return None
+    
+    @staticmethod
+    def get_scheme_value(scheme: str, entry_fields: Dict[str, str]) -> Optional[str]:
+        """Get a specific identifier value from entry fields.
+        
+        Args:
+            scheme: Identifier scheme name
+            entry_fields: Dictionary of entry fields
+            
+        Returns:
+            Identifier value, or None if not found
+        """
+        if value := entry_fields.get(scheme):
+            if normalized := IdentifierNormalizer.normalize_identifier(scheme, value):
+                return normalized[1]
+        return None
+
+
+# Convenience functions
+def extract_identifiers(entry_fields: Dict[str, str]) -> Dict[str, str]:
+    """Extract all identifiers from entry fields.
+    
+    Args:
+        entry_fields: Dictionary of entry fields
+        
+    Returns:
+        Dictionary mapping scheme names to values
+    """
+    return IdentifierExtractor.extract(entry_fields)
+
+
+def normalize_identifier(scheme: str, value: str) -> Optional[Tuple[str, str]]:
+    """Normalize an identifier.
+    
+    Args:
+        scheme: Identifier scheme name
+        value: Identifier value
+        
+    Returns:
+        Tuple of (scheme, normalized_value), or None if invalid
+    """
+    return IdentifierNormalizer.normalize_identifier(scheme, value)
+
+
+def get_primary_identifier(entry_fields: Dict[str, str]) -> Optional[Tuple[str, str]]:
+    """Get the primary identifier.
+    
+    Args:
+        entry_fields: Dictionary of entry fields
+        
+    Returns:
+        Tuple of (scheme, value), or None if no identifier found
+    """
+    return IdentifierResolver.get_primary_identifier(entry_fields)
+
+
+def resolve_identifiers(entry_fields: Dict[str, str]) -> List[Tuple[str, str]]:
+    """Resolve identifiers from entry fields.
+    
+    Args:
+        entry_fields: Dictionary of entry fields
+        
+    Returns:
+        List of (scheme, value) tuples
+    """
+    return IdentifierResolver.resolve(entry_fields)
--- a/src/citegeist/sources/all.py
+++ b/src/citegeist/sources/all.py
@ -0,0 +1,29 @@
+"""Export all source plugins."""
+from __future__ import annotations
+
+from citegeist.sources.base import BibliographicSource, SourceRecord, CitationEdge
+from citegeist.sources.catalog import SourceCatalogEntry, list_source_catalog, prioritized_source_keys
+from citegeist.sources.registry import SourceRegistry, get_registry
+from citegeist.sources.crossref import CrossRefSource
+from citegeist.sources.europepmc import EuropePmcSource
+from citegeist.sources.opencitations import OpenCitationsSource
+from citegeist.sources.openlibrary import OpenLibrarySource
+from citegeist.sources.semanticscholar import SemanticScholarSource
+from citegeist.sources.unpaywall import UnpaywallSource
+
+__all__ = [
+    'BibliographicSource',
+    'SourceRecord',
+    'CitationEdge',
+    'SourceCatalogEntry',
+    'SourceRegistry',
+    'get_registry',
+    'list_source_catalog',
+    'prioritized_source_keys',
+    'CrossRefSource',
+    'EuropePmcSource',
+    'OpenCitationsSource',
+    'OpenLibrarySource',
+    'SemanticScholarSource',
+    'UnpaywallSource',
+]
--- a/src/citegeist/sources/init.py
+++ b/src/citegeist/sources/init.py
@ -0,0 +1,44 @@
+"""
+Bibliographic source plugins.
+
+This package provides a plugin architecture for integrating multiple
+bibliographic data sources (CrossRef, PubMed, Semantic Scholar, etc.).
+"""
+
+# Import old sources module for backward compatibility
+from . import _old_sources_compat
+
+# Import new plugin architecture
+from citegeist.sources.base import BibliographicSource, SourceRecord, CitationEdge
+from citegeist.sources.catalog import SourceCatalogEntry, list_source_catalog, prioritized_source_keys
+from citegeist.sources.registry import SourceRegistry, get_registry
+from citegeist.sources.crossref import CrossRefSource
+from citegeist.sources.europepmc import EuropePmcSource
+from citegeist.sources.opencitations import OpenCitationsSource
+from citegeist.sources.openlibrary import OpenLibrarySource
+from citegeist.sources.semanticscholar import SemanticScholarSource
+from citegeist.sources.unpaywall import UnpaywallSource
+
+# Re-export old classes for compatibility
+__all__ = [
+    # New plugin architecture
+    'BibliographicSource',
+    'SourceRecord',
+    'CitationEdge',
+    'SourceCatalogEntry',
+    'SourceRegistry',
+    'get_registry',
+    'list_source_catalog',
+    'prioritized_source_keys',
+    'CrossRefSource',
+    'EuropePmcSource',
+    'OpenCitationsSource',
+    'OpenLibrarySource',
+    'SemanticScholarSource',
+    'UnpaywallSource',
+    # Old API (for backward compatibility)
+    'SourceClient',
+]
+
+# Backward compatibility - make SourceClient available from this module
+SourceClient = _old_sources_compat.SourceClient
--- a/src/citegeist/sources/_old_sources_compat.py
+++ b/src/citegeist/sources/_old_sources_compat.py
@ -0,0 +1,25 @@
+"""
+Backward compatibility module for old sources module.
+
+This module re-exports the old SourceClient class for compatibility.
+"""
+from pathlib import Path
+import importlib.util
+
+from .base import BibliographicSource, SourceRecord, CitationEdge
+from .registry import SourceRegistry, get_registry
+from .crossref import CrossRefSource
+
+# Load the old sources.py module from the citegeist package root
+_OLD_SOURCES_PATH = Path(__file__).resolve().parents[1] / "sources.py"
+spec = importlib.util.spec_from_file_location(
+    "citegeist.sources_old",
+    _OLD_SOURCES_PATH
+)
+if spec and spec.loader:
+    old_sources = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(old_sources)
+    SourceClient = old_sources.SourceClient
+else:
+    # Fallback if old sources.py doesn't exist
+    SourceClient = None
--- a/src/citegeist/sources/base.py
+++ b/src/citegeist/sources/base.py
@ -0,0 +1,189 @@
+"""
+Base interface for bibliographic sources.
+
+This module defines the abstract base class that all source plugins must implement.
+Plugins can register themselves with the SourceRegistry for dynamic loading.
+"""
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+from citegeist.bibtex import BibEntry
+
+
+@dataclass(slots=True)
+class SourceRecord:
+    """Represents a raw record from a source API."""
+    raw: Dict[str, Any]
+    source_type: str
+    source_label: str
+    timestamp: str
+    confidence: float
+
+
+@dataclass(slots=True)
+class CitationEdge:
+    """Represents a citation relationship."""
+    source_work_id: str
+    target_work_id: str
+    relation_type: str  # "cites" or "cited_by"
+    source_type: str
+    source_label: str
+    confidence: float
+
+
+class BibliographicSource(ABC):
+    """Abstract base class for bibliographic data sources.
+    
+    All source plugins must inherit from this class and implement the required methods.
+    """
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """Initialize the source with optional configuration.
+        
+        Args:
+            config: Source-specific configuration dictionary
+        """
+        self.config = config or {}
+        self.enabled = self.config.get('enabled', True)
+        self.source_type = self.config.get('source_type', self.__class__.__name__)
+    
+    @abstractmethod
+    def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
+        """Look up a work by DOI.
+        
+        Args:
+            doi: Digital Object Identifier
+            
+        Returns:
+            BibEntry if found, None otherwise
+        """
+        pass
+    
+    @abstractmethod
+    def lookup_by_title(self, title: str) -> Optional[BibEntry]:
+        """Look up a work by title.
+        
+        Args:
+            title: Work title
+            
+        Returns:
+            BibEntry if found, None otherwise
+        """
+        pass
+    
+    @abstractmethod
+    def search(self, query: str, limit: int = 10) -> List[BibEntry]:
+        """Search for works matching the query.
+        
+        Args:
+            query: Search query string
+            limit: Maximum number of results
+            
+        Returns:
+            List of matching BibEntry objects
+        """
+        pass
+    
+    @abstractmethod
+    def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
+        """Normalize a raw API record to a canonical BibEntry.
+        
+        Args:
+            record: Raw record from source API
+            
+        Returns:
+            BibEntry if normalization succeeds, None otherwise
+        """
+        pass
+    
+    def get_citations(self, work_id: str, relation_type: str = 'cites', limit: int = 10) -> List[CitationEdge]:
+        """Get citations for a work.
+        
+        Args:
+            work_id: Work identifier (DOI, PMID, etc.)
+            relation_type: Type of relation ('cites' or 'cited_by')
+            limit: Maximum number of results
+            
+        Returns:
+            List of CitationEdge objects
+        """
+        return []
+    
+    def get_related(self, work_id: str, limit: int = 10) -> List[BibEntry]:
+        """Get works related to a work.
+        
+        Args:
+            work_id: Work identifier
+            limit: Maximum number of results
+            
+        Returns:
+            List of related BibEntry objects
+        """
+        return []
+    
+    def get_fulltext_url(self, doi: str) -> Optional[str]:
+        """Get full-text URL for a work.
+        
+        Args:
+            doi: Digital Object Identifier
+            
+        Returns:
+            Full-text URL if available, None otherwise
+        """
+        return None
+    
+    def get_embedding(self, work_id: str) -> Optional[List[float]]:
+        """Get embedding vector for a work.
+        
+        Args:
+            work_id: Work identifier
+            
+        Returns:
+            Embedding vector if available, None otherwise
+        """
+        return None
+    
+    def get_identifier_scheme(self) -> str:
+        """Get the identifier scheme used by this source.
+        
+        Returns:
+            Identifier scheme (e.g., 'doi', 'pmid', 'openalex')
+        """
+        return self.source_type.lower()
+    
+    def record_source_metadata(self, entry: BibEntry, operation: str = 'ingest') -> SourceRecord:
+        """Create a source record for provenance tracking.
+        
+        Args:
+            entry: The BibEntry to record
+            operation: Operation type (e.g., 'ingest', 'enrich')
+            
+        Returns:
+            SourceRecord with metadata
+        """
+        return SourceRecord(
+            raw=self._entry_to_dict(entry),
+            source_type=self.source_type,
+            source_label=f"{self.source_type}:{self.config.get('name', self.__class__.__name__)}",
+            timestamp='',
+            confidence=1.0
+        )
+    
+    def _entry_to_dict(self, entry: BibEntry) -> Dict[str, Any]:
+        """Convert BibEntry to dictionary for source records."""
+        return {
+            'entry_type': entry.entry_type,
+            'citation_key': entry.citation_key,
+            'fields': entry.fields
+        }
+    
+    def is_available(self) -> bool:
+        """Check if the source is available and enabled.
+        
+        Returns:
+            True if enabled and available, False otherwise
+        """
+        return self.enabled
--- a/src/citegeist/sources/catalog.py
+++ b/src/citegeist/sources/catalog.py
@ -0,0 +1,173 @@
+"""Open bibliographic source inventory and prioritization helpers."""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True, slots=True)
+class SourceCatalogEntry:
+    key: str
+    label: str
+    category: str
+    access: str
+    capabilities: tuple[str, ...]
+    strengths: str
+    caveats: str
+    current_status: str
+    priority: str
+
+
+_CATALOG: tuple[SourceCatalogEntry, ...] = (
+    SourceCatalogEntry(
+        key="crossref",
+        label="Crossref",
+        category="metadata",
+        access="open API",
+        capabilities=("doi_lookup", "title_search", "reference_lists"),
+        strengths="Broad DOI coverage and good article-level metadata.",
+        caveats="Citation coverage is incomplete and some references are unstructured blobs.",
+        current_status="integrated",
+        priority="now",
+    ),
+    SourceCatalogEntry(
+        key="openalex",
+        label="OpenAlex",
+        category="metadata+graph",
+        access="open API",
+        capabilities=("work_lookup", "title_search", "citations", "cited_by", "topics"),
+        strengths="Best current open source for citation graph expansion and work-level discovery.",
+        caveats="Occasional noisy secondary records require conservative admission rules.",
+        current_status="integrated",
+        priority="now",
+    ),
+    SourceCatalogEntry(
+        key="pubmed",
+        label="PubMed / NCBI E-utilities",
+        category="metadata",
+        access="open API",
+        capabilities=("pmid_lookup", "title_search", "biomedical_metadata"),
+        strengths="High-value authoritative metadata for biomedical literature.",
+        caveats="Domain-specific coverage outside biomedicine is limited.",
+        current_status="integrated",
+        priority="now",
+    ),
+    SourceCatalogEntry(
+        key="datacite",
+        label="DataCite",
+        category="metadata",
+        access="open API",
+        capabilities=("doi_lookup", "title_search", "datasets"),
+        strengths="Important for datasets, reports, and non-traditional DOI-bearing objects.",
+        caveats="Less useful than Crossref/OpenAlex for mainstream article citation graphs.",
+        current_status="integrated",
+        priority="now",
+    ),
+    SourceCatalogEntry(
+        key="dblp",
+        label="DBLP",
+        category="metadata",
+        access="open API",
+        capabilities=("key_lookup", "search", "computer_science"),
+        strengths="Excellent computer-science coverage and clean bibliographic records.",
+        caveats="Discipline-specific rather than general-purpose.",
+        current_status="integrated",
+        priority="selective",
+    ),
+    SourceCatalogEntry(
+        key="arxiv",
+        label="arXiv",
+        category="metadata+fulltext",
+        access="open API",
+        capabilities=("id_lookup", "search", "preprints"),
+        strengths="Useful for preprint-first fields and free full-text links.",
+        caveats="Not a general citation graph source.",
+        current_status="integrated",
+        priority="selective",
+    ),
+    SourceCatalogEntry(
+        key="open_citations",
+        label="OpenCitations",
+        category="graph",
+        access="open API",
+        capabilities=("doi_citations", "doi_references", "provenance"),
+        strengths="Directly aligned with open citation-edge expansion.",
+        caveats="Coverage is narrower than OpenAlex and needs merge discipline.",
+        current_status="integrated",
+        priority="now",
+    ),
+    SourceCatalogEntry(
+        key="semantic_scholar",
+        label="Semantic Scholar",
+        category="metadata+graph",
+        access="free API with limits",
+        capabilities=("work_lookup", "search", "citations", "references"),
+        strengths="Strong graph and relevance signals, especially for discovery workflows.",
+        caveats="Not fully open data in the same sense as Crossref/OpenAlex/OpenCitations.",
+        current_status="integrated",
+        priority="now",
+    ),
+    SourceCatalogEntry(
+        key="unpaywall",
+        label="Unpaywall",
+        category="access-links",
+        access="open API",
+        capabilities=("doi_fulltext_links", "oa_status"),
+        strengths="Best open source for landing-page and OA-link enrichment.",
+        caveats="Improves access, not bibliographic identity or graph completeness.",
+        current_status="integrated",
+        priority="now",
+    ),
+    SourceCatalogEntry(
+        key="europe_pmc",
+        label="Europe PMC",
+        category="metadata+fulltext",
+        access="open API",
+        capabilities=("search", "citations", "fulltext_links", "biomedical"),
+        strengths="Valuable biomedical complement to PubMed with richer open-access linkage.",
+        caveats="Domain-specific and partially overlapping with PubMed/OpenAlex.",
+        current_status="integrated",
+        priority="now",
+    ),
+    SourceCatalogEntry(
+        key="open_library",
+        label="Open Library",
+        category="metadata",
+        access="open API",
+        capabilities=("title_search", "book_metadata", "author_catalog", "isbn_hints"),
+        strengths="Useful general-purpose open catalog coverage for books, monographs, and reference works.",
+        caveats="Catalog metadata is not a citation graph and can be noisy for article-like titles.",
+        current_status="integrated",
+        priority="selective",
+    ),
+    SourceCatalogEntry(
+        key="openaire",
+        label="OpenAIRE",
+        category="metadata+repository",
+        access="open API",
+        capabilities=("repository_metadata", "oa_links", "project_links"),
+        strengths="Good for repository, project, and European OA discovery.",
+        caveats="May be better treated as a corpus-acquisition adapter than a first-line resolver.",
+        current_status="planned",
+        priority="evaluate",
+    ),
+    SourceCatalogEntry(
+        key="oai_pmh",
+        label="OAI-PMH Repositories",
+        category="repository",
+        access="open protocol",
+        capabilities=("repository_harvest", "set_discovery", "metadata_formats"),
+        strengths="Already useful for theses, dissertations, and institutional repositories.",
+        caveats="Heterogeneous metadata quality; not a single canonical bibliographic source.",
+        current_status="integrated",
+        priority="selective",
+    ),
+)
+
+
+def list_source_catalog() -> list[SourceCatalogEntry]:
+    return list(_CATALOG)
+
+
+def prioritized_source_keys() -> list[str]:
+    order = {"now": 0, "next": 1, "selective": 2, "evaluate": 3}
+    return [entry.key for entry in sorted(_CATALOG, key=lambda entry: (order[entry.priority], entry.label.lower()))]
--- a/src/citegeist/sources/crossref.py
+++ b/src/citegeist/sources/crossref.py
@ -0,0 +1,210 @@
+"""
+CrossRef source plugin.
+
+CrossRef provides metadata for DOIs for scholarly works.
+"""
+from __future__ import annotations
+
+import json
+import urllib.request
+import urllib.parse
+from typing import Any, Dict, List, Optional
+
+from citegeist.bibtex import BibEntry
+from citegeist.sources.base import BibliographicSource
+
+
+class CrossRefSource(BibliographicSource):
+    """CrossRef source for DOI-based metadata lookup."""
+    
+    BASE_URL = "https://api.crossref.org"
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """Initialize CrossRef source.
+        
+        Args:
+            config: Configuration with optional 'api_key'
+        """
+        super().__init__(config)
+        self.api_key = self.config.get('api_key', '')
+        self.user_agent = self.config.get(
+            'user_agent',
+            'citegeist/0.1 (local research tool)',
+        )
+    
+    def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
+        """Look up a work by DOI.
+        
+        Args:
+            doi: Digital Object Identifier
+            
+        Returns:
+            BibEntry if found, None otherwise
+        """
+        if not doi:
+            return None
+        
+        encoded = urllib.parse.quote(doi, safe="")
+        url = f"{self.BASE_URL}/works/{encoded}"
+        headers = {'User-Agent': self.user_agent}
+        if self.api_key:
+            headers['X-Api-Key'] = self.api_key
+        
+        try:
+            req = urllib.request.Request(url, headers=headers)
+            response = urllib.request.urlopen(req)
+            data = response.read().decode('utf-8')
+            payload = json.loads(data)
+            return self._normalize_crossref(payload)
+        except Exception:
+            return None
+    
+    def lookup_by_title(self, title: str) -> Optional[BibEntry]:
+        """CrossRef doesn't support title-only lookup.
+        
+        Returns None as this is not a supported operation.
+        """
+        return None
+    
+    def search(self, query: str, limit: int = 10) -> List[BibEntry]:
+        """Search CrossRef for works.
+        
+        Args:
+            query: Search query string
+            limit: Maximum number of results
+            
+        Returns:
+            List of matching BibEntry objects
+        """
+        if not query:
+            return []
+        
+        encoded_query = urllib.parse.quote(query, safe="")
+        url = f"{self.BASE_URL}/works?query={encoded_query}&rows={limit}"
+        headers = {'User-Agent': self.user_agent}
+        if self.api_key:
+            headers['X-Api-Key'] = self.api_key
+        
+        try:
+            req = urllib.request.Request(url, headers=headers)
+            response = urllib.request.urlopen(req)
+            data = response.read().decode('utf-8')
+            payload = json.loads(data)
+            items = payload.get('message', {}).get('items', [])
+            return [entry for item in items if (entry := self._normalize_crossref(item)) is not None]
+        except Exception:
+            return []
+    
+    def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
+        """Normalize a raw CrossRef record to a BibEntry.
+        
+        Args:
+            record: Raw record from CrossRef API
+            
+        Returns:
+            BibEntry if normalization succeeds
+        """
+        return self._normalize_crossref(record)
+    
+    def get_identifier_scheme(self) -> str:
+        """Return 'doi' as the identifier scheme."""
+        return 'doi'
+    
+    def _normalize_crossref(self, payload: Dict[str, Any]) -> Optional[BibEntry]:
+        """Normalize a CrossRef payload to a BibEntry.
+        
+        Args:
+            payload: Raw JSON payload from CrossRef
+            
+        Returns:
+            BibEntry object
+        """
+        message = payload.get('message', payload)
+        if not message:
+            return None
+        
+        # Extract basic fields
+        doi = str(message.get('DOI', ''))
+        title = ' '.join(message.get('title', [])) if message.get('title') else ''
+        author_data = message.get('author', [])
+        year = self._extract_year(message)
+        
+        # Format authors
+        authors = []
+        for author in author_data:
+            given = str(author.get('given', ''))
+            family = str(author.get('family', ''))
+            if given and family:
+                authors.append(f"{given} {family}")
+            elif family:
+                authors.append(family)
+        
+        # Get publisher
+        publisher = str(message.get('publisher', ''))
+        
+        # Get journal info
+        container_title = message.get('container-title', [])
+        journal = container_title[0] if container_title else ''
+        
+        # Get URL
+        url = str(message.get('URL', ''))
+        
+        # Get abstract
+        abstract = self._extract_abstract(message.get('abstract'))
+        
+        # Map to BibEntry
+        fields: Dict[str, str] = {}
+        if title:
+            fields['title'] = title
+        if authors:
+            fields['author'] = ' and '.join(authors)
+        if year:
+            fields['year'] = year
+        if doi:
+            fields['doi'] = doi
+        if journal:
+            fields['journal'] = journal
+        if publisher:
+            fields['publisher'] = publisher
+        if url:
+            fields['url'] = url
+        if abstract:
+            fields['abstract'] = abstract
+        
+        citation_key = f"{authors[0] if authors else 'crossref'}_{year or 'n.d.'}_{title or doi}"
+        
+        return BibEntry(
+            entry_type='article',
+            citation_key=citation_key,
+            fields=fields
+        )
+
+    def _extract_year(self, message: Dict[str, Any]) -> str:
+        for field_name in ('published-print', 'published-online', 'issued', 'created'):
+            year = self._extract_year_from_date_parts(message.get(field_name, {}))
+            if year:
+                return year
+        return ''
+
+    def _extract_year_from_date_parts(self, field: Dict[str, Any]) -> str:
+        date_parts = field.get('date-parts', [])
+        if not date_parts:
+            return ''
+        first_part = date_parts[0]
+        if not first_part:
+            return ''
+        year = first_part[0]
+        return str(year) if year else ''
+
+    def _extract_abstract(self, raw_abstract: Any) -> str:
+        if isinstance(raw_abstract, str):
+            return raw_abstract.strip()
+        if isinstance(raw_abstract, list):
+            for item in raw_abstract:
+                if isinstance(item, dict):
+                    text = str(item.get('value', '')).strip()
+                    if text:
+                        return text
+                elif isinstance(item, str) and item.strip():
+                    return item.strip()
+        return ''
--- a/src/citegeist/sources/europepmc.py
+++ b/src/citegeist/sources/europepmc.py
@ -0,0 +1,157 @@
+"""Europe PMC source plugin."""
+from __future__ import annotations
+
+import urllib.parse
+from typing import Any, Dict, Optional
+
+from citegeist.bibtex import BibEntry
+from citegeist.sources._old_sources_compat import SourceClient
+from citegeist.sources.base import BibliographicSource
+
+
+class EuropePmcSource(BibliographicSource):
+    """Europe PMC source for biomedical metadata and OA/fulltext links."""
+
+    BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        super().__init__(config)
+        user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)")
+        self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
+
+    def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
+        normalized = doi.strip()
+        if not normalized:
+            return None
+        query = f'DOI:"{normalized}"'
+        row = self._search_one(query)
+        return self.normalize(row) if row else None
+
+    def lookup_by_title(self, title: str) -> Optional[BibEntry]:
+        query_text = " ".join(title.split())
+        if not query_text:
+            return None
+        query = f'TITLE:"{query_text}"'
+        row = self._search_one(query)
+        return self.normalize(row) if row else None
+
+    def search(self, query: str, limit: int = 10) -> list[BibEntry]:
+        query_text = " ".join(query.split())
+        if not query_text:
+            return []
+        payload = self._search_payload(f'TITLE:"{query_text}"', max(1, limit))
+        results = payload.get("resultList", {}).get("result", []) if payload else []
+        return [entry for row in results if (entry := self.normalize(row)) is not None]
+
+    def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
+        title = str(record.get("title") or "").strip()
+        if not title:
+            return None
+
+        doi = str(record.get("doi") or "").strip()
+        pmid = str(record.get("pmid") or record.get("id") or "").strip() if str(record.get("source") or "") == "MED" else str(record.get("pmid") or "").strip()
+        pmcid = str(record.get("pmcid") or "").strip()
+        year = str(record.get("pubYear") or "").strip()
+        author_text = self._normalize_author_string(str(record.get("authorString") or "").strip())
+        journal_title = str(record.get("journalTitle") or "").strip()
+        abstract = str(record.get("abstractText") or "").strip()
+
+        fields: Dict[str, str] = {"title": title}
+        if doi:
+            fields["doi"] = doi
+        if pmid:
+            fields["pmid"] = pmid
+        if pmcid:
+            fields["pmcid"] = pmcid
+        if year:
+            fields["year"] = year
+        if author_text:
+            fields["author"] = author_text
+        if journal_title:
+            fields["journal"] = journal_title
+        if volume := str(record.get("journalVolume") or "").strip():
+            fields["volume"] = volume
+        if issue := str(record.get("issue") or "").strip():
+            fields["number"] = issue
+        if pages := str(record.get("pageInfo") or "").strip():
+            fields["pages"] = pages
+        if abstract:
+            fields["abstract"] = abstract
+        if fulltext_url := self._fulltext_url(record):
+            fields["url"] = fulltext_url
+        elif article_url := self._article_url(record):
+            fields["url"] = article_url
+        if str(record.get("isOpenAccess") or "").strip():
+            fields["is_oa"] = "true" if str(record.get("isOpenAccess")).upper() == "Y" else "false"
+        if cited_by := str(record.get("citedByCount") or "").strip():
+            fields["europepmc_cited_by_count"] = cited_by
+        if source := str(record.get("source") or "").strip():
+            fields["europepmc_source"] = source
+
+        citation_key = self._citation_key(doi, pmid, author_text, year, title)
+        return BibEntry(entry_type="article", citation_key=citation_key, fields=fields)
+
+    def get_fulltext_url(self, doi: str) -> Optional[str]:
+        normalized = doi.strip()
+        if not normalized:
+            return None
+        payload = self._search_payload(f'DOI:"{normalized}"', 1)
+        results = payload.get("resultList", {}).get("result", []) if payload else []
+        if not results:
+            return None
+        return self._fulltext_url(results[0]) or self._article_url(results[0])
+
+    def get_identifier_scheme(self) -> str:
+        return "doi"
+
+    def _search_one(self, query: str) -> Dict[str, Any] | None:
+        payload = self._search_payload(query, 1)
+        results = payload.get("resultList", {}).get("result", []) if payload else []
+        return results[0] if results else None
+
+    def _search_payload(self, query: str, page_size: int) -> Dict[str, Any] | None:
+        params = {
+            "query": query,
+            "format": "json",
+            "resultType": "core",
+            "pageSize": max(1, page_size),
+        }
+        return self.source_client.try_get_json(f"{self.BASE_URL}?{urllib.parse.urlencode(params)}")
+
+    def _fulltext_url(self, record: Dict[str, Any]) -> str:
+        candidates = record.get("fullTextUrlList", {})
+        if isinstance(candidates, dict):
+            urls = candidates.get("fullTextUrl", [])
+            if isinstance(urls, dict):
+                urls = [urls]
+            if isinstance(urls, list):
+                for item in urls:
+                    if not isinstance(item, dict):
+                        continue
+                    url = str(item.get("url") or "").strip()
+                    if url:
+                        return url
+        return ""
+
+    def _article_url(self, record: Dict[str, Any]) -> str:
+        source = str(record.get("source") or "").strip()
+        identifier = str(record.get("id") or "").strip()
+        if source and identifier:
+            return f"https://europepmc.org/article/{source}/{identifier}"
+        return ""
+
+    def _normalize_author_string(self, value: str) -> str:
+        if not value:
+            return ""
+        authors = [part.strip().rstrip(".") for part in value.split(",") if part.strip()]
+        return " and ".join(authors)
+
+    def _citation_key(self, doi: str, pmid: str, author_text: str, year: str, title: str) -> str:
+        if doi:
+            return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
+        if pmid:
+            return f"pmid{pmid}"
+        family = author_text.split(" and ")[0].split()[-1] if author_text else "ref"
+        family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref"
+        first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
+        return f"{family}{year or 'nd'}{first_word or 'untitled'}"
--- a/src/citegeist/sources/opencitations.py
+++ b/src/citegeist/sources/opencitations.py
@ -0,0 +1,178 @@
+"""OpenCitations source plugin."""
+from __future__ import annotations
+
+import urllib.parse
+from typing import Any, Dict, List, Optional
+
+from citegeist.bibtex import BibEntry
+from citegeist.sources.base import BibliographicSource, CitationEdge
+from citegeist.sources._old_sources_compat import SourceClient
+
+
+class OpenCitationsSource(BibliographicSource):
+    """OpenCitations source for DOI metadata and citation edges."""
+
+    INDEX_BASE_URL = "https://api.opencitations.net/index/v2"
+    META_BASE_URL = "https://api.opencitations.net/meta/v1"
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        super().__init__(config)
+        user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)")
+        self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
+
+    def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
+        normalized = self._normalize_doi_pid(doi)
+        if not normalized:
+            return None
+        rows = self.source_client.try_get_json(f"{self.META_BASE_URL}/metadata/{normalized}")
+        if not rows:
+            return None
+        return self.normalize(rows[0])
+
+    def lookup_by_title(self, title: str) -> Optional[BibEntry]:
+        return None
+
+    def search(self, query: str, limit: int = 10) -> List[BibEntry]:
+        return []
+
+    def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
+        ids = str(record.get("id") or "")
+        title = str(record.get("title") or "").strip()
+        if not ids or not title:
+            return None
+
+        doi = self._extract_id_value(ids, "doi")
+        openalex = self._extract_id_value(ids, "openalex")
+        year = self._extract_year(str(record.get("pub_date") or ""))
+        authors = self._normalize_author_field(str(record.get("author") or ""))
+        venue, venue_ids = self._parse_venue_field(str(record.get("venue") or ""))
+        entry_type = self._map_entry_type(str(record.get("type") or ""))
+
+        fields: Dict[str, str] = {"title": title}
+        if doi:
+            fields["doi"] = doi
+            fields["url"] = f"https://doi.org/{doi}"
+        if openalex:
+            fields["openalex"] = openalex
+        if year:
+            fields["year"] = year
+        if authors:
+            fields["author"] = authors
+        if venue:
+            if entry_type == "article":
+                fields["journal"] = venue
+            else:
+                fields["booktitle"] = venue
+        if volume := str(record.get("volume") or "").strip():
+            fields["volume"] = volume
+        if issue := str(record.get("issue") or "").strip():
+            fields["number"] = issue
+        if pages := str(record.get("page") or "").strip():
+            fields["pages"] = pages
+        if publisher := self._strip_bracketed_ids(str(record.get("publisher") or "")):
+            fields["publisher"] = publisher
+        if venue_ids:
+            fields["note"] = f"opencitations_venue_ids = {{{venue_ids}}}"
+
+        citation_key = self._citation_key(doi, openalex, authors, year, title)
+        return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
+
+    def get_citations(self, work_id: str, relation_type: str = "cites", limit: int = 10) -> List[CitationEdge]:
+        normalized = self._normalize_doi_pid(work_id)
+        if not normalized:
+            return []
+        path = "references" if relation_type == "cites" else "citations"
+        rows = self.source_client.try_get_json(f"{self.INDEX_BASE_URL}/{path}/{normalized}")
+        if not rows:
+            return []
+
+        edges: List[CitationEdge] = []
+        for row in rows[:limit]:
+            citing = self._extract_id_value(str(row.get("citing") or ""), "doi")
+            cited = self._extract_id_value(str(row.get("cited") or ""), "doi")
+            if not citing or not cited:
+                continue
+            if relation_type == "cites":
+                source_work_id, target_work_id = citing, cited
+            else:
+                source_work_id, target_work_id = citing, cited
+            edges.append(
+                CitationEdge(
+                    source_work_id=f"doi:{source_work_id}",
+                    target_work_id=f"doi:{target_work_id}",
+                    relation_type="cites",
+                    source_type="opencitations",
+                    source_label=f"opencitations:{path}:{normalized}",
+                    confidence=0.85,
+                )
+            )
+        return edges
+
+    def get_identifier_scheme(self) -> str:
+        return "doi"
+
+    def _normalize_doi_pid(self, value: str) -> str:
+        doi = value.strip()
+        if not doi:
+            return ""
+        if doi.lower().startswith("doi:"):
+            doi = doi[4:]
+        return f"doi:{doi}"
+
+    def _extract_id_value(self, identifiers: str, scheme: str) -> str:
+        prefix = f"{scheme}:"
+        for token in identifiers.split():
+            if token.startswith(prefix):
+                return token[len(prefix):]
+        return ""
+
+    def _extract_year(self, pub_date: str) -> str:
+        pub_date = pub_date.strip()
+        if len(pub_date) >= 4 and pub_date[:4].isdigit():
+            return pub_date[:4]
+        return ""
+
+    def _normalize_author_field(self, raw_authors: str) -> str:
+        authors: List[str] = []
+        for part in raw_authors.split(";"):
+            cleaned = self._strip_bracketed_ids(part)
+            cleaned = " ".join(cleaned.split())
+            if cleaned:
+                authors.append(cleaned)
+        return " and ".join(authors)
+
+    def _parse_venue_field(self, raw_venue: str) -> tuple[str, str]:
+        raw_venue = raw_venue.strip()
+        if not raw_venue:
+            return "", ""
+        if "[" not in raw_venue:
+            return raw_venue, ""
+        title, _, remainder = raw_venue.partition("[")
+        return title.strip(), remainder.rstrip("] ").strip()
+
+    def _strip_bracketed_ids(self, value: str) -> str:
+        return value.split("[", 1)[0].strip()
+
+    def _map_entry_type(self, raw_type: str) -> str:
+        lowered = raw_type.casefold()
+        if lowered == "journal article":
+            return "article"
+        if lowered == "book":
+            return "book"
+        if lowered == "book chapter":
+            return "incollection"
+        if lowered in {"proceedings article", "conference paper"}:
+            return "inproceedings"
+        if "thesis" in lowered or "dissertation" in lowered:
+            return "phdthesis"
+        return "misc"
+
+    def _citation_key(self, doi: str, openalex: str, authors: str, year: str, title: str) -> str:
+        if doi:
+            return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
+        if openalex:
+            return "openalex" + "".join(ch for ch in openalex.lower() if ch.isalnum())
+        family = authors.split(" and ")[0].split(",")[0].split()[-1] if authors else "ref"
+        family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref"
+        first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
+        return f"{family}{year or 'nd'}{first_word or 'untitled'}"
--- a/src/citegeist/sources/openlibrary.py
+++ b/src/citegeist/sources/openlibrary.py
@ -0,0 +1,100 @@
+"""Open Library source plugin."""
+from __future__ import annotations
+
+import urllib.parse
+from typing import Any, Dict, List, Optional
+
+from citegeist.bibtex import BibEntry
+from citegeist.sources.base import BibliographicSource
+from citegeist.sources._old_sources_compat import SourceClient
+
+
+class OpenLibrarySource(BibliographicSource):
+    """Open Library source for broad book and monograph metadata."""
+
+    SEARCH_URL = "https://openlibrary.org/search.json"
+    WORK_URL = "https://openlibrary.org"
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        super().__init__(config)
+        user_agent = str(self.config.get("user_agent") or "citegeist/0.1 (local research tool)")
+        self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
+
+    def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
+        return None
+
+    def lookup_by_title(self, title: str) -> Optional[BibEntry]:
+        matches = self.search(title, limit=1)
+        return matches[0] if matches else None
+
+    def search(self, query: str, limit: int = 10) -> List[BibEntry]:
+        title = " ".join(query.split())
+        if not title:
+            return []
+        params = urllib.parse.urlencode({"title": title, "limit": max(1, limit), "fields": "*"})
+        payload = self.source_client.try_get_json(f"{self.SEARCH_URL}?{params}")
+        if not payload:
+            return []
+        docs = payload.get("docs", [])
+        if not isinstance(docs, list):
+            return []
+        return [entry for record in docs if isinstance(record, dict) and (entry := self.normalize(record)) is not None]
+
+    def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
+        title = str(record.get("title") or "").strip()
+        if not title:
+            return None
+
+        authors = self._join_list(record.get("author_name"))
+        year = self._extract_year(record)
+        publishers = self._join_list(record.get("publisher"))
+        work_key = str(record.get("key") or "").strip()
+        edition_keys = record.get("edition_key") or []
+        isbn_values = record.get("isbn") or []
+
+        fields: Dict[str, str] = {"title": title}
+        if authors:
+            fields["author"] = authors
+        if year:
+            fields["year"] = year
+        if publishers:
+            fields["publisher"] = publishers
+        if work_key:
+            fields["openlibrary_work"] = work_key
+            fields["url"] = f"{self.WORK_URL}{work_key}"
+        if isinstance(edition_keys, list) and edition_keys:
+            fields["openlibrary_edition"] = str(edition_keys[0])
+        if isinstance(isbn_values, list) and isbn_values:
+            fields["isbn"] = str(isbn_values[0])
+
+        return BibEntry(
+            entry_type="book",
+            citation_key=self._citation_key(work_key, authors, year, title),
+            fields=fields,
+        )
+
+    def get_identifier_scheme(self) -> str:
+        return "openlibrary"
+
+    def _extract_year(self, record: Dict[str, Any]) -> str:
+        first_publish_year = record.get("first_publish_year")
+        if first_publish_year:
+            return str(first_publish_year)
+        publish_year = record.get("publish_year")
+        if isinstance(publish_year, list) and publish_year:
+            return str(publish_year[0])
+        return ""
+
+    def _join_list(self, value: Any) -> str:
+        if not isinstance(value, list):
+            return ""
+        items = [str(item).strip() for item in value if str(item).strip()]
+        return " and ".join(items)
+
+    def _citation_key(self, work_key: str, authors: str, year: str, title: str) -> str:
+        if work_key:
+            return "ol" + "".join(ch for ch in work_key.lower() if ch.isalnum())
+        family = authors.split(" and ")[0].split()[-1] if authors else "book"
+        family = "".join(ch for ch in family.lower() if ch.isalnum()) or "book"
+        first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
+        return f"{family}{year or 'nd'}{first_word or 'untitled'}"
--- a/src/citegeist/sources/registry.py
+++ b/src/citegeist/sources/registry.py
@ -0,0 +1,253 @@
+"""
+Source registry for managing bibliographic source plugins.
+
+This module provides a registry that can discover, load, and manage
+multiple bibliographic source plugins.
+"""
+from __future__ import annotations
+
+import importlib.util
+import inspect
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Type
+
+from citegeist.sources.base import BibliographicSource
+
+
+@dataclass(slots=True)
+class SourceRegistration:
+    """Registration information for a source plugin."""
+    name: str
+    source_class: Type[BibliographicSource]
+    config: Dict[str, Any]
+    enabled: bool
+
+
+class SourceRegistry:
+    """Registry for bibliographic source plugins.
+    
+    This class manages the discovery, registration, and instantiation
+    of bibliographic source plugins.
+    """
+    
+    def __init__(self) -> None:
+        """Initialize the source registry."""
+        self._registrations: Dict[str, SourceRegistration] = {}
+        self._instances: Dict[str, BibliographicSource] = {}
+    
+    def register(
+        self,
+        source_class: Type[BibliographicSource],
+        name: Optional[str] = None,
+        config: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """Register a source class.
+        
+        Args:
+            source_class: The source class to register (must inherit from BibliographicSource)
+            name: Optional name for the source (uses class name if not provided)
+            config: Optional configuration dictionary
+        """
+        if not inspect.isclass(source_class) or not issubclass(source_class, BibliographicSource):
+            raise ValueError(f"{source_class} must be a subclass of BibliographicSource")
+        
+        source_name = name or source_class.__name__
+        self._registrations[source_name] = SourceRegistration(
+            name=source_name,
+            source_class=source_class,
+            config=config or {},
+            enabled=config.get('enabled', True) if config else True
+        )
+    
+    def get(self, name: str) -> Optional[BibliographicSource]:
+        """Get a source instance by name.
+        
+        Args:
+            name: Name of the source
+            
+        Returns:
+            Source instance if registered and enabled, None otherwise
+        """
+        if name not in self._registrations:
+            return None
+        
+        registration = self._registrations[name]
+        
+        # Return cached instance if available
+        if name in self._instances:
+            return self._instances[name]
+        
+        # Create new instance
+        if not registration.enabled:
+            return None
+        
+        instance = registration.source_class(config=registration.config)
+        self._instances[name] = instance
+        return instance
+    
+    def list_sources(self, enabled_only: bool = False) -> List[str]:
+        """List registered source names.
+        
+        Args:
+            enabled_only: Only return enabled sources
+            
+        Returns:
+            List of source names
+        """
+        sources = list(self._registrations.keys())
+        if enabled_only:
+            return [name for name, reg in self._registrations.items() if reg.enabled]
+        return sources
+    
+    def get_config(self, name: str) -> Optional[Dict[str, Any]]:
+        """Get configuration for a source.
+        
+        Args:
+            name: Name of the source
+            
+        Returns:
+            Configuration dictionary, or None if not found
+        """
+        registration = self._registrations.get(name)
+        return registration.config if registration else None
+    
+    def load_from_file(self, filepath: str) -> None:
+        """Load source plugins from a Python file.
+        
+        Args:
+            filepath: Path to Python file containing source classes
+        """
+        spec = importlib.util.spec_from_file_location("module.sources", filepath)
+        if spec is None or spec.loader is None:
+            raise ImportError(f"Cannot load module from {filepath}")
+        
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        
+        # Find all classes that inherit from BibliographicSource
+        for name, obj in inspect.getmembers(module, inspect.isclass):
+            if issubclass(obj, BibliographicSource) and obj is not BibliographicSource:
+                self.register(obj)
+    
+    def load_from_directory(self, directory: str) -> None:
+        """Load source plugins from a directory.
+        
+        Args:
+            directory: Path to directory containing source plugin files
+        """
+        import os
+        for filename in os.listdir(directory):
+            if filename.endswith('.py') and not filename.startswith('_'):
+                filepath = os.path.join(directory, filename)
+                self.load_from_file(filepath)
+    
+    def from_config_dict(self, config: Dict[str, Any]) -> None:
+        """Load sources from a configuration dictionary.
+        
+        Example config format:
+        {
+            "sources": {
+                "crossref": {
+                    "source_type": "crossref",
+                    "enabled": true
+                },
+                "semantic_scholar": {
+                    "source_type": "semantic_scholar",
+                    "enabled": true,
+                    "api_key": "..."
+                }
+            }
+        }
+        
+        Args:
+            config: Configuration dictionary
+        """
+        if 'sources' not in config:
+            return
+
+        for name, source_config in config['sources'].items():
+            source_name = str(name)
+            source_type = str(source_config.get('source_type', source_name))
+            self.register(
+                source_class=self._resolve_source_class(source_type),
+                name=source_name,
+                config=source_config
+            )
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Serialize registry to dictionary.
+        
+        Returns:
+            Dictionary representation of registry
+        """
+        return {
+            name: {
+                'enabled': reg.enabled,
+                'config': reg.config
+            }
+            for name, reg in self._registrations.items()
+        }
+    
+    def from_dict(self, data: Dict[str, Any]) -> None:
+        """Load registry from dictionary.
+        
+        Args:
+            data: Dictionary representation of registry
+        """
+        for name, source_data in data.items():
+            source_name = str(name)
+            source_type = str(source_data.get('source_type', source_name))
+            self.register(
+                source_class=self._resolve_source_class(source_type),
+                name=source_name,
+                config=source_data.get('config', source_data)
+            )
+    
+    def get_registered_sources(self) -> List[SourceRegistration]:
+        """Get all registered source registrations.
+        
+        Returns:
+            List of SourceRegistration objects
+        """
+        return list(self._registrations.values())
+
+    def _resolve_source_class(self, source_type: str) -> Type[BibliographicSource]:
+        normalized = source_type.strip().lower().replace('-', '_')
+        if normalized in {'crossref', 'cross_ref'}:
+            from citegeist.sources.crossref import CrossRefSource
+
+            return CrossRefSource
+        if normalized in {'opencitations', 'open_citations'}:
+            from citegeist.sources.opencitations import OpenCitationsSource
+
+            return OpenCitationsSource
+        if normalized == 'unpaywall':
+            from citegeist.sources.unpaywall import UnpaywallSource
+
+            return UnpaywallSource
+        if normalized in {'europepmc', 'europe_pmc'}:
+            from citegeist.sources.europepmc import EuropePmcSource
+
+            return EuropePmcSource
+        if normalized in {'semanticscholar', 'semantic_scholar'}:
+            from citegeist.sources.semanticscholar import SemanticScholarSource
+
+            return SemanticScholarSource
+        if normalized in {"openlibrary", "open_library"}:
+            from citegeist.sources.openlibrary import OpenLibrarySource
+
+            return OpenLibrarySource
+        raise ValueError(f"Unknown source type: {source_type}")
+
+
+# Global registry instance
+_global_registry = SourceRegistry()
+
+
+def get_registry() -> SourceRegistry:
+    """Get the global source registry instance.
+    
+    Returns:
+        The global SourceRegistry instance
+    """
+    return _global_registry
--- a/src/citegeist/sources/semanticscholar.py
+++ b/src/citegeist/sources/semanticscholar.py
@ -0,0 +1,140 @@
+"""Semantic Scholar source plugin."""
+from __future__ import annotations
+
+import json
+import os
+import urllib.parse
+import urllib.request
+from typing import Any, Dict, List, Optional
+
+from citegeist.bibtex import BibEntry
+from citegeist.sources.base import BibliographicSource
+
+
+class SemanticScholarSource(BibliographicSource):
+    """Semantic Scholar source for broad scientific metadata coverage."""
+
+    BASE_URL = "https://api.semanticscholar.org/graph/v1"
+    DEFAULT_FIELDS = (
+        "paperId,title,year,abstract,authors,externalIds,journal,venue,url,"
+        "openAccessPdf,citationCount,publicationTypes"
+    )
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        super().__init__(config)
+        self.api_key = str(
+            self.config.get("api_key")
+            or os.environ.get("SEMANTIC_SCHOLAR_API_KEY")
+            or ""
+        ).strip()
+        self.user_agent = str(self.config.get("user_agent") or "citegeist/0.1 (local research tool)")
+
+    def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
+        normalized = doi.strip()
+        if not normalized:
+            return None
+        encoded = urllib.parse.quote(f"DOI:{normalized}", safe="")
+        payload = self._get_json(f"{self.BASE_URL}/paper/{encoded}?fields={self.DEFAULT_FIELDS}")
+        if not payload:
+            return None
+        return self.normalize(payload)
+
+    def lookup_by_title(self, title: str) -> Optional[BibEntry]:
+        matches = self.search(title, limit=1)
+        return matches[0] if matches else None
+
+    def search(self, query: str, limit: int = 10) -> List[BibEntry]:
+        query_text = " ".join(query.split())
+        if not query_text:
+            return []
+        params = urllib.parse.urlencode(
+            {"query": query_text, "limit": max(1, limit), "fields": self.DEFAULT_FIELDS}
+        )
+        payload = self._get_json(f"{self.BASE_URL}/paper/search?{params}")
+        if not payload:
+            return []
+        return [entry for row in payload.get("data", []) if (entry := self.normalize(row)) is not None]
+
+    def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
+        title = str(record.get("title") or "").strip()
+        if not title:
+            return None
+
+        external_ids = record.get("externalIds") or {}
+        doi = str(external_ids.get("DOI") or "").strip()
+        authors = " and ".join(
+            str(author.get("name") or "").strip()
+            for author in record.get("authors", [])
+            if str(author.get("name") or "").strip()
+        )
+        year = str(record.get("year") or "").strip()
+        abstract = str(record.get("abstract") or "").strip()
+        journal = record.get("journal") or {}
+        journal_name = str(journal.get("name") or record.get("venue") or "").strip()
+        open_access_pdf = record.get("openAccessPdf") or {}
+
+        fields: Dict[str, str] = {"title": title}
+        if doi:
+            fields["doi"] = doi
+        if paper_id := str(record.get("paperId") or "").strip():
+            fields["semanticscholar_id"] = paper_id
+        if year:
+            fields["year"] = year
+        if authors:
+            fields["author"] = authors
+        if abstract:
+            fields["abstract"] = abstract
+        if journal_name:
+            if self._entry_type(record) == "inproceedings":
+                fields["booktitle"] = journal_name
+            else:
+                fields["journal"] = journal_name
+        if url := str(open_access_pdf.get("url") or record.get("url") or "").strip():
+            fields["url"] = url
+        if open_access_pdf:
+            fields["is_oa"] = "true"
+        if citation_count := record.get("citationCount"):
+            fields["semanticscholar_citation_count"] = str(citation_count)
+
+        citation_key = self._citation_key(doi, str(record.get("paperId") or ""), authors, year, title)
+        return BibEntry(entry_type=self._entry_type(record), citation_key=citation_key, fields=fields)
+
+    def get_fulltext_url(self, doi: str) -> Optional[str]:
+        entry = self.lookup_by_doi(doi)
+        if entry is None:
+            return None
+        return entry.fields.get("url")
+
+    def get_identifier_scheme(self) -> str:
+        return "doi"
+
+    def _entry_type(self, record: Dict[str, Any]) -> str:
+        publication_types = [str(item).lower() for item in (record.get("publicationTypes") or [])]
+        if any("conference" in item for item in publication_types):
+            return "inproceedings"
+        if any("review" in item for item in publication_types):
+            return "article"
+        if record.get("journal") or record.get("venue"):
+            return "article"
+        return "misc"
+
+    def _citation_key(self, doi: str, paper_id: str, authors: str, year: str, title: str) -> str:
+        if doi:
+            return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
+        if paper_id:
+            return "s2" + "".join(ch for ch in paper_id.lower() if ch.isalnum())
+        family = authors.split(" and ")[0].split()[-1] if authors else "ref"
+        family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref"
+        first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
+        return f"{family}{year or 'nd'}{first_word or 'untitled'}"
+
+    def _get_json(self, url: str) -> Dict[str, Any] | None:
+        headers = {"User-Agent": self.user_agent}
+        if self.api_key:
+            headers["x-api-key"] = self.api_key
+        try:
+            request = urllib.request.Request(url, headers=headers)
+            with urllib.request.urlopen(request) as response:
+                return json.loads(response.read().decode("utf-8"))
+        except Exception:
+            return None
--- a/src/citegeist/sources/unpaywall.py
+++ b/src/citegeist/sources/unpaywall.py
@ -0,0 +1,116 @@
+"""Unpaywall source plugin."""
+from __future__ import annotations
+
+import os
+import urllib.parse
+from typing import Any, Dict, Optional
+
+from citegeist.bibtex import BibEntry
+from citegeist.sources._old_sources_compat import SourceClient
+from citegeist.sources.base import BibliographicSource
+
+
+class UnpaywallSource(BibliographicSource):
+    """Unpaywall source for DOI-based OA link enrichment."""
+
+    BASE_URL = "https://api.unpaywall.org/v2"
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        super().__init__(config)
+        user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)")
+        self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
+        self.email = str(
+            self.config.get("email")
+            or os.environ.get("UNPAYWALL_EMAIL")
+            or os.environ.get("NCBI_EMAIL")
+            or ""
+        ).strip()
+
+    def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
+        payload = self.lookup_oa_record(doi)
+        if not payload:
+            return None
+        return self.normalize(payload)
+
+    def lookup_by_title(self, title: str) -> Optional[BibEntry]:
+        return None
+
+    def search(self, query: str, limit: int = 10) -> list[BibEntry]:
+        return []
+
+    def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
+        doi = str(record.get("doi") or "").strip()
+        title = str(record.get("title") or "").strip() or (doi and f"OA record for DOI {doi}")
+        if not doi or not title:
+            return None
+
+        fields: Dict[str, str] = {
+            "title": title,
+            "doi": doi,
+        }
+        if year := str(record.get("year") or "").strip():
+            fields["year"] = year
+        if landing_url := self._best_landing_url(record):
+            fields["url"] = landing_url
+            fields["best_oa_url"] = landing_url
+        if pdf_url := self._best_pdf_url(record):
+            fields["best_oa_pdf_url"] = pdf_url
+        if oa_status := str(record.get("oa_status") or "").strip():
+            fields["oa_status"] = oa_status
+        if license_name := self._best_license(record):
+            fields["oa_license"] = license_name
+        if host_type := self._best_host_type(record):
+            fields["oa_host_type"] = host_type
+        if version := self._best_version(record):
+            fields["oa_version"] = version
+        if evidence := self._best_evidence(record):
+            fields["oa_evidence"] = evidence
+        if record.get("is_oa") is not None:
+            fields["is_oa"] = "true" if bool(record.get("is_oa")) else "false"
+
+        citation_key = "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
+        return BibEntry(entry_type="misc", citation_key=citation_key, fields=fields)
+
+    def get_fulltext_url(self, doi: str) -> Optional[str]:
+        payload = self.lookup_oa_record(doi)
+        if not payload:
+            return None
+        return self._best_pdf_url(payload) or self._best_landing_url(payload)
+
+    def get_identifier_scheme(self) -> str:
+        return "doi"
+
+    def is_available(self) -> bool:
+        return self.enabled and bool(self.email)
+
+    def lookup_oa_record(self, doi: str) -> Dict[str, Any] | None:
+        normalized = doi.strip()
+        if not normalized or not self.email:
+            return None
+        encoded = urllib.parse.quote(normalized, safe="")
+        query = urllib.parse.urlencode({"email": self.email})
+        return self.source_client.try_get_json(f"{self.BASE_URL}/{encoded}?{query}")
+
+    def _best_landing_url(self, payload: Dict[str, Any]) -> str:
+        location = payload.get("best_oa_location") or {}
+        return str(location.get("url") or location.get("url_for_landing_page") or "").strip()
+
+    def _best_pdf_url(self, payload: Dict[str, Any]) -> str:
+        location = payload.get("best_oa_location") or {}
+        return str(location.get("url_for_pdf") or "").strip()
+
+    def _best_license(self, payload: Dict[str, Any]) -> str:
+        location = payload.get("best_oa_location") or {}
+        return str(location.get("license") or "").strip()
+
+    def _best_host_type(self, payload: Dict[str, Any]) -> str:
+        location = payload.get("best_oa_location") or {}
+        return str(location.get("host_type") or "").strip()
+
+    def _best_version(self, payload: Dict[str, Any]) -> str:
+        location = payload.get("best_oa_location") or {}
+        return str(location.get("version") or "").strip()
+
+    def _best_evidence(self, payload: Dict[str, Any]) -> str:
+        location = payload.get("best_oa_location") or {}
+        return str(location.get("evidence") or "").strip()
--- a/src/citegeist/talkorigins.py
+++ b/src/citegeist/talkorigins.py
@ -138,6 +138,7 @@ class TalkOriginsEnrichmentResult:
    applied: bool
    source_label: str = ""
    weak_reasons_after: list[str] | None = None
+    resolution_attempts: list[dict[str, object]] | None = None
    conflicts: list[dict[str, str]] | None = None
    error: str = ""

@ -545,8 +546,28 @@ class TalkOriginsScraper:
            if not weak_reasons_before:
                continue
            resolution = None
+            attempts: list[dict[str, object]] = []
            error = ""
            try:
+                resolver_with_trace = getattr(self.resolver, "resolve_entry_with_trace", None)
+                resolver_plain = getattr(self.resolver, "resolve_entry", None)
+                plain_func = getattr(resolver_plain, "__func__", None)
+                trace_func = getattr(resolver_with_trace, "__func__", None)
+                use_trace = (
+                    resolver_with_trace is not None
+                    and (
+                        trace_func is None
+                        or (
+                            plain_func is MetadataResolver.resolve_entry
+                            and trace_func is MetadataResolver.resolve_entry_with_trace
+                        )
+                    )
+                )
+                if use_trace:
+                    outcome = self.resolver.resolve_entry_with_trace(canonical)
+                    resolution = outcome.resolution
+                    attempts = [asdict(attempt) for attempt in outcome.attempts]
+                else:
                    resolution = self.resolver.resolve_entry(canonical)
            except Exception as exc:
                error = str(exc)
@ -559,6 +580,7 @@ class TalkOriginsScraper:
                applied=False,
                source_label=resolution.source_label if resolution is not None else "",
                error=error,
+                resolution_attempts=attempts,
            )

            if resolution is not None:
--- a/tests/test_europepmc.py
+++ b/tests/test_europepmc.py
@ -0,0 +1,123 @@
+from __future__ import annotations
+
+from citegeist.resolve import MetadataResolver
+from citegeist.sources import EuropePmcSource, SourceRegistry, list_source_catalog
+
+
+def test_europepmc_source_normalizes_core_record() -> None:
+    source = EuropePmcSource(config={})
+    entry = source.normalize(
+        {
+            "id": "37158217",
+            "source": "MED",
+            "pmid": "37158217",
+            "pmcid": "PMC10000001",
+            "doi": "10.1000/example",
+            "title": "Biomedical Example",
+            "authorString": "Doe J, Roe A",
+            "journalTitle": "Biomed Journal",
+            "pubYear": "2024",
+            "journalVolume": "16",
+            "issue": "1",
+            "pageInfo": "10-20",
+            "abstractText": "Abstract text.",
+            "isOpenAccess": "Y",
+            "citedByCount": 12,
+            "fullTextUrlList": {"fullTextUrl": [{"url": "https://europepmc.org/articles/PMC10000001?pdf=render"}]},
+        }
+    )
+
+    assert entry is not None
+    assert entry.fields["doi"] == "10.1000/example"
+    assert entry.fields["pmid"] == "37158217"
+    assert entry.fields["pmcid"] == "PMC10000001"
+    assert entry.fields["journal"] == "Biomed Journal"
+    assert entry.fields["url"] == "https://europepmc.org/articles/PMC10000001?pdf=render"
+    assert entry.fields["is_oa"] == "true"
+
+
+def test_europepmc_registry_and_catalog() -> None:
+    registry = SourceRegistry()
+    registry.from_config_dict(
+        {
+            "sources": {
+                "europepmc": {
+                    "source_type": "europepmc",
+                    "enabled": True,
+                }
+            }
+        }
+    )
+    source = registry.get("europepmc")
+    assert isinstance(source, EuropePmcSource)
+
+    catalog = {entry.key: entry for entry in list_source_catalog()}
+    assert catalog["europe_pmc"].current_status == "integrated"
+    assert catalog["europe_pmc"].priority == "now"
+
+
+def test_metadata_resolver_uses_europepmc_doi_after_primary_lookups_fail() -> None:
+    resolver = MetadataResolver()
+    resolver.resolve_doi = lambda _doi: None  # type: ignore[method-assign]
+    resolver.resolve_datacite_doi = lambda _doi: None  # type: ignore[method-assign]
+    resolver.europepmc.lookup_by_doi = lambda _doi: resolver.europepmc.normalize(  # type: ignore[method-assign]
+        {
+            "id": "37158217",
+            "source": "MED",
+            "pmid": "37158217",
+            "doi": "10.1000/example",
+            "title": "Biomedical Example",
+            "authorString": "Doe J, Roe A",
+            "journalTitle": "Biomed Journal",
+            "pubYear": "2024",
+        }
+    )
+
+    from citegeist.bibtex import BibEntry
+
+    result = resolver.resolve_entry(
+        BibEntry(
+            entry_type="article",
+            citation_key="seed2024",
+            fields={"doi": "10.1000/example", "title": "Biomedical Example"},
+        )
+    )
+
+    assert result is not None
+    assert result.source_label == "europepmc:doi:10.1000/example"
+    assert result.entry.fields["pmid"] == "37158217"
+
+
+def test_metadata_resolver_uses_europepmc_title_search_after_pubmed() -> None:
+    resolver = MetadataResolver()
+    resolver.search_crossref_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
+    resolver.search_datacite_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
+    resolver.search_openalex_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
+    resolver.search_pubmed_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
+    resolver.europepmc.search = lambda _title, limit=5: [  # type: ignore[method-assign]
+        resolver.europepmc.normalize(
+            {
+                "id": "37158217",
+                "source": "MED",
+                "pmid": "37158217",
+                "doi": "10.1000/example",
+                "title": "Biomedical Example",
+                "authorString": "Doe J, Roe A",
+                "journalTitle": "Biomed Journal",
+                "pubYear": "2024",
+            }
+        )
+    ]
+
+    from citegeist.bibtex import BibEntry
+
+    result = resolver.resolve_entry(
+        BibEntry(
+            entry_type="article",
+            citation_key="seed2024",
+            fields={"title": "Biomedical Example", "author": "Doe J", "year": "2024"},
+        )
+    )
+
+    assert result is not None
+    assert result.source_label == "europepmc:search:Biomedical Example"
--- a/tests/test_opencitations.py
+++ b/tests/test_opencitations.py
@ -0,0 +1,137 @@
+from __future__ import annotations
+
+from citegeist.expand import OpenCitationsExpander
+from citegeist.sources import OpenCitationsSource
+from citegeist.storage import BibliographyStore
+
+
+def test_opencitations_source_normalizes_metadata_row() -> None:
+    source = OpenCitationsSource(config={})
+    entry = source.normalize(
+        {
+            "id": "doi:10.1000/example openalex:W1234567890 omid:br/06123",
+            "title": "Example Work",
+            "author": "Doe, Jane [omid:ra/1]; Roe, Alex [omid:ra/2]",
+            "pub_date": "2024-05",
+            "venue": "Journal of Examples [issn:1234-5678]",
+            "volume": "12",
+            "issue": "3",
+            "page": "10-20",
+            "type": "journal article",
+            "publisher": "Example Press [crossref:123]",
+        }
+    )
+
+    assert entry is not None
+    assert entry.fields["doi"] == "10.1000/example"
+    assert entry.fields["openalex"] == "W1234567890"
+    assert entry.fields["author"] == "Doe, Jane and Roe, Alex"
+    assert entry.fields["journal"] == "Journal of Examples"
+    assert entry.fields["publisher"] == "Example Press"
+    assert entry.fields["year"] == "2024"
+
+
+def test_opencitations_source_builds_edges_for_references() -> None:
+    source = OpenCitationsSource(config={})
+    source.source_client.get_json = lambda _url: [  # type: ignore[method-assign]
+        {
+            "oci": "1-2",
+            "citing": "omid:br/1 doi:10.1000/source",
+            "cited": "omid:br/2 doi:10.1000/target",
+            "creation": "2024-01-01",
+        }
+    ]
+
+    edges = source.get_citations("10.1000/source", relation_type="cites", limit=10)
+    assert len(edges) == 1
+    assert edges[0].source_work_id == "doi:10.1000/source"
+    assert edges[0].target_work_id == "doi:10.1000/target"
+
+
+def test_opencitations_expander_creates_reference_nodes_and_relations() -> None:
+    store = BibliographyStore()
+    try:
+        store.ingest_bibtex(
+            """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024},
+  doi = {10.1000/source}
+}
+"""
+        )
+
+        expander = OpenCitationsExpander()
+        expander.source.source_client.get_json = lambda url: [  # type: ignore[method-assign]
+            {
+                "oci": "1-2",
+                "citing": "omid:br/1 doi:10.1000/source",
+                "cited": "omid:br/2 doi:10.1000/target",
+                "creation": "2024-01-01",
+            }
+        ] if "/references/" in url else [
+            {
+                "id": "doi:10.1000/target omid:br/2",
+                "title": "Target Work",
+                "author": "Doe, Jane [omid:ra/1]",
+                "pub_date": "2023",
+                "venue": "Journal of Targets [issn:1111-1111]",
+                "type": "journal article",
+            }
+        ]
+        expander.resolver.resolve_doi = lambda _doi: None  # type: ignore[method-assign]
+        expander.resolver.resolve_datacite_doi = lambda _doi: None  # type: ignore[method-assign]
+
+        results = expander.expand_entry(store, "seed2024", relation_type="cites", limit=10)
+
+        assert [item.discovered_citation_key for item in results] == ["doi101000target"]
+        discovered = store.get_entry("doi101000target")
+        assert discovered is not None
+        assert discovered["title"] == "Target Work"
+        assert store.get_relations("seed2024") == ["doi101000target"]
+    finally:
+        store.close()
+
+
+def test_opencitations_expander_supports_cited_by_direction() -> None:
+    store = BibliographyStore()
+    try:
+        store.ingest_bibtex(
+            """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024},
+  doi = {10.1000/seed}
+}
+"""
+        )
+
+        expander = OpenCitationsExpander()
+        expander.source.source_client.get_json = lambda url: [  # type: ignore[method-assign]
+            {
+                "oci": "2-1",
+                "citing": "omid:br/2 doi:10.1000/citing",
+                "cited": "omid:br/1 doi:10.1000/seed",
+                "creation": "2024-01-01",
+            }
+        ] if "/citations/" in url else [
+            {
+                "id": "doi:10.1000/citing omid:br/2",
+                "title": "Citing Work",
+                "author": "Doe, Jane [omid:ra/1]",
+                "pub_date": "2025",
+                "venue": "Journal of Citers [issn:1111-1111]",
+                "type": "journal article",
+            }
+        ]
+        expander.resolver.resolve_doi = lambda _doi: None  # type: ignore[method-assign]
+        expander.resolver.resolve_datacite_doi = lambda _doi: None  # type: ignore[method-assign]
+
+        results = expander.expand_entry(store, "seed2024", relation_type="cited_by", limit=10)
+
+        assert [item.discovered_citation_key for item in results] == ["doi101000citing"]
+        assert store.get_relations("doi101000citing") == ["seed2024"]
+    finally:
+        store.close()
--- a/tests/test_openlibrary.py
+++ b/tests/test_openlibrary.py
@ -0,0 +1,188 @@
+from __future__ import annotations
+
+from citegeist.bibtex import BibEntry
+from citegeist.resolve import MetadataResolver
+from citegeist.sources import OpenLibrarySource, SourceRegistry, list_source_catalog
+
+
+class FakeSourceClient:
+    def __init__(self, payload: dict[str, object]) -> None:
+        self.payload = payload
+
+    def try_get_json(self, _url: str) -> dict[str, object]:
+        return dict(self.payload)
+
+
+def test_openlibrary_source_normalizes_book_record() -> None:
+    source = OpenLibrarySource(config={"source_client": FakeSourceClient({})})
+    entry = source.normalize(
+        {
+            "title": "The Nature of the Stratigraphic Record",
+            "author_name": ["D. V. Ager"],
+            "first_publish_year": 1973,
+            "publisher": ["Macmillan"],
+            "key": "/works/OL82563W",
+            "edition_key": ["OL12345M"],
+            "isbn": ["9781234567890"],
+        }
+    )
+
+    assert entry is not None
+    assert entry.entry_type == "book"
+    assert entry.fields["title"] == "The Nature of the Stratigraphic Record"
+    assert entry.fields["author"] == "D. V. Ager"
+    assert entry.fields["year"] == "1973"
+    assert entry.fields["publisher"] == "Macmillan"
+    assert entry.fields["openlibrary_work"] == "/works/OL82563W"
+    assert entry.fields["openlibrary_edition"] == "OL12345M"
+    assert entry.fields["isbn"] == "9781234567890"
+
+
+def test_openlibrary_registry_and_catalog() -> None:
+    registry = SourceRegistry()
+    registry.from_config_dict(
+        {
+            "sources": {
+                "openlibrary": {
+                    "source_type": "openlibrary",
+                    "enabled": True,
+                }
+            }
+        }
+    )
+    source = registry.get("openlibrary")
+    assert isinstance(source, OpenLibrarySource)
+
+    catalog = {entry.key: entry for entry in list_source_catalog()}
+    assert catalog["open_library"].current_status == "integrated"
+    assert "book_metadata" in catalog["open_library"].capabilities
+
+
+def test_metadata_resolver_uses_openlibrary_after_other_searches_fail() -> None:
+    resolver = MetadataResolver()
+    resolver.search_crossref = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_datacite = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_openalex = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_pubmed = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_europepmc = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_semanticscholar = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_openlibrary = lambda _title, limit=5: [  # type: ignore[method-assign]
+        BibEntry(
+            entry_type="book",
+            citation_key="olworks123",
+            fields={
+                "title": "The Nature of the Stratigraphic Record",
+                "author": "D. V. Ager",
+                "year": "1973",
+                "openlibrary_work": "/works/OL82563W",
+            },
+        )
+    ]
+
+    result = resolver.resolve_entry(
+        BibEntry(
+            entry_type="book",
+            citation_key="seed1973",
+            fields={"title": "The Nature of the Stratigraphic Record", "author": "D. V. Ager", "year": "1973"},
+        )
+    )
+
+    assert result is not None
+    assert result.source_label == "openlibrary:search:The Nature of the Stratigraphic Record"
+
+
+def test_metadata_resolver_trace_records_fallback_attempts() -> None:
+    resolver = MetadataResolver()
+    resolver.search_crossref = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_datacite = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_openalex = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_pubmed = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_europepmc = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_semanticscholar = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_openlibrary = lambda _title, limit=5: [  # type: ignore[method-assign]
+        BibEntry(
+            entry_type="book",
+            citation_key="olworks123",
+            fields={"title": "Example Book", "author": "Author, A", "year": "1980"},
+        )
+    ]
+
+    outcome = resolver.resolve_entry_with_trace(
+        BibEntry(
+            entry_type="book",
+            citation_key="seed1980",
+            fields={"title": "Example Book", "author": "Author, A", "year": "1980"},
+        )
+    )
+
+    assert outcome.resolution is not None
+    assert outcome.resolution.source_label == "openlibrary:search:Example Book"
+    assert [attempt.source_name for attempt in outcome.attempts[-2:]] == ["semanticscholar", "openlibrary"]
+    assert outcome.attempts[-1].matched is True
+    assert outcome.attempts[-1].candidate_count == 1
+
+
+def test_metadata_resolver_uses_fuzzy_catalog_match_for_book_titles() -> None:
+    resolver = MetadataResolver()
+    resolver.search_crossref = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_datacite = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_openalex = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_pubmed = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_europepmc = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_semanticscholar = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_openlibrary = lambda _title, limit=5: [  # type: ignore[method-assign]
+        BibEntry(
+            entry_type="book",
+            citation_key="olworks123",
+            fields={
+                "title": "The nature of the stratigraphical record",
+                "author": "D. V. Ager",
+                "year": "1973",
+                "openlibrary_work": "/works/OL82563W",
+            },
+        )
+    ]
+
+    result = resolver.resolve_entry(
+        BibEntry(
+            entry_type="book",
+            citation_key="seed1973",
+            fields={"title": "The Nature of the Stratigraphic Record", "author": "D. V. Ager", "year": "1973"},
+        )
+    )
+
+    assert result is not None
+    assert result.source_label == "openlibrary:search:The Nature of the Stratigraphic Record"
+
+
+def test_metadata_resolver_skips_openlibrary_for_article_like_entries() -> None:
+    resolver = MetadataResolver()
+    resolver.search_crossref = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_datacite = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_openalex = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_pubmed = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_europepmc = lambda _title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_semanticscholar = lambda _title, limit=5: []  # type: ignore[method-assign]
+    called = {"openlibrary": False}
+
+    def fake_openlibrary(_title: str, limit: int = 5) -> list[BibEntry]:
+        called["openlibrary"] = True
+        return []
+
+    resolver.search_openlibrary = fake_openlibrary  # type: ignore[method-assign]
+    outcome = resolver.resolve_entry_with_trace(
+        BibEntry(
+            entry_type="article",
+            citation_key="seed1977",
+            fields={
+                "title": "Fast locomotion of some African ungulates",
+                "author": "Alexander, R. M.",
+                "year": "1977",
+                "journal": "Journal of Zoology",
+            },
+        )
+    )
+
+    assert outcome.resolution is None
+    assert called["openlibrary"] is False
+    assert all(attempt.source_name != "openlibrary" for attempt in outcome.attempts)
--- a/tests/test_resolver_identifiers.py
+++ b/tests/test_resolver_identifiers.py
@ -0,0 +1,201 @@
+"""Tests for identifier resolution and normalization."""
+from __future__ import annotations
+
+import pytest
+
+from citegeist.resolver import (
+    IdentifierExtractor,
+    IdentifierNormalizer,
+    IdentifierResolver,
+    extract_identifiers,
+    normalize_identifier,
+    get_primary_identifier,
+    resolve_identifiers,
+)
+
+
+class TestIdentifierExtractor:
+    """Test IdentifierExtractor class."""
+    
+    def test_extract_from_entry(self):
+        """Test extracting identifiers from entry fields."""
+        fields = {
+            'doi': '10.1234/example',
+            'title': 'Test Title',
+            'author': 'John Doe',
+            'pmid': '123456',
+        }
+        
+        identifiers = IdentifierExtractor.extract(fields)
+        
+        assert 'doi' in identifiers
+        assert identifiers['doi'] == '10.1234/example'
+        assert 'pmid' in identifiers
+        assert identifiers['pmid'] == '123456'
+        assert 'title' not in identifiers  # Title is not an identifier
+    
+    def test_extract_multiple_identifiers(self):
+        """Test extracting multiple identifiers."""
+        fields = {
+            'doi': '10.1234/example',
+            'pmid': '123456',
+            'arxiv': '2310.12345',
+            'isbn': '978-0-123456-78-9',
+        }
+        
+        identifiers = IdentifierExtractor.extract(fields)
+        
+        assert len(identifiers) == 4
+        assert identifiers['doi'] == '10.1234/example'
+        assert identifiers['pmid'] == '123456'
+        assert identifiers['arxiv'] == '2310.12345'
+        assert identifiers['isbn'] == '978-0-123456-78-9'
+
+
+class TestIdentifierNormalizer:
+    """Test IdentifierNormalizer class."""
+    
+    def test_normalize_doi(self):
+        """Test DOI normalization."""
+        assert IdentifierNormalizer.normalize_doi('10.1234/EXAMPLE') == '10.1234/example'
+        assert IdentifierNormalizer.normalize_doi('10.1234/test') == '10.1234/test'
+        assert IdentifierNormalizer.normalize_doi('invalid') is None
+    
+    def test_normalize_pmid(self):
+        """Test PMID normalization."""
+        assert IdentifierNormalizer.normalize_pmid('12345') == '12345'
+        assert IdentifierNormalizer.normalize_pmid('1234567') == '1234567'
+        assert IdentifierNormalizer.normalize_pmid('invalid') is None
+    
+    def test_normalize_pmcid(self):
+        """Test PMCID normalization."""
+        assert IdentifierNormalizer.normalize_pmcid('PMC12345') == 'pmc12345'
+        assert IdentifierNormalizer.normalize_pmcid('PMCabcdef') == 'pmcabcdef'
+        assert IdentifierNormalizer.normalize_pmcid('invalid') is None
+    
+    def test_normalize_arxiv(self):
+        """Test arXiv normalization."""
+        assert IdentifierNormalizer.normalize_arxiv('2310.12345') == '2310.12345'
+        assert IdentifierNormalizer.normalize_arxiv('2310.12345v1') == '2310.12345'
+        assert IdentifierNormalizer.normalize_arxiv('INVALID') is None
+    
+    def test_normalize_orcid(self):
+        """Test ORCID normalization."""
+        assert IdentifierNormalizer.normalize_orcid('0000-0001-2345-6789') == '0000-0001-2345-6789'
+        # ORCID with spaces is invalid according to the canonical format
+        assert IdentifierNormalizer.normalize_orcid('0000 0001 2345 6789') is None
+        assert IdentifierNormalizer.normalize_orcid('invalid') is None
+    
+    def test_normalize_identifier(self):
+        """Test generic identifier normalization."""
+        result = IdentifierNormalizer.normalize_identifier('doi', '10.1234/test')
+        assert result == ('doi', '10.1234/test')
+        
+        result = IdentifierNormalizer.normalize_identifier('pmid', '12345')
+        assert result == ('pmid', '12345')
+        
+        result = IdentifierNormalizer.normalize_identifier('invalid', 'value')
+        assert result is None
+
+
+class TestIdentifierResolver:
+    """Test IdentifierResolver class."""
+    
+    def test_resolve_with_doi(self):
+        """Test resolving with DOI."""
+        fields = {'doi': '10.1234/example', 'title': 'Test Title'}
+        
+        resolved = IdentifierResolver.resolve(fields)
+        
+        assert len(resolved) >= 1
+        doi_resolved = [r for r in resolved if r[0] == 'doi']
+        assert len(doi_resolved) > 0
+    
+    def test_resolve_with_multiple_identifiers(self):
+        """Test resolving with multiple identifiers."""
+        fields = {
+            'doi': '10.1234/example',
+            'pmid': '12345',
+            'arxiv': '2310.12345',
+        }
+        
+        resolved = IdentifierResolver.resolve(fields)
+        
+        assert len(resolved) >= 2
+        doi_resolved = [r for r in resolved if r[0] == 'doi']
+        assert len(doi_resolved) > 0
+    
+    def test_resolve_without_identifiers(self):
+        """Test resolving without identifiers."""
+        fields = {'title': 'Test Title', 'author': 'John Doe'}
+        
+        resolved = IdentifierResolver.resolve(fields)
+        
+        # Should have at least title fingerprint
+        assert len(resolved) >= 1
+        title_resolved = [r for r in resolved if r[0] == 'title']
+        assert len(title_resolved) > 0
+    
+    def test_get_primary_identifier(self):
+        """Test getting primary identifier."""
+        fields = {
+            'doi': '10.1234/example',
+            'pmid': '12345',
+            'title': 'Test Title',
+        }
+        
+        primary = IdentifierResolver.get_primary_identifier(fields)
+        
+        assert primary is not None
+        # DOI should be first priority
+        assert primary[0] == 'doi'
+    
+    def test_get_scheme_value(self):
+        """Test getting specific scheme value."""
+        fields = {
+            'doi': '10.1234/example',
+            'pmid': '12345',
+        }
+        
+        doi = IdentifierResolver.get_scheme_value('doi', fields)
+        assert doi == '10.1234/example'
+        
+        pmid = IdentifierResolver.get_scheme_value('pmid', fields)
+        assert pmid == '12345'
+        
+        isbn = IdentifierResolver.get_scheme_value('isbn', fields)
+        assert isbn is None
+
+
+class TestConvenienceFunctions:
+    """Test convenience functions."""
+    
+    def test_extract_identifiers(self):
+        """Test extract_identifiers function."""
+        fields = {'doi': '10.1234/example', 'pmid': '12345'}
+        
+        identifiers = extract_identifiers(fields)
+        
+        assert 'doi' in identifiers
+        assert 'pmid' in identifiers
+    
+    def test_normalize_identifier(self):
+        """Test normalize_identifier function."""
+        result = normalize_identifier('doi', '10.1234/test')
+        assert result == ('doi', '10.1234/test')
+    
+    def test_get_primary_identifier(self):
+        """Test get_primary_identifier function."""
+        fields = {'doi': '10.1234/example'}
+        
+        primary = get_primary_identifier(fields)
+        
+        assert primary == ('doi', '10.1234/example')
+    
+    def test_resolve_identifiers(self):
+        """Test resolve_identifiers function."""
+        fields = {'doi': '10.1234/example'}
+        
+        resolved = resolve_identifiers(fields)
+        
+        assert len(resolved) > 0
--- a/tests/test_semanticscholar.py
+++ b/tests/test_semanticscholar.py
@ -0,0 +1,117 @@
+from __future__ import annotations
+
+from citegeist.resolve import MetadataResolver
+from citegeist.sources import SemanticScholarSource, SourceRegistry, list_source_catalog
+
+
+def test_semanticscholar_source_normalizes_record() -> None:
+    source = SemanticScholarSource(config={})
+    entry = source.normalize(
+        {
+            "paperId": "abcdef123456",
+            "title": "Physics Example",
+            "year": 2024,
+            "abstract": "Abstract text.",
+            "authors": [{"name": "Jane Doe"}, {"name": "Alex Roe"}],
+            "externalIds": {"DOI": "10.1000/physics"},
+            "journal": {"name": "Physical Review Example"},
+            "openAccessPdf": {"url": "https://example.org/paper.pdf"},
+            "citationCount": 42,
+            "publicationTypes": ["JournalArticle"],
+        }
+    )
+
+    assert entry is not None
+    assert entry.fields["doi"] == "10.1000/physics"
+    assert entry.fields["author"] == "Jane Doe and Alex Roe"
+    assert entry.fields["journal"] == "Physical Review Example"
+    assert entry.fields["url"] == "https://example.org/paper.pdf"
+    assert entry.fields["is_oa"] == "true"
+    assert entry.fields["semanticscholar_citation_count"] == "42"
+
+
+def test_semanticscholar_registry_and_catalog() -> None:
+    registry = SourceRegistry()
+    registry.from_config_dict(
+        {
+            "sources": {
+                "semanticscholar": {
+                    "source_type": "semanticscholar",
+                    "enabled": True,
+                }
+            }
+        }
+    )
+    source = registry.get("semanticscholar")
+    assert isinstance(source, SemanticScholarSource)
+
+    catalog = {entry.key: entry for entry in list_source_catalog()}
+    assert catalog["semantic_scholar"].current_status == "integrated"
+    assert catalog["semantic_scholar"].priority == "now"
+
+
+def test_metadata_resolver_uses_semanticscholar_doi_after_other_lookups_fail() -> None:
+    resolver = MetadataResolver()
+    resolver.resolve_doi = lambda _doi: None  # type: ignore[method-assign]
+    resolver.resolve_datacite_doi = lambda _doi: None  # type: ignore[method-assign]
+    resolver.resolve_europepmc_doi = lambda _doi: None  # type: ignore[method-assign]
+    resolver.semanticscholar.lookup_by_doi = lambda _doi: resolver.semanticscholar.normalize(  # type: ignore[method-assign]
+        {
+            "paperId": "abcdef123456",
+            "title": "Physics Example",
+            "year": 2024,
+            "authors": [{"name": "Jane Doe"}],
+            "externalIds": {"DOI": "10.1000/physics"},
+            "journal": {"name": "Physical Review Example"},
+            "publicationTypes": ["JournalArticle"],
+        }
+    )
+
+    from citegeist.bibtex import BibEntry
+
+    result = resolver.resolve_entry(
+        BibEntry(
+            entry_type="article",
+            citation_key="seed2024",
+            fields={"doi": "10.1000/physics", "title": "Physics Example"},
+        )
+    )
+
+    assert result is not None
+    assert result.source_label == "semanticscholar:doi:10.1000/physics"
+    assert result.entry.fields["journal"] == "Physical Review Example"
+
+
+def test_metadata_resolver_uses_semanticscholar_title_search_after_other_searches_fail() -> None:
+    resolver = MetadataResolver()
+    resolver.search_crossref_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
+    resolver.search_datacite_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
+    resolver.search_openalex_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
+    resolver.search_pubmed_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
+    resolver.search_europepmc_best_match = lambda *args, **kwargs: None  # type: ignore[method-assign]
+    resolver.semanticscholar.search = lambda _title, limit=5: [  # type: ignore[method-assign]
+        resolver.semanticscholar.normalize(
+            {
+                "paperId": "abcdef123456",
+                "title": "Physics Example",
+                "year": 2024,
+                "authors": [{"name": "Jane Doe"}],
+                "externalIds": {"DOI": "10.1000/physics"},
+                "journal": {"name": "Physical Review Example"},
+                "publicationTypes": ["JournalArticle"],
+            }
+        )
+    ]
+
+    from citegeist.bibtex import BibEntry
+
+    result = resolver.resolve_entry(
+        BibEntry(
+            entry_type="article",
+            citation_key="seed2024",
+            fields={"title": "Physics Example", "author": "Jane Doe", "year": "2024"},
+        )
+    )
+
+    assert result is not None
+    assert result.source_label == "semanticscholar:search:Physics Example"
--- a/tests/test_sources_catalog.py
+++ b/tests/test_sources_catalog.py
@ -0,0 +1,60 @@
+from __future__ import annotations
+
+from citegeist.sources import CrossRefSource, OpenCitationsSource, SourceRegistry, list_source_catalog, prioritized_source_keys
+
+
+def test_catalog_prioritizes_existing_core_sources() -> None:
+    keys = prioritized_source_keys()
+    assert keys[:6] == ["crossref", "datacite", "europe_pmc", "openalex", "open_citations", "pubmed"]
+
+
+def test_catalog_includes_open_citation_and_access_sources() -> None:
+    catalog = {entry.key: entry for entry in list_source_catalog()}
+    assert "open_citations" in catalog
+    assert "unpaywall" in catalog
+    assert catalog["open_citations"].priority == "now"
+    assert "doi_citations" in catalog["open_citations"].capabilities
+
+
+def test_registry_loads_known_source_from_config() -> None:
+    registry = SourceRegistry()
+    registry.from_config_dict(
+        {
+            "sources": {
+                "crossref": {
+                    "source_type": "crossref",
+                    "enabled": True,
+                }
+            }
+        }
+    )
+
+    source = registry.get("crossref")
+    assert isinstance(source, CrossRefSource)
+
+
+def test_registry_rejects_unknown_source_type() -> None:
+    registry = SourceRegistry()
+    try:
+        registry.from_config_dict({"sources": {"mystery": {"source_type": "mystery"}}})
+    except ValueError as exc:
+        assert "Unknown source type" in str(exc)
+    else:
+        raise AssertionError("expected ValueError for unknown source type")
+
+
+def test_registry_loads_opencitations_from_config() -> None:
+    registry = SourceRegistry()
+    registry.from_config_dict(
+        {
+            "sources": {
+                "opencitations": {
+                    "source_type": "opencitations",
+                    "enabled": True,
+                }
+            }
+        }
+    )
+
+    source = registry.get("opencitations")
+    assert isinstance(source, OpenCitationsSource)
--- a/tests/test_sources_plugin.py
+++ b/tests/test_sources_plugin.py
@ -0,0 +1,171 @@
+"""Tests for the source plugin architecture."""
+from __future__ import annotations
+
+import pytest
+
+from citegeist.sources import BibliographicSource, SourceRegistry, CrossRefSource
+
+
+class MockSource(BibliographicSource):
+    """Mock source for testing."""
+    
+    def __init__(self, config: dict | None = None):
+        super().__init__(config)
+        self.lookup_calls = []
+    
+    def lookup_by_doi(self, doi: str) -> None:
+        """Return None to indicate not found."""
+        self.lookup_calls.append(('doi', doi))
+        return None
+    
+    def lookup_by_title(self, title: str) -> None:
+        """Return None to indicate not found."""
+        self.lookup_calls.append(('title', title))
+        return None
+    
+    def search(self, query: str, limit: int = 10) -> list:
+        return []
+    
+    def normalize(self, record: dict) -> None:
+        return None
+
+
+def test_source_base_interface():
+    """Test that BibliographicSource base class works."""
+    source = MockSource()
+    assert source.is_available()
+    assert source.get_identifier_scheme() == 'mocksource'
+    assert source.get_fulltext_url('doi:test') is None
+    assert source.get_embedding('doi:test') is None
+
+
+def test_mock_source():
+    """Test that mock source implements interface correctly."""
+    source = MockSource()
+    source.lookup_by_doi('10.1234/test')
+    source.lookup_by_title('Test Title')
+    
+    assert source.lookup_calls == [
+        ('doi', '10.1234/test'),
+        ('title', 'Test Title')
+    ]
+
+
+def test_source_registry():
+    """Test source registry functionality."""
+    registry = SourceRegistry()
+    
+    # Register a source
+    registry.register(MockSource, name='mock_source', config={'enabled': True})
+    
+    # List sources
+    sources = registry.list_sources()
+    assert 'mock_source' in sources
+    
+    # Get source instance
+    source = registry.get('mock_source')
+    assert source is not None
+    assert isinstance(source, MockSource)
+    assert source.is_available()
+
+
+def test_source_registry_disabled():
+    """Test that disabled sources are not returned."""
+    registry = SourceRegistry()
+    
+    registry.register(
+        MockSource,
+        name='disabled_source',
+        config={'enabled': False}
+    )
+    
+    sources = registry.list_sources()
+    assert 'disabled_source' in sources
+    
+    # Getting disabled source should return None
+    source = registry.get('disabled_source')
+    assert source is None
+
+
+def test_crossref_source():
+    """Test CrossRef source plugin."""
+    registry = SourceRegistry()
+    registry.register(CrossRefSource, name='crossref', config={})
+    
+    source = registry.get('crossref')
+    assert source is not None
+    assert source.is_available()
+    assert source.get_identifier_scheme() == 'doi'
+    
+    entry = source.normalize(
+        {
+            'message': {
+                'DOI': '10.1234/example',
+                'title': ['Test Title'],
+                'author': [{'given': 'Jane', 'family': 'Doe'}],
+                'published-print': {'date-parts': [[2024]]},
+                'container-title': ['Journal of Tests'],
+                'publisher': 'Test Publisher',
+                'URL': 'https://doi.org/10.1234/example',
+                'abstract': '<jats:p>Example abstract</jats:p>',
+            }
+        }
+    )
+
+    assert entry is not None
+    assert entry.fields['doi'] == '10.1234/example'
+    assert entry.fields['title'] == 'Test Title'
+    assert entry.fields['year'] == '2024'
+    assert entry.fields['journal'] == 'Journal of Tests'
+
+
+def test_crossref_search_item_normalization():
+    source = CrossRefSource()
+
+    entry = source.normalize(
+        {
+            'DOI': '10.1234/example',
+            'title': ['Search Result'],
+            'author': [{'family': 'Doe'}],
+            'issued': {'date-parts': [[2023]]},
+        }
+    )
+
+    assert entry is not None
+    assert entry.fields['doi'] == '10.1234/example'
+    assert entry.fields['year'] == '2023'
+
+
+def test_source_record():
+    """Test SourceRecord dataclass."""
+    from citegeist.sources import SourceRecord
+    
+    record = SourceRecord(
+        raw={'test': 'data'},
+        source_type='test',
+        source_label='test_source',
+        timestamp='2024-01-01',
+        confidence=1.0
+    )
+    
+    assert record.source_type == 'test'
+    assert record.source_label == 'test_source'
+    assert record.confidence == 1.0
+    assert record.raw == {'test': 'data'}
+
+
+def test_citation_edge():
+    """Test CitationEdge dataclass."""
+    from citegeist.sources import CitationEdge
+    
+    edge = CitationEdge(
+        source_work_id='doi:10.1234',
+        target_work_id='doi:10.5678',
+        relation_type='cites',
+        source_type='crossref',
+        source_label='crossref:test',
+        confidence=0.9
+    )
+    
+    assert edge.relation_type == 'cites'
+    assert edge.confidence == 0.9
--- a/tests/test_talkorigins.py
+++ b/tests/test_talkorigins.py
@ -530,6 +530,88 @@ def test_talkorigins_enrich_weak_canonicals_can_preview_resolution(tmp_path: Pat
    assert results[0].weak_reasons_after == []


+def test_talkorigins_enrich_weak_canonicals_includes_resolution_attempts(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+                f"{base_url}evolution.html": EVOLUTION_HTML,
+            }
+        )
+    )
+
+    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
+    Path(export.seed_sets[0].seed_bib).write_text(
+        """
+@misc{weak1,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Weak Duplicate"
+}
+
+@misc{weak2,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Weak Duplicate",
+    note = "Copied from legacy source"
+}
+""",
+        encoding="utf-8",
+    )
+    Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8")
+
+    from citegeist.resolve import Resolution, ResolutionAttempt, ResolutionOutcome
+
+    scraper.resolver.resolve_entry_with_trace = lambda entry: ResolutionOutcome(  # type: ignore[method-assign]
+        resolution=Resolution(
+            entry=BibEntry(
+                entry_type="article",
+                citation_key="resolved",
+                fields={
+                    "author": entry.fields["author"],
+                    "title": entry.fields["title"],
+                    "year": entry.fields["year"],
+                    "doi": "10.1000/weak",
+                    "journal": "Journal of Better Metadata",
+                },
+            ),
+            source_type="resolver",
+            source_label="crossref:search:Weak Duplicate",
+        ),
+        attempts=[
+            ResolutionAttempt(
+                source_name="crossref",
+                strategy="title_search",
+                query_value="Weak Duplicate",
+                matched=True,
+                candidate_count=1,
+                source_label="crossref:search:Weak Duplicate",
+            )
+        ],
+    )
+
+    store = BibliographyStore()
+    try:
+        results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=False)
+    finally:
+        store.close()
+
+    assert len(results) == 1
+    assert results[0].resolution_attempts == [
+        {
+            "source_name": "crossref",
+            "strategy": "title_search",
+            "query_value": "Weak Duplicate",
+            "matched": True,
+            "candidate_count": 1,
+            "source_label": "crossref:search:Weak Duplicate",
+            "error": "",
+        }
+    ]
+
+
 def test_talkorigins_enrich_weak_canonicals_can_apply_to_store(tmp_path: Path):
    base_url = "https://www.talkorigins.org/origins/biblio/"
    scraper = TalkOriginsScraper(
@ -799,6 +881,7 @@ def test_talkorigins_build_review_export_combines_cluster_and_enrichment(tmp_pat
    assert review.items[0]["canonical"]["citation_key"] == "weak2"
    assert review.items[0]["enrichment"]["resolved"] is True
    assert review.items[0]["enrichment"]["applied"] is False
+    assert review.items[0]["enrichment"]["resolution_attempts"] == []


 def test_talkorigins_apply_review_corrections_updates_store_entry(tmp_path: Path):
--- a/tests/test_unpaywall.py
+++ b/tests/test_unpaywall.py
@ -0,0 +1,117 @@
+from __future__ import annotations
+
+from citegeist.cli import _run_enrich_oa
+from citegeist.sources import SourceRegistry, UnpaywallSource, list_source_catalog, prioritized_source_keys
+from citegeist.storage import BibliographyStore
+
+
+def test_unpaywall_source_normalizes_oa_record() -> None:
+    source = UnpaywallSource(config={"email": "tester@example.org"})
+    entry = source.normalize(
+        {
+            "doi": "10.1000/example",
+            "title": "Example Article",
+            "year": 2024,
+            "is_oa": True,
+            "oa_status": "gold",
+            "best_oa_location": {
+                "url": "https://example.org/article",
+                "url_for_pdf": "https://example.org/article.pdf",
+                "license": "cc-by",
+                "host_type": "publisher",
+                "version": "publishedVersion",
+                "evidence": "open (via free pdf)",
+            },
+        }
+    )
+
+    assert entry is not None
+    assert entry.fields["doi"] == "10.1000/example"
+    assert entry.fields["best_oa_url"] == "https://example.org/article"
+    assert entry.fields["best_oa_pdf_url"] == "https://example.org/article.pdf"
+    assert entry.fields["oa_status"] == "gold"
+    assert entry.fields["oa_license"] == "cc-by"
+    assert entry.fields["is_oa"] == "true"
+
+
+def test_unpaywall_registry_and_catalog() -> None:
+    registry = SourceRegistry()
+    registry.from_config_dict(
+        {
+            "sources": {
+                "unpaywall": {
+                    "source_type": "unpaywall",
+                    "enabled": True,
+                    "email": "tester@example.org",
+                }
+            }
+        }
+    )
+    source = registry.get("unpaywall")
+    assert isinstance(source, UnpaywallSource)
+
+    catalog = {entry.key: entry for entry in list_source_catalog()}
+    assert catalog["unpaywall"].current_status == "integrated"
+    assert catalog["unpaywall"].priority == "now"
+    assert "unpaywall" in prioritized_source_keys()
+
+
+def test_run_enrich_oa_updates_entry() -> None:
+    store = BibliographyStore()
+    try:
+        store.ingest_bibtex(
+            """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024},
+  doi = {10.1000/example}
+}
+"""
+        )
+
+        original_lookup = UnpaywallSource.lookup_by_doi
+
+        def fake_lookup(self: UnpaywallSource, doi: str):
+            return self.normalize(
+                {
+                    "doi": doi,
+                    "title": "Seed Paper",
+                    "year": 2024,
+                    "is_oa": True,
+                    "oa_status": "green",
+                    "best_oa_location": {
+                        "url": "https://repository.example.org/seed",
+                        "url_for_pdf": "https://repository.example.org/seed.pdf",
+                        "license": "cc-by",
+                        "host_type": "repository",
+                        "version": "acceptedVersion",
+                        "evidence": "oa repository",
+                    },
+                }
+            )
+
+        UnpaywallSource.lookup_by_doi = fake_lookup  # type: ignore[method-assign]
+        try:
+            assert _run_enrich_oa(store, ["seed2024"], "tester@example.org") == 0
+        finally:
+            UnpaywallSource.lookup_by_doi = original_lookup  # type: ignore[method-assign]
+
+        entry = store.get_entry("seed2024")
+        assert entry is not None
+        assert entry["best_oa_url"] == "https://repository.example.org/seed"
+        assert entry["best_oa_pdf_url"] == "https://repository.example.org/seed.pdf"
+        assert entry["oa_status"] == "green"
+        assert entry["oa_host_type"] == "repository"
+        provenance = store.get_field_provenance("seed2024")
+        assert any(item["source_type"] == "oa_enrich" for item in provenance)
+    finally:
+        store.close()
+
+
+def test_run_enrich_oa_requires_email() -> None:
+    store = BibliographyStore()
+    try:
+        assert _run_enrich_oa(store, ["missing"], None) == 1
+    finally:
+        store.close()