Add source tracing and broader open source coverage

This commit is contained in:
welsberr 2026-04-25 22:27:53 -04:00
parent 39fe5ea86c
commit 0497e18f04
37 changed files with 4975 additions and 86 deletions

View File

@ -0,0 +1,185 @@
-- Migration: Multi-source bibliographic schema
-- Description: Add multi-source support with works, identifiers, source records, citations, and embeddings
-- ============================================================================
-- WORKS TABLE - Canonical metadata for works
-- ============================================================================
CREATE TABLE IF NOT EXISTS works (
id INTEGER PRIMARY KEY AUTOINCREMENT,
work_id TEXT NOT NULL UNIQUE,
title TEXT,
abstract TEXT,
publication_year INTEGER,
publication_date TEXT,
journal_name TEXT,
publisher TEXT,
volume TEXT,
issue TEXT,
pages TEXT,
doi TEXT,
pmid TEXT,
pmcid TEXT,
arxiv_id TEXT,
dblp_key TEXT,
openalex_id TEXT,
isbn TEXT,
issn TEXT,
entry_type TEXT NOT NULL DEFAULT 'article',
citation_count INTEGER DEFAULT 0,
cited_by_count INTEGER DEFAULT 0,
influential_citations INTEGER DEFAULT 0,
is_open_access BOOLEAN DEFAULT 0,
best_oa_url TEXT,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
-- ============================================================================
-- WORK IDENTIFIERS TABLE - Mapping scheme + value to works
-- ============================================================================
CREATE TABLE IF NOT EXISTS work_identifiers (
id INTEGER PRIMARY KEY AUTOINCREMENT,
work_id TEXT NOT NULL,
scheme TEXT NOT NULL,
value TEXT NOT NULL,
is_primary BOOLEAN DEFAULT 0,
normalized_value TEXT,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE(work_id, scheme, value),
FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
);
-- ============================================================================
-- SOURCE RECORDS TABLE - Raw API responses with provenance
-- ============================================================================
CREATE TABLE IF NOT EXISTS source_records (
id INTEGER PRIMARY KEY AUTOINCREMENT,
work_id TEXT NOT NULL,
source_type TEXT NOT NULL,
source_label TEXT NOT NULL,
raw_data_json TEXT NOT NULL,
raw_record_id TEXT,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE(work_id, source_type, source_label),
FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
);
-- ============================================================================
-- CITATIONS TABLE - Citation graph with provenance
-- ============================================================================
CREATE TABLE IF NOT EXISTS citations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_work_id TEXT NOT NULL,
target_work_id TEXT NOT NULL,
relation_type TEXT NOT NULL,
source_type TEXT NOT NULL,
source_label TEXT NOT NULL,
confidence REAL DEFAULT 1.0,
is_verified BOOLEAN DEFAULT 0,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE(source_work_id, target_work_id, relation_type),
FOREIGN KEY (source_work_id) REFERENCES works(id) ON DELETE CASCADE,
FOREIGN KEY (target_work_id) REFERENCES works(id) ON DELETE CASCADE
);
-- ============================================================================
-- WORK EMBEDDINGS TABLE - Vector storage for semantic search
-- ============================================================================
CREATE TABLE IF NOT EXISTS work_embeddings (
id INTEGER PRIMARY KEY AUTOINCREMENT,
work_id TEXT NOT NULL,
embedding TEXT NOT NULL,
model_name TEXT NOT NULL,
model_version TEXT,
dimension INTEGER NOT NULL,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE(work_id, model_name),
FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
);
-- ============================================================================
-- INDEXES - For performance optimization
-- ============================================================================
-- Work identifiers indexes
CREATE INDEX IF NOT EXISTS idx_work_identifiers_scheme ON work_identifiers(scheme);
CREATE INDEX IF NOT EXISTS idx_work_identifiers_value ON work_identifiers(value);
CREATE INDEX IF NOT EXISTS idx_work_identifiers_work_id ON work_identifiers(work_id);
CREATE INDEX IF NOT EXISTS idx_work_identifiers_normalized ON work_identifiers(normalized_value);
-- Source records indexes
CREATE INDEX IF NOT EXISTS idx_source_records_work_id ON source_records(work_id);
CREATE INDEX IF NOT EXISTS idx_source_records_source_type ON source_records(source_type);
-- Citations indexes
CREATE INDEX IF NOT EXISTS idx_citations_source_work ON citations(source_work_id);
CREATE INDEX IF NOT EXISTS idx_citations_target_work ON citations(target_work_id);
CREATE INDEX IF NOT EXISTS idx_citations_relation_type ON citations(relation_type);
CREATE INDEX IF NOT EXISTS idx_citations_source_type ON citations(source_type);
-- Works indexes
CREATE INDEX IF NOT EXISTS idx_works_doi ON works(doi);
CREATE INDEX IF NOT EXISTS idx_works_pmid ON works(pmid);
CREATE INDEX IF NOT EXISTS idx_works_pmcid ON works(pmcid);
CREATE INDEX IF NOT EXISTS idx_works_arxiv ON works(arxiv_id);
CREATE INDEX IF NOT EXISTS idx_works_openalex ON works(openalex_id);
CREATE INDEX IF NOT EXISTS idx_works_is_open_access ON works(is_open_access);
CREATE INDEX IF NOT EXISTS idx_works_created_at ON works(created_at);
-- Embeddings indexes
CREATE INDEX IF NOT EXISTS idx_embeddings_work_id ON work_embeddings(work_id);
CREATE INDEX IF NOT EXISTS idx_embeddings_model_name ON work_embeddings(model_name);
-- ============================================================================
-- PostgreSQL-specific extensions and vector indexing
-- ============================================================================
-- Note: The following are PostgreSQL-specific and should be run when using pgvector
-- Uncomment these when using PostgreSQL with pgvector extension:
-- CREATE EXTENSION IF NOT EXISTS vector;
-- CREATE INDEX IF NOT EXISTS idx_embeddings_vector ON work_embeddings
-- USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
-- ============================================================================
-- TRIGGERS - For automatic timestamp updates
-- ============================================================================
-- Works table update trigger
CREATE TRIGGER IF NOT EXISTS works_updated_at
AFTER UPDATE ON works
FOR EACH ROW
WHEN (new.updated_at IS NULL)
BEGIN
UPDATE works SET updated_at = CURRENT_TIMESTAMP WHERE id = old.id;
END;
-- Work identifiers update trigger
CREATE TRIGGER IF NOT EXISTS work_identifiers_updated_at
AFTER UPDATE ON work_identifiers
FOR EACH ROW
WHEN (new.created_at IS NULL)
BEGIN
UPDATE work_identifiers SET created_at = CURRENT_TIMESTAMP WHERE id = old.id;
END;
-- ============================================================================
-- VIEWS - For simplified queries
-- ============================================================================
-- View to join works with their identifiers
CREATE VIEW IF NOT EXISTS works_with_identifiers AS
SELECT
w.id,
w.work_id,
w.title,
w.abstract,
w.publication_year,
w.journal_name,
w.publisher,
w.doi,
w.pmid,
w.pmcid,
w.arxiv_id,
w.dblp_key,
w.openalex_id,
GROUP_CONCAT(DISTINCT wi.scheme || ':' || wi.value, ', ') AS identifiers
FROM works w
LEFT JOIN work_identifiers wi ON w.id = wi.work_id
GROUP BY w.id, w.work_id, w.title, w.abstract, w.publication_year, w.journal_name, w.publisher, w.doi, w.pmid, w.pmcid, w.arxiv_id, w.dblp_key, w.openalex_id;

103
docs/README.md Normal file
View File

@ -0,0 +1,103 @@
# CiteGeist Source Planning Documentation
Welcome to the source-planning documentation for CiteGeist.
## Quick Overview
The immediate planning question is which additional open bibliographic sources should be incorporated next.
This documentation therefore emphasizes:
- the current source baseline already present in the repository
- the next highest-value open sources to add
- a smaller, more realistic source-layer abstraction
- explicit deferral of unrelated database/vector ambitions
## Documentation Files
### Planning and Status
- **[source-landscape.md](./source-landscape.md)** - recommended next open bibliographic sources
- **[implementation-progress.md](./implementation-progress.md)** - sources-first progress tracker
- **[phase-completion.md](./phase-completion.md)** - short status summary
- **[file-structure.md](./file-structure.md)** - file structure and module notes
### Existing Architecture References
- **[architecture-current.md](./architecture-current.md)** - current architecture overview
- **[schema-current.sql](./schema-current.sql)** - existing database schema
## Current Status
### Current Baseline
1. Crossref, OpenAlex, PubMed, Europe PMC, Semantic Scholar, DataCite, DBLP, arXiv, and OAI-PMH are already in play.
2. OpenCitations and Unpaywall are now integrated as source-layer additions.
3. The SQLite-based local workflow remains the baseline.
### Recommended Next Sources
1. OpenAIRE only if repository-acquisition scope expands
### Explicitly Deferred
1. Database redesign
2. pgvector / embedding-first work
## Source Layer
The source-layer code now provides:
- `BibliographicSource` as the common interface
- `SourceRegistry` for known concrete source classes
- `CrossRefSource` as the repaired first concrete plugin
- `OpenCitationsSource` plus DOI-based graph expansion
- `UnpaywallSource` plus DOI-based OA-link enrichment
- `EuropePmcSource` plus biomedical resolver/search support
- `SemanticScholarSource` plus broader biological/physical sciences resolver/search support
- a source catalog with current status and priority order
- compatibility with the existing `SourceClient`-based resolver and expander code
## Quick Start
```python
from citegeist.sources import (
CrossRefSource,
EuropePmcSource,
OpenCitationsSource,
SemanticScholarSource,
SourceRegistry,
UnpaywallSource,
list_source_catalog,
prioritized_source_keys,
)
registry = SourceRegistry()
registry.register(CrossRefSource, name="crossref", config={})
registry.register(EuropePmcSource, name="europepmc", config={})
registry.register(OpenCitationsSource, name="opencitations", config={})
registry.register(SemanticScholarSource, name="semanticscholar", config={})
registry.register(UnpaywallSource, name="unpaywall", config={"email": "you@example.org"})
source = registry.get("crossref")
catalog = list_source_catalog()
priority = prioritized_source_keys()
```
## Tests
Relevant tests for the refocused source work:
- `tests/test_sources_plugin.py`
- `tests/test_sources_catalog.py`
The existing broader repository test suite should continue to pass as the source-layer changes are integrated.
## Next Steps
1. Decide whether `OpenAIRE` is worth adding for repository-acquisition breadth.
2. Keep database/vector redesign work deferred unless a source need forces it.
## License
Same as the CiteGeist project.
---
**Last Updated:** 2026-04-25
**Status:** Sources-first plan in effect

View File

@ -0,0 +1,87 @@
# CiteGeist Current Architecture
## Overview
CiteGeist is currently designed as a local BibTeX-native tooling system with:
- BibTeX parsing and storage
- Local text search (FTS5)
- Entry provenance tracking
- Citation graph traversal
- Topic-based expansion
## Core Modules
### Source Management
- **sources.py**: `SourceClient` class for HTTP requests with caching and retry logic
- Base HTTP client with JSON/XML/text support
- Built-in retry with exponential backoff
- Cache directory support
### Metadata Resolution
- **resolve.py**: `MetadataResolver` class for entry resolution
- DOI → CrossRef lookup
- PMID → PubMed lookup
- arXiv, DBLP, OpenAlex lookup
- Title search fallback with best-match selection
- DataCite integration
- Returns `Resolution` objects with provenance
### Storage
- **storage.py**: `BibliographyStore` class (SQLite)
- Tables: entries, creators, entry_creators, identifiers, relations, topics, entry_topics, field_provenance, relation_provenance
- FTS5 text search integration
- Field-level provenance tracking
- Citation graph support (cites, cited_by edges)
### BibTeX Processing
- **bibtex.py**: BibEntry dataclass and parsing/rendering
- BibTeX → BibEntry conversion
- BibEntry → BibTeX rendering
- Citation key generation
### CLI and Server
- **cli.py**: Command-line interface
- **app_server.py**: Local HTTP server for UI/JSON API
- **app_api.py**: JSON API adapter surface
### Expansion and Discovery
- **expand.py**: Citation graph expansion workflows
- **extract.py**: Plaintext reference extraction
- **bootstrap.py**: Topic bootstrap and expansion
## Current State Summary
**Completed/Usable:**
- BibTeX parsing and storage
- Identifier-based resolution (DOI, PMID, arXiv, DBLP, OpenAlex)
- Title search with best-match selection
- Citation graph traversal and expansion
- Field provenance tracking
- Local search with FTS5
- Topic-based discovery workflows
**Not Yet Implemented (from new roadmap):**
- Plugin-based source architecture
- Multi-source record merging
- PGVector embeddings
- Full-text OA link retrieval
- Semantic Scholar integration
- OpenCitations integration
- Unified API endpoints for multi-source queries
## Data Flow
1. **Ingest**: BibTeX file → parse → store in entries table
2. **Resolve**: Entry → resolve_doi/resolve_pmid/resolve_arxiv → fetch metadata → merge with existing
3. **Expand**: Start from entry → traverse citation edges → discover new entries
4. **Search**: Query FTS5 index → retrieve relevant entries
5. **Export**: Entries → render BibTeX → output file
## Database Schema
SQLite-based storage with:
- Normalized entry fields
- Creator relationships
- Identifier mapping
- Citation relations
- Topic associations
- Field provenance metadata

165
docs/file-structure.md Normal file
View File

@ -0,0 +1,165 @@
# CiteGeist Multi-Source File Structure
**Date:** 2026-04-25
## Project Structure
```
/home/netuser/dev/CiteGeist/
├── db/
│ └── migrations/
│ └── 0001_multisource.sql ✅ NEW - Multi-source schema
├── docs/
│ ├── architecture-current.md ✅ NEW - Current architecture docs
│ ├── implementation-progress.md ✅ NEW - Implementation progress tracker
│ ├── schema-current.sql ✅ NEW - Current schema SQL
│ └── file-structure.md ✅ NEW - This file
├── src/citegeist/
│ ├── sources/ ✅ NEW - Source plugin architecture
│ │ ├── __init__.py ✅ NEW - Package exports
│ │ ├── __all__.py ✅ NEW - Public API
│ │ ├── base.py ✅ NEW - Base BibliographicSource class
│ │ ├── registry.py ✅ NEW - SourceRegistry implementation
│ │ ├── crossref.py ✅ NEW - CrossRef source plugin
│ │ └── _old_sources_compat.py ✅ NEW - Backward compatibility
│ │
│ ├── resolver/ ✅ NEW - Identifier resolution
│ │ ├── __init__.py ✅ NEW - Module exports
│ │ └── identifiers.py ✅ NEW - Extract, normalize, resolve
│ │
│ ├── db/ ✅ NEW - Database operations
│ │ └── __init__.py 🚧 TO DO - Database client
│ │
│ ├── ... (existing files)
│ ├── sources.py 📦 Existing - Old SourceClient
│ ├── resolve.py 📦 Existing - MetadataResolver
│ └── storage.py 📦 Existing - BibliographyStore
└── tests/
├── test_sources_plugin.py ✅ NEW - Source plugin tests
└── test_resolver_identifiers.py ✅ NEW - Identifier tests
```
## Module Documentation
### New Modules
#### `src/citegeist/sources/`
Plugin architecture for bibliographic sources.
**Classes:**
- `BibliographicSource` - Abstract base class for source plugins
- `SourceRecord` - Raw source record dataclass
- `CitationEdge` - Citation relationship dataclass
- `SourceRegistry` - Manages source plugins
**Plugin:**
- `CrossRefSource` - CrossRef API implementation
#### `src/citegeist/resolver/`
Identifier extraction, normalization, and resolution.
**Classes:**
- `IdentifierExtractor` - Extract identifiers from entry fields
- `IdentifierNormalizer` - Normalize identifiers to canonical form
- `IdentifierResolver` - Resolve identifiers with lookup priority
**Functions:**
- `extract_identifiers()` - Quick identifier extraction
- `normalize_identifier()` - Quick normalization
- `get_primary_identifier()` - Get primary identifier
- `resolve_identifiers()` - Resolve all identifiers
#### `src/citegeist/db/`
Database operations (to be implemented).
**Planned:**
- Database client for works table
- Migration runner
- Query builders
#### `db/migrations/0001_multisource.sql`
Multi-source database schema migration.
**Tables:**
1. `works` - Canonical work metadata
2. `work_identifiers` - Multi-scheme identifiers
3. `source_records` - Raw API responses
4. `citations` - Citation graph
5. `work_embeddings` - Vector embeddings
### Existing Modules (Preserved)
- `src/citegeist/sources.py` - Old SourceClient (backward compatible)
- `src/citegeist/resolve.py` - Old MetadataResolver
- `src/citegeist/storage.py` - Old BibliographyStore
## Test Coverage
**New Tests:**
- `tests/test_sources_plugin.py` (7 tests)
- `tests/test_resolver_identifiers.py` (17 tests)
**Total:** 24 tests passing
## Dependencies
**New Dependencies Required:**
- No new Python packages (uses stdlib only)
**Planned Dependencies (Future phases):**
- `pgvector` - PostgreSQL vector extension
- `sentence-transformers` - Local embedding model
- `fastapi` - API framework
- `unpaywall` - OA link retrieval (if needed)
## Implementation Status
### Completed (100%)
- ✅ Phase 0: Baseline Audit
- ✅ Phase 1: Source Plugin Architecture
- ✅ Phase 2: Identifier Resolution Layer
### In Progress (50%)
- 🚧 Phase 3: Database Schema Upgrade
### Pending (0%)
- ⏳ Phase 4: High-Value Source Integrations
- ⏳ Phase 5: Merge & Deduplication Engine
- ⏳ Phase 6: Citation Graph Construction
- ⏳ Phase 7: Embedding Pipeline
- ⏳ Phase 8: Full-Text Retrieval Layer
- ⏳ Phase 9: API Layer
- ⏳ Phase 10: Ranking & Relevance
- ⏳ Phase 12: Observability & QA
- ⏳ Phase 13: Performance Optimization
## Quick Start
```python
# Register a source
from citegeist.sources import SourceRegistry, CrossRefSource
registry = SourceRegistry()
registry.register(CrossRefSource, name='crossref', config={})
# Get source instance
source = registry.get('crossref')
entry = source.lookup_by_doi('10.1234/example')
# Resolve identifiers
from citegeist.resolver import resolve_identifiers
fields = {'doi': '10.1234/example', 'title': 'Test'}
resolved = resolve_identifiers(fields)
# Returns [('doi', '10.1234/example'), ('title', 'test title')]
```
## Next Steps
1. ✅ Phase 0-2: Complete
2. 🚧 Phase 3: Implement Python interface for database operations
3. ⏳ Phase 4: Add Unpaywall, Semantic Scholar, OpenCitations integrations
4. ⏳ Phase 5: Build merge engine

View File

@ -0,0 +1,122 @@
# CiteGeist Sources-First Progress
**Last Updated:** 2026-04-25
This document tracks the refocused plan for source incorporation. The working question is which additional open bibliographic sources CiteGeist should integrate next, not whether it needs a new storage platform first.
---
## Phase 0: Scope Reframe ✅ COMPLETE
**Status:** Completed
**Deliverables:**
- ✅ `/docs/source-landscape.md` - source inventory and recommendation document
- ✅ `/src/citegeist/sources/catalog.py` - code-backed source catalog
**Completed:**
- Identified which source integrations already exist in the repository
- Split source-expansion planning from database/vector-search ambitions
- Prioritized open-source additions by workflow value
---
## Phase 1: Source Layer Tightening ✅ COMPLETE
**Status:** Completed
**Deliverables:**
- ✅ `/src/citegeist/sources/base.py` - Base `BibliographicSource` interface
- ✅ `/src/citegeist/sources/registry.py` - Registry for known concrete sources
- ✅ `/src/citegeist/sources/crossref.py` - Repaired CrossRef source implementation
- ✅ `/src/citegeist/sources/catalog.py` - Open-source inventory
- ✅ `/src/citegeist/sources/__init__.py` - Package initialization
- ✅ `/tests/test_sources_plugin.py` - Source plugin tests
- ✅ `/tests/test_sources_catalog.py` - Source catalog and registry tests
**Completed:**
- ✅ Created `BibliographicSource` abstract base class
- ✅ Repaired `SourceRegistry` so config-backed loading resolves real source classes
- ✅ Fixed `CrossRefSource` normalization for direct lookup and search-style payloads
- ✅ Replaced path-specific compatibility loading with repo-relative loading
- ✅ Added a source catalog that captures current status and next-priority sources
**Features:**
- Abstract interface for source plugins
- Registry for known source discovery and instantiation
- Config-driven enable/disable for known source types
- Source prioritization metadata
- Compatibility with the existing `SourceClient`-based resolver/expander code
---
## Current Integrated Sources ✅ AVAILABLE
- `Crossref`
- `OpenAlex`
- `OpenCitations`
- `Unpaywall`
- `PubMed`
- `Europe PMC`
- `Semantic Scholar`
- `DataCite`
- `DBLP`
- `arXiv`
- `OAI-PMH`
These are already sufficient for a credible local enrichment-and-discovery workflow. The next work should complement them rather than restart infrastructure underneath them.
---
## Phase 2: Next Source Additions 🚧 IN PROGRESS
**Status:** In Progress
**Priority Order:**
1. `OpenAIRE` only if repository-acquisition scope expands
**Completed Deliverables:**
- ✅ OpenCitations adapter for DOI citation/reference lookup
- ✅ OpenCitations graph expansion support in CLI and topic expansion flows
- ✅ Unpaywall adapter for DOI OA-link enrichment
- ✅ `enrich-oa` CLI flow for applying OA metadata to stored entries
- ✅ Europe PMC biomedical resolver/search integration
- ✅ Semantic Scholar broad-science resolver/search integration
**Planned Deliverables:**
- ⏳ Decide whether repository-acquisition breadth needs another dedicated source
**Rationale:**
- `OpenCitations` now improves open citation-edge coverage
- `Unpaywall` now improves access-link enrichment
- `Europe PMC` now improves biomedical metadata and OA/fulltext coverage
- `Semantic Scholar` now improves broader biological and physical sciences coverage
- neither requires a new database architecture to become useful
---
## Phase 3: Optional Source Evaluation ⏳ PLANNED
**Status:** Planned
- `OpenAIRE`
**Decision Rule:**
- add them only if they solve a concrete discovery or acquisition gap that current open sources do not already cover well
---
## Explicitly Deferred
- second-schema redesign work
- pgvector integration
- embedding-first retrieval
- broad canonical-work reconstruction
---
## Summary
**Completed:** scope reframe and source-layer cleanup
**Planned next:** `OpenAIRE` reevaluation
**Deferred:** database/vector expansion work not required by the source question

111
docs/phase-completion.md Normal file
View File

@ -0,0 +1,111 @@
# Sources-First Status
**Current Focus:** identify and prioritize the next open bibliographic sources to add, using the existing SQLite-based workflow as the baseline.
---
## Phase Matrix
| Phase | Title | Status | Outcome |
|-------|-------|--------|---------|
| **0** | Scope Reframe | ✅ Complete | Planning now answers the source question directly |
| **1** | Source Layer Tightening | ✅ Complete | Registry, CrossRef plugin, compatibility seam, and source catalog repaired |
| **2** | Next Open Source Additions | 🚧 In Progress | OpenCitations, Unpaywall, and Europe PMC integrated |
| **3** | Optional Source Evaluation | ⏳ Planned | OpenAIRE evaluated later if acquisition breadth matters |
| **D** | Database / Vector Expansion | ⏸ Deferred | Not required for the current source-incorporation decision |
---
## Test Coverage Summary
```
✅ test_sources_plugin.py
✅ test_sources_catalog.py
✅ existing full suite still expected to pass
```
---
## Key Artifacts
### Documentation
```
docs/
├── source-landscape.md ✅ Source inventory and recommendations
├── implementation-progress.md ✅ Sources-first progress tracker
└── phase-completion.md ✅ Short status summary
```
### Source Layer
```
src/citegeist/sources/
├── base.py ✅ Base source interface
├── catalog.py ✅ Source inventory in code
├── registry.py ✅ Registry for known source classes
├── crossref.py ✅ Repaired CrossRef plugin
└── _old_sources_compat.py ✅ Repo-relative compatibility bridge
```
### Tests
```
tests/
├── test_sources_plugin.py ✅ Source plugin tests
└── test_sources_catalog.py ✅ Source catalog/registry tests
```
---
## Key Features Implemented
- ✅ Source catalog covering current and candidate open sources
- ✅ Config-driven registry loading for known real source classes
- ✅ CrossRef normalization that works for both single-record and search-result payloads
- ✅ Compatibility bridge that no longer depends on one checkout path
- ✅ OpenCitations DOI-based graph expansion with CLI support
- ✅ Unpaywall OA-link enrichment with CLI support
- ✅ Europe PMC biomedical resolver/search support
- ✅ Semantic Scholar broad-science resolver/search support
---
## Next Milestones
### Immediate
1. Decide whether repository-acquisition scope justifies `OpenAIRE`
2. Keep the OA-enrichment flow aligned with review/export needs
3. Keep graph-source scope disciplined as broader coverage grows
### Later
1. Evaluate `Semantic Scholar`
2. Evaluate `OpenAIRE`
3. Revisit database/vector work only if a concrete source need demands it
---
## Success Metrics
### Completed
- ✅ Planning now matches the actual source question
- ✅ Source-layer defects from the first pass have been corrected
- ✅ OpenCitations is now a working integrated source
- ✅ Unpaywall is now a working integrated source
- ✅ Europe PMC is now a working integrated source
- ✅ Semantic Scholar is now a working integrated source
- ✅ The next source priorities are explicit
### Planned
- ⏳ Better source selection discipline before adding more integrations
---
## Recommendations
1. Treat the current SQLite/FTS workflow as the baseline, not as a blocker.
2. Add source integrations only when they materially improve bibliographic coverage, citation coverage, or open-access linkage.
3. Keep database/vector work explicitly subordinate to source-incorporation needs.
---
**Last Updated:** 2026-04-25
**Status:** Sources-first plan in effect
**Confidence:** High

131
docs/schema-current.sql Normal file
View File

@ -0,0 +1,131 @@
-- CiteGeist Current Schema (SQLite)
-- Entries table
CREATE TABLE IF NOT EXISTS entries (
id INTEGER PRIMARY KEY,
citation_key TEXT NOT NULL UNIQUE,
entry_type TEXT NOT NULL,
review_status TEXT NOT NULL DEFAULT 'draft',
title TEXT,
year TEXT,
journal TEXT,
booktitle TEXT,
publisher TEXT,
abstract TEXT,
keywords TEXT,
url TEXT,
doi TEXT,
isbn TEXT,
fulltext TEXT,
raw_bibtex TEXT,
extra_fields_json TEXT NOT NULL DEFAULT '{}',
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
-- Creators table
CREATE TABLE IF NOT EXISTS creators (
id INTEGER PRIMARY KEY,
full_name TEXT NOT NULL UNIQUE,
family_name TEXT,
given_names TEXT
);
-- Entry-Creators relationship
CREATE TABLE IF NOT EXISTS entry_creators (
entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
creator_id INTEGER NOT NULL REFERENCES creators(id) ON DELETE CASCADE,
role TEXT NOT NULL,
ordinal INTEGER NOT NULL,
PRIMARY KEY (entry_id, role, ordinal)
);
-- Identifiers table
CREATE TABLE IF NOT EXISTS identifiers (
entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
scheme TEXT NOT NULL,
value TEXT NOT NULL,
PRIMARY KEY (scheme, value)
);
-- Relations table (citation graph)
CREATE TABLE IF NOT EXISTS relations (
source_entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
target_citation_key TEXT NOT NULL,
relation_type TEXT NOT NULL,
PRIMARY KEY (source_entry_id, target_citation_key, relation_type)
);
-- Topics table
CREATE TABLE IF NOT EXISTS topics (
id INTEGER PRIMARY KEY,
slug TEXT NOT NULL UNIQUE,
name TEXT NOT NULL,
source_type TEXT NOT NULL,
source_url TEXT,
expansion_phrase TEXT,
suggested_phrase TEXT,
phrase_review_status TEXT NOT NULL DEFAULT 'unreviewed',
phrase_review_notes TEXT,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
-- Entry-Topics relationship
CREATE TABLE IF NOT EXISTS entry_topics (
entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
topic_id INTEGER NOT NULL REFERENCES topics(id) ON DELETE CASCADE,
source_label TEXT NOT NULL,
confidence REAL,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (entry_id, topic_id)
);
-- Field Provenance table
CREATE TABLE IF NOT EXISTS field_provenance (
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
field_name TEXT NOT NULL,
field_value TEXT,
source_type TEXT NOT NULL,
source_label TEXT NOT NULL,
operation TEXT NOT NULL,
confidence REAL,
recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
-- Relation Provenance table
CREATE TABLE IF NOT EXISTS relation_provenance (
id INTEGER PRIMARY KEY,
source_entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
target_citation_key TEXT NOT NULL,
relation_type TEXT NOT NULL,
source_type TEXT NOT NULL,
source_label TEXT NOT NULL,
confidence REAL,
recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
-- Full-text Search (FTS5)
CREATE VIRTUAL TABLE IF NOT EXISTS entries_fts USING fts5(
title,
abstract,
keywords,
content='entries',
content_rowid='id'
);
-- Trigger to sync entries with FTS
CREATE TRIGGER IF NOT EXISTS entries_ai AFTER INSERT ON entries BEGIN
INSERT INTO entries_fts(rowid, title, abstract, keywords)
VALUES (new.id, new.title, new.abstract, new.keywords);
END;
CREATE TRIGGER IF NOT EXISTS entries_ad AFTER DELETE ON entries BEGIN
DELETE FROM entries_fts WHERE rowid = old.id;
END;
CREATE TRIGGER IF NOT EXISTS entries_au AFTER UPDATE ON entries BEGIN
UPDATE entries_fts SET title = new.title, abstract = new.abstract, keywords = new.keywords
WHERE rowid = new.id;
END;

131
docs/source-landscape.md Normal file
View File

@ -0,0 +1,131 @@
# Open Bibliographic Source Landscape
This document answers the question that prompted the recent planning work: which additional open bibliographic sources are worth incorporating into CiteGeist, given the sources it already uses?
## Current Baseline
CiteGeist already has useful source coverage for a local BibTeX-first workflow:
- `Crossref`: DOI lookup, title search, and reference-list expansion.
- `OpenAlex`: work lookup, title/discovery search, and citation-graph expansion.
- `PubMed`: authoritative biomedical metadata lookup by PMID and title search fallback.
- `Europe PMC`: biomedical metadata/fulltext complement to PubMed.
- `Semantic Scholar`: broad cross-domain metadata with strong biological and physical sciences coverage.
- `DataCite`: DOI-backed dataset/report/non-article metadata.
- `DBLP`: strong computer-science metadata.
- `arXiv`: preprint metadata.
- `OAI-PMH`: repository harvesting for theses, dissertations, and institutional collections.
That means the immediate gap is no longer “get any scholarly metadata at all.” The immediate gap is to add the next highest-value open sources without destabilizing the existing ingest, review, and export pipeline.
## Recommended Priorities
### OpenCitations
Why:
- It directly improves open citation-edge coverage.
- It fits CiteGeist's graph-discovery workflow better than another generic metadata source.
- It complements OpenAlex rather than replacing it.
Expected role:
- DOI-to-citations lookup
- DOI-to-references lookup
- provenance for citation edges
Status:
- now integrated as a DOI-based citation/reference source in the source layer and graph expansion flow
Main risk:
- coverage is narrower than OpenAlex, so merge rules need to treat it as an additional edge source rather than a primary metadata authority.
### Unpaywall
Why:
- It solves a different problem from Crossref/OpenAlex: full-text access and OA status.
- It improves the “can I get the paper?” part of the workflow without forcing a storage redesign.
Expected role:
- DOI-to-best-open-access-link lookup
- OA status enrichment
Status:
- now integrated as an OA-link enrichment source with a dedicated `enrich-oa` CLI flow
Main risk:
- It should remain an access-link enrichment layer, not become entangled with identity resolution logic.
### Europe PMC
Why:
- It is valuable for biomedical and life-sciences use cases.
- It complements PubMed with richer open-access and citation-related information.
Expected role:
- domain-specific metadata enrichment
- biomedical search
- OA/full-text linkage
Status:
- now integrated as a biomedical resolver/search complement to `PubMed`
Main risk:
- this should remain a domain-specific source, not be treated as a universal resolver.
### Semantic Scholar
Pros:
- good graph and relevance signals
- useful for discovery quality
Status:
- now integrated as a broad resolver/search complement with good biological and physical sciences coverage
Main risk:
- rate limits and product-policy changes still matter more here than for the more explicitly open bibliographic sources
## Evaluate But Do Not Make Core Yet
### OpenAIRE
Pros:
- strong repository and OA/project linkage
- good for European repository acquisition
Cons:
- better suited to corpus acquisition than first-line metadata resolution
Recommendation:
- treat as an acquisition adapter, not an immediate resolver target
## What Not To Prioritize Right Now
### Database Redesign
The repository already has a working SQLite storage model and FTS-backed local workflow. A second schema track should not lead the next phase of work unless a concrete source integration is blocked on it.
### Vector Search
Optional semantic ranking may become useful later, but it was not the motivating question and does not need to be a prerequisite for source incorporation.
## Suggested Execution Order
1. Keep the source abstraction aligned with sources already in use.
2. Revisit `OpenAIRE` after the current source additions settle.

113
new-roadmap.md Normal file
View File

@ -0,0 +1,113 @@
# CiteGeist Roadmap: Sources-First Expansion
## Purpose
The primary question is not “how do we redesign CiteGeist around a new storage engine?” The primary question is “which additional open bibliographic sources should CiteGeist incorporate next?”
This roadmap treats the current SQLite-based local workflow as the baseline and focuses on source evaluation, source integration order, and reviewable source behavior.
## Baseline
Already present in the repository:
- local BibTeX ingest, review, export, and graph traversal
- metadata resolution from `Crossref`, `PubMed`, `Europe PMC`, `OpenAlex`, `Semantic Scholar`, `DBLP`, `arXiv`, and `DataCite`
- citation-graph expansion using `Crossref` and `OpenAlex`
- repository harvesting via `OAI-PMH`
That means the next planning step is source prioritization, not another platform pivot.
## Phase 0: Reframe Scope
Goal:
Put source-incorporation decisions ahead of database and vector-search ambitions.
Tasks:
- [x] identify which source integrations already exist
- [x] separate “source expansion” work from “new database/vector stack” work
- [x] document the source landscape and recommended order
Deliverables:
- `/docs/source-landscape.md`
- `/src/citegeist/sources/catalog.py`
## Phase 1: Tighten The Source Layer
Goal:
Make the new source abstraction useful for the repository that already exists, rather than speculative infrastructure.
Tasks:
- [x] keep the compatibility bridge to the existing `SourceClient`
- [x] fix the initial `CrossRefSource` implementation so normalization works
- [x] make config-driven registry loading work for known concrete sources
- [x] add a code-backed source catalog for planning and prioritization
Deliverables:
- `/src/citegeist/sources/base.py`
- `/src/citegeist/sources/registry.py`
- `/src/citegeist/sources/crossref.py`
- `/src/citegeist/sources/catalog.py`
## Phase 2: Highest-Value Open Source Additions
Goal:
Incorporate the next open sources that materially improve the current workflow.
Priority order:
1. `OpenAIRE` only if repository-acquisition scope expands
Tasks:
- [x] add `OpenCitations` DOI-to-citation and DOI-to-reference lookup
- [x] merge `OpenCitations` edges into the existing graph-expansion workflow with provenance
- [x] add `Unpaywall` DOI-to-OA-link enrichment
- [x] expose OA-link enrichment in a dedicated CLI flow
- [x] add `Europe PMC` as a biomedical metadata/fulltext complement to `PubMed`
- [x] add `Semantic Scholar` as a broader scientific metadata complement across biological and physical sciences
Why these first:
- `OpenCitations` directly answers the open-citation-coverage gap
- `Unpaywall` now solves access-link enrichment without forcing a storage redesign
- `Europe PMC` now improves biomedical metadata and OA/fulltext coverage without changing the storage model
- `Semantic Scholar` now improves broader biological and physical sciences coverage without changing the storage model
## Phase 3: Evaluate Optional Sources, Do Not Commit Prematurely
Goal:
Assess sources that may be useful, but are not clearly the next source-first move.
Candidates:
- `OpenAIRE`
Tasks:
- [ ] document API limits, openness constraints, and integration risk
- [ ] decide whether each source belongs in core resolution, graph expansion, or corpus acquisition
- [ ] avoid adding sources that duplicate existing coverage without a clear payoff
## Deferred Work
These are valid future ideas, but they are not the current planning driver:
- a second database schema
- pgvector integration
- embedding-first search
- large-scale canonical-work reconstruction
The repository already has a working local storage/search path. Those ideas should only return to the front of the plan if a concrete source-integration need forces them there.
## Immediate Next Steps
1. Land the source inventory and source-layer cleanup.
2. Reassess whether `OpenAIRE` is worth adding for repository-acquisition breadth.

View File

@ -2,7 +2,7 @@ from .app_api import LiteratureExplorerApi
from .batch import BatchBootstrapRunner, BatchJobResult, load_batch_jobs from .batch import BatchBootstrapRunner, BatchJobResult, load_batch_jobs
from .bibtex import BibEntry, parse_bibtex from .bibtex import BibEntry, parse_bibtex
from .bootstrap import BootstrapResult, Bootstrapper from .bootstrap import BootstrapResult, Bootstrapper
from .expand import CrossrefExpander, OpenAlexExpander from .expand import CrossrefExpander, OpenAlexExpander, OpenCitationsExpander
from .extract import ( from .extract import (
available_extraction_backends, available_extraction_backends,
check_extraction_comparison_summary, check_extraction_comparison_summary,
@ -16,6 +16,10 @@ from .harvest import OaiMetadataFormat, OaiPmhHarvester, OaiSet
from .llm_verify import VerificationLlmClient, VerificationLlmConfig from .llm_verify import VerificationLlmClient, VerificationLlmConfig
from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts
from .sources import SourceClient from .sources import SourceClient
from .sources import EuropePmcSource
from .sources import OpenLibrarySource
from .sources import SemanticScholarSource
from .sources import UnpaywallSource
from .storage import BibliographyStore from .storage import BibliographyStore
from .verify import BibliographyVerifier, VerificationResult, VerificationMatch from .verify import BibliographyVerifier, VerificationResult, VerificationMatch
@ -31,10 +35,15 @@ __all__ = [
"LiteratureExplorerApi", "LiteratureExplorerApi",
"MetadataResolver", "MetadataResolver",
"OpenAlexExpander", "OpenAlexExpander",
"OpenCitationsExpander",
"OaiPmhHarvester", "OaiPmhHarvester",
"OaiMetadataFormat", "OaiMetadataFormat",
"OaiSet", "OaiSet",
"SourceClient", "SourceClient",
"EuropePmcSource",
"OpenLibrarySource",
"SemanticScholarSource",
"UnpaywallSource",
"VerificationLlmClient", "VerificationLlmClient",
"VerificationLlmConfig", "VerificationLlmConfig",
"VerificationMatch", "VerificationMatch",

View File

@ -173,6 +173,13 @@ def build_parser() -> argparse.ArgumentParser:
resolve_parser = subparsers.add_parser("resolve", help="Enrich stored entries from external metadata sources") resolve_parser = subparsers.add_parser("resolve", help="Enrich stored entries from external metadata sources")
resolve_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich") resolve_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich")
enrich_oa_parser = subparsers.add_parser(
"enrich-oa",
help="Enrich DOI-bearing entries with Unpaywall OA link metadata",
)
enrich_oa_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich")
enrich_oa_parser.add_argument("--email", help="Email address required by the Unpaywall API")
resolve_stubs_parser = subparsers.add_parser( resolve_stubs_parser = subparsers.add_parser(
"resolve-stubs", "resolve-stubs",
help="Find and enrich stub-like stored entries, optionally limited to DOI-bearing candidates", help="Find and enrich stub-like stored entries, optionally limited to DOI-bearing candidates",
@ -237,7 +244,7 @@ def build_parser() -> argparse.ArgumentParser:
expand_parser.add_argument("citation_keys", nargs="+", help="Seed citation keys to expand") expand_parser.add_argument("citation_keys", nargs="+", help="Seed citation keys to expand")
expand_parser.add_argument( expand_parser.add_argument(
"--source", "--source",
choices=["crossref", "openalex"], choices=["crossref", "openalex", "opencitations"],
default="crossref", default="crossref",
help="Graph expansion source", help="Graph expansion source",
) )
@ -260,7 +267,7 @@ def build_parser() -> argparse.ArgumentParser:
) )
expand_topic_parser.add_argument( expand_topic_parser.add_argument(
"--source", "--source",
choices=["crossref", "openalex"], choices=["crossref", "openalex", "opencitations"],
default="openalex", default="openalex",
help="Topic graph expansion source", help="Topic graph expansion source",
) )
@ -749,6 +756,8 @@ def main(argv: list[str] | None = None) -> int:
) )
if args.command == "resolve": if args.command == "resolve":
return _run_resolve(store, args.citation_keys) return _run_resolve(store, args.citation_keys)
if args.command == "enrich-oa":
return _run_enrich_oa(store, args.citation_keys, args.email)
if args.command == "resolve-stubs": if args.command == "resolve-stubs":
return _run_resolve_stubs(store, args.limit, args.doi_only, args.all_misc, args.topic, args.preview) return _run_resolve_stubs(store, args.limit, args.doi_only, args.all_misc, args.topic, args.preview)
if args.command == "graph": if args.command == "graph":
@ -1215,6 +1224,72 @@ def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int:
return exit_code return exit_code
def _run_enrich_oa(store: BibliographyStore, citation_keys: list[str], email: str | None) -> int:
from .sources import UnpaywallSource
source = UnpaywallSource(config={"email": email} if email else {})
if not source.is_available():
print("Unpaywall enrichment requires --email or UNPAYWALL_EMAIL", file=sys.stderr)
return 1
results: list[dict[str, object]] = []
total = len(citation_keys)
for index, citation_key in enumerate(citation_keys, start=1):
_print_progress("enriching OA", index, total, citation_key)
existing = store.get_entry(citation_key)
if existing is None:
results.append({"citation_key": citation_key, "status": "missing"})
continue
doi = str(existing.get("doi") or "").strip()
if not doi:
results.append({"citation_key": citation_key, "status": "no_doi"})
continue
enriched = source.lookup_by_doi(doi)
if enriched is None:
results.append({"citation_key": citation_key, "status": "no_record", "doi": doi})
continue
merged_fields: dict[str, str] = {}
for key, value in existing.items():
if isinstance(value, str):
merged_fields[key] = value
merged_fields.update(enriched.fields)
for field_name in ("title", "year", "author", "journal", "booktitle", "publisher", "abstract", "keywords"):
existing_value = str(existing.get(field_name) or "").strip()
if existing_value:
merged_fields[field_name] = existing_value
replacement = BibEntry(
entry_type=str(existing.get("entry_type") or "misc"),
citation_key=citation_key,
fields=merged_fields,
)
store.replace_entry(
citation_key,
replacement,
source_type="oa_enrich",
source_label=f"unpaywall:doi:{doi}",
review_status=str(existing.get("review_status") or "enriched"),
)
updated = store.get_entry(citation_key) or {}
results.append(
{
"citation_key": citation_key,
"status": "enriched",
"doi": doi,
"is_oa": updated.get("is_oa"),
"oa_status": updated.get("oa_status"),
"best_oa_url": updated.get("best_oa_url"),
"best_oa_pdf_url": updated.get("best_oa_pdf_url"),
}
)
print(json.dumps(results, indent=2))
return 0
def _resolve_one(store: BibliographyStore, resolver: MetadataResolver, citation_key: str) -> bool: def _resolve_one(store: BibliographyStore, resolver: MetadataResolver, citation_key: str) -> bool:
existing = store.get_entry(citation_key) existing = store.get_entry(citation_key)
if existing is None: if existing is None:
@ -1664,6 +1739,15 @@ def _run_expand(
for relation_name in _expand_relation_types(relation) for relation_name in _expand_relation_types(relation)
for item in expander.expand_entry(store, key, relation_type=relation_name, limit=limit) for item in expander.expand_entry(store, key, relation_type=relation_name, limit=limit)
] ]
elif source == "opencitations":
from .expand import OpenCitationsExpander
expander = OpenCitationsExpander()
expand_fn = lambda key: [
item
for relation_name in _expand_relation_types(relation)
for item in expander.expand_entry(store, key, relation_type=relation_name, limit=limit)
]
else: else:
print(f"Unsupported expansion source: {source}", file=sys.stderr) print(f"Unsupported expansion source: {source}", file=sys.stderr)
return 1 return 1

View File

@ -9,6 +9,7 @@ from urllib.parse import quote, urlencode
from .bibtex import BibEntry, parse_bibtex from .bibtex import BibEntry, parse_bibtex
from .extract import _extract_thesis_like_title, _looks_like_citation_blob as _shared_looks_like_citation_blob from .extract import _extract_thesis_like_title, _looks_like_citation_blob as _shared_looks_like_citation_blob
from .resolve import MetadataResolver, merge_entries from .resolve import MetadataResolver, merge_entries
from .sources import OpenCitationsSource
from .storage import BibliographyStore from .storage import BibliographyStore
@ -219,14 +220,94 @@ class OpenAlexExpander:
return _normalize_openalex_id(results[0].get("id", "")) return _normalize_openalex_id(results[0].get("id", ""))
class OpenCitationsExpander:
def __init__(self, resolver: MetadataResolver | None = None, source: OpenCitationsSource | None = None) -> None:
self.resolver = resolver or MetadataResolver()
self.source = source or OpenCitationsSource(config={"source_client": self.resolver.source_client})
def expand_entry(
self,
store: BibliographyStore,
citation_key: str,
relation_type: str = "cites",
limit: int = 25,
) -> list[ExpansionResult]:
entry = store.get_entry(citation_key)
if entry is None:
return []
doi = str(entry.get("doi") or "")
if not doi:
return []
edges = self.source.get_citations(doi, relation_type=relation_type, limit=limit)
results: list[ExpansionResult] = []
for edge in edges:
discovered_doi = edge.target_work_id[4:] if relation_type == "cites" else edge.source_work_id[4:]
discovered = self._lookup_discovered_entry(discovered_doi)
if discovered is None:
discovered = _opencitations_stub_entry(discovered_doi, citation_key)
existing_key = _existing_entry_key_for_discovered_work(store, discovered)
target_key = existing_key or discovered.citation_key
created = False
if existing_key is None and store.get_entry(discovered.citation_key) is None:
store.upsert_entry(
discovered,
raw_bibtex=None,
source_type="graph_expand",
source_label=edge.source_label,
review_status="draft",
)
store.connection.commit()
created = True
if relation_type == "cites":
source_key = citation_key
relation_target_key = target_key
else:
source_key = target_key
relation_target_key = citation_key
store.add_relation(
source_key,
relation_target_key,
"cites",
source_type="graph_expand",
source_label=edge.source_label,
confidence=edge.confidence,
)
results.append(
ExpansionResult(
source_citation_key=source_key,
discovered_citation_key=target_key,
created_entry=created,
relation_type=relation_type,
source_label=edge.source_label,
)
)
return results
def _lookup_discovered_entry(self, doi: str) -> BibEntry | None:
resolution = self.resolver.resolve_doi(doi)
if resolution is not None:
return resolution.entry
resolution = self.resolver.resolve_datacite_doi(doi)
if resolution is not None:
return resolution.entry
return self.source.lookup_by_doi(doi)
class TopicExpander: class TopicExpander:
def __init__( def __init__(
self, self,
crossref_expander: CrossrefExpander | None = None, crossref_expander: CrossrefExpander | None = None,
openalex_expander: OpenAlexExpander | None = None, openalex_expander: OpenAlexExpander | None = None,
opencitations_expander: OpenCitationsExpander | None = None,
) -> None: ) -> None:
self.crossref_expander = crossref_expander or CrossrefExpander() self.crossref_expander = crossref_expander or CrossrefExpander()
self.openalex_expander = openalex_expander or OpenAlexExpander() self.openalex_expander = openalex_expander or OpenAlexExpander()
self.opencitations_expander = opencitations_expander or OpenCitationsExpander()
self.last_run_meta: dict[str, object] = {} self.last_run_meta: dict[str, object] = {}
def expand_topic( def expand_topic(
@ -362,6 +443,17 @@ class TopicExpander:
) -> list[tuple[ExpansionResult, dict[str, object] | None]]: ) -> list[tuple[ExpansionResult, dict[str, object] | None]]:
if source == "crossref": if source == "crossref":
expansion_rows = self.crossref_expander.expand_entry_references(store, citation_key) expansion_rows = self.crossref_expander.expand_entry_references(store, citation_key)
elif source == "opencitations":
expansion_rows = []
for relation_name in _expand_relation_types(relation_type):
expansion_rows.extend(
self.opencitations_expander.expand_entry(
store,
citation_key,
relation_type=relation_name,
limit=limit,
)
)
else: else:
expansion_rows: list[ExpansionResult] = [] expansion_rows: list[ExpansionResult] = []
for relation_name in _expand_relation_types(relation_type): for relation_name in _expand_relation_types(relation_type):
@ -385,6 +477,11 @@ class TopicExpander:
) -> list[tuple[ExpansionResult, dict[str, object]]]: ) -> list[tuple[ExpansionResult, dict[str, object]]]:
if source == "crossref": if source == "crossref":
return self._preview_crossref_discoveries(store, citation_key, limit) return self._preview_crossref_discoveries(store, citation_key, limit)
if source == "opencitations":
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
for relation_name in _expand_relation_types(relation_type):
rows.extend(self._preview_opencitations_discoveries(store, citation_key, relation_name, limit))
return rows
rows: list[tuple[ExpansionResult, dict[str, object]]] = [] rows: list[tuple[ExpansionResult, dict[str, object]]] = []
for relation_name in _expand_relation_types(relation_type): for relation_name in _expand_relation_types(relation_type):
rows.extend(self._preview_openalex_discoveries(store, citation_key, relation_name, limit)) rows.extend(self._preview_openalex_discoveries(store, citation_key, relation_name, limit))
@ -467,6 +564,40 @@ class TopicExpander:
) )
return rows return rows
def _preview_opencitations_discoveries(
self,
store: BibliographyStore,
citation_key: str,
relation_type: str,
limit: int,
) -> list[tuple[ExpansionResult, dict[str, object]]]:
entry = store.get_entry(citation_key)
if entry is None or not entry.get("doi"):
return []
doi = str(entry["doi"])
edges = self.opencitations_expander.source.get_citations(doi, relation_type=relation_type, limit=limit)
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
for edge in edges:
discovered_doi = edge.target_work_id[4:] if relation_type == "cites" else edge.source_work_id[4:]
discovered = self.opencitations_expander._lookup_discovered_entry(discovered_doi)
if discovered is None:
discovered = _opencitations_stub_entry(discovered_doi, citation_key)
existing_key = _existing_entry_key_for_discovered_work(store, discovered)
target_key = existing_key or discovered.citation_key
rows.append(
(
ExpansionResult(
source_citation_key=citation_key if relation_type == "cites" else target_key,
discovered_citation_key=target_key,
created_entry=existing_key is None and store.get_entry(discovered.citation_key) is None,
relation_type=relation_type,
source_label=edge.source_label,
),
dict(discovered.fields),
)
)
return rows
def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry: def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
title = _crossref_reference_title(reference, ordinal) title = _crossref_reference_title(reference, ordinal)
@ -567,6 +698,20 @@ def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int
return f"{family}{year or 'nd'}{first_word}{ordinal}" return f"{family}{year or 'nd'}{first_word}{ordinal}"
def _opencitations_stub_entry(doi: str, source_citation_key: str) -> BibEntry:
suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
return BibEntry(
entry_type="misc",
citation_key=f"doi{suffix}",
fields={
"title": f"Referenced work for DOI {doi}",
"doi": doi,
"url": f"https://doi.org/{doi}",
"note": f"discovered_from = {{{source_citation_key}}}",
},
)
def _normalize_text(value: str) -> str: def _normalize_text(value: str) -> str:
without_tags = re.sub(r"<[^>]+>", " ", html.unescape(value)) without_tags = re.sub(r"<[^>]+>", " ", html.unescape(value))
normalized = " ".join(without_tags.split()) normalized = " ".join(without_tags.split())

View File

@ -7,17 +7,38 @@ import re
import urllib.error import urllib.error
import urllib.parse import urllib.parse
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from dataclasses import dataclass from dataclasses import dataclass, field
from .bibtex import BibEntry, parse_bibtex from .bibtex import BibEntry, parse_bibtex
from .sources.europepmc import EuropePmcSource
from .sources.openlibrary import OpenLibrarySource
from .sources.semanticscholar import SemanticScholarSource
from .sources import SourceClient from .sources import SourceClient
@dataclass(slots=True)
class ResolutionAttempt:
source_name: str
strategy: str
query_value: str
matched: bool
candidate_count: int | None = None
source_label: str = ""
error: str = ""
@dataclass(slots=True) @dataclass(slots=True)
class Resolution: class Resolution:
entry: BibEntry entry: BibEntry
source_type: str source_type: str
source_label: str source_label: str
attempts: list[ResolutionAttempt] = field(default_factory=list)
@dataclass(slots=True)
class ResolutionOutcome:
resolution: Resolution | None
attempts: list[ResolutionAttempt]
class MetadataResolver: class MetadataResolver:
@ -31,70 +52,109 @@ class MetadataResolver:
) -> None: ) -> None:
self.user_agent = user_agent self.user_agent = user_agent
self.source_client = source_client or SourceClient(user_agent=user_agent) self.source_client = source_client or SourceClient(user_agent=user_agent)
self.europepmc = EuropePmcSource(config={"source_client": self.source_client, "user_agent": user_agent})
self.openlibrary = OpenLibrarySource(config={"source_client": self.source_client, "user_agent": user_agent})
self.semanticscholar = SemanticScholarSource(config={"user_agent": user_agent})
self.ncbi_api_key = ncbi_api_key if ncbi_api_key is not None else os.environ.get("NCBI_API_KEY", "") self.ncbi_api_key = ncbi_api_key if ncbi_api_key is not None else os.environ.get("NCBI_API_KEY", "")
self.ncbi_tool = ncbi_tool if ncbi_tool is not None else os.environ.get("NCBI_TOOL", "citegeist") self.ncbi_tool = ncbi_tool if ncbi_tool is not None else os.environ.get("NCBI_TOOL", "citegeist")
self.ncbi_email = ncbi_email if ncbi_email is not None else os.environ.get("NCBI_EMAIL", "") self.ncbi_email = ncbi_email if ncbi_email is not None else os.environ.get("NCBI_EMAIL", "")
def resolve_entry(self, entry: BibEntry) -> Resolution | None: def resolve_entry(self, entry: BibEntry) -> Resolution | None:
return self.resolve_entry_with_trace(entry).resolution
def resolve_entry_with_trace(self, entry: BibEntry) -> ResolutionOutcome:
attempts: list[ResolutionAttempt] = []
if doi := entry.fields.get("doi"): if doi := entry.fields.get("doi"):
resolved = self.resolve_doi(doi) resolved = self._attempt_direct_resolution(attempts, "crossref", "doi_lookup", doi, self.resolve_doi)
if resolved is not None: if resolved is not None:
return resolved return ResolutionOutcome(resolution=resolved, attempts=attempts)
resolved = self.resolve_datacite_doi(doi) resolved = self._attempt_direct_resolution(
attempts, "datacite", "doi_lookup", doi, self.resolve_datacite_doi
)
if resolved is not None: if resolved is not None:
return resolved return ResolutionOutcome(resolution=resolved, attempts=attempts)
resolved = self._attempt_direct_resolution(
attempts, "europepmc", "doi_lookup", doi, self.resolve_europepmc_doi
)
if resolved is not None:
return ResolutionOutcome(resolution=resolved, attempts=attempts)
resolved = self._attempt_direct_resolution(
attempts, "semanticscholar", "doi_lookup", doi, self.resolve_semanticscholar_doi
)
if resolved is not None:
return ResolutionOutcome(resolution=resolved, attempts=attempts)
if pmid := entry.fields.get("pmid"): if pmid := entry.fields.get("pmid"):
resolved = self.resolve_pmid(pmid) resolved = self._attempt_direct_resolution(attempts, "pubmed", "pmid_lookup", pmid, self.resolve_pmid)
if resolved is not None: if resolved is not None:
return resolved return ResolutionOutcome(resolution=resolved, attempts=attempts)
if openalex_id := entry.fields.get("openalex"): if openalex_id := entry.fields.get("openalex"):
resolved = self.resolve_openalex(openalex_id) resolved = self._attempt_direct_resolution(
attempts, "openalex", "work_lookup", openalex_id, self.resolve_openalex
)
if resolved is not None: if resolved is not None:
return resolved return ResolutionOutcome(resolution=resolved, attempts=attempts)
if dblp_key := entry.fields.get("dblp"): if dblp_key := entry.fields.get("dblp"):
resolved = self.resolve_dblp(dblp_key) resolved = self._attempt_direct_resolution(attempts, "dblp", "key_lookup", dblp_key, self.resolve_dblp)
if resolved is not None: if resolved is not None:
return resolved return ResolutionOutcome(resolution=resolved, attempts=attempts)
if arxiv_id := entry.fields.get("arxiv"): if arxiv_id := entry.fields.get("arxiv"):
resolved = self.resolve_arxiv(arxiv_id) resolved = self._attempt_direct_resolution(
attempts, "arxiv", "id_lookup", arxiv_id, self.resolve_arxiv
)
if resolved is not None: if resolved is not None:
return resolved return ResolutionOutcome(resolution=resolved, attempts=attempts)
if title := entry.fields.get("title"): if title := entry.fields.get("title"):
resolved = self.search_crossref_best_match( author_text = entry.fields.get("author", "")
title=title, year = entry.fields.get("year", "")
author_text=entry.fields.get("author", ""), resolved = self._attempt_title_search_resolution(
year=entry.fields.get("year", ""), attempts, "crossref", title, author_text, year, self.search_crossref
) )
if resolved is not None: if resolved is not None:
return resolved return ResolutionOutcome(resolution=resolved, attempts=attempts)
resolved = self.search_datacite_best_match( resolved = self._attempt_title_search_resolution(
title=title, attempts, "datacite", title, author_text, year, self.search_datacite
author_text=entry.fields.get("author", ""),
year=entry.fields.get("year", ""),
) )
if resolved is not None: if resolved is not None:
return resolved return ResolutionOutcome(resolution=resolved, attempts=attempts)
resolved = self.search_openalex_best_match( resolved = self._attempt_title_search_resolution(
title=title, attempts, "openalex", title, author_text, year, self.search_openalex
author_text=entry.fields.get("author", ""),
year=entry.fields.get("year", ""),
) )
if resolved is not None: if resolved is not None:
return resolved return ResolutionOutcome(resolution=resolved, attempts=attempts)
resolved = self.search_pubmed_best_match( resolved = self._attempt_title_search_resolution(
title=title, attempts, "pubmed", title, author_text, year, self.search_pubmed
author_text=entry.fields.get("author", ""),
year=entry.fields.get("year", ""),
) )
if resolved is not None: if resolved is not None:
return resolved return ResolutionOutcome(resolution=resolved, attempts=attempts)
resolved = self._attempt_title_search_resolution(
attempts, "europepmc", title, author_text, year, self.search_europepmc
)
if resolved is not None:
return ResolutionOutcome(resolution=resolved, attempts=attempts)
resolved = self._attempt_title_search_resolution(
attempts, "semanticscholar", title, author_text, year, self.search_semanticscholar
)
if resolved is not None:
return ResolutionOutcome(resolution=resolved, attempts=attempts)
if _entry_prefers_catalog_search(entry):
resolved = self._attempt_title_search_resolution(
attempts,
"openlibrary",
title,
author_text,
year,
self.search_openlibrary,
selector=_select_best_catalog_title_match,
)
if resolved is not None:
return ResolutionOutcome(resolution=resolved, attempts=attempts)
return None return ResolutionOutcome(resolution=None, attempts=attempts)
def resolve_doi(self, doi: str) -> Resolution | None: def resolve_doi(self, doi: str) -> Resolution | None:
encoded = urllib.parse.quote(doi, safe="") encoded = urllib.parse.quote(doi, safe="")
@ -124,19 +184,7 @@ class MetadataResolver:
author_text: str = "", author_text: str = "",
year: str = "", year: str = "",
) -> Resolution | None: ) -> Resolution | None:
candidate = _select_best_title_match( return self._search_best_match_resolution("crossref", title, author_text, year, self.search_crossref)
self.search_crossref(title, limit=5),
title=title,
author_text=author_text,
year=year,
)
if candidate is None:
return None
return Resolution(
entry=candidate,
source_type="resolver",
source_label=f"crossref:search:{title}",
)
def resolve_dblp(self, dblp_key: str) -> Resolution | None: def resolve_dblp(self, dblp_key: str) -> Resolution | None:
encoded_key = urllib.parse.quote(dblp_key, safe="/:") encoded_key = urllib.parse.quote(dblp_key, safe="/:")
@ -245,19 +293,7 @@ class MetadataResolver:
author_text: str = "", author_text: str = "",
year: str = "", year: str = "",
) -> Resolution | None: ) -> Resolution | None:
candidate = _select_best_title_match( return self._search_best_match_resolution("datacite", title, author_text, year, self.search_datacite)
self.search_datacite(title, limit=5),
title=title,
author_text=author_text,
year=year,
)
if candidate is None:
return None
return Resolution(
entry=candidate,
source_type="resolver",
source_label=f"datacite:search:{title}",
)
def search_openalex(self, title: str, limit: int = 5) -> list[BibEntry]: def search_openalex(self, title: str, limit: int = 5) -> list[BibEntry]:
query = urllib.parse.urlencode({"search": title, "per-page": limit}) query = urllib.parse.urlencode({"search": title, "per-page": limit})
@ -290,6 +326,35 @@ class MetadataResolver:
return [] return []
return self._fetch_pubmed_entries(ids[:limit]) return self._fetch_pubmed_entries(ids[:limit])
def resolve_europepmc_doi(self, doi: str) -> Resolution | None:
entry = self.europepmc.lookup_by_doi(doi)
if entry is None:
return None
return Resolution(
entry=entry,
source_type="resolver",
source_label=f"europepmc:doi:{doi}",
)
def search_europepmc(self, title: str, limit: int = 5) -> list[BibEntry]:
return self.europepmc.search(title, limit=limit)
def search_openlibrary(self, title: str, limit: int = 5) -> list[BibEntry]:
return self.openlibrary.search(title, limit=limit)
def resolve_semanticscholar_doi(self, doi: str) -> Resolution | None:
entry = self.semanticscholar.lookup_by_doi(doi)
if entry is None:
return None
return Resolution(
entry=entry,
source_type="resolver",
source_label=f"semanticscholar:doi:{doi}",
)
def search_semanticscholar(self, title: str, limit: int = 5) -> list[BibEntry]:
return self.semanticscholar.search(title, limit=limit)
def _safe_get_json(self, url: str) -> dict | None: def _safe_get_json(self, url: str) -> dict | None:
try: try:
return self.source_client.get_json(url) return self.source_client.get_json(url)
@ -333,19 +398,7 @@ class MetadataResolver:
author_text: str = "", author_text: str = "",
year: str = "", year: str = "",
) -> Resolution | None: ) -> Resolution | None:
candidate = _select_best_title_match( return self._search_best_match_resolution("openalex", title, author_text, year, self.search_openalex)
self.search_openalex(title, limit=5),
title=title,
author_text=author_text,
year=year,
)
if candidate is None:
return None
return Resolution(
entry=candidate,
source_type="resolver",
source_label=f"openalex:search:{title}",
)
def search_pubmed_best_match( def search_pubmed_best_match(
self, self,
@ -353,19 +406,122 @@ class MetadataResolver:
author_text: str = "", author_text: str = "",
year: str = "", year: str = "",
) -> Resolution | None: ) -> Resolution | None:
candidate = _select_best_title_match( return self._search_best_match_resolution("pubmed", title, author_text, year, self.search_pubmed)
self.search_pubmed(title, limit=5),
title=title, def search_europepmc_best_match(
author_text=author_text, self,
year=year, title: str,
author_text: str = "",
year: str = "",
) -> Resolution | None:
return self._search_best_match_resolution("europepmc", title, author_text, year, self.search_europepmc)
def search_semanticscholar_best_match(
self,
title: str,
author_text: str = "",
year: str = "",
) -> Resolution | None:
return self._search_best_match_resolution(
"semanticscholar", title, author_text, year, self.search_semanticscholar
) )
def search_openlibrary_best_match(
self,
title: str,
author_text: str = "",
year: str = "",
) -> Resolution | None:
return self._search_best_match_resolution("openlibrary", title, author_text, year, self.search_openlibrary)
def _search_best_match_resolution(
self, source_name: str, title: str, author_text: str, year: str, search_func
) -> Resolution | None:
candidates = search_func(title, limit=5)
candidate = _select_best_title_match(candidates, title=title, author_text=author_text, year=year)
if candidate is None: if candidate is None:
return None return None
return Resolution( return Resolution(entry=candidate, source_type="resolver", source_label=f"{source_name}:search:{title}")
def _attempt_direct_resolution(
self,
attempts: list[ResolutionAttempt],
source_name: str,
strategy: str,
query_value: str,
resolver_func,
) -> Resolution | None:
try:
resolution = resolver_func(query_value)
except Exception as exc:
attempts.append(
ResolutionAttempt(
source_name=source_name,
strategy=strategy,
query_value=query_value,
matched=False,
error=str(exc),
)
)
return None
attempts.append(
ResolutionAttempt(
source_name=source_name,
strategy=strategy,
query_value=query_value,
matched=resolution is not None,
source_label=resolution.source_label if resolution is not None else "",
)
)
if resolution is not None and not resolution.attempts:
resolution.attempts = list(attempts)
return resolution
def _attempt_title_search_resolution(
self,
attempts: list[ResolutionAttempt],
source_name: str,
title: str,
author_text: str,
year: str,
search_func,
selector=None,
) -> Resolution | None:
try:
candidates = search_func(title, limit=5)
except Exception as exc:
attempts.append(
ResolutionAttempt(
source_name=source_name,
strategy="title_search",
query_value=title,
matched=False,
error=str(exc),
)
)
return None
match_selector = selector or _select_best_title_match
candidate = match_selector(candidates, title=title, author_text=author_text, year=year)
resolution = None
if candidate is not None:
resolution = Resolution(
entry=candidate, entry=candidate,
source_type="resolver", source_type="resolver",
source_label=f"pubmed:search:{title}", source_label=f"{source_name}:search:{title}",
) )
attempts.append(
ResolutionAttempt(
source_name=source_name,
strategy="title_search",
query_value=title,
matched=resolution is not None,
candidate_count=len(candidates),
source_label=resolution.source_label if resolution is not None else "",
)
)
if resolution is not None and not resolution.attempts:
resolution.attempts = list(attempts)
return resolution
def _fetch_pubmed_entries(self, pmids: list[str]) -> list[BibEntry]: def _fetch_pubmed_entries(self, pmids: list[str]) -> list[BibEntry]:
ordered_pmids = [pmid for pmid in dict.fromkeys(pmids) if pmid] ordered_pmids = [pmid for pmid in dict.fromkeys(pmids) if pmid]
@ -768,6 +924,42 @@ def _select_best_title_match(
return None return None
def _select_best_catalog_title_match(
candidates: list[BibEntry],
title: str,
author_text: str = "",
year: str = "",
) -> BibEntry | None:
if not candidates:
return None
title_tokens = _catalog_title_tokens(title)
author_tokens = _author_match_tokens(author_text)
year_text = str(year or "").strip()
scored: list[tuple[float, BibEntry]] = []
for candidate in candidates:
candidate_title_tokens = _catalog_title_tokens(candidate.fields.get("title", ""))
if not candidate_title_tokens:
continue
overlap = len(title_tokens & candidate_title_tokens)
union = len(title_tokens | candidate_title_tokens)
score = (overlap / union) if union else 0.0
if score < 0.6:
continue
candidate_year = str(candidate.fields.get("year", "") or "").strip()
if year_text and candidate_year and year_text != candidate_year:
continue
if author_tokens and not _candidate_matches_author_tokens(candidate, author_tokens):
continue
scored.append((score, candidate))
if not scored:
return None
scored.sort(key=lambda item: (-item[0], item[1].citation_key))
return scored[0][1]
def _author_match_tokens(author_text: str) -> set[str]: def _author_match_tokens(author_text: str) -> set[str]:
normalized = _normalize_match_text(author_text) normalized = _normalize_match_text(author_text)
if not normalized: if not normalized:
@ -788,6 +980,39 @@ def _candidate_matches_author_tokens(candidate: BibEntry, author_tokens: set[str
return bool(author_tokens & candidate_tokens) return bool(author_tokens & candidate_tokens)
def _catalog_title_tokens(value: str) -> set[str]:
normalized = _normalize_match_text(value)
stopwords = {"the", "and", "for", "with", "from", "into", "after", "all"}
return {
f"{token[:-4]}ic" if token.endswith("ical") and len(token) > 6 else token
for token in re.findall(r"[a-z0-9]+", normalized)
if len(token) >= 4 and token not in stopwords
}
def _entry_prefers_catalog_search(entry: BibEntry) -> bool:
if entry.entry_type in {"book", "incollection", "phdthesis", "mastersthesis"}:
return True
title = _normalize_match_text(entry.fields.get("title", ""))
venue = _normalize_match_text(
" ".join(
filter(
None,
[
entry.fields.get("publisher", ""),
entry.fields.get("howpublished", ""),
entry.fields.get("booktitle", ""),
],
)
)
)
if entry.entry_type != "misc":
return False
if any(token in venue for token in ("press", "university", "house", "dictionary", "christendom")):
return True
return any(token in title for token in ("dictionary", "history", "world", "universe", "record"))
def _normalize_pmid(value: str) -> str: def _normalize_pmid(value: str) -> str:
return "".join(ch for ch in str(value) if ch.isdigit()) return "".join(ch for ch in str(value) if ch.isdigit())

View File

@ -0,0 +1,27 @@
"""
Identifier resolution and normalization module.
Provides functions for extracting, normalizing, and resolving
bibliographic identifiers across multiple schemes.
"""
from __future__ import annotations
from citegeist.resolver.identifiers import (
IdentifierExtractor,
IdentifierNormalizer,
IdentifierResolver,
extract_identifiers,
normalize_identifier,
get_primary_identifier,
resolve_identifiers,
)
__all__ = [
'IdentifierExtractor',
'IdentifierNormalizer',
'IdentifierResolver',
'extract_identifiers',
'normalize_identifier',
'get_primary_identifier',
'resolve_identifiers',
]

View File

@ -0,0 +1,418 @@
"""
Identifier resolution and normalization module.
This module provides functions for extracting, normalizing, and resolving
bibliographic identifiers across multiple schemes (DOI, PMID, arXiv, ORCID, etc.).
"""
from __future__ import annotations
import re
from typing import Dict, List, Optional, Tuple
# Identifier scheme patterns
DOI_PATTERN = re.compile(
r'^10\.\d{4,9}/\S+$',
re.IGNORECASE
)
PMID_PATTERN = re.compile(r'^\d{5,7}$')
PMCID_PATTERN = re.compile(
r'^PMC\d+$|^PMC[0-9a-f]+$',
re.IGNORECASE
)
ARXIV_PATTERN = re.compile(
r'^\d{4}\.\d{4,5}(v\d+)?$',
re.IGNORECASE
)
ORCID_PATTERN = re.compile(
r'^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$',
re.IGNORECASE
)
ROR_PATTERN = re.compile(
r'^https?://ror\.org/[0-9A-Z]{4,10}$'
)
DBLP_PATTERN = re.compile(
r'^[a-zA-Z0-9_]+:[a-zA-Z0-9_]+$',
re.IGNORECASE
)
OPENALEX_PATTERN = re.compile(
r'^W[0-9]{4}-[A-F0-9]{4}$',
re.IGNORECASE
)
class IdentifierExtractor:
"""Extract identifiers from BibEntry fields."""
@staticmethod
def extract(entry_fields: Dict[str, str]) -> Dict[str, str]:
"""Extract all identifier schemes from entry fields.
Args:
entry_fields: Dictionary of entry fields
Returns:
Dictionary mapping scheme names to values
"""
identifiers = {}
# DOI
if doi := entry_fields.get('doi'):
identifiers['doi'] = doi
# PMID
if pmid := entry_fields.get('pmid'):
identifiers['pmid'] = pmid
# PMCID
if pmcid := entry_fields.get('pmcid'):
identifiers['pmcid'] = pmcid
# arXiv
if arxiv := entry_fields.get('arxiv'):
identifiers['arxiv'] = arxiv
# DBLP
if dblp := entry_fields.get('dblp'):
identifiers['dblp'] = dblp
# OpenAlex
if openalex := entry_fields.get('openalex'):
identifiers['openalex'] = openalex
# ISBN
if isbn := entry_fields.get('isbn'):
identifiers['isbn'] = isbn
# ISSN
if issn := entry_fields.get('issn'):
identifiers['issn'] = issn
return identifiers
class IdentifierNormalizer:
"""Normalize identifiers to canonical form."""
@staticmethod
def normalize_doi(doi: str) -> Optional[str]:
"""Normalize DOI to lowercase.
Args:
doi: DOI string
Returns:
Lowercase DOI, or None if invalid
"""
if not doi:
return None
normalized = doi.strip().lower()
if DOI_PATTERN.match(normalized):
return normalized
return None
@staticmethod
def normalize_pmid(pmid: str) -> Optional[str]:
"""Normalize PMID to string.
Args:
pmid: PMID string
Returns:
PMID string, or None if invalid
"""
if not pmid:
return None
pmid_str = str(pmid).strip()
if PMID_PATTERN.match(pmid_str):
return pmid_str
return None
@staticmethod
def normalize_pmcid(pmcid: str) -> Optional[str]:
"""Normalize PMCID to lowercase.
Args:
pmcid: PMCID string
Returns:
Lowercase PMCID, or None if invalid
"""
if not pmcid:
return None
normalized = pmcid.strip().lower()
if PMCID_PATTERN.match(normalized):
return normalized
return None
@staticmethod
def normalize_arxiv(arxiv: str) -> Optional[str]:
"""Normalize arXiv ID.
Args:
arxiv: arXiv ID string
Returns:
Normalized arXiv ID, or None if invalid
"""
if not arxiv:
return None
# Remove 'v' and version suffix if present
normalized = arxiv.strip().lower()
if 'v' in normalized:
normalized = normalized.split('v')[0]
if ARXIV_PATTERN.match(normalized):
return normalized
return None
@staticmethod
def normalize_orcid(orcid: str) -> Optional[str]:
"""Normalize ORCID to canonical format.
Args:
orcid: ORCID string
Returns:
Normalized ORCID (XXXX-XXXX-XXXX-XXX0), or None if invalid
"""
if not orcid:
return None
orcid = orcid.strip().upper().replace(' ', '')
if ORCID_PATTERN.match(orcid):
return orcid
return None
@staticmethod
def normalize_ror(ror_url: str) -> Optional[str]:
"""Normalize ROR URL to identifier.
Args:
ror_url: ROR URL string
Returns:
ROR identifier, or None if invalid
"""
if not ror_url:
return None
ror_id = ror_url.strip().lower()
if ROR_PATTERN.match(ror_id):
return ror_id
return None
@staticmethod
def normalize_dblp(dblp_key: str) -> Optional[str]:
"""Normalize DBLP key.
Args:
dblp_key: DBLP key string
Returns:
DBLP key, or None if invalid
"""
if not dblp_key:
return None
dblp = dblp_key.strip()
if DBLP_PATTERN.match(dblp):
return dblp
return None
@staticmethod
def normalize_openalex(openalex_id: str) -> Optional[str]:
"""Normalize OpenAlex ID.
Args:
openalex_id: OpenAlex ID string
Returns:
OpenAlex ID, or None if invalid
"""
if not openalex_id:
return None
openalex = openalex_id.strip().upper()
if OPENALEX_PATTERN.match(openalex):
return openalex
return None
@staticmethod
def normalize_identifier(scheme: str, value: str) -> Optional[Tuple[str, str]]:
"""Normalize an identifier.
Args:
scheme: Identifier scheme name
value: Identifier value
Returns:
Tuple of (scheme, normalized_value), or None if invalid
"""
scheme = scheme.lower()
normalizers = {
'doi': IdentifierNormalizer.normalize_doi,
'pmid': IdentifierNormalizer.normalize_pmid,
'pmcid': IdentifierNormalizer.normalize_pmcid,
'arxiv': IdentifierNormalizer.normalize_arxiv,
'orcid': IdentifierNormalizer.normalize_orcid,
'ror': IdentifierNormalizer.normalize_ror,
'dblp': IdentifierNormalizer.normalize_dblp,
'openalex': IdentifierNormalizer.normalize_openalex,
}
normalizer = normalizers.get(scheme)
if normalizer:
normalized = normalizer(value)
if normalized:
return (scheme, normalized)
return None
class IdentifierResolver:
"""Resolve identifiers across multiple schemes."""
# Lookup priority: schemes should be checked in this order
LOOKUP_PRIORITY = [
('doi', IdentifierNormalizer.normalize_doi),
('pmid', IdentifierNormalizer.normalize_pmid),
('pmcid', IdentifierNormalizer.normalize_pmcid),
('arxiv', IdentifierNormalizer.normalize_arxiv),
('dblp', IdentifierNormalizer.normalize_dblp),
('openalex', IdentifierNormalizer.normalize_openalex),
]
@staticmethod
def resolve(entry_fields: Dict[str, str]) -> List[Tuple[str, str]]:
"""Resolve identifiers from entry fields.
Args:
entry_fields: Dictionary of entry fields
Returns:
List of (scheme, normalized_value) tuples in priority order
"""
identifiers = IdentifierExtractor.extract(entry_fields)
resolved = []
for scheme, value in identifiers.items():
if normalized := IdentifierNormalizer.normalize_identifier(scheme, value):
resolved.append(normalized)
# Add title fingerprint as fallback
if title := entry_fields.get('title'):
fingerprint = IdentifierResolver._create_title_fingerprint(title)
if fingerprint:
resolved.append(('title', fingerprint))
return resolved
@staticmethod
def _create_title_fingerprint(title: str) -> Optional[str]:
"""Create a fingerprint from title for fallback lookup.
Args:
title: Work title
Returns:
Fingerprint string
"""
if not title:
return None
# Remove common words, punctuation, and normalize
words = title.lower()
words = re.sub(r'[^\w\s]', ' ', words) # Remove punctuation
words = re.sub(r'\s+', ' ', words) # Normalize whitespace
words = words.strip()
return words
@staticmethod
def get_primary_identifier(entry_fields: Dict[str, str]) -> Optional[Tuple[str, str]]:
"""Get the primary identifier (first in priority order).
Args:
entry_fields: Dictionary of entry fields
Returns:
Tuple of (scheme, value), or None if no identifier found
"""
resolved = IdentifierResolver.resolve(entry_fields)
for scheme, _ in IdentifierResolver.LOOKUP_PRIORITY:
# Find this scheme in resolved identifiers
for rscheme, rvalue in resolved:
if rscheme == scheme:
return (rscheme, rvalue)
return None
@staticmethod
def get_scheme_value(scheme: str, entry_fields: Dict[str, str]) -> Optional[str]:
"""Get a specific identifier value from entry fields.
Args:
scheme: Identifier scheme name
entry_fields: Dictionary of entry fields
Returns:
Identifier value, or None if not found
"""
if value := entry_fields.get(scheme):
if normalized := IdentifierNormalizer.normalize_identifier(scheme, value):
return normalized[1]
return None
# Convenience functions
def extract_identifiers(entry_fields: Dict[str, str]) -> Dict[str, str]:
"""Extract all identifiers from entry fields.
Args:
entry_fields: Dictionary of entry fields
Returns:
Dictionary mapping scheme names to values
"""
return IdentifierExtractor.extract(entry_fields)
def normalize_identifier(scheme: str, value: str) -> Optional[Tuple[str, str]]:
"""Normalize an identifier.
Args:
scheme: Identifier scheme name
value: Identifier value
Returns:
Tuple of (scheme, normalized_value), or None if invalid
"""
return IdentifierNormalizer.normalize_identifier(scheme, value)
def get_primary_identifier(entry_fields: Dict[str, str]) -> Optional[Tuple[str, str]]:
"""Get the primary identifier.
Args:
entry_fields: Dictionary of entry fields
Returns:
Tuple of (scheme, value), or None if no identifier found
"""
return IdentifierResolver.get_primary_identifier(entry_fields)
def resolve_identifiers(entry_fields: Dict[str, str]) -> List[Tuple[str, str]]:
"""Resolve identifiers from entry fields.
Args:
entry_fields: Dictionary of entry fields
Returns:
List of (scheme, value) tuples
"""
return IdentifierResolver.resolve(entry_fields)

View File

@ -0,0 +1,29 @@
"""Export all source plugins."""
from __future__ import annotations
from citegeist.sources.base import BibliographicSource, SourceRecord, CitationEdge
from citegeist.sources.catalog import SourceCatalogEntry, list_source_catalog, prioritized_source_keys
from citegeist.sources.registry import SourceRegistry, get_registry
from citegeist.sources.crossref import CrossRefSource
from citegeist.sources.europepmc import EuropePmcSource
from citegeist.sources.opencitations import OpenCitationsSource
from citegeist.sources.openlibrary import OpenLibrarySource
from citegeist.sources.semanticscholar import SemanticScholarSource
from citegeist.sources.unpaywall import UnpaywallSource
__all__ = [
'BibliographicSource',
'SourceRecord',
'CitationEdge',
'SourceCatalogEntry',
'SourceRegistry',
'get_registry',
'list_source_catalog',
'prioritized_source_keys',
'CrossRefSource',
'EuropePmcSource',
'OpenCitationsSource',
'OpenLibrarySource',
'SemanticScholarSource',
'UnpaywallSource',
]

View File

@ -0,0 +1,44 @@
"""
Bibliographic source plugins.
This package provides a plugin architecture for integrating multiple
bibliographic data sources (CrossRef, PubMed, Semantic Scholar, etc.).
"""
# Import old sources module for backward compatibility
from . import _old_sources_compat
# Import new plugin architecture
from citegeist.sources.base import BibliographicSource, SourceRecord, CitationEdge
from citegeist.sources.catalog import SourceCatalogEntry, list_source_catalog, prioritized_source_keys
from citegeist.sources.registry import SourceRegistry, get_registry
from citegeist.sources.crossref import CrossRefSource
from citegeist.sources.europepmc import EuropePmcSource
from citegeist.sources.opencitations import OpenCitationsSource
from citegeist.sources.openlibrary import OpenLibrarySource
from citegeist.sources.semanticscholar import SemanticScholarSource
from citegeist.sources.unpaywall import UnpaywallSource
# Re-export old classes for compatibility
__all__ = [
# New plugin architecture
'BibliographicSource',
'SourceRecord',
'CitationEdge',
'SourceCatalogEntry',
'SourceRegistry',
'get_registry',
'list_source_catalog',
'prioritized_source_keys',
'CrossRefSource',
'EuropePmcSource',
'OpenCitationsSource',
'OpenLibrarySource',
'SemanticScholarSource',
'UnpaywallSource',
# Old API (for backward compatibility)
'SourceClient',
]
# Backward compatibility - make SourceClient available from this module
SourceClient = _old_sources_compat.SourceClient

View File

@ -0,0 +1,25 @@
"""
Backward compatibility module for old sources module.
This module re-exports the old SourceClient class for compatibility.
"""
from pathlib import Path
import importlib.util
from .base import BibliographicSource, SourceRecord, CitationEdge
from .registry import SourceRegistry, get_registry
from .crossref import CrossRefSource
# Load the old sources.py module from the citegeist package root
_OLD_SOURCES_PATH = Path(__file__).resolve().parents[1] / "sources.py"
spec = importlib.util.spec_from_file_location(
"citegeist.sources_old",
_OLD_SOURCES_PATH
)
if spec and spec.loader:
old_sources = importlib.util.module_from_spec(spec)
spec.loader.exec_module(old_sources)
SourceClient = old_sources.SourceClient
else:
# Fallback if old sources.py doesn't exist
SourceClient = None

View File

@ -0,0 +1,189 @@
"""
Base interface for bibliographic sources.
This module defines the abstract base class that all source plugins must implement.
Plugins can register themselves with the SourceRegistry for dynamic loading.
"""
from __future__ import annotations
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
from citegeist.bibtex import BibEntry
@dataclass(slots=True)
class SourceRecord:
"""Represents a raw record from a source API."""
raw: Dict[str, Any]
source_type: str
source_label: str
timestamp: str
confidence: float
@dataclass(slots=True)
class CitationEdge:
"""Represents a citation relationship."""
source_work_id: str
target_work_id: str
relation_type: str # "cites" or "cited_by"
source_type: str
source_label: str
confidence: float
class BibliographicSource(ABC):
"""Abstract base class for bibliographic data sources.
All source plugins must inherit from this class and implement the required methods.
"""
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""Initialize the source with optional configuration.
Args:
config: Source-specific configuration dictionary
"""
self.config = config or {}
self.enabled = self.config.get('enabled', True)
self.source_type = self.config.get('source_type', self.__class__.__name__)
@abstractmethod
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
"""Look up a work by DOI.
Args:
doi: Digital Object Identifier
Returns:
BibEntry if found, None otherwise
"""
pass
@abstractmethod
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
"""Look up a work by title.
Args:
title: Work title
Returns:
BibEntry if found, None otherwise
"""
pass
@abstractmethod
def search(self, query: str, limit: int = 10) -> List[BibEntry]:
"""Search for works matching the query.
Args:
query: Search query string
limit: Maximum number of results
Returns:
List of matching BibEntry objects
"""
pass
@abstractmethod
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
"""Normalize a raw API record to a canonical BibEntry.
Args:
record: Raw record from source API
Returns:
BibEntry if normalization succeeds, None otherwise
"""
pass
def get_citations(self, work_id: str, relation_type: str = 'cites', limit: int = 10) -> List[CitationEdge]:
"""Get citations for a work.
Args:
work_id: Work identifier (DOI, PMID, etc.)
relation_type: Type of relation ('cites' or 'cited_by')
limit: Maximum number of results
Returns:
List of CitationEdge objects
"""
return []
def get_related(self, work_id: str, limit: int = 10) -> List[BibEntry]:
"""Get works related to a work.
Args:
work_id: Work identifier
limit: Maximum number of results
Returns:
List of related BibEntry objects
"""
return []
def get_fulltext_url(self, doi: str) -> Optional[str]:
"""Get full-text URL for a work.
Args:
doi: Digital Object Identifier
Returns:
Full-text URL if available, None otherwise
"""
return None
def get_embedding(self, work_id: str) -> Optional[List[float]]:
"""Get embedding vector for a work.
Args:
work_id: Work identifier
Returns:
Embedding vector if available, None otherwise
"""
return None
def get_identifier_scheme(self) -> str:
"""Get the identifier scheme used by this source.
Returns:
Identifier scheme (e.g., 'doi', 'pmid', 'openalex')
"""
return self.source_type.lower()
def record_source_metadata(self, entry: BibEntry, operation: str = 'ingest') -> SourceRecord:
"""Create a source record for provenance tracking.
Args:
entry: The BibEntry to record
operation: Operation type (e.g., 'ingest', 'enrich')
Returns:
SourceRecord with metadata
"""
return SourceRecord(
raw=self._entry_to_dict(entry),
source_type=self.source_type,
source_label=f"{self.source_type}:{self.config.get('name', self.__class__.__name__)}",
timestamp='',
confidence=1.0
)
def _entry_to_dict(self, entry: BibEntry) -> Dict[str, Any]:
"""Convert BibEntry to dictionary for source records."""
return {
'entry_type': entry.entry_type,
'citation_key': entry.citation_key,
'fields': entry.fields
}
def is_available(self) -> bool:
"""Check if the source is available and enabled.
Returns:
True if enabled and available, False otherwise
"""
return self.enabled

View File

@ -0,0 +1,173 @@
"""Open bibliographic source inventory and prioritization helpers."""
from __future__ import annotations
from dataclasses import dataclass
@dataclass(frozen=True, slots=True)
class SourceCatalogEntry:
key: str
label: str
category: str
access: str
capabilities: tuple[str, ...]
strengths: str
caveats: str
current_status: str
priority: str
_CATALOG: tuple[SourceCatalogEntry, ...] = (
SourceCatalogEntry(
key="crossref",
label="Crossref",
category="metadata",
access="open API",
capabilities=("doi_lookup", "title_search", "reference_lists"),
strengths="Broad DOI coverage and good article-level metadata.",
caveats="Citation coverage is incomplete and some references are unstructured blobs.",
current_status="integrated",
priority="now",
),
SourceCatalogEntry(
key="openalex",
label="OpenAlex",
category="metadata+graph",
access="open API",
capabilities=("work_lookup", "title_search", "citations", "cited_by", "topics"),
strengths="Best current open source for citation graph expansion and work-level discovery.",
caveats="Occasional noisy secondary records require conservative admission rules.",
current_status="integrated",
priority="now",
),
SourceCatalogEntry(
key="pubmed",
label="PubMed / NCBI E-utilities",
category="metadata",
access="open API",
capabilities=("pmid_lookup", "title_search", "biomedical_metadata"),
strengths="High-value authoritative metadata for biomedical literature.",
caveats="Domain-specific coverage outside biomedicine is limited.",
current_status="integrated",
priority="now",
),
SourceCatalogEntry(
key="datacite",
label="DataCite",
category="metadata",
access="open API",
capabilities=("doi_lookup", "title_search", "datasets"),
strengths="Important for datasets, reports, and non-traditional DOI-bearing objects.",
caveats="Less useful than Crossref/OpenAlex for mainstream article citation graphs.",
current_status="integrated",
priority="now",
),
SourceCatalogEntry(
key="dblp",
label="DBLP",
category="metadata",
access="open API",
capabilities=("key_lookup", "search", "computer_science"),
strengths="Excellent computer-science coverage and clean bibliographic records.",
caveats="Discipline-specific rather than general-purpose.",
current_status="integrated",
priority="selective",
),
SourceCatalogEntry(
key="arxiv",
label="arXiv",
category="metadata+fulltext",
access="open API",
capabilities=("id_lookup", "search", "preprints"),
strengths="Useful for preprint-first fields and free full-text links.",
caveats="Not a general citation graph source.",
current_status="integrated",
priority="selective",
),
SourceCatalogEntry(
key="open_citations",
label="OpenCitations",
category="graph",
access="open API",
capabilities=("doi_citations", "doi_references", "provenance"),
strengths="Directly aligned with open citation-edge expansion.",
caveats="Coverage is narrower than OpenAlex and needs merge discipline.",
current_status="integrated",
priority="now",
),
SourceCatalogEntry(
key="semantic_scholar",
label="Semantic Scholar",
category="metadata+graph",
access="free API with limits",
capabilities=("work_lookup", "search", "citations", "references"),
strengths="Strong graph and relevance signals, especially for discovery workflows.",
caveats="Not fully open data in the same sense as Crossref/OpenAlex/OpenCitations.",
current_status="integrated",
priority="now",
),
SourceCatalogEntry(
key="unpaywall",
label="Unpaywall",
category="access-links",
access="open API",
capabilities=("doi_fulltext_links", "oa_status"),
strengths="Best open source for landing-page and OA-link enrichment.",
caveats="Improves access, not bibliographic identity or graph completeness.",
current_status="integrated",
priority="now",
),
SourceCatalogEntry(
key="europe_pmc",
label="Europe PMC",
category="metadata+fulltext",
access="open API",
capabilities=("search", "citations", "fulltext_links", "biomedical"),
strengths="Valuable biomedical complement to PubMed with richer open-access linkage.",
caveats="Domain-specific and partially overlapping with PubMed/OpenAlex.",
current_status="integrated",
priority="now",
),
SourceCatalogEntry(
key="open_library",
label="Open Library",
category="metadata",
access="open API",
capabilities=("title_search", "book_metadata", "author_catalog", "isbn_hints"),
strengths="Useful general-purpose open catalog coverage for books, monographs, and reference works.",
caveats="Catalog metadata is not a citation graph and can be noisy for article-like titles.",
current_status="integrated",
priority="selective",
),
SourceCatalogEntry(
key="openaire",
label="OpenAIRE",
category="metadata+repository",
access="open API",
capabilities=("repository_metadata", "oa_links", "project_links"),
strengths="Good for repository, project, and European OA discovery.",
caveats="May be better treated as a corpus-acquisition adapter than a first-line resolver.",
current_status="planned",
priority="evaluate",
),
SourceCatalogEntry(
key="oai_pmh",
label="OAI-PMH Repositories",
category="repository",
access="open protocol",
capabilities=("repository_harvest", "set_discovery", "metadata_formats"),
strengths="Already useful for theses, dissertations, and institutional repositories.",
caveats="Heterogeneous metadata quality; not a single canonical bibliographic source.",
current_status="integrated",
priority="selective",
),
)
def list_source_catalog() -> list[SourceCatalogEntry]:
return list(_CATALOG)
def prioritized_source_keys() -> list[str]:
order = {"now": 0, "next": 1, "selective": 2, "evaluate": 3}
return [entry.key for entry in sorted(_CATALOG, key=lambda entry: (order[entry.priority], entry.label.lower()))]

View File

@ -0,0 +1,210 @@
"""
CrossRef source plugin.
CrossRef provides metadata for DOIs for scholarly works.
"""
from __future__ import annotations
import json
import urllib.request
import urllib.parse
from typing import Any, Dict, List, Optional
from citegeist.bibtex import BibEntry
from citegeist.sources.base import BibliographicSource
class CrossRefSource(BibliographicSource):
"""CrossRef source for DOI-based metadata lookup."""
BASE_URL = "https://api.crossref.org"
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""Initialize CrossRef source.
Args:
config: Configuration with optional 'api_key'
"""
super().__init__(config)
self.api_key = self.config.get('api_key', '')
self.user_agent = self.config.get(
'user_agent',
'citegeist/0.1 (local research tool)',
)
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
"""Look up a work by DOI.
Args:
doi: Digital Object Identifier
Returns:
BibEntry if found, None otherwise
"""
if not doi:
return None
encoded = urllib.parse.quote(doi, safe="")
url = f"{self.BASE_URL}/works/{encoded}"
headers = {'User-Agent': self.user_agent}
if self.api_key:
headers['X-Api-Key'] = self.api_key
try:
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
data = response.read().decode('utf-8')
payload = json.loads(data)
return self._normalize_crossref(payload)
except Exception:
return None
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
"""CrossRef doesn't support title-only lookup.
Returns None as this is not a supported operation.
"""
return None
def search(self, query: str, limit: int = 10) -> List[BibEntry]:
"""Search CrossRef for works.
Args:
query: Search query string
limit: Maximum number of results
Returns:
List of matching BibEntry objects
"""
if not query:
return []
encoded_query = urllib.parse.quote(query, safe="")
url = f"{self.BASE_URL}/works?query={encoded_query}&rows={limit}"
headers = {'User-Agent': self.user_agent}
if self.api_key:
headers['X-Api-Key'] = self.api_key
try:
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
data = response.read().decode('utf-8')
payload = json.loads(data)
items = payload.get('message', {}).get('items', [])
return [entry for item in items if (entry := self._normalize_crossref(item)) is not None]
except Exception:
return []
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
"""Normalize a raw CrossRef record to a BibEntry.
Args:
record: Raw record from CrossRef API
Returns:
BibEntry if normalization succeeds
"""
return self._normalize_crossref(record)
def get_identifier_scheme(self) -> str:
"""Return 'doi' as the identifier scheme."""
return 'doi'
def _normalize_crossref(self, payload: Dict[str, Any]) -> Optional[BibEntry]:
"""Normalize a CrossRef payload to a BibEntry.
Args:
payload: Raw JSON payload from CrossRef
Returns:
BibEntry object
"""
message = payload.get('message', payload)
if not message:
return None
# Extract basic fields
doi = str(message.get('DOI', ''))
title = ' '.join(message.get('title', [])) if message.get('title') else ''
author_data = message.get('author', [])
year = self._extract_year(message)
# Format authors
authors = []
for author in author_data:
given = str(author.get('given', ''))
family = str(author.get('family', ''))
if given and family:
authors.append(f"{given} {family}")
elif family:
authors.append(family)
# Get publisher
publisher = str(message.get('publisher', ''))
# Get journal info
container_title = message.get('container-title', [])
journal = container_title[0] if container_title else ''
# Get URL
url = str(message.get('URL', ''))
# Get abstract
abstract = self._extract_abstract(message.get('abstract'))
# Map to BibEntry
fields: Dict[str, str] = {}
if title:
fields['title'] = title
if authors:
fields['author'] = ' and '.join(authors)
if year:
fields['year'] = year
if doi:
fields['doi'] = doi
if journal:
fields['journal'] = journal
if publisher:
fields['publisher'] = publisher
if url:
fields['url'] = url
if abstract:
fields['abstract'] = abstract
citation_key = f"{authors[0] if authors else 'crossref'}_{year or 'n.d.'}_{title or doi}"
return BibEntry(
entry_type='article',
citation_key=citation_key,
fields=fields
)
def _extract_year(self, message: Dict[str, Any]) -> str:
for field_name in ('published-print', 'published-online', 'issued', 'created'):
year = self._extract_year_from_date_parts(message.get(field_name, {}))
if year:
return year
return ''
def _extract_year_from_date_parts(self, field: Dict[str, Any]) -> str:
date_parts = field.get('date-parts', [])
if not date_parts:
return ''
first_part = date_parts[0]
if not first_part:
return ''
year = first_part[0]
return str(year) if year else ''
def _extract_abstract(self, raw_abstract: Any) -> str:
if isinstance(raw_abstract, str):
return raw_abstract.strip()
if isinstance(raw_abstract, list):
for item in raw_abstract:
if isinstance(item, dict):
text = str(item.get('value', '')).strip()
if text:
return text
elif isinstance(item, str) and item.strip():
return item.strip()
return ''

View File

@ -0,0 +1,157 @@
"""Europe PMC source plugin."""
from __future__ import annotations
import urllib.parse
from typing import Any, Dict, Optional
from citegeist.bibtex import BibEntry
from citegeist.sources._old_sources_compat import SourceClient
from citegeist.sources.base import BibliographicSource
class EuropePmcSource(BibliographicSource):
"""Europe PMC source for biomedical metadata and OA/fulltext links."""
BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)")
self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
normalized = doi.strip()
if not normalized:
return None
query = f'DOI:"{normalized}"'
row = self._search_one(query)
return self.normalize(row) if row else None
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
query_text = " ".join(title.split())
if not query_text:
return None
query = f'TITLE:"{query_text}"'
row = self._search_one(query)
return self.normalize(row) if row else None
def search(self, query: str, limit: int = 10) -> list[BibEntry]:
query_text = " ".join(query.split())
if not query_text:
return []
payload = self._search_payload(f'TITLE:"{query_text}"', max(1, limit))
results = payload.get("resultList", {}).get("result", []) if payload else []
return [entry for row in results if (entry := self.normalize(row)) is not None]
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
title = str(record.get("title") or "").strip()
if not title:
return None
doi = str(record.get("doi") or "").strip()
pmid = str(record.get("pmid") or record.get("id") or "").strip() if str(record.get("source") or "") == "MED" else str(record.get("pmid") or "").strip()
pmcid = str(record.get("pmcid") or "").strip()
year = str(record.get("pubYear") or "").strip()
author_text = self._normalize_author_string(str(record.get("authorString") or "").strip())
journal_title = str(record.get("journalTitle") or "").strip()
abstract = str(record.get("abstractText") or "").strip()
fields: Dict[str, str] = {"title": title}
if doi:
fields["doi"] = doi
if pmid:
fields["pmid"] = pmid
if pmcid:
fields["pmcid"] = pmcid
if year:
fields["year"] = year
if author_text:
fields["author"] = author_text
if journal_title:
fields["journal"] = journal_title
if volume := str(record.get("journalVolume") or "").strip():
fields["volume"] = volume
if issue := str(record.get("issue") or "").strip():
fields["number"] = issue
if pages := str(record.get("pageInfo") or "").strip():
fields["pages"] = pages
if abstract:
fields["abstract"] = abstract
if fulltext_url := self._fulltext_url(record):
fields["url"] = fulltext_url
elif article_url := self._article_url(record):
fields["url"] = article_url
if str(record.get("isOpenAccess") or "").strip():
fields["is_oa"] = "true" if str(record.get("isOpenAccess")).upper() == "Y" else "false"
if cited_by := str(record.get("citedByCount") or "").strip():
fields["europepmc_cited_by_count"] = cited_by
if source := str(record.get("source") or "").strip():
fields["europepmc_source"] = source
citation_key = self._citation_key(doi, pmid, author_text, year, title)
return BibEntry(entry_type="article", citation_key=citation_key, fields=fields)
def get_fulltext_url(self, doi: str) -> Optional[str]:
normalized = doi.strip()
if not normalized:
return None
payload = self._search_payload(f'DOI:"{normalized}"', 1)
results = payload.get("resultList", {}).get("result", []) if payload else []
if not results:
return None
return self._fulltext_url(results[0]) or self._article_url(results[0])
def get_identifier_scheme(self) -> str:
return "doi"
def _search_one(self, query: str) -> Dict[str, Any] | None:
payload = self._search_payload(query, 1)
results = payload.get("resultList", {}).get("result", []) if payload else []
return results[0] if results else None
def _search_payload(self, query: str, page_size: int) -> Dict[str, Any] | None:
params = {
"query": query,
"format": "json",
"resultType": "core",
"pageSize": max(1, page_size),
}
return self.source_client.try_get_json(f"{self.BASE_URL}?{urllib.parse.urlencode(params)}")
def _fulltext_url(self, record: Dict[str, Any]) -> str:
candidates = record.get("fullTextUrlList", {})
if isinstance(candidates, dict):
urls = candidates.get("fullTextUrl", [])
if isinstance(urls, dict):
urls = [urls]
if isinstance(urls, list):
for item in urls:
if not isinstance(item, dict):
continue
url = str(item.get("url") or "").strip()
if url:
return url
return ""
def _article_url(self, record: Dict[str, Any]) -> str:
source = str(record.get("source") or "").strip()
identifier = str(record.get("id") or "").strip()
if source and identifier:
return f"https://europepmc.org/article/{source}/{identifier}"
return ""
def _normalize_author_string(self, value: str) -> str:
if not value:
return ""
authors = [part.strip().rstrip(".") for part in value.split(",") if part.strip()]
return " and ".join(authors)
def _citation_key(self, doi: str, pmid: str, author_text: str, year: str, title: str) -> str:
if doi:
return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
if pmid:
return f"pmid{pmid}"
family = author_text.split(" and ")[0].split()[-1] if author_text else "ref"
family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref"
first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
return f"{family}{year or 'nd'}{first_word or 'untitled'}"

View File

@ -0,0 +1,178 @@
"""OpenCitations source plugin."""
from __future__ import annotations
import urllib.parse
from typing import Any, Dict, List, Optional
from citegeist.bibtex import BibEntry
from citegeist.sources.base import BibliographicSource, CitationEdge
from citegeist.sources._old_sources_compat import SourceClient
class OpenCitationsSource(BibliographicSource):
"""OpenCitations source for DOI metadata and citation edges."""
INDEX_BASE_URL = "https://api.opencitations.net/index/v2"
META_BASE_URL = "https://api.opencitations.net/meta/v1"
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)")
self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
normalized = self._normalize_doi_pid(doi)
if not normalized:
return None
rows = self.source_client.try_get_json(f"{self.META_BASE_URL}/metadata/{normalized}")
if not rows:
return None
return self.normalize(rows[0])
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
return None
def search(self, query: str, limit: int = 10) -> List[BibEntry]:
return []
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
ids = str(record.get("id") or "")
title = str(record.get("title") or "").strip()
if not ids or not title:
return None
doi = self._extract_id_value(ids, "doi")
openalex = self._extract_id_value(ids, "openalex")
year = self._extract_year(str(record.get("pub_date") or ""))
authors = self._normalize_author_field(str(record.get("author") or ""))
venue, venue_ids = self._parse_venue_field(str(record.get("venue") or ""))
entry_type = self._map_entry_type(str(record.get("type") or ""))
fields: Dict[str, str] = {"title": title}
if doi:
fields["doi"] = doi
fields["url"] = f"https://doi.org/{doi}"
if openalex:
fields["openalex"] = openalex
if year:
fields["year"] = year
if authors:
fields["author"] = authors
if venue:
if entry_type == "article":
fields["journal"] = venue
else:
fields["booktitle"] = venue
if volume := str(record.get("volume") or "").strip():
fields["volume"] = volume
if issue := str(record.get("issue") or "").strip():
fields["number"] = issue
if pages := str(record.get("page") or "").strip():
fields["pages"] = pages
if publisher := self._strip_bracketed_ids(str(record.get("publisher") or "")):
fields["publisher"] = publisher
if venue_ids:
fields["note"] = f"opencitations_venue_ids = {{{venue_ids}}}"
citation_key = self._citation_key(doi, openalex, authors, year, title)
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
def get_citations(self, work_id: str, relation_type: str = "cites", limit: int = 10) -> List[CitationEdge]:
normalized = self._normalize_doi_pid(work_id)
if not normalized:
return []
path = "references" if relation_type == "cites" else "citations"
rows = self.source_client.try_get_json(f"{self.INDEX_BASE_URL}/{path}/{normalized}")
if not rows:
return []
edges: List[CitationEdge] = []
for row in rows[:limit]:
citing = self._extract_id_value(str(row.get("citing") or ""), "doi")
cited = self._extract_id_value(str(row.get("cited") or ""), "doi")
if not citing or not cited:
continue
if relation_type == "cites":
source_work_id, target_work_id = citing, cited
else:
source_work_id, target_work_id = citing, cited
edges.append(
CitationEdge(
source_work_id=f"doi:{source_work_id}",
target_work_id=f"doi:{target_work_id}",
relation_type="cites",
source_type="opencitations",
source_label=f"opencitations:{path}:{normalized}",
confidence=0.85,
)
)
return edges
def get_identifier_scheme(self) -> str:
return "doi"
def _normalize_doi_pid(self, value: str) -> str:
doi = value.strip()
if not doi:
return ""
if doi.lower().startswith("doi:"):
doi = doi[4:]
return f"doi:{doi}"
def _extract_id_value(self, identifiers: str, scheme: str) -> str:
prefix = f"{scheme}:"
for token in identifiers.split():
if token.startswith(prefix):
return token[len(prefix):]
return ""
def _extract_year(self, pub_date: str) -> str:
pub_date = pub_date.strip()
if len(pub_date) >= 4 and pub_date[:4].isdigit():
return pub_date[:4]
return ""
def _normalize_author_field(self, raw_authors: str) -> str:
authors: List[str] = []
for part in raw_authors.split(";"):
cleaned = self._strip_bracketed_ids(part)
cleaned = " ".join(cleaned.split())
if cleaned:
authors.append(cleaned)
return " and ".join(authors)
def _parse_venue_field(self, raw_venue: str) -> tuple[str, str]:
raw_venue = raw_venue.strip()
if not raw_venue:
return "", ""
if "[" not in raw_venue:
return raw_venue, ""
title, _, remainder = raw_venue.partition("[")
return title.strip(), remainder.rstrip("] ").strip()
def _strip_bracketed_ids(self, value: str) -> str:
return value.split("[", 1)[0].strip()
def _map_entry_type(self, raw_type: str) -> str:
lowered = raw_type.casefold()
if lowered == "journal article":
return "article"
if lowered == "book":
return "book"
if lowered == "book chapter":
return "incollection"
if lowered in {"proceedings article", "conference paper"}:
return "inproceedings"
if "thesis" in lowered or "dissertation" in lowered:
return "phdthesis"
return "misc"
def _citation_key(self, doi: str, openalex: str, authors: str, year: str, title: str) -> str:
if doi:
return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
if openalex:
return "openalex" + "".join(ch for ch in openalex.lower() if ch.isalnum())
family = authors.split(" and ")[0].split(",")[0].split()[-1] if authors else "ref"
family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref"
first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
return f"{family}{year or 'nd'}{first_word or 'untitled'}"

View File

@ -0,0 +1,100 @@
"""Open Library source plugin."""
from __future__ import annotations
import urllib.parse
from typing import Any, Dict, List, Optional
from citegeist.bibtex import BibEntry
from citegeist.sources.base import BibliographicSource
from citegeist.sources._old_sources_compat import SourceClient
class OpenLibrarySource(BibliographicSource):
"""Open Library source for broad book and monograph metadata."""
SEARCH_URL = "https://openlibrary.org/search.json"
WORK_URL = "https://openlibrary.org"
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
user_agent = str(self.config.get("user_agent") or "citegeist/0.1 (local research tool)")
self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
return None
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
matches = self.search(title, limit=1)
return matches[0] if matches else None
def search(self, query: str, limit: int = 10) -> List[BibEntry]:
title = " ".join(query.split())
if not title:
return []
params = urllib.parse.urlencode({"title": title, "limit": max(1, limit), "fields": "*"})
payload = self.source_client.try_get_json(f"{self.SEARCH_URL}?{params}")
if not payload:
return []
docs = payload.get("docs", [])
if not isinstance(docs, list):
return []
return [entry for record in docs if isinstance(record, dict) and (entry := self.normalize(record)) is not None]
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
title = str(record.get("title") or "").strip()
if not title:
return None
authors = self._join_list(record.get("author_name"))
year = self._extract_year(record)
publishers = self._join_list(record.get("publisher"))
work_key = str(record.get("key") or "").strip()
edition_keys = record.get("edition_key") or []
isbn_values = record.get("isbn") or []
fields: Dict[str, str] = {"title": title}
if authors:
fields["author"] = authors
if year:
fields["year"] = year
if publishers:
fields["publisher"] = publishers
if work_key:
fields["openlibrary_work"] = work_key
fields["url"] = f"{self.WORK_URL}{work_key}"
if isinstance(edition_keys, list) and edition_keys:
fields["openlibrary_edition"] = str(edition_keys[0])
if isinstance(isbn_values, list) and isbn_values:
fields["isbn"] = str(isbn_values[0])
return BibEntry(
entry_type="book",
citation_key=self._citation_key(work_key, authors, year, title),
fields=fields,
)
def get_identifier_scheme(self) -> str:
return "openlibrary"
def _extract_year(self, record: Dict[str, Any]) -> str:
first_publish_year = record.get("first_publish_year")
if first_publish_year:
return str(first_publish_year)
publish_year = record.get("publish_year")
if isinstance(publish_year, list) and publish_year:
return str(publish_year[0])
return ""
def _join_list(self, value: Any) -> str:
if not isinstance(value, list):
return ""
items = [str(item).strip() for item in value if str(item).strip()]
return " and ".join(items)
def _citation_key(self, work_key: str, authors: str, year: str, title: str) -> str:
if work_key:
return "ol" + "".join(ch for ch in work_key.lower() if ch.isalnum())
family = authors.split(" and ")[0].split()[-1] if authors else "book"
family = "".join(ch for ch in family.lower() if ch.isalnum()) or "book"
first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
return f"{family}{year or 'nd'}{first_word or 'untitled'}"

View File

@ -0,0 +1,253 @@
"""
Source registry for managing bibliographic source plugins.
This module provides a registry that can discover, load, and manage
multiple bibliographic source plugins.
"""
from __future__ import annotations
import importlib.util
import inspect
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Type
from citegeist.sources.base import BibliographicSource
@dataclass(slots=True)
class SourceRegistration:
"""Registration information for a source plugin."""
name: str
source_class: Type[BibliographicSource]
config: Dict[str, Any]
enabled: bool
class SourceRegistry:
"""Registry for bibliographic source plugins.
This class manages the discovery, registration, and instantiation
of bibliographic source plugins.
"""
def __init__(self) -> None:
"""Initialize the source registry."""
self._registrations: Dict[str, SourceRegistration] = {}
self._instances: Dict[str, BibliographicSource] = {}
def register(
self,
source_class: Type[BibliographicSource],
name: Optional[str] = None,
config: Optional[Dict[str, Any]] = None,
) -> None:
"""Register a source class.
Args:
source_class: The source class to register (must inherit from BibliographicSource)
name: Optional name for the source (uses class name if not provided)
config: Optional configuration dictionary
"""
if not inspect.isclass(source_class) or not issubclass(source_class, BibliographicSource):
raise ValueError(f"{source_class} must be a subclass of BibliographicSource")
source_name = name or source_class.__name__
self._registrations[source_name] = SourceRegistration(
name=source_name,
source_class=source_class,
config=config or {},
enabled=config.get('enabled', True) if config else True
)
def get(self, name: str) -> Optional[BibliographicSource]:
"""Get a source instance by name.
Args:
name: Name of the source
Returns:
Source instance if registered and enabled, None otherwise
"""
if name not in self._registrations:
return None
registration = self._registrations[name]
# Return cached instance if available
if name in self._instances:
return self._instances[name]
# Create new instance
if not registration.enabled:
return None
instance = registration.source_class(config=registration.config)
self._instances[name] = instance
return instance
def list_sources(self, enabled_only: bool = False) -> List[str]:
"""List registered source names.
Args:
enabled_only: Only return enabled sources
Returns:
List of source names
"""
sources = list(self._registrations.keys())
if enabled_only:
return [name for name, reg in self._registrations.items() if reg.enabled]
return sources
def get_config(self, name: str) -> Optional[Dict[str, Any]]:
"""Get configuration for a source.
Args:
name: Name of the source
Returns:
Configuration dictionary, or None if not found
"""
registration = self._registrations.get(name)
return registration.config if registration else None
def load_from_file(self, filepath: str) -> None:
"""Load source plugins from a Python file.
Args:
filepath: Path to Python file containing source classes
"""
spec = importlib.util.spec_from_file_location("module.sources", filepath)
if spec is None or spec.loader is None:
raise ImportError(f"Cannot load module from {filepath}")
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
# Find all classes that inherit from BibliographicSource
for name, obj in inspect.getmembers(module, inspect.isclass):
if issubclass(obj, BibliographicSource) and obj is not BibliographicSource:
self.register(obj)
def load_from_directory(self, directory: str) -> None:
"""Load source plugins from a directory.
Args:
directory: Path to directory containing source plugin files
"""
import os
for filename in os.listdir(directory):
if filename.endswith('.py') and not filename.startswith('_'):
filepath = os.path.join(directory, filename)
self.load_from_file(filepath)
def from_config_dict(self, config: Dict[str, Any]) -> None:
"""Load sources from a configuration dictionary.
Example config format:
{
"sources": {
"crossref": {
"source_type": "crossref",
"enabled": true
},
"semantic_scholar": {
"source_type": "semantic_scholar",
"enabled": true,
"api_key": "..."
}
}
}
Args:
config: Configuration dictionary
"""
if 'sources' not in config:
return
for name, source_config in config['sources'].items():
source_name = str(name)
source_type = str(source_config.get('source_type', source_name))
self.register(
source_class=self._resolve_source_class(source_type),
name=source_name,
config=source_config
)
def to_dict(self) -> Dict[str, Any]:
"""Serialize registry to dictionary.
Returns:
Dictionary representation of registry
"""
return {
name: {
'enabled': reg.enabled,
'config': reg.config
}
for name, reg in self._registrations.items()
}
def from_dict(self, data: Dict[str, Any]) -> None:
"""Load registry from dictionary.
Args:
data: Dictionary representation of registry
"""
for name, source_data in data.items():
source_name = str(name)
source_type = str(source_data.get('source_type', source_name))
self.register(
source_class=self._resolve_source_class(source_type),
name=source_name,
config=source_data.get('config', source_data)
)
def get_registered_sources(self) -> List[SourceRegistration]:
"""Get all registered source registrations.
Returns:
List of SourceRegistration objects
"""
return list(self._registrations.values())
def _resolve_source_class(self, source_type: str) -> Type[BibliographicSource]:
normalized = source_type.strip().lower().replace('-', '_')
if normalized in {'crossref', 'cross_ref'}:
from citegeist.sources.crossref import CrossRefSource
return CrossRefSource
if normalized in {'opencitations', 'open_citations'}:
from citegeist.sources.opencitations import OpenCitationsSource
return OpenCitationsSource
if normalized == 'unpaywall':
from citegeist.sources.unpaywall import UnpaywallSource
return UnpaywallSource
if normalized in {'europepmc', 'europe_pmc'}:
from citegeist.sources.europepmc import EuropePmcSource
return EuropePmcSource
if normalized in {'semanticscholar', 'semantic_scholar'}:
from citegeist.sources.semanticscholar import SemanticScholarSource
return SemanticScholarSource
if normalized in {"openlibrary", "open_library"}:
from citegeist.sources.openlibrary import OpenLibrarySource
return OpenLibrarySource
raise ValueError(f"Unknown source type: {source_type}")
# Global registry instance
_global_registry = SourceRegistry()
def get_registry() -> SourceRegistry:
"""Get the global source registry instance.
Returns:
The global SourceRegistry instance
"""
return _global_registry

View File

@ -0,0 +1,140 @@
"""Semantic Scholar source plugin."""
from __future__ import annotations
import json
import os
import urllib.parse
import urllib.request
from typing import Any, Dict, List, Optional
from citegeist.bibtex import BibEntry
from citegeist.sources.base import BibliographicSource
class SemanticScholarSource(BibliographicSource):
"""Semantic Scholar source for broad scientific metadata coverage."""
BASE_URL = "https://api.semanticscholar.org/graph/v1"
DEFAULT_FIELDS = (
"paperId,title,year,abstract,authors,externalIds,journal,venue,url,"
"openAccessPdf,citationCount,publicationTypes"
)
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
self.api_key = str(
self.config.get("api_key")
or os.environ.get("SEMANTIC_SCHOLAR_API_KEY")
or ""
).strip()
self.user_agent = str(self.config.get("user_agent") or "citegeist/0.1 (local research tool)")
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
normalized = doi.strip()
if not normalized:
return None
encoded = urllib.parse.quote(f"DOI:{normalized}", safe="")
payload = self._get_json(f"{self.BASE_URL}/paper/{encoded}?fields={self.DEFAULT_FIELDS}")
if not payload:
return None
return self.normalize(payload)
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
matches = self.search(title, limit=1)
return matches[0] if matches else None
def search(self, query: str, limit: int = 10) -> List[BibEntry]:
query_text = " ".join(query.split())
if not query_text:
return []
params = urllib.parse.urlencode(
{"query": query_text, "limit": max(1, limit), "fields": self.DEFAULT_FIELDS}
)
payload = self._get_json(f"{self.BASE_URL}/paper/search?{params}")
if not payload:
return []
return [entry for row in payload.get("data", []) if (entry := self.normalize(row)) is not None]
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
title = str(record.get("title") or "").strip()
if not title:
return None
external_ids = record.get("externalIds") or {}
doi = str(external_ids.get("DOI") or "").strip()
authors = " and ".join(
str(author.get("name") or "").strip()
for author in record.get("authors", [])
if str(author.get("name") or "").strip()
)
year = str(record.get("year") or "").strip()
abstract = str(record.get("abstract") or "").strip()
journal = record.get("journal") or {}
journal_name = str(journal.get("name") or record.get("venue") or "").strip()
open_access_pdf = record.get("openAccessPdf") or {}
fields: Dict[str, str] = {"title": title}
if doi:
fields["doi"] = doi
if paper_id := str(record.get("paperId") or "").strip():
fields["semanticscholar_id"] = paper_id
if year:
fields["year"] = year
if authors:
fields["author"] = authors
if abstract:
fields["abstract"] = abstract
if journal_name:
if self._entry_type(record) == "inproceedings":
fields["booktitle"] = journal_name
else:
fields["journal"] = journal_name
if url := str(open_access_pdf.get("url") or record.get("url") or "").strip():
fields["url"] = url
if open_access_pdf:
fields["is_oa"] = "true"
if citation_count := record.get("citationCount"):
fields["semanticscholar_citation_count"] = str(citation_count)
citation_key = self._citation_key(doi, str(record.get("paperId") or ""), authors, year, title)
return BibEntry(entry_type=self._entry_type(record), citation_key=citation_key, fields=fields)
def get_fulltext_url(self, doi: str) -> Optional[str]:
entry = self.lookup_by_doi(doi)
if entry is None:
return None
return entry.fields.get("url")
def get_identifier_scheme(self) -> str:
return "doi"
def _entry_type(self, record: Dict[str, Any]) -> str:
publication_types = [str(item).lower() for item in (record.get("publicationTypes") or [])]
if any("conference" in item for item in publication_types):
return "inproceedings"
if any("review" in item for item in publication_types):
return "article"
if record.get("journal") or record.get("venue"):
return "article"
return "misc"
def _citation_key(self, doi: str, paper_id: str, authors: str, year: str, title: str) -> str:
if doi:
return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
if paper_id:
return "s2" + "".join(ch for ch in paper_id.lower() if ch.isalnum())
family = authors.split(" and ")[0].split()[-1] if authors else "ref"
family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref"
first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
return f"{family}{year or 'nd'}{first_word or 'untitled'}"
def _get_json(self, url: str) -> Dict[str, Any] | None:
headers = {"User-Agent": self.user_agent}
if self.api_key:
headers["x-api-key"] = self.api_key
try:
request = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(request) as response:
return json.loads(response.read().decode("utf-8"))
except Exception:
return None

View File

@ -0,0 +1,116 @@
"""Unpaywall source plugin."""
from __future__ import annotations
import os
import urllib.parse
from typing import Any, Dict, Optional
from citegeist.bibtex import BibEntry
from citegeist.sources._old_sources_compat import SourceClient
from citegeist.sources.base import BibliographicSource
class UnpaywallSource(BibliographicSource):
"""Unpaywall source for DOI-based OA link enrichment."""
BASE_URL = "https://api.unpaywall.org/v2"
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)")
self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
self.email = str(
self.config.get("email")
or os.environ.get("UNPAYWALL_EMAIL")
or os.environ.get("NCBI_EMAIL")
or ""
).strip()
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
payload = self.lookup_oa_record(doi)
if not payload:
return None
return self.normalize(payload)
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
return None
def search(self, query: str, limit: int = 10) -> list[BibEntry]:
return []
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
doi = str(record.get("doi") or "").strip()
title = str(record.get("title") or "").strip() or (doi and f"OA record for DOI {doi}")
if not doi or not title:
return None
fields: Dict[str, str] = {
"title": title,
"doi": doi,
}
if year := str(record.get("year") or "").strip():
fields["year"] = year
if landing_url := self._best_landing_url(record):
fields["url"] = landing_url
fields["best_oa_url"] = landing_url
if pdf_url := self._best_pdf_url(record):
fields["best_oa_pdf_url"] = pdf_url
if oa_status := str(record.get("oa_status") or "").strip():
fields["oa_status"] = oa_status
if license_name := self._best_license(record):
fields["oa_license"] = license_name
if host_type := self._best_host_type(record):
fields["oa_host_type"] = host_type
if version := self._best_version(record):
fields["oa_version"] = version
if evidence := self._best_evidence(record):
fields["oa_evidence"] = evidence
if record.get("is_oa") is not None:
fields["is_oa"] = "true" if bool(record.get("is_oa")) else "false"
citation_key = "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
return BibEntry(entry_type="misc", citation_key=citation_key, fields=fields)
def get_fulltext_url(self, doi: str) -> Optional[str]:
payload = self.lookup_oa_record(doi)
if not payload:
return None
return self._best_pdf_url(payload) or self._best_landing_url(payload)
def get_identifier_scheme(self) -> str:
return "doi"
def is_available(self) -> bool:
return self.enabled and bool(self.email)
def lookup_oa_record(self, doi: str) -> Dict[str, Any] | None:
normalized = doi.strip()
if not normalized or not self.email:
return None
encoded = urllib.parse.quote(normalized, safe="")
query = urllib.parse.urlencode({"email": self.email})
return self.source_client.try_get_json(f"{self.BASE_URL}/{encoded}?{query}")
def _best_landing_url(self, payload: Dict[str, Any]) -> str:
location = payload.get("best_oa_location") or {}
return str(location.get("url") or location.get("url_for_landing_page") or "").strip()
def _best_pdf_url(self, payload: Dict[str, Any]) -> str:
location = payload.get("best_oa_location") or {}
return str(location.get("url_for_pdf") or "").strip()
def _best_license(self, payload: Dict[str, Any]) -> str:
location = payload.get("best_oa_location") or {}
return str(location.get("license") or "").strip()
def _best_host_type(self, payload: Dict[str, Any]) -> str:
location = payload.get("best_oa_location") or {}
return str(location.get("host_type") or "").strip()
def _best_version(self, payload: Dict[str, Any]) -> str:
location = payload.get("best_oa_location") or {}
return str(location.get("version") or "").strip()
def _best_evidence(self, payload: Dict[str, Any]) -> str:
location = payload.get("best_oa_location") or {}
return str(location.get("evidence") or "").strip()

View File

@ -138,6 +138,7 @@ class TalkOriginsEnrichmentResult:
applied: bool applied: bool
source_label: str = "" source_label: str = ""
weak_reasons_after: list[str] | None = None weak_reasons_after: list[str] | None = None
resolution_attempts: list[dict[str, object]] | None = None
conflicts: list[dict[str, str]] | None = None conflicts: list[dict[str, str]] | None = None
error: str = "" error: str = ""
@ -545,8 +546,28 @@ class TalkOriginsScraper:
if not weak_reasons_before: if not weak_reasons_before:
continue continue
resolution = None resolution = None
attempts: list[dict[str, object]] = []
error = "" error = ""
try: try:
resolver_with_trace = getattr(self.resolver, "resolve_entry_with_trace", None)
resolver_plain = getattr(self.resolver, "resolve_entry", None)
plain_func = getattr(resolver_plain, "__func__", None)
trace_func = getattr(resolver_with_trace, "__func__", None)
use_trace = (
resolver_with_trace is not None
and (
trace_func is None
or (
plain_func is MetadataResolver.resolve_entry
and trace_func is MetadataResolver.resolve_entry_with_trace
)
)
)
if use_trace:
outcome = self.resolver.resolve_entry_with_trace(canonical)
resolution = outcome.resolution
attempts = [asdict(attempt) for attempt in outcome.attempts]
else:
resolution = self.resolver.resolve_entry(canonical) resolution = self.resolver.resolve_entry(canonical)
except Exception as exc: except Exception as exc:
error = str(exc) error = str(exc)
@ -559,6 +580,7 @@ class TalkOriginsScraper:
applied=False, applied=False,
source_label=resolution.source_label if resolution is not None else "", source_label=resolution.source_label if resolution is not None else "",
error=error, error=error,
resolution_attempts=attempts,
) )
if resolution is not None: if resolution is not None:

123
tests/test_europepmc.py Normal file
View File

@ -0,0 +1,123 @@
from __future__ import annotations
from citegeist.resolve import MetadataResolver
from citegeist.sources import EuropePmcSource, SourceRegistry, list_source_catalog
def test_europepmc_source_normalizes_core_record() -> None:
source = EuropePmcSource(config={})
entry = source.normalize(
{
"id": "37158217",
"source": "MED",
"pmid": "37158217",
"pmcid": "PMC10000001",
"doi": "10.1000/example",
"title": "Biomedical Example",
"authorString": "Doe J, Roe A",
"journalTitle": "Biomed Journal",
"pubYear": "2024",
"journalVolume": "16",
"issue": "1",
"pageInfo": "10-20",
"abstractText": "Abstract text.",
"isOpenAccess": "Y",
"citedByCount": 12,
"fullTextUrlList": {"fullTextUrl": [{"url": "https://europepmc.org/articles/PMC10000001?pdf=render"}]},
}
)
assert entry is not None
assert entry.fields["doi"] == "10.1000/example"
assert entry.fields["pmid"] == "37158217"
assert entry.fields["pmcid"] == "PMC10000001"
assert entry.fields["journal"] == "Biomed Journal"
assert entry.fields["url"] == "https://europepmc.org/articles/PMC10000001?pdf=render"
assert entry.fields["is_oa"] == "true"
def test_europepmc_registry_and_catalog() -> None:
registry = SourceRegistry()
registry.from_config_dict(
{
"sources": {
"europepmc": {
"source_type": "europepmc",
"enabled": True,
}
}
}
)
source = registry.get("europepmc")
assert isinstance(source, EuropePmcSource)
catalog = {entry.key: entry for entry in list_source_catalog()}
assert catalog["europe_pmc"].current_status == "integrated"
assert catalog["europe_pmc"].priority == "now"
def test_metadata_resolver_uses_europepmc_doi_after_primary_lookups_fail() -> None:
resolver = MetadataResolver()
resolver.resolve_doi = lambda _doi: None # type: ignore[method-assign]
resolver.resolve_datacite_doi = lambda _doi: None # type: ignore[method-assign]
resolver.europepmc.lookup_by_doi = lambda _doi: resolver.europepmc.normalize( # type: ignore[method-assign]
{
"id": "37158217",
"source": "MED",
"pmid": "37158217",
"doi": "10.1000/example",
"title": "Biomedical Example",
"authorString": "Doe J, Roe A",
"journalTitle": "Biomed Journal",
"pubYear": "2024",
}
)
from citegeist.bibtex import BibEntry
result = resolver.resolve_entry(
BibEntry(
entry_type="article",
citation_key="seed2024",
fields={"doi": "10.1000/example", "title": "Biomedical Example"},
)
)
assert result is not None
assert result.source_label == "europepmc:doi:10.1000/example"
assert result.entry.fields["pmid"] == "37158217"
def test_metadata_resolver_uses_europepmc_title_search_after_pubmed() -> None:
resolver = MetadataResolver()
resolver.search_crossref_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
resolver.search_datacite_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
resolver.search_openalex_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
resolver.search_pubmed_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
resolver.europepmc.search = lambda _title, limit=5: [ # type: ignore[method-assign]
resolver.europepmc.normalize(
{
"id": "37158217",
"source": "MED",
"pmid": "37158217",
"doi": "10.1000/example",
"title": "Biomedical Example",
"authorString": "Doe J, Roe A",
"journalTitle": "Biomed Journal",
"pubYear": "2024",
}
)
]
from citegeist.bibtex import BibEntry
result = resolver.resolve_entry(
BibEntry(
entry_type="article",
citation_key="seed2024",
fields={"title": "Biomedical Example", "author": "Doe J", "year": "2024"},
)
)
assert result is not None
assert result.source_label == "europepmc:search:Biomedical Example"

137
tests/test_opencitations.py Normal file
View File

@ -0,0 +1,137 @@
from __future__ import annotations
from citegeist.expand import OpenCitationsExpander
from citegeist.sources import OpenCitationsSource
from citegeist.storage import BibliographyStore
def test_opencitations_source_normalizes_metadata_row() -> None:
source = OpenCitationsSource(config={})
entry = source.normalize(
{
"id": "doi:10.1000/example openalex:W1234567890 omid:br/06123",
"title": "Example Work",
"author": "Doe, Jane [omid:ra/1]; Roe, Alex [omid:ra/2]",
"pub_date": "2024-05",
"venue": "Journal of Examples [issn:1234-5678]",
"volume": "12",
"issue": "3",
"page": "10-20",
"type": "journal article",
"publisher": "Example Press [crossref:123]",
}
)
assert entry is not None
assert entry.fields["doi"] == "10.1000/example"
assert entry.fields["openalex"] == "W1234567890"
assert entry.fields["author"] == "Doe, Jane and Roe, Alex"
assert entry.fields["journal"] == "Journal of Examples"
assert entry.fields["publisher"] == "Example Press"
assert entry.fields["year"] == "2024"
def test_opencitations_source_builds_edges_for_references() -> None:
source = OpenCitationsSource(config={})
source.source_client.get_json = lambda _url: [ # type: ignore[method-assign]
{
"oci": "1-2",
"citing": "omid:br/1 doi:10.1000/source",
"cited": "omid:br/2 doi:10.1000/target",
"creation": "2024-01-01",
}
]
edges = source.get_citations("10.1000/source", relation_type="cites", limit=10)
assert len(edges) == 1
assert edges[0].source_work_id == "doi:10.1000/source"
assert edges[0].target_work_id == "doi:10.1000/target"
def test_opencitations_expander_creates_reference_nodes_and_relations() -> None:
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024},
doi = {10.1000/source}
}
"""
)
expander = OpenCitationsExpander()
expander.source.source_client.get_json = lambda url: [ # type: ignore[method-assign]
{
"oci": "1-2",
"citing": "omid:br/1 doi:10.1000/source",
"cited": "omid:br/2 doi:10.1000/target",
"creation": "2024-01-01",
}
] if "/references/" in url else [
{
"id": "doi:10.1000/target omid:br/2",
"title": "Target Work",
"author": "Doe, Jane [omid:ra/1]",
"pub_date": "2023",
"venue": "Journal of Targets [issn:1111-1111]",
"type": "journal article",
}
]
expander.resolver.resolve_doi = lambda _doi: None # type: ignore[method-assign]
expander.resolver.resolve_datacite_doi = lambda _doi: None # type: ignore[method-assign]
results = expander.expand_entry(store, "seed2024", relation_type="cites", limit=10)
assert [item.discovered_citation_key for item in results] == ["doi101000target"]
discovered = store.get_entry("doi101000target")
assert discovered is not None
assert discovered["title"] == "Target Work"
assert store.get_relations("seed2024") == ["doi101000target"]
finally:
store.close()
def test_opencitations_expander_supports_cited_by_direction() -> None:
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024},
doi = {10.1000/seed}
}
"""
)
expander = OpenCitationsExpander()
expander.source.source_client.get_json = lambda url: [ # type: ignore[method-assign]
{
"oci": "2-1",
"citing": "omid:br/2 doi:10.1000/citing",
"cited": "omid:br/1 doi:10.1000/seed",
"creation": "2024-01-01",
}
] if "/citations/" in url else [
{
"id": "doi:10.1000/citing omid:br/2",
"title": "Citing Work",
"author": "Doe, Jane [omid:ra/1]",
"pub_date": "2025",
"venue": "Journal of Citers [issn:1111-1111]",
"type": "journal article",
}
]
expander.resolver.resolve_doi = lambda _doi: None # type: ignore[method-assign]
expander.resolver.resolve_datacite_doi = lambda _doi: None # type: ignore[method-assign]
results = expander.expand_entry(store, "seed2024", relation_type="cited_by", limit=10)
assert [item.discovered_citation_key for item in results] == ["doi101000citing"]
assert store.get_relations("doi101000citing") == ["seed2024"]
finally:
store.close()

188
tests/test_openlibrary.py Normal file
View File

@ -0,0 +1,188 @@
from __future__ import annotations
from citegeist.bibtex import BibEntry
from citegeist.resolve import MetadataResolver
from citegeist.sources import OpenLibrarySource, SourceRegistry, list_source_catalog
class FakeSourceClient:
def __init__(self, payload: dict[str, object]) -> None:
self.payload = payload
def try_get_json(self, _url: str) -> dict[str, object]:
return dict(self.payload)
def test_openlibrary_source_normalizes_book_record() -> None:
source = OpenLibrarySource(config={"source_client": FakeSourceClient({})})
entry = source.normalize(
{
"title": "The Nature of the Stratigraphic Record",
"author_name": ["D. V. Ager"],
"first_publish_year": 1973,
"publisher": ["Macmillan"],
"key": "/works/OL82563W",
"edition_key": ["OL12345M"],
"isbn": ["9781234567890"],
}
)
assert entry is not None
assert entry.entry_type == "book"
assert entry.fields["title"] == "The Nature of the Stratigraphic Record"
assert entry.fields["author"] == "D. V. Ager"
assert entry.fields["year"] == "1973"
assert entry.fields["publisher"] == "Macmillan"
assert entry.fields["openlibrary_work"] == "/works/OL82563W"
assert entry.fields["openlibrary_edition"] == "OL12345M"
assert entry.fields["isbn"] == "9781234567890"
def test_openlibrary_registry_and_catalog() -> None:
registry = SourceRegistry()
registry.from_config_dict(
{
"sources": {
"openlibrary": {
"source_type": "openlibrary",
"enabled": True,
}
}
}
)
source = registry.get("openlibrary")
assert isinstance(source, OpenLibrarySource)
catalog = {entry.key: entry for entry in list_source_catalog()}
assert catalog["open_library"].current_status == "integrated"
assert "book_metadata" in catalog["open_library"].capabilities
def test_metadata_resolver_uses_openlibrary_after_other_searches_fail() -> None:
resolver = MetadataResolver()
resolver.search_crossref = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_datacite = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_openalex = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_pubmed = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_europepmc = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_semanticscholar = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_openlibrary = lambda _title, limit=5: [ # type: ignore[method-assign]
BibEntry(
entry_type="book",
citation_key="olworks123",
fields={
"title": "The Nature of the Stratigraphic Record",
"author": "D. V. Ager",
"year": "1973",
"openlibrary_work": "/works/OL82563W",
},
)
]
result = resolver.resolve_entry(
BibEntry(
entry_type="book",
citation_key="seed1973",
fields={"title": "The Nature of the Stratigraphic Record", "author": "D. V. Ager", "year": "1973"},
)
)
assert result is not None
assert result.source_label == "openlibrary:search:The Nature of the Stratigraphic Record"
def test_metadata_resolver_trace_records_fallback_attempts() -> None:
resolver = MetadataResolver()
resolver.search_crossref = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_datacite = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_openalex = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_pubmed = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_europepmc = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_semanticscholar = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_openlibrary = lambda _title, limit=5: [ # type: ignore[method-assign]
BibEntry(
entry_type="book",
citation_key="olworks123",
fields={"title": "Example Book", "author": "Author, A", "year": "1980"},
)
]
outcome = resolver.resolve_entry_with_trace(
BibEntry(
entry_type="book",
citation_key="seed1980",
fields={"title": "Example Book", "author": "Author, A", "year": "1980"},
)
)
assert outcome.resolution is not None
assert outcome.resolution.source_label == "openlibrary:search:Example Book"
assert [attempt.source_name for attempt in outcome.attempts[-2:]] == ["semanticscholar", "openlibrary"]
assert outcome.attempts[-1].matched is True
assert outcome.attempts[-1].candidate_count == 1
def test_metadata_resolver_uses_fuzzy_catalog_match_for_book_titles() -> None:
resolver = MetadataResolver()
resolver.search_crossref = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_datacite = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_openalex = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_pubmed = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_europepmc = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_semanticscholar = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_openlibrary = lambda _title, limit=5: [ # type: ignore[method-assign]
BibEntry(
entry_type="book",
citation_key="olworks123",
fields={
"title": "The nature of the stratigraphical record",
"author": "D. V. Ager",
"year": "1973",
"openlibrary_work": "/works/OL82563W",
},
)
]
result = resolver.resolve_entry(
BibEntry(
entry_type="book",
citation_key="seed1973",
fields={"title": "The Nature of the Stratigraphic Record", "author": "D. V. Ager", "year": "1973"},
)
)
assert result is not None
assert result.source_label == "openlibrary:search:The Nature of the Stratigraphic Record"
def test_metadata_resolver_skips_openlibrary_for_article_like_entries() -> None:
resolver = MetadataResolver()
resolver.search_crossref = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_datacite = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_openalex = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_pubmed = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_europepmc = lambda _title, limit=5: [] # type: ignore[method-assign]
resolver.search_semanticscholar = lambda _title, limit=5: [] # type: ignore[method-assign]
called = {"openlibrary": False}
def fake_openlibrary(_title: str, limit: int = 5) -> list[BibEntry]:
called["openlibrary"] = True
return []
resolver.search_openlibrary = fake_openlibrary # type: ignore[method-assign]
outcome = resolver.resolve_entry_with_trace(
BibEntry(
entry_type="article",
citation_key="seed1977",
fields={
"title": "Fast locomotion of some African ungulates",
"author": "Alexander, R. M.",
"year": "1977",
"journal": "Journal of Zoology",
},
)
)
assert outcome.resolution is None
assert called["openlibrary"] is False
assert all(attempt.source_name != "openlibrary" for attempt in outcome.attempts)

View File

@ -0,0 +1,201 @@
"""Tests for identifier resolution and normalization."""
from __future__ import annotations
import pytest
from citegeist.resolver import (
IdentifierExtractor,
IdentifierNormalizer,
IdentifierResolver,
extract_identifiers,
normalize_identifier,
get_primary_identifier,
resolve_identifiers,
)
class TestIdentifierExtractor:
"""Test IdentifierExtractor class."""
def test_extract_from_entry(self):
"""Test extracting identifiers from entry fields."""
fields = {
'doi': '10.1234/example',
'title': 'Test Title',
'author': 'John Doe',
'pmid': '123456',
}
identifiers = IdentifierExtractor.extract(fields)
assert 'doi' in identifiers
assert identifiers['doi'] == '10.1234/example'
assert 'pmid' in identifiers
assert identifiers['pmid'] == '123456'
assert 'title' not in identifiers # Title is not an identifier
def test_extract_multiple_identifiers(self):
"""Test extracting multiple identifiers."""
fields = {
'doi': '10.1234/example',
'pmid': '123456',
'arxiv': '2310.12345',
'isbn': '978-0-123456-78-9',
}
identifiers = IdentifierExtractor.extract(fields)
assert len(identifiers) == 4
assert identifiers['doi'] == '10.1234/example'
assert identifiers['pmid'] == '123456'
assert identifiers['arxiv'] == '2310.12345'
assert identifiers['isbn'] == '978-0-123456-78-9'
class TestIdentifierNormalizer:
"""Test IdentifierNormalizer class."""
def test_normalize_doi(self):
"""Test DOI normalization."""
assert IdentifierNormalizer.normalize_doi('10.1234/EXAMPLE') == '10.1234/example'
assert IdentifierNormalizer.normalize_doi('10.1234/test') == '10.1234/test'
assert IdentifierNormalizer.normalize_doi('invalid') is None
def test_normalize_pmid(self):
"""Test PMID normalization."""
assert IdentifierNormalizer.normalize_pmid('12345') == '12345'
assert IdentifierNormalizer.normalize_pmid('1234567') == '1234567'
assert IdentifierNormalizer.normalize_pmid('invalid') is None
def test_normalize_pmcid(self):
"""Test PMCID normalization."""
assert IdentifierNormalizer.normalize_pmcid('PMC12345') == 'pmc12345'
assert IdentifierNormalizer.normalize_pmcid('PMCabcdef') == 'pmcabcdef'
assert IdentifierNormalizer.normalize_pmcid('invalid') is None
def test_normalize_arxiv(self):
"""Test arXiv normalization."""
assert IdentifierNormalizer.normalize_arxiv('2310.12345') == '2310.12345'
assert IdentifierNormalizer.normalize_arxiv('2310.12345v1') == '2310.12345'
assert IdentifierNormalizer.normalize_arxiv('INVALID') is None
def test_normalize_orcid(self):
"""Test ORCID normalization."""
assert IdentifierNormalizer.normalize_orcid('0000-0001-2345-6789') == '0000-0001-2345-6789'
# ORCID with spaces is invalid according to the canonical format
assert IdentifierNormalizer.normalize_orcid('0000 0001 2345 6789') is None
assert IdentifierNormalizer.normalize_orcid('invalid') is None
def test_normalize_identifier(self):
"""Test generic identifier normalization."""
result = IdentifierNormalizer.normalize_identifier('doi', '10.1234/test')
assert result == ('doi', '10.1234/test')
result = IdentifierNormalizer.normalize_identifier('pmid', '12345')
assert result == ('pmid', '12345')
result = IdentifierNormalizer.normalize_identifier('invalid', 'value')
assert result is None
class TestIdentifierResolver:
"""Test IdentifierResolver class."""
def test_resolve_with_doi(self):
"""Test resolving with DOI."""
fields = {'doi': '10.1234/example', 'title': 'Test Title'}
resolved = IdentifierResolver.resolve(fields)
assert len(resolved) >= 1
doi_resolved = [r for r in resolved if r[0] == 'doi']
assert len(doi_resolved) > 0
def test_resolve_with_multiple_identifiers(self):
"""Test resolving with multiple identifiers."""
fields = {
'doi': '10.1234/example',
'pmid': '12345',
'arxiv': '2310.12345',
}
resolved = IdentifierResolver.resolve(fields)
assert len(resolved) >= 2
doi_resolved = [r for r in resolved if r[0] == 'doi']
assert len(doi_resolved) > 0
def test_resolve_without_identifiers(self):
"""Test resolving without identifiers."""
fields = {'title': 'Test Title', 'author': 'John Doe'}
resolved = IdentifierResolver.resolve(fields)
# Should have at least title fingerprint
assert len(resolved) >= 1
title_resolved = [r for r in resolved if r[0] == 'title']
assert len(title_resolved) > 0
def test_get_primary_identifier(self):
"""Test getting primary identifier."""
fields = {
'doi': '10.1234/example',
'pmid': '12345',
'title': 'Test Title',
}
primary = IdentifierResolver.get_primary_identifier(fields)
assert primary is not None
# DOI should be first priority
assert primary[0] == 'doi'
def test_get_scheme_value(self):
"""Test getting specific scheme value."""
fields = {
'doi': '10.1234/example',
'pmid': '12345',
}
doi = IdentifierResolver.get_scheme_value('doi', fields)
assert doi == '10.1234/example'
pmid = IdentifierResolver.get_scheme_value('pmid', fields)
assert pmid == '12345'
isbn = IdentifierResolver.get_scheme_value('isbn', fields)
assert isbn is None
class TestConvenienceFunctions:
"""Test convenience functions."""
def test_extract_identifiers(self):
"""Test extract_identifiers function."""
fields = {'doi': '10.1234/example', 'pmid': '12345'}
identifiers = extract_identifiers(fields)
assert 'doi' in identifiers
assert 'pmid' in identifiers
def test_normalize_identifier(self):
"""Test normalize_identifier function."""
result = normalize_identifier('doi', '10.1234/test')
assert result == ('doi', '10.1234/test')
def test_get_primary_identifier(self):
"""Test get_primary_identifier function."""
fields = {'doi': '10.1234/example'}
primary = get_primary_identifier(fields)
assert primary == ('doi', '10.1234/example')
def test_resolve_identifiers(self):
"""Test resolve_identifiers function."""
fields = {'doi': '10.1234/example'}
resolved = resolve_identifiers(fields)
assert len(resolved) > 0

View File

@ -0,0 +1,117 @@
from __future__ import annotations
from citegeist.resolve import MetadataResolver
from citegeist.sources import SemanticScholarSource, SourceRegistry, list_source_catalog
def test_semanticscholar_source_normalizes_record() -> None:
source = SemanticScholarSource(config={})
entry = source.normalize(
{
"paperId": "abcdef123456",
"title": "Physics Example",
"year": 2024,
"abstract": "Abstract text.",
"authors": [{"name": "Jane Doe"}, {"name": "Alex Roe"}],
"externalIds": {"DOI": "10.1000/physics"},
"journal": {"name": "Physical Review Example"},
"openAccessPdf": {"url": "https://example.org/paper.pdf"},
"citationCount": 42,
"publicationTypes": ["JournalArticle"],
}
)
assert entry is not None
assert entry.fields["doi"] == "10.1000/physics"
assert entry.fields["author"] == "Jane Doe and Alex Roe"
assert entry.fields["journal"] == "Physical Review Example"
assert entry.fields["url"] == "https://example.org/paper.pdf"
assert entry.fields["is_oa"] == "true"
assert entry.fields["semanticscholar_citation_count"] == "42"
def test_semanticscholar_registry_and_catalog() -> None:
registry = SourceRegistry()
registry.from_config_dict(
{
"sources": {
"semanticscholar": {
"source_type": "semanticscholar",
"enabled": True,
}
}
}
)
source = registry.get("semanticscholar")
assert isinstance(source, SemanticScholarSource)
catalog = {entry.key: entry for entry in list_source_catalog()}
assert catalog["semantic_scholar"].current_status == "integrated"
assert catalog["semantic_scholar"].priority == "now"
def test_metadata_resolver_uses_semanticscholar_doi_after_other_lookups_fail() -> None:
resolver = MetadataResolver()
resolver.resolve_doi = lambda _doi: None # type: ignore[method-assign]
resolver.resolve_datacite_doi = lambda _doi: None # type: ignore[method-assign]
resolver.resolve_europepmc_doi = lambda _doi: None # type: ignore[method-assign]
resolver.semanticscholar.lookup_by_doi = lambda _doi: resolver.semanticscholar.normalize( # type: ignore[method-assign]
{
"paperId": "abcdef123456",
"title": "Physics Example",
"year": 2024,
"authors": [{"name": "Jane Doe"}],
"externalIds": {"DOI": "10.1000/physics"},
"journal": {"name": "Physical Review Example"},
"publicationTypes": ["JournalArticle"],
}
)
from citegeist.bibtex import BibEntry
result = resolver.resolve_entry(
BibEntry(
entry_type="article",
citation_key="seed2024",
fields={"doi": "10.1000/physics", "title": "Physics Example"},
)
)
assert result is not None
assert result.source_label == "semanticscholar:doi:10.1000/physics"
assert result.entry.fields["journal"] == "Physical Review Example"
def test_metadata_resolver_uses_semanticscholar_title_search_after_other_searches_fail() -> None:
resolver = MetadataResolver()
resolver.search_crossref_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
resolver.search_datacite_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
resolver.search_openalex_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
resolver.search_pubmed_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
resolver.search_europepmc_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
resolver.semanticscholar.search = lambda _title, limit=5: [ # type: ignore[method-assign]
resolver.semanticscholar.normalize(
{
"paperId": "abcdef123456",
"title": "Physics Example",
"year": 2024,
"authors": [{"name": "Jane Doe"}],
"externalIds": {"DOI": "10.1000/physics"},
"journal": {"name": "Physical Review Example"},
"publicationTypes": ["JournalArticle"],
}
)
]
from citegeist.bibtex import BibEntry
result = resolver.resolve_entry(
BibEntry(
entry_type="article",
citation_key="seed2024",
fields={"title": "Physics Example", "author": "Jane Doe", "year": "2024"},
)
)
assert result is not None
assert result.source_label == "semanticscholar:search:Physics Example"

View File

@ -0,0 +1,60 @@
from __future__ import annotations
from citegeist.sources import CrossRefSource, OpenCitationsSource, SourceRegistry, list_source_catalog, prioritized_source_keys
def test_catalog_prioritizes_existing_core_sources() -> None:
keys = prioritized_source_keys()
assert keys[:6] == ["crossref", "datacite", "europe_pmc", "openalex", "open_citations", "pubmed"]
def test_catalog_includes_open_citation_and_access_sources() -> None:
catalog = {entry.key: entry for entry in list_source_catalog()}
assert "open_citations" in catalog
assert "unpaywall" in catalog
assert catalog["open_citations"].priority == "now"
assert "doi_citations" in catalog["open_citations"].capabilities
def test_registry_loads_known_source_from_config() -> None:
registry = SourceRegistry()
registry.from_config_dict(
{
"sources": {
"crossref": {
"source_type": "crossref",
"enabled": True,
}
}
}
)
source = registry.get("crossref")
assert isinstance(source, CrossRefSource)
def test_registry_rejects_unknown_source_type() -> None:
registry = SourceRegistry()
try:
registry.from_config_dict({"sources": {"mystery": {"source_type": "mystery"}}})
except ValueError as exc:
assert "Unknown source type" in str(exc)
else:
raise AssertionError("expected ValueError for unknown source type")
def test_registry_loads_opencitations_from_config() -> None:
registry = SourceRegistry()
registry.from_config_dict(
{
"sources": {
"opencitations": {
"source_type": "opencitations",
"enabled": True,
}
}
}
)
source = registry.get("opencitations")
assert isinstance(source, OpenCitationsSource)

View File

@ -0,0 +1,171 @@
"""Tests for the source plugin architecture."""
from __future__ import annotations
import pytest
from citegeist.sources import BibliographicSource, SourceRegistry, CrossRefSource
class MockSource(BibliographicSource):
"""Mock source for testing."""
def __init__(self, config: dict | None = None):
super().__init__(config)
self.lookup_calls = []
def lookup_by_doi(self, doi: str) -> None:
"""Return None to indicate not found."""
self.lookup_calls.append(('doi', doi))
return None
def lookup_by_title(self, title: str) -> None:
"""Return None to indicate not found."""
self.lookup_calls.append(('title', title))
return None
def search(self, query: str, limit: int = 10) -> list:
return []
def normalize(self, record: dict) -> None:
return None
def test_source_base_interface():
"""Test that BibliographicSource base class works."""
source = MockSource()
assert source.is_available()
assert source.get_identifier_scheme() == 'mocksource'
assert source.get_fulltext_url('doi:test') is None
assert source.get_embedding('doi:test') is None
def test_mock_source():
"""Test that mock source implements interface correctly."""
source = MockSource()
source.lookup_by_doi('10.1234/test')
source.lookup_by_title('Test Title')
assert source.lookup_calls == [
('doi', '10.1234/test'),
('title', 'Test Title')
]
def test_source_registry():
"""Test source registry functionality."""
registry = SourceRegistry()
# Register a source
registry.register(MockSource, name='mock_source', config={'enabled': True})
# List sources
sources = registry.list_sources()
assert 'mock_source' in sources
# Get source instance
source = registry.get('mock_source')
assert source is not None
assert isinstance(source, MockSource)
assert source.is_available()
def test_source_registry_disabled():
"""Test that disabled sources are not returned."""
registry = SourceRegistry()
registry.register(
MockSource,
name='disabled_source',
config={'enabled': False}
)
sources = registry.list_sources()
assert 'disabled_source' in sources
# Getting disabled source should return None
source = registry.get('disabled_source')
assert source is None
def test_crossref_source():
"""Test CrossRef source plugin."""
registry = SourceRegistry()
registry.register(CrossRefSource, name='crossref', config={})
source = registry.get('crossref')
assert source is not None
assert source.is_available()
assert source.get_identifier_scheme() == 'doi'
entry = source.normalize(
{
'message': {
'DOI': '10.1234/example',
'title': ['Test Title'],
'author': [{'given': 'Jane', 'family': 'Doe'}],
'published-print': {'date-parts': [[2024]]},
'container-title': ['Journal of Tests'],
'publisher': 'Test Publisher',
'URL': 'https://doi.org/10.1234/example',
'abstract': '<jats:p>Example abstract</jats:p>',
}
}
)
assert entry is not None
assert entry.fields['doi'] == '10.1234/example'
assert entry.fields['title'] == 'Test Title'
assert entry.fields['year'] == '2024'
assert entry.fields['journal'] == 'Journal of Tests'
def test_crossref_search_item_normalization():
source = CrossRefSource()
entry = source.normalize(
{
'DOI': '10.1234/example',
'title': ['Search Result'],
'author': [{'family': 'Doe'}],
'issued': {'date-parts': [[2023]]},
}
)
assert entry is not None
assert entry.fields['doi'] == '10.1234/example'
assert entry.fields['year'] == '2023'
def test_source_record():
"""Test SourceRecord dataclass."""
from citegeist.sources import SourceRecord
record = SourceRecord(
raw={'test': 'data'},
source_type='test',
source_label='test_source',
timestamp='2024-01-01',
confidence=1.0
)
assert record.source_type == 'test'
assert record.source_label == 'test_source'
assert record.confidence == 1.0
assert record.raw == {'test': 'data'}
def test_citation_edge():
"""Test CitationEdge dataclass."""
from citegeist.sources import CitationEdge
edge = CitationEdge(
source_work_id='doi:10.1234',
target_work_id='doi:10.5678',
relation_type='cites',
source_type='crossref',
source_label='crossref:test',
confidence=0.9
)
assert edge.relation_type == 'cites'
assert edge.confidence == 0.9

View File

@ -530,6 +530,88 @@ def test_talkorigins_enrich_weak_canonicals_can_preview_resolution(tmp_path: Pat
assert results[0].weak_reasons_after == [] assert results[0].weak_reasons_after == []
def test_talkorigins_enrich_weak_canonicals_includes_resolution_attempts(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
f"{base_url}evolution.html": EVOLUTION_HTML,
}
)
)
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
Path(export.seed_sets[0].seed_bib).write_text(
"""
@misc{weak1,
author = "Smith, Jane",
year = "1999",
title = "Weak Duplicate"
}
@misc{weak2,
author = "Smith, Jane",
year = "1999",
title = "Weak Duplicate",
note = "Copied from legacy source"
}
""",
encoding="utf-8",
)
Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8")
from citegeist.resolve import Resolution, ResolutionAttempt, ResolutionOutcome
scraper.resolver.resolve_entry_with_trace = lambda entry: ResolutionOutcome( # type: ignore[method-assign]
resolution=Resolution(
entry=BibEntry(
entry_type="article",
citation_key="resolved",
fields={
"author": entry.fields["author"],
"title": entry.fields["title"],
"year": entry.fields["year"],
"doi": "10.1000/weak",
"journal": "Journal of Better Metadata",
},
),
source_type="resolver",
source_label="crossref:search:Weak Duplicate",
),
attempts=[
ResolutionAttempt(
source_name="crossref",
strategy="title_search",
query_value="Weak Duplicate",
matched=True,
candidate_count=1,
source_label="crossref:search:Weak Duplicate",
)
],
)
store = BibliographyStore()
try:
results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=False)
finally:
store.close()
assert len(results) == 1
assert results[0].resolution_attempts == [
{
"source_name": "crossref",
"strategy": "title_search",
"query_value": "Weak Duplicate",
"matched": True,
"candidate_count": 1,
"source_label": "crossref:search:Weak Duplicate",
"error": "",
}
]
def test_talkorigins_enrich_weak_canonicals_can_apply_to_store(tmp_path: Path): def test_talkorigins_enrich_weak_canonicals_can_apply_to_store(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/" base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper( scraper = TalkOriginsScraper(
@ -799,6 +881,7 @@ def test_talkorigins_build_review_export_combines_cluster_and_enrichment(tmp_pat
assert review.items[0]["canonical"]["citation_key"] == "weak2" assert review.items[0]["canonical"]["citation_key"] == "weak2"
assert review.items[0]["enrichment"]["resolved"] is True assert review.items[0]["enrichment"]["resolved"] is True
assert review.items[0]["enrichment"]["applied"] is False assert review.items[0]["enrichment"]["applied"] is False
assert review.items[0]["enrichment"]["resolution_attempts"] == []
def test_talkorigins_apply_review_corrections_updates_store_entry(tmp_path: Path): def test_talkorigins_apply_review_corrections_updates_store_entry(tmp_path: Path):

117
tests/test_unpaywall.py Normal file
View File

@ -0,0 +1,117 @@
from __future__ import annotations
from citegeist.cli import _run_enrich_oa
from citegeist.sources import SourceRegistry, UnpaywallSource, list_source_catalog, prioritized_source_keys
from citegeist.storage import BibliographyStore
def test_unpaywall_source_normalizes_oa_record() -> None:
source = UnpaywallSource(config={"email": "tester@example.org"})
entry = source.normalize(
{
"doi": "10.1000/example",
"title": "Example Article",
"year": 2024,
"is_oa": True,
"oa_status": "gold",
"best_oa_location": {
"url": "https://example.org/article",
"url_for_pdf": "https://example.org/article.pdf",
"license": "cc-by",
"host_type": "publisher",
"version": "publishedVersion",
"evidence": "open (via free pdf)",
},
}
)
assert entry is not None
assert entry.fields["doi"] == "10.1000/example"
assert entry.fields["best_oa_url"] == "https://example.org/article"
assert entry.fields["best_oa_pdf_url"] == "https://example.org/article.pdf"
assert entry.fields["oa_status"] == "gold"
assert entry.fields["oa_license"] == "cc-by"
assert entry.fields["is_oa"] == "true"
def test_unpaywall_registry_and_catalog() -> None:
registry = SourceRegistry()
registry.from_config_dict(
{
"sources": {
"unpaywall": {
"source_type": "unpaywall",
"enabled": True,
"email": "tester@example.org",
}
}
}
)
source = registry.get("unpaywall")
assert isinstance(source, UnpaywallSource)
catalog = {entry.key: entry for entry in list_source_catalog()}
assert catalog["unpaywall"].current_status == "integrated"
assert catalog["unpaywall"].priority == "now"
assert "unpaywall" in prioritized_source_keys()
def test_run_enrich_oa_updates_entry() -> None:
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024},
doi = {10.1000/example}
}
"""
)
original_lookup = UnpaywallSource.lookup_by_doi
def fake_lookup(self: UnpaywallSource, doi: str):
return self.normalize(
{
"doi": doi,
"title": "Seed Paper",
"year": 2024,
"is_oa": True,
"oa_status": "green",
"best_oa_location": {
"url": "https://repository.example.org/seed",
"url_for_pdf": "https://repository.example.org/seed.pdf",
"license": "cc-by",
"host_type": "repository",
"version": "acceptedVersion",
"evidence": "oa repository",
},
}
)
UnpaywallSource.lookup_by_doi = fake_lookup # type: ignore[method-assign]
try:
assert _run_enrich_oa(store, ["seed2024"], "tester@example.org") == 0
finally:
UnpaywallSource.lookup_by_doi = original_lookup # type: ignore[method-assign]
entry = store.get_entry("seed2024")
assert entry is not None
assert entry["best_oa_url"] == "https://repository.example.org/seed"
assert entry["best_oa_pdf_url"] == "https://repository.example.org/seed.pdf"
assert entry["oa_status"] == "green"
assert entry["oa_host_type"] == "repository"
provenance = store.get_field_provenance("seed2024")
assert any(item["source_type"] == "oa_enrich" for item in provenance)
finally:
store.close()
def test_run_enrich_oa_requires_email() -> None:
store = BibliographyStore()
try:
assert _run_enrich_oa(store, ["missing"], None) == 1
finally:
store.close()