Add source tracing and broader open source coverage
This commit is contained in:
parent
39fe5ea86c
commit
0497e18f04
|
|
@ -0,0 +1,185 @@
|
|||
-- Migration: Multi-source bibliographic schema
|
||||
-- Description: Add multi-source support with works, identifiers, source records, citations, and embeddings
|
||||
|
||||
-- ============================================================================
|
||||
-- WORKS TABLE - Canonical metadata for works
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS works (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
work_id TEXT NOT NULL UNIQUE,
|
||||
title TEXT,
|
||||
abstract TEXT,
|
||||
publication_year INTEGER,
|
||||
publication_date TEXT,
|
||||
journal_name TEXT,
|
||||
publisher TEXT,
|
||||
volume TEXT,
|
||||
issue TEXT,
|
||||
pages TEXT,
|
||||
doi TEXT,
|
||||
pmid TEXT,
|
||||
pmcid TEXT,
|
||||
arxiv_id TEXT,
|
||||
dblp_key TEXT,
|
||||
openalex_id TEXT,
|
||||
isbn TEXT,
|
||||
issn TEXT,
|
||||
entry_type TEXT NOT NULL DEFAULT 'article',
|
||||
citation_count INTEGER DEFAULT 0,
|
||||
cited_by_count INTEGER DEFAULT 0,
|
||||
influential_citations INTEGER DEFAULT 0,
|
||||
is_open_access BOOLEAN DEFAULT 0,
|
||||
best_oa_url TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- WORK IDENTIFIERS TABLE - Mapping scheme + value to works
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS work_identifiers (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
work_id TEXT NOT NULL,
|
||||
scheme TEXT NOT NULL,
|
||||
value TEXT NOT NULL,
|
||||
is_primary BOOLEAN DEFAULT 0,
|
||||
normalized_value TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(work_id, scheme, value),
|
||||
FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- SOURCE RECORDS TABLE - Raw API responses with provenance
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS source_records (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
work_id TEXT NOT NULL,
|
||||
source_type TEXT NOT NULL,
|
||||
source_label TEXT NOT NULL,
|
||||
raw_data_json TEXT NOT NULL,
|
||||
raw_record_id TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(work_id, source_type, source_label),
|
||||
FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- CITATIONS TABLE - Citation graph with provenance
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS citations (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
source_work_id TEXT NOT NULL,
|
||||
target_work_id TEXT NOT NULL,
|
||||
relation_type TEXT NOT NULL,
|
||||
source_type TEXT NOT NULL,
|
||||
source_label TEXT NOT NULL,
|
||||
confidence REAL DEFAULT 1.0,
|
||||
is_verified BOOLEAN DEFAULT 0,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(source_work_id, target_work_id, relation_type),
|
||||
FOREIGN KEY (source_work_id) REFERENCES works(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (target_work_id) REFERENCES works(id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- WORK EMBEDDINGS TABLE - Vector storage for semantic search
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS work_embeddings (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
work_id TEXT NOT NULL,
|
||||
embedding TEXT NOT NULL,
|
||||
model_name TEXT NOT NULL,
|
||||
model_version TEXT,
|
||||
dimension INTEGER NOT NULL,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(work_id, model_name),
|
||||
FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- INDEXES - For performance optimization
|
||||
-- ============================================================================
|
||||
-- Work identifiers indexes
|
||||
CREATE INDEX IF NOT EXISTS idx_work_identifiers_scheme ON work_identifiers(scheme);
|
||||
CREATE INDEX IF NOT EXISTS idx_work_identifiers_value ON work_identifiers(value);
|
||||
CREATE INDEX IF NOT EXISTS idx_work_identifiers_work_id ON work_identifiers(work_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_work_identifiers_normalized ON work_identifiers(normalized_value);
|
||||
|
||||
-- Source records indexes
|
||||
CREATE INDEX IF NOT EXISTS idx_source_records_work_id ON source_records(work_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_source_records_source_type ON source_records(source_type);
|
||||
|
||||
-- Citations indexes
|
||||
CREATE INDEX IF NOT EXISTS idx_citations_source_work ON citations(source_work_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_citations_target_work ON citations(target_work_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_citations_relation_type ON citations(relation_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_citations_source_type ON citations(source_type);
|
||||
|
||||
-- Works indexes
|
||||
CREATE INDEX IF NOT EXISTS idx_works_doi ON works(doi);
|
||||
CREATE INDEX IF NOT EXISTS idx_works_pmid ON works(pmid);
|
||||
CREATE INDEX IF NOT EXISTS idx_works_pmcid ON works(pmcid);
|
||||
CREATE INDEX IF NOT EXISTS idx_works_arxiv ON works(arxiv_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_works_openalex ON works(openalex_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_works_is_open_access ON works(is_open_access);
|
||||
CREATE INDEX IF NOT EXISTS idx_works_created_at ON works(created_at);
|
||||
|
||||
-- Embeddings indexes
|
||||
CREATE INDEX IF NOT EXISTS idx_embeddings_work_id ON work_embeddings(work_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_embeddings_model_name ON work_embeddings(model_name);
|
||||
|
||||
-- ============================================================================
|
||||
-- PostgreSQL-specific extensions and vector indexing
|
||||
-- ============================================================================
|
||||
-- Note: The following are PostgreSQL-specific and should be run when using pgvector
|
||||
|
||||
-- Uncomment these when using PostgreSQL with pgvector extension:
|
||||
-- CREATE EXTENSION IF NOT EXISTS vector;
|
||||
-- CREATE INDEX IF NOT EXISTS idx_embeddings_vector ON work_embeddings
|
||||
-- USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
|
||||
|
||||
-- ============================================================================
|
||||
-- TRIGGERS - For automatic timestamp updates
|
||||
-- ============================================================================
|
||||
-- Works table update trigger
|
||||
CREATE TRIGGER IF NOT EXISTS works_updated_at
|
||||
AFTER UPDATE ON works
|
||||
FOR EACH ROW
|
||||
WHEN (new.updated_at IS NULL)
|
||||
BEGIN
|
||||
UPDATE works SET updated_at = CURRENT_TIMESTAMP WHERE id = old.id;
|
||||
END;
|
||||
|
||||
-- Work identifiers update trigger
|
||||
CREATE TRIGGER IF NOT EXISTS work_identifiers_updated_at
|
||||
AFTER UPDATE ON work_identifiers
|
||||
FOR EACH ROW
|
||||
WHEN (new.created_at IS NULL)
|
||||
BEGIN
|
||||
UPDATE work_identifiers SET created_at = CURRENT_TIMESTAMP WHERE id = old.id;
|
||||
END;
|
||||
|
||||
-- ============================================================================
|
||||
-- VIEWS - For simplified queries
|
||||
-- ============================================================================
|
||||
-- View to join works with their identifiers
|
||||
CREATE VIEW IF NOT EXISTS works_with_identifiers AS
|
||||
SELECT
|
||||
w.id,
|
||||
w.work_id,
|
||||
w.title,
|
||||
w.abstract,
|
||||
w.publication_year,
|
||||
w.journal_name,
|
||||
w.publisher,
|
||||
w.doi,
|
||||
w.pmid,
|
||||
w.pmcid,
|
||||
w.arxiv_id,
|
||||
w.dblp_key,
|
||||
w.openalex_id,
|
||||
GROUP_CONCAT(DISTINCT wi.scheme || ':' || wi.value, ', ') AS identifiers
|
||||
FROM works w
|
||||
LEFT JOIN work_identifiers wi ON w.id = wi.work_id
|
||||
GROUP BY w.id, w.work_id, w.title, w.abstract, w.publication_year, w.journal_name, w.publisher, w.doi, w.pmid, w.pmcid, w.arxiv_id, w.dblp_key, w.openalex_id;
|
||||
|
|
@ -0,0 +1,103 @@
|
|||
# CiteGeist Source Planning Documentation
|
||||
|
||||
Welcome to the source-planning documentation for CiteGeist.
|
||||
|
||||
## Quick Overview
|
||||
|
||||
The immediate planning question is which additional open bibliographic sources should be incorporated next.
|
||||
|
||||
This documentation therefore emphasizes:
|
||||
|
||||
- the current source baseline already present in the repository
|
||||
- the next highest-value open sources to add
|
||||
- a smaller, more realistic source-layer abstraction
|
||||
- explicit deferral of unrelated database/vector ambitions
|
||||
|
||||
## Documentation Files
|
||||
|
||||
### Planning and Status
|
||||
- **[source-landscape.md](./source-landscape.md)** - recommended next open bibliographic sources
|
||||
- **[implementation-progress.md](./implementation-progress.md)** - sources-first progress tracker
|
||||
- **[phase-completion.md](./phase-completion.md)** - short status summary
|
||||
- **[file-structure.md](./file-structure.md)** - file structure and module notes
|
||||
|
||||
### Existing Architecture References
|
||||
- **[architecture-current.md](./architecture-current.md)** - current architecture overview
|
||||
- **[schema-current.sql](./schema-current.sql)** - existing database schema
|
||||
|
||||
## Current Status
|
||||
|
||||
### Current Baseline
|
||||
1. Crossref, OpenAlex, PubMed, Europe PMC, Semantic Scholar, DataCite, DBLP, arXiv, and OAI-PMH are already in play.
|
||||
2. OpenCitations and Unpaywall are now integrated as source-layer additions.
|
||||
3. The SQLite-based local workflow remains the baseline.
|
||||
|
||||
### Recommended Next Sources
|
||||
1. OpenAIRE only if repository-acquisition scope expands
|
||||
|
||||
### Explicitly Deferred
|
||||
1. Database redesign
|
||||
2. pgvector / embedding-first work
|
||||
|
||||
## Source Layer
|
||||
|
||||
The source-layer code now provides:
|
||||
|
||||
- `BibliographicSource` as the common interface
|
||||
- `SourceRegistry` for known concrete source classes
|
||||
- `CrossRefSource` as the repaired first concrete plugin
|
||||
- `OpenCitationsSource` plus DOI-based graph expansion
|
||||
- `UnpaywallSource` plus DOI-based OA-link enrichment
|
||||
- `EuropePmcSource` plus biomedical resolver/search support
|
||||
- `SemanticScholarSource` plus broader biological/physical sciences resolver/search support
|
||||
- a source catalog with current status and priority order
|
||||
- compatibility with the existing `SourceClient`-based resolver and expander code
|
||||
|
||||
## Quick Start
|
||||
|
||||
```python
|
||||
from citegeist.sources import (
|
||||
CrossRefSource,
|
||||
EuropePmcSource,
|
||||
OpenCitationsSource,
|
||||
SemanticScholarSource,
|
||||
SourceRegistry,
|
||||
UnpaywallSource,
|
||||
list_source_catalog,
|
||||
prioritized_source_keys,
|
||||
)
|
||||
|
||||
registry = SourceRegistry()
|
||||
registry.register(CrossRefSource, name="crossref", config={})
|
||||
registry.register(EuropePmcSource, name="europepmc", config={})
|
||||
registry.register(OpenCitationsSource, name="opencitations", config={})
|
||||
registry.register(SemanticScholarSource, name="semanticscholar", config={})
|
||||
registry.register(UnpaywallSource, name="unpaywall", config={"email": "you@example.org"})
|
||||
|
||||
source = registry.get("crossref")
|
||||
catalog = list_source_catalog()
|
||||
priority = prioritized_source_keys()
|
||||
```
|
||||
|
||||
## Tests
|
||||
|
||||
Relevant tests for the refocused source work:
|
||||
|
||||
- `tests/test_sources_plugin.py`
|
||||
- `tests/test_sources_catalog.py`
|
||||
|
||||
The existing broader repository test suite should continue to pass as the source-layer changes are integrated.
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. Decide whether `OpenAIRE` is worth adding for repository-acquisition breadth.
|
||||
2. Keep database/vector redesign work deferred unless a source need forces it.
|
||||
|
||||
## License
|
||||
|
||||
Same as the CiteGeist project.
|
||||
|
||||
---
|
||||
|
||||
**Last Updated:** 2026-04-25
|
||||
**Status:** Sources-first plan in effect
|
||||
|
|
@ -0,0 +1,87 @@
|
|||
# CiteGeist Current Architecture
|
||||
|
||||
## Overview
|
||||
CiteGeist is currently designed as a local BibTeX-native tooling system with:
|
||||
- BibTeX parsing and storage
|
||||
- Local text search (FTS5)
|
||||
- Entry provenance tracking
|
||||
- Citation graph traversal
|
||||
- Topic-based expansion
|
||||
|
||||
## Core Modules
|
||||
|
||||
### Source Management
|
||||
- **sources.py**: `SourceClient` class for HTTP requests with caching and retry logic
|
||||
- Base HTTP client with JSON/XML/text support
|
||||
- Built-in retry with exponential backoff
|
||||
- Cache directory support
|
||||
|
||||
### Metadata Resolution
|
||||
- **resolve.py**: `MetadataResolver` class for entry resolution
|
||||
- DOI → CrossRef lookup
|
||||
- PMID → PubMed lookup
|
||||
- arXiv, DBLP, OpenAlex lookup
|
||||
- Title search fallback with best-match selection
|
||||
- DataCite integration
|
||||
- Returns `Resolution` objects with provenance
|
||||
|
||||
### Storage
|
||||
- **storage.py**: `BibliographyStore` class (SQLite)
|
||||
- Tables: entries, creators, entry_creators, identifiers, relations, topics, entry_topics, field_provenance, relation_provenance
|
||||
- FTS5 text search integration
|
||||
- Field-level provenance tracking
|
||||
- Citation graph support (cites, cited_by edges)
|
||||
|
||||
### BibTeX Processing
|
||||
- **bibtex.py**: BibEntry dataclass and parsing/rendering
|
||||
- BibTeX → BibEntry conversion
|
||||
- BibEntry → BibTeX rendering
|
||||
- Citation key generation
|
||||
|
||||
### CLI and Server
|
||||
- **cli.py**: Command-line interface
|
||||
- **app_server.py**: Local HTTP server for UI/JSON API
|
||||
- **app_api.py**: JSON API adapter surface
|
||||
|
||||
### Expansion and Discovery
|
||||
- **expand.py**: Citation graph expansion workflows
|
||||
- **extract.py**: Plaintext reference extraction
|
||||
- **bootstrap.py**: Topic bootstrap and expansion
|
||||
|
||||
## Current State Summary
|
||||
|
||||
**Completed/Usable:**
|
||||
- BibTeX parsing and storage
|
||||
- Identifier-based resolution (DOI, PMID, arXiv, DBLP, OpenAlex)
|
||||
- Title search with best-match selection
|
||||
- Citation graph traversal and expansion
|
||||
- Field provenance tracking
|
||||
- Local search with FTS5
|
||||
- Topic-based discovery workflows
|
||||
|
||||
**Not Yet Implemented (from new roadmap):**
|
||||
- Plugin-based source architecture
|
||||
- Multi-source record merging
|
||||
- PGVector embeddings
|
||||
- Full-text OA link retrieval
|
||||
- Semantic Scholar integration
|
||||
- OpenCitations integration
|
||||
- Unified API endpoints for multi-source queries
|
||||
|
||||
## Data Flow
|
||||
|
||||
1. **Ingest**: BibTeX file → parse → store in entries table
|
||||
2. **Resolve**: Entry → resolve_doi/resolve_pmid/resolve_arxiv → fetch metadata → merge with existing
|
||||
3. **Expand**: Start from entry → traverse citation edges → discover new entries
|
||||
4. **Search**: Query FTS5 index → retrieve relevant entries
|
||||
5. **Export**: Entries → render BibTeX → output file
|
||||
|
||||
## Database Schema
|
||||
|
||||
SQLite-based storage with:
|
||||
- Normalized entry fields
|
||||
- Creator relationships
|
||||
- Identifier mapping
|
||||
- Citation relations
|
||||
- Topic associations
|
||||
- Field provenance metadata
|
||||
|
|
@ -0,0 +1,165 @@
|
|||
# CiteGeist Multi-Source File Structure
|
||||
|
||||
**Date:** 2026-04-25
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
/home/netuser/dev/CiteGeist/
|
||||
├── db/
|
||||
│ └── migrations/
|
||||
│ └── 0001_multisource.sql ✅ NEW - Multi-source schema
|
||||
│
|
||||
├── docs/
|
||||
│ ├── architecture-current.md ✅ NEW - Current architecture docs
|
||||
│ ├── implementation-progress.md ✅ NEW - Implementation progress tracker
|
||||
│ ├── schema-current.sql ✅ NEW - Current schema SQL
|
||||
│ └── file-structure.md ✅ NEW - This file
|
||||
│
|
||||
├── src/citegeist/
|
||||
│ ├── sources/ ✅ NEW - Source plugin architecture
|
||||
│ │ ├── __init__.py ✅ NEW - Package exports
|
||||
│ │ ├── __all__.py ✅ NEW - Public API
|
||||
│ │ ├── base.py ✅ NEW - Base BibliographicSource class
|
||||
│ │ ├── registry.py ✅ NEW - SourceRegistry implementation
|
||||
│ │ ├── crossref.py ✅ NEW - CrossRef source plugin
|
||||
│ │ └── _old_sources_compat.py ✅ NEW - Backward compatibility
|
||||
│ │
|
||||
│ ├── resolver/ ✅ NEW - Identifier resolution
|
||||
│ │ ├── __init__.py ✅ NEW - Module exports
|
||||
│ │ └── identifiers.py ✅ NEW - Extract, normalize, resolve
|
||||
│ │
|
||||
│ ├── db/ ✅ NEW - Database operations
|
||||
│ │ └── __init__.py 🚧 TO DO - Database client
|
||||
│ │
|
||||
│ ├── ... (existing files)
|
||||
│ ├── sources.py 📦 Existing - Old SourceClient
|
||||
│ ├── resolve.py 📦 Existing - MetadataResolver
|
||||
│ └── storage.py 📦 Existing - BibliographyStore
|
||||
│
|
||||
└── tests/
|
||||
├── test_sources_plugin.py ✅ NEW - Source plugin tests
|
||||
└── test_resolver_identifiers.py ✅ NEW - Identifier tests
|
||||
```
|
||||
|
||||
## Module Documentation
|
||||
|
||||
### New Modules
|
||||
|
||||
#### `src/citegeist/sources/`
|
||||
Plugin architecture for bibliographic sources.
|
||||
|
||||
**Classes:**
|
||||
- `BibliographicSource` - Abstract base class for source plugins
|
||||
- `SourceRecord` - Raw source record dataclass
|
||||
- `CitationEdge` - Citation relationship dataclass
|
||||
- `SourceRegistry` - Manages source plugins
|
||||
|
||||
**Plugin:**
|
||||
- `CrossRefSource` - CrossRef API implementation
|
||||
|
||||
#### `src/citegeist/resolver/`
|
||||
Identifier extraction, normalization, and resolution.
|
||||
|
||||
**Classes:**
|
||||
- `IdentifierExtractor` - Extract identifiers from entry fields
|
||||
- `IdentifierNormalizer` - Normalize identifiers to canonical form
|
||||
- `IdentifierResolver` - Resolve identifiers with lookup priority
|
||||
|
||||
**Functions:**
|
||||
- `extract_identifiers()` - Quick identifier extraction
|
||||
- `normalize_identifier()` - Quick normalization
|
||||
- `get_primary_identifier()` - Get primary identifier
|
||||
- `resolve_identifiers()` - Resolve all identifiers
|
||||
|
||||
#### `src/citegeist/db/`
|
||||
Database operations (to be implemented).
|
||||
|
||||
**Planned:**
|
||||
- Database client for works table
|
||||
- Migration runner
|
||||
- Query builders
|
||||
|
||||
#### `db/migrations/0001_multisource.sql`
|
||||
Multi-source database schema migration.
|
||||
|
||||
**Tables:**
|
||||
1. `works` - Canonical work metadata
|
||||
2. `work_identifiers` - Multi-scheme identifiers
|
||||
3. `source_records` - Raw API responses
|
||||
4. `citations` - Citation graph
|
||||
5. `work_embeddings` - Vector embeddings
|
||||
|
||||
### Existing Modules (Preserved)
|
||||
|
||||
- `src/citegeist/sources.py` - Old SourceClient (backward compatible)
|
||||
- `src/citegeist/resolve.py` - Old MetadataResolver
|
||||
- `src/citegeist/storage.py` - Old BibliographyStore
|
||||
|
||||
## Test Coverage
|
||||
|
||||
**New Tests:**
|
||||
- `tests/test_sources_plugin.py` (7 tests)
|
||||
- `tests/test_resolver_identifiers.py` (17 tests)
|
||||
|
||||
**Total:** 24 tests passing
|
||||
|
||||
## Dependencies
|
||||
|
||||
**New Dependencies Required:**
|
||||
- No new Python packages (uses stdlib only)
|
||||
|
||||
**Planned Dependencies (Future phases):**
|
||||
- `pgvector` - PostgreSQL vector extension
|
||||
- `sentence-transformers` - Local embedding model
|
||||
- `fastapi` - API framework
|
||||
- `unpaywall` - OA link retrieval (if needed)
|
||||
|
||||
## Implementation Status
|
||||
|
||||
### Completed (100%)
|
||||
- ✅ Phase 0: Baseline Audit
|
||||
- ✅ Phase 1: Source Plugin Architecture
|
||||
- ✅ Phase 2: Identifier Resolution Layer
|
||||
|
||||
### In Progress (50%)
|
||||
- 🚧 Phase 3: Database Schema Upgrade
|
||||
|
||||
### Pending (0%)
|
||||
- ⏳ Phase 4: High-Value Source Integrations
|
||||
- ⏳ Phase 5: Merge & Deduplication Engine
|
||||
- ⏳ Phase 6: Citation Graph Construction
|
||||
- ⏳ Phase 7: Embedding Pipeline
|
||||
- ⏳ Phase 8: Full-Text Retrieval Layer
|
||||
- ⏳ Phase 9: API Layer
|
||||
- ⏳ Phase 10: Ranking & Relevance
|
||||
- ⏳ Phase 12: Observability & QA
|
||||
- ⏳ Phase 13: Performance Optimization
|
||||
|
||||
## Quick Start
|
||||
|
||||
```python
|
||||
# Register a source
|
||||
from citegeist.sources import SourceRegistry, CrossRefSource
|
||||
|
||||
registry = SourceRegistry()
|
||||
registry.register(CrossRefSource, name='crossref', config={})
|
||||
|
||||
# Get source instance
|
||||
source = registry.get('crossref')
|
||||
entry = source.lookup_by_doi('10.1234/example')
|
||||
|
||||
# Resolve identifiers
|
||||
from citegeist.resolver import resolve_identifiers
|
||||
|
||||
fields = {'doi': '10.1234/example', 'title': 'Test'}
|
||||
resolved = resolve_identifiers(fields)
|
||||
# Returns [('doi', '10.1234/example'), ('title', 'test title')]
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. ✅ Phase 0-2: Complete
|
||||
2. 🚧 Phase 3: Implement Python interface for database operations
|
||||
3. ⏳ Phase 4: Add Unpaywall, Semantic Scholar, OpenCitations integrations
|
||||
4. ⏳ Phase 5: Build merge engine
|
||||
|
|
@ -0,0 +1,122 @@
|
|||
# CiteGeist Sources-First Progress
|
||||
|
||||
**Last Updated:** 2026-04-25
|
||||
|
||||
This document tracks the refocused plan for source incorporation. The working question is which additional open bibliographic sources CiteGeist should integrate next, not whether it needs a new storage platform first.
|
||||
|
||||
---
|
||||
|
||||
## Phase 0: Scope Reframe ✅ COMPLETE
|
||||
|
||||
**Status:** Completed
|
||||
|
||||
**Deliverables:**
|
||||
- ✅ `/docs/source-landscape.md` - source inventory and recommendation document
|
||||
- ✅ `/src/citegeist/sources/catalog.py` - code-backed source catalog
|
||||
|
||||
**Completed:**
|
||||
- Identified which source integrations already exist in the repository
|
||||
- Split source-expansion planning from database/vector-search ambitions
|
||||
- Prioritized open-source additions by workflow value
|
||||
|
||||
---
|
||||
|
||||
## Phase 1: Source Layer Tightening ✅ COMPLETE
|
||||
|
||||
**Status:** Completed
|
||||
|
||||
**Deliverables:**
|
||||
- ✅ `/src/citegeist/sources/base.py` - Base `BibliographicSource` interface
|
||||
- ✅ `/src/citegeist/sources/registry.py` - Registry for known concrete sources
|
||||
- ✅ `/src/citegeist/sources/crossref.py` - Repaired CrossRef source implementation
|
||||
- ✅ `/src/citegeist/sources/catalog.py` - Open-source inventory
|
||||
- ✅ `/src/citegeist/sources/__init__.py` - Package initialization
|
||||
- ✅ `/tests/test_sources_plugin.py` - Source plugin tests
|
||||
- ✅ `/tests/test_sources_catalog.py` - Source catalog and registry tests
|
||||
|
||||
**Completed:**
|
||||
- ✅ Created `BibliographicSource` abstract base class
|
||||
- ✅ Repaired `SourceRegistry` so config-backed loading resolves real source classes
|
||||
- ✅ Fixed `CrossRefSource` normalization for direct lookup and search-style payloads
|
||||
- ✅ Replaced path-specific compatibility loading with repo-relative loading
|
||||
- ✅ Added a source catalog that captures current status and next-priority sources
|
||||
|
||||
**Features:**
|
||||
- Abstract interface for source plugins
|
||||
- Registry for known source discovery and instantiation
|
||||
- Config-driven enable/disable for known source types
|
||||
- Source prioritization metadata
|
||||
- Compatibility with the existing `SourceClient`-based resolver/expander code
|
||||
|
||||
---
|
||||
|
||||
## Current Integrated Sources ✅ AVAILABLE
|
||||
|
||||
- `Crossref`
|
||||
- `OpenAlex`
|
||||
- `OpenCitations`
|
||||
- `Unpaywall`
|
||||
- `PubMed`
|
||||
- `Europe PMC`
|
||||
- `Semantic Scholar`
|
||||
- `DataCite`
|
||||
- `DBLP`
|
||||
- `arXiv`
|
||||
- `OAI-PMH`
|
||||
|
||||
These are already sufficient for a credible local enrichment-and-discovery workflow. The next work should complement them rather than restart infrastructure underneath them.
|
||||
|
||||
---
|
||||
|
||||
## Phase 2: Next Source Additions 🚧 IN PROGRESS
|
||||
|
||||
**Status:** In Progress
|
||||
|
||||
**Priority Order:**
|
||||
1. `OpenAIRE` only if repository-acquisition scope expands
|
||||
|
||||
**Completed Deliverables:**
|
||||
- ✅ OpenCitations adapter for DOI citation/reference lookup
|
||||
- ✅ OpenCitations graph expansion support in CLI and topic expansion flows
|
||||
- ✅ Unpaywall adapter for DOI OA-link enrichment
|
||||
- ✅ `enrich-oa` CLI flow for applying OA metadata to stored entries
|
||||
- ✅ Europe PMC biomedical resolver/search integration
|
||||
- ✅ Semantic Scholar broad-science resolver/search integration
|
||||
|
||||
**Planned Deliverables:**
|
||||
- ⏳ Decide whether repository-acquisition breadth needs another dedicated source
|
||||
|
||||
**Rationale:**
|
||||
- `OpenCitations` now improves open citation-edge coverage
|
||||
- `Unpaywall` now improves access-link enrichment
|
||||
- `Europe PMC` now improves biomedical metadata and OA/fulltext coverage
|
||||
- `Semantic Scholar` now improves broader biological and physical sciences coverage
|
||||
- neither requires a new database architecture to become useful
|
||||
|
||||
---
|
||||
|
||||
## Phase 3: Optional Source Evaluation ⏳ PLANNED
|
||||
|
||||
**Status:** Planned
|
||||
|
||||
- `OpenAIRE`
|
||||
|
||||
**Decision Rule:**
|
||||
- add them only if they solve a concrete discovery or acquisition gap that current open sources do not already cover well
|
||||
|
||||
---
|
||||
|
||||
## Explicitly Deferred
|
||||
|
||||
- second-schema redesign work
|
||||
- pgvector integration
|
||||
- embedding-first retrieval
|
||||
- broad canonical-work reconstruction
|
||||
|
||||
---
|
||||
|
||||
## Summary
|
||||
|
||||
**Completed:** scope reframe and source-layer cleanup
|
||||
**Planned next:** `OpenAIRE` reevaluation
|
||||
**Deferred:** database/vector expansion work not required by the source question
|
||||
|
|
@ -0,0 +1,111 @@
|
|||
# Sources-First Status
|
||||
|
||||
**Current Focus:** identify and prioritize the next open bibliographic sources to add, using the existing SQLite-based workflow as the baseline.
|
||||
|
||||
---
|
||||
|
||||
## Phase Matrix
|
||||
|
||||
| Phase | Title | Status | Outcome |
|
||||
|-------|-------|--------|---------|
|
||||
| **0** | Scope Reframe | ✅ Complete | Planning now answers the source question directly |
|
||||
| **1** | Source Layer Tightening | ✅ Complete | Registry, CrossRef plugin, compatibility seam, and source catalog repaired |
|
||||
| **2** | Next Open Source Additions | 🚧 In Progress | OpenCitations, Unpaywall, and Europe PMC integrated |
|
||||
| **3** | Optional Source Evaluation | ⏳ Planned | OpenAIRE evaluated later if acquisition breadth matters |
|
||||
| **D** | Database / Vector Expansion | ⏸ Deferred | Not required for the current source-incorporation decision |
|
||||
|
||||
---
|
||||
|
||||
## Test Coverage Summary
|
||||
|
||||
```
|
||||
✅ test_sources_plugin.py
|
||||
✅ test_sources_catalog.py
|
||||
✅ existing full suite still expected to pass
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Key Artifacts
|
||||
|
||||
### Documentation
|
||||
```
|
||||
docs/
|
||||
├── source-landscape.md ✅ Source inventory and recommendations
|
||||
├── implementation-progress.md ✅ Sources-first progress tracker
|
||||
└── phase-completion.md ✅ Short status summary
|
||||
```
|
||||
|
||||
### Source Layer
|
||||
```
|
||||
src/citegeist/sources/
|
||||
├── base.py ✅ Base source interface
|
||||
├── catalog.py ✅ Source inventory in code
|
||||
├── registry.py ✅ Registry for known source classes
|
||||
├── crossref.py ✅ Repaired CrossRef plugin
|
||||
└── _old_sources_compat.py ✅ Repo-relative compatibility bridge
|
||||
```
|
||||
|
||||
### Tests
|
||||
```
|
||||
tests/
|
||||
├── test_sources_plugin.py ✅ Source plugin tests
|
||||
└── test_sources_catalog.py ✅ Source catalog/registry tests
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Key Features Implemented
|
||||
|
||||
- ✅ Source catalog covering current and candidate open sources
|
||||
- ✅ Config-driven registry loading for known real source classes
|
||||
- ✅ CrossRef normalization that works for both single-record and search-result payloads
|
||||
- ✅ Compatibility bridge that no longer depends on one checkout path
|
||||
- ✅ OpenCitations DOI-based graph expansion with CLI support
|
||||
- ✅ Unpaywall OA-link enrichment with CLI support
|
||||
- ✅ Europe PMC biomedical resolver/search support
|
||||
- ✅ Semantic Scholar broad-science resolver/search support
|
||||
|
||||
---
|
||||
|
||||
## Next Milestones
|
||||
|
||||
### Immediate
|
||||
1. Decide whether repository-acquisition scope justifies `OpenAIRE`
|
||||
2. Keep the OA-enrichment flow aligned with review/export needs
|
||||
3. Keep graph-source scope disciplined as broader coverage grows
|
||||
|
||||
### Later
|
||||
1. Evaluate `Semantic Scholar`
|
||||
2. Evaluate `OpenAIRE`
|
||||
3. Revisit database/vector work only if a concrete source need demands it
|
||||
|
||||
---
|
||||
|
||||
## Success Metrics
|
||||
|
||||
### Completed
|
||||
- ✅ Planning now matches the actual source question
|
||||
- ✅ Source-layer defects from the first pass have been corrected
|
||||
- ✅ OpenCitations is now a working integrated source
|
||||
- ✅ Unpaywall is now a working integrated source
|
||||
- ✅ Europe PMC is now a working integrated source
|
||||
- ✅ Semantic Scholar is now a working integrated source
|
||||
- ✅ The next source priorities are explicit
|
||||
|
||||
### Planned
|
||||
- ⏳ Better source selection discipline before adding more integrations
|
||||
|
||||
---
|
||||
|
||||
## Recommendations
|
||||
|
||||
1. Treat the current SQLite/FTS workflow as the baseline, not as a blocker.
|
||||
2. Add source integrations only when they materially improve bibliographic coverage, citation coverage, or open-access linkage.
|
||||
3. Keep database/vector work explicitly subordinate to source-incorporation needs.
|
||||
|
||||
---
|
||||
|
||||
**Last Updated:** 2026-04-25
|
||||
**Status:** Sources-first plan in effect
|
||||
**Confidence:** High
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
-- CiteGeist Current Schema (SQLite)
|
||||
|
||||
-- Entries table
|
||||
CREATE TABLE IF NOT EXISTS entries (
|
||||
id INTEGER PRIMARY KEY,
|
||||
citation_key TEXT NOT NULL UNIQUE,
|
||||
entry_type TEXT NOT NULL,
|
||||
review_status TEXT NOT NULL DEFAULT 'draft',
|
||||
title TEXT,
|
||||
year TEXT,
|
||||
journal TEXT,
|
||||
booktitle TEXT,
|
||||
publisher TEXT,
|
||||
abstract TEXT,
|
||||
keywords TEXT,
|
||||
url TEXT,
|
||||
doi TEXT,
|
||||
isbn TEXT,
|
||||
fulltext TEXT,
|
||||
raw_bibtex TEXT,
|
||||
extra_fields_json TEXT NOT NULL DEFAULT '{}',
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Creators table
|
||||
CREATE TABLE IF NOT EXISTS creators (
|
||||
id INTEGER PRIMARY KEY,
|
||||
full_name TEXT NOT NULL UNIQUE,
|
||||
family_name TEXT,
|
||||
given_names TEXT
|
||||
);
|
||||
|
||||
-- Entry-Creators relationship
|
||||
CREATE TABLE IF NOT EXISTS entry_creators (
|
||||
entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
|
||||
creator_id INTEGER NOT NULL REFERENCES creators(id) ON DELETE CASCADE,
|
||||
role TEXT NOT NULL,
|
||||
ordinal INTEGER NOT NULL,
|
||||
PRIMARY KEY (entry_id, role, ordinal)
|
||||
);
|
||||
|
||||
-- Identifiers table
|
||||
CREATE TABLE IF NOT EXISTS identifiers (
|
||||
entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
|
||||
scheme TEXT NOT NULL,
|
||||
value TEXT NOT NULL,
|
||||
PRIMARY KEY (scheme, value)
|
||||
);
|
||||
|
||||
-- Relations table (citation graph)
|
||||
CREATE TABLE IF NOT EXISTS relations (
|
||||
source_entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
|
||||
target_citation_key TEXT NOT NULL,
|
||||
relation_type TEXT NOT NULL,
|
||||
PRIMARY KEY (source_entry_id, target_citation_key, relation_type)
|
||||
);
|
||||
|
||||
-- Topics table
|
||||
CREATE TABLE IF NOT EXISTS topics (
|
||||
id INTEGER PRIMARY KEY,
|
||||
slug TEXT NOT NULL UNIQUE,
|
||||
name TEXT NOT NULL,
|
||||
source_type TEXT NOT NULL,
|
||||
source_url TEXT,
|
||||
expansion_phrase TEXT,
|
||||
suggested_phrase TEXT,
|
||||
phrase_review_status TEXT NOT NULL DEFAULT 'unreviewed',
|
||||
phrase_review_notes TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Entry-Topics relationship
|
||||
CREATE TABLE IF NOT EXISTS entry_topics (
|
||||
entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
|
||||
topic_id INTEGER NOT NULL REFERENCES topics(id) ON DELETE CASCADE,
|
||||
source_label TEXT NOT NULL,
|
||||
confidence REAL,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
PRIMARY KEY (entry_id, topic_id)
|
||||
);
|
||||
|
||||
-- Field Provenance table
|
||||
CREATE TABLE IF NOT EXISTS field_provenance (
|
||||
id INTEGER PRIMARY KEY,
|
||||
entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
|
||||
field_name TEXT NOT NULL,
|
||||
field_value TEXT,
|
||||
source_type TEXT NOT NULL,
|
||||
source_label TEXT NOT NULL,
|
||||
operation TEXT NOT NULL,
|
||||
confidence REAL,
|
||||
recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Relation Provenance table
|
||||
CREATE TABLE IF NOT EXISTS relation_provenance (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
|
||||
target_citation_key TEXT NOT NULL,
|
||||
relation_type TEXT NOT NULL,
|
||||
source_type TEXT NOT NULL,
|
||||
source_label TEXT NOT NULL,
|
||||
confidence REAL,
|
||||
recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Full-text Search (FTS5)
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS entries_fts USING fts5(
|
||||
title,
|
||||
abstract,
|
||||
keywords,
|
||||
content='entries',
|
||||
content_rowid='id'
|
||||
);
|
||||
|
||||
-- Trigger to sync entries with FTS
|
||||
CREATE TRIGGER IF NOT EXISTS entries_ai AFTER INSERT ON entries BEGIN
|
||||
INSERT INTO entries_fts(rowid, title, abstract, keywords)
|
||||
VALUES (new.id, new.title, new.abstract, new.keywords);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS entries_ad AFTER DELETE ON entries BEGIN
|
||||
DELETE FROM entries_fts WHERE rowid = old.id;
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS entries_au AFTER UPDATE ON entries BEGIN
|
||||
UPDATE entries_fts SET title = new.title, abstract = new.abstract, keywords = new.keywords
|
||||
WHERE rowid = new.id;
|
||||
END;
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
# Open Bibliographic Source Landscape
|
||||
|
||||
This document answers the question that prompted the recent planning work: which additional open bibliographic sources are worth incorporating into CiteGeist, given the sources it already uses?
|
||||
|
||||
## Current Baseline
|
||||
|
||||
CiteGeist already has useful source coverage for a local BibTeX-first workflow:
|
||||
|
||||
- `Crossref`: DOI lookup, title search, and reference-list expansion.
|
||||
- `OpenAlex`: work lookup, title/discovery search, and citation-graph expansion.
|
||||
- `PubMed`: authoritative biomedical metadata lookup by PMID and title search fallback.
|
||||
- `Europe PMC`: biomedical metadata/fulltext complement to PubMed.
|
||||
- `Semantic Scholar`: broad cross-domain metadata with strong biological and physical sciences coverage.
|
||||
- `DataCite`: DOI-backed dataset/report/non-article metadata.
|
||||
- `DBLP`: strong computer-science metadata.
|
||||
- `arXiv`: preprint metadata.
|
||||
- `OAI-PMH`: repository harvesting for theses, dissertations, and institutional collections.
|
||||
|
||||
That means the immediate gap is no longer “get any scholarly metadata at all.” The immediate gap is to add the next highest-value open sources without destabilizing the existing ingest, review, and export pipeline.
|
||||
|
||||
## Recommended Priorities
|
||||
|
||||
### OpenCitations
|
||||
|
||||
Why:
|
||||
|
||||
- It directly improves open citation-edge coverage.
|
||||
- It fits CiteGeist's graph-discovery workflow better than another generic metadata source.
|
||||
- It complements OpenAlex rather than replacing it.
|
||||
|
||||
Expected role:
|
||||
|
||||
- DOI-to-citations lookup
|
||||
- DOI-to-references lookup
|
||||
- provenance for citation edges
|
||||
|
||||
Status:
|
||||
|
||||
- now integrated as a DOI-based citation/reference source in the source layer and graph expansion flow
|
||||
|
||||
Main risk:
|
||||
|
||||
- coverage is narrower than OpenAlex, so merge rules need to treat it as an additional edge source rather than a primary metadata authority.
|
||||
|
||||
### Unpaywall
|
||||
|
||||
Why:
|
||||
|
||||
- It solves a different problem from Crossref/OpenAlex: full-text access and OA status.
|
||||
- It improves the “can I get the paper?” part of the workflow without forcing a storage redesign.
|
||||
|
||||
Expected role:
|
||||
|
||||
- DOI-to-best-open-access-link lookup
|
||||
- OA status enrichment
|
||||
|
||||
Status:
|
||||
|
||||
- now integrated as an OA-link enrichment source with a dedicated `enrich-oa` CLI flow
|
||||
|
||||
Main risk:
|
||||
|
||||
- It should remain an access-link enrichment layer, not become entangled with identity resolution logic.
|
||||
|
||||
### Europe PMC
|
||||
|
||||
Why:
|
||||
|
||||
- It is valuable for biomedical and life-sciences use cases.
|
||||
- It complements PubMed with richer open-access and citation-related information.
|
||||
|
||||
Expected role:
|
||||
|
||||
- domain-specific metadata enrichment
|
||||
- biomedical search
|
||||
- OA/full-text linkage
|
||||
|
||||
Status:
|
||||
|
||||
- now integrated as a biomedical resolver/search complement to `PubMed`
|
||||
|
||||
Main risk:
|
||||
|
||||
- this should remain a domain-specific source, not be treated as a universal resolver.
|
||||
|
||||
### Semantic Scholar
|
||||
|
||||
Pros:
|
||||
|
||||
- good graph and relevance signals
|
||||
- useful for discovery quality
|
||||
|
||||
Status:
|
||||
|
||||
- now integrated as a broad resolver/search complement with good biological and physical sciences coverage
|
||||
|
||||
Main risk:
|
||||
|
||||
- rate limits and product-policy changes still matter more here than for the more explicitly open bibliographic sources
|
||||
|
||||
## Evaluate But Do Not Make Core Yet
|
||||
|
||||
### OpenAIRE
|
||||
|
||||
Pros:
|
||||
|
||||
- strong repository and OA/project linkage
|
||||
- good for European repository acquisition
|
||||
|
||||
Cons:
|
||||
|
||||
- better suited to corpus acquisition than first-line metadata resolution
|
||||
|
||||
Recommendation:
|
||||
|
||||
- treat as an acquisition adapter, not an immediate resolver target
|
||||
|
||||
## What Not To Prioritize Right Now
|
||||
|
||||
### Database Redesign
|
||||
|
||||
The repository already has a working SQLite storage model and FTS-backed local workflow. A second schema track should not lead the next phase of work unless a concrete source integration is blocked on it.
|
||||
|
||||
### Vector Search
|
||||
|
||||
Optional semantic ranking may become useful later, but it was not the motivating question and does not need to be a prerequisite for source incorporation.
|
||||
|
||||
## Suggested Execution Order
|
||||
|
||||
1. Keep the source abstraction aligned with sources already in use.
|
||||
2. Revisit `OpenAIRE` after the current source additions settle.
|
||||
|
|
@ -0,0 +1,113 @@
|
|||
# CiteGeist Roadmap: Sources-First Expansion
|
||||
|
||||
## Purpose
|
||||
|
||||
The primary question is not “how do we redesign CiteGeist around a new storage engine?” The primary question is “which additional open bibliographic sources should CiteGeist incorporate next?”
|
||||
|
||||
This roadmap treats the current SQLite-based local workflow as the baseline and focuses on source evaluation, source integration order, and reviewable source behavior.
|
||||
|
||||
## Baseline
|
||||
|
||||
Already present in the repository:
|
||||
|
||||
- local BibTeX ingest, review, export, and graph traversal
|
||||
- metadata resolution from `Crossref`, `PubMed`, `Europe PMC`, `OpenAlex`, `Semantic Scholar`, `DBLP`, `arXiv`, and `DataCite`
|
||||
- citation-graph expansion using `Crossref` and `OpenAlex`
|
||||
- repository harvesting via `OAI-PMH`
|
||||
|
||||
That means the next planning step is source prioritization, not another platform pivot.
|
||||
|
||||
## Phase 0: Reframe Scope
|
||||
|
||||
Goal:
|
||||
|
||||
Put source-incorporation decisions ahead of database and vector-search ambitions.
|
||||
|
||||
Tasks:
|
||||
|
||||
- [x] identify which source integrations already exist
|
||||
- [x] separate “source expansion” work from “new database/vector stack” work
|
||||
- [x] document the source landscape and recommended order
|
||||
|
||||
Deliverables:
|
||||
|
||||
- `/docs/source-landscape.md`
|
||||
- `/src/citegeist/sources/catalog.py`
|
||||
|
||||
## Phase 1: Tighten The Source Layer
|
||||
|
||||
Goal:
|
||||
|
||||
Make the new source abstraction useful for the repository that already exists, rather than speculative infrastructure.
|
||||
|
||||
Tasks:
|
||||
|
||||
- [x] keep the compatibility bridge to the existing `SourceClient`
|
||||
- [x] fix the initial `CrossRefSource` implementation so normalization works
|
||||
- [x] make config-driven registry loading work for known concrete sources
|
||||
- [x] add a code-backed source catalog for planning and prioritization
|
||||
|
||||
Deliverables:
|
||||
|
||||
- `/src/citegeist/sources/base.py`
|
||||
- `/src/citegeist/sources/registry.py`
|
||||
- `/src/citegeist/sources/crossref.py`
|
||||
- `/src/citegeist/sources/catalog.py`
|
||||
|
||||
## Phase 2: Highest-Value Open Source Additions
|
||||
|
||||
Goal:
|
||||
|
||||
Incorporate the next open sources that materially improve the current workflow.
|
||||
|
||||
Priority order:
|
||||
|
||||
1. `OpenAIRE` only if repository-acquisition scope expands
|
||||
|
||||
Tasks:
|
||||
|
||||
- [x] add `OpenCitations` DOI-to-citation and DOI-to-reference lookup
|
||||
- [x] merge `OpenCitations` edges into the existing graph-expansion workflow with provenance
|
||||
- [x] add `Unpaywall` DOI-to-OA-link enrichment
|
||||
- [x] expose OA-link enrichment in a dedicated CLI flow
|
||||
- [x] add `Europe PMC` as a biomedical metadata/fulltext complement to `PubMed`
|
||||
- [x] add `Semantic Scholar` as a broader scientific metadata complement across biological and physical sciences
|
||||
|
||||
Why these first:
|
||||
|
||||
- `OpenCitations` directly answers the open-citation-coverage gap
|
||||
- `Unpaywall` now solves access-link enrichment without forcing a storage redesign
|
||||
- `Europe PMC` now improves biomedical metadata and OA/fulltext coverage without changing the storage model
|
||||
- `Semantic Scholar` now improves broader biological and physical sciences coverage without changing the storage model
|
||||
|
||||
## Phase 3: Evaluate Optional Sources, Do Not Commit Prematurely
|
||||
|
||||
Goal:
|
||||
|
||||
Assess sources that may be useful, but are not clearly the next source-first move.
|
||||
|
||||
Candidates:
|
||||
|
||||
- `OpenAIRE`
|
||||
|
||||
Tasks:
|
||||
|
||||
- [ ] document API limits, openness constraints, and integration risk
|
||||
- [ ] decide whether each source belongs in core resolution, graph expansion, or corpus acquisition
|
||||
- [ ] avoid adding sources that duplicate existing coverage without a clear payoff
|
||||
|
||||
## Deferred Work
|
||||
|
||||
These are valid future ideas, but they are not the current planning driver:
|
||||
|
||||
- a second database schema
|
||||
- pgvector integration
|
||||
- embedding-first search
|
||||
- large-scale canonical-work reconstruction
|
||||
|
||||
The repository already has a working local storage/search path. Those ideas should only return to the front of the plan if a concrete source-integration need forces them there.
|
||||
|
||||
## Immediate Next Steps
|
||||
|
||||
1. Land the source inventory and source-layer cleanup.
|
||||
2. Reassess whether `OpenAIRE` is worth adding for repository-acquisition breadth.
|
||||
|
|
@ -2,7 +2,7 @@ from .app_api import LiteratureExplorerApi
|
|||
from .batch import BatchBootstrapRunner, BatchJobResult, load_batch_jobs
|
||||
from .bibtex import BibEntry, parse_bibtex
|
||||
from .bootstrap import BootstrapResult, Bootstrapper
|
||||
from .expand import CrossrefExpander, OpenAlexExpander
|
||||
from .expand import CrossrefExpander, OpenAlexExpander, OpenCitationsExpander
|
||||
from .extract import (
|
||||
available_extraction_backends,
|
||||
check_extraction_comparison_summary,
|
||||
|
|
@ -16,6 +16,10 @@ from .harvest import OaiMetadataFormat, OaiPmhHarvester, OaiSet
|
|||
from .llm_verify import VerificationLlmClient, VerificationLlmConfig
|
||||
from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts
|
||||
from .sources import SourceClient
|
||||
from .sources import EuropePmcSource
|
||||
from .sources import OpenLibrarySource
|
||||
from .sources import SemanticScholarSource
|
||||
from .sources import UnpaywallSource
|
||||
from .storage import BibliographyStore
|
||||
from .verify import BibliographyVerifier, VerificationResult, VerificationMatch
|
||||
|
||||
|
|
@ -31,10 +35,15 @@ __all__ = [
|
|||
"LiteratureExplorerApi",
|
||||
"MetadataResolver",
|
||||
"OpenAlexExpander",
|
||||
"OpenCitationsExpander",
|
||||
"OaiPmhHarvester",
|
||||
"OaiMetadataFormat",
|
||||
"OaiSet",
|
||||
"SourceClient",
|
||||
"EuropePmcSource",
|
||||
"OpenLibrarySource",
|
||||
"SemanticScholarSource",
|
||||
"UnpaywallSource",
|
||||
"VerificationLlmClient",
|
||||
"VerificationLlmConfig",
|
||||
"VerificationMatch",
|
||||
|
|
|
|||
|
|
@ -173,6 +173,13 @@ def build_parser() -> argparse.ArgumentParser:
|
|||
resolve_parser = subparsers.add_parser("resolve", help="Enrich stored entries from external metadata sources")
|
||||
resolve_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich")
|
||||
|
||||
enrich_oa_parser = subparsers.add_parser(
|
||||
"enrich-oa",
|
||||
help="Enrich DOI-bearing entries with Unpaywall OA link metadata",
|
||||
)
|
||||
enrich_oa_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich")
|
||||
enrich_oa_parser.add_argument("--email", help="Email address required by the Unpaywall API")
|
||||
|
||||
resolve_stubs_parser = subparsers.add_parser(
|
||||
"resolve-stubs",
|
||||
help="Find and enrich stub-like stored entries, optionally limited to DOI-bearing candidates",
|
||||
|
|
@ -237,7 +244,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|||
expand_parser.add_argument("citation_keys", nargs="+", help="Seed citation keys to expand")
|
||||
expand_parser.add_argument(
|
||||
"--source",
|
||||
choices=["crossref", "openalex"],
|
||||
choices=["crossref", "openalex", "opencitations"],
|
||||
default="crossref",
|
||||
help="Graph expansion source",
|
||||
)
|
||||
|
|
@ -260,7 +267,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|||
)
|
||||
expand_topic_parser.add_argument(
|
||||
"--source",
|
||||
choices=["crossref", "openalex"],
|
||||
choices=["crossref", "openalex", "opencitations"],
|
||||
default="openalex",
|
||||
help="Topic graph expansion source",
|
||||
)
|
||||
|
|
@ -749,6 +756,8 @@ def main(argv: list[str] | None = None) -> int:
|
|||
)
|
||||
if args.command == "resolve":
|
||||
return _run_resolve(store, args.citation_keys)
|
||||
if args.command == "enrich-oa":
|
||||
return _run_enrich_oa(store, args.citation_keys, args.email)
|
||||
if args.command == "resolve-stubs":
|
||||
return _run_resolve_stubs(store, args.limit, args.doi_only, args.all_misc, args.topic, args.preview)
|
||||
if args.command == "graph":
|
||||
|
|
@ -1215,6 +1224,72 @@ def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int:
|
|||
return exit_code
|
||||
|
||||
|
||||
def _run_enrich_oa(store: BibliographyStore, citation_keys: list[str], email: str | None) -> int:
|
||||
from .sources import UnpaywallSource
|
||||
|
||||
source = UnpaywallSource(config={"email": email} if email else {})
|
||||
if not source.is_available():
|
||||
print("Unpaywall enrichment requires --email or UNPAYWALL_EMAIL", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
results: list[dict[str, object]] = []
|
||||
total = len(citation_keys)
|
||||
for index, citation_key in enumerate(citation_keys, start=1):
|
||||
_print_progress("enriching OA", index, total, citation_key)
|
||||
existing = store.get_entry(citation_key)
|
||||
if existing is None:
|
||||
results.append({"citation_key": citation_key, "status": "missing"})
|
||||
continue
|
||||
doi = str(existing.get("doi") or "").strip()
|
||||
if not doi:
|
||||
results.append({"citation_key": citation_key, "status": "no_doi"})
|
||||
continue
|
||||
|
||||
enriched = source.lookup_by_doi(doi)
|
||||
if enriched is None:
|
||||
results.append({"citation_key": citation_key, "status": "no_record", "doi": doi})
|
||||
continue
|
||||
|
||||
merged_fields: dict[str, str] = {}
|
||||
for key, value in existing.items():
|
||||
if isinstance(value, str):
|
||||
merged_fields[key] = value
|
||||
merged_fields.update(enriched.fields)
|
||||
|
||||
for field_name in ("title", "year", "author", "journal", "booktitle", "publisher", "abstract", "keywords"):
|
||||
existing_value = str(existing.get(field_name) or "").strip()
|
||||
if existing_value:
|
||||
merged_fields[field_name] = existing_value
|
||||
|
||||
replacement = BibEntry(
|
||||
entry_type=str(existing.get("entry_type") or "misc"),
|
||||
citation_key=citation_key,
|
||||
fields=merged_fields,
|
||||
)
|
||||
store.replace_entry(
|
||||
citation_key,
|
||||
replacement,
|
||||
source_type="oa_enrich",
|
||||
source_label=f"unpaywall:doi:{doi}",
|
||||
review_status=str(existing.get("review_status") or "enriched"),
|
||||
)
|
||||
updated = store.get_entry(citation_key) or {}
|
||||
results.append(
|
||||
{
|
||||
"citation_key": citation_key,
|
||||
"status": "enriched",
|
||||
"doi": doi,
|
||||
"is_oa": updated.get("is_oa"),
|
||||
"oa_status": updated.get("oa_status"),
|
||||
"best_oa_url": updated.get("best_oa_url"),
|
||||
"best_oa_pdf_url": updated.get("best_oa_pdf_url"),
|
||||
}
|
||||
)
|
||||
|
||||
print(json.dumps(results, indent=2))
|
||||
return 0
|
||||
|
||||
|
||||
def _resolve_one(store: BibliographyStore, resolver: MetadataResolver, citation_key: str) -> bool:
|
||||
existing = store.get_entry(citation_key)
|
||||
if existing is None:
|
||||
|
|
@ -1664,6 +1739,15 @@ def _run_expand(
|
|||
for relation_name in _expand_relation_types(relation)
|
||||
for item in expander.expand_entry(store, key, relation_type=relation_name, limit=limit)
|
||||
]
|
||||
elif source == "opencitations":
|
||||
from .expand import OpenCitationsExpander
|
||||
|
||||
expander = OpenCitationsExpander()
|
||||
expand_fn = lambda key: [
|
||||
item
|
||||
for relation_name in _expand_relation_types(relation)
|
||||
for item in expander.expand_entry(store, key, relation_type=relation_name, limit=limit)
|
||||
]
|
||||
else:
|
||||
print(f"Unsupported expansion source: {source}", file=sys.stderr)
|
||||
return 1
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ from urllib.parse import quote, urlencode
|
|||
from .bibtex import BibEntry, parse_bibtex
|
||||
from .extract import _extract_thesis_like_title, _looks_like_citation_blob as _shared_looks_like_citation_blob
|
||||
from .resolve import MetadataResolver, merge_entries
|
||||
from .sources import OpenCitationsSource
|
||||
from .storage import BibliographyStore
|
||||
|
||||
|
||||
|
|
@ -219,14 +220,94 @@ class OpenAlexExpander:
|
|||
return _normalize_openalex_id(results[0].get("id", ""))
|
||||
|
||||
|
||||
class OpenCitationsExpander:
|
||||
def __init__(self, resolver: MetadataResolver | None = None, source: OpenCitationsSource | None = None) -> None:
|
||||
self.resolver = resolver or MetadataResolver()
|
||||
self.source = source or OpenCitationsSource(config={"source_client": self.resolver.source_client})
|
||||
|
||||
def expand_entry(
|
||||
self,
|
||||
store: BibliographyStore,
|
||||
citation_key: str,
|
||||
relation_type: str = "cites",
|
||||
limit: int = 25,
|
||||
) -> list[ExpansionResult]:
|
||||
entry = store.get_entry(citation_key)
|
||||
if entry is None:
|
||||
return []
|
||||
|
||||
doi = str(entry.get("doi") or "")
|
||||
if not doi:
|
||||
return []
|
||||
|
||||
edges = self.source.get_citations(doi, relation_type=relation_type, limit=limit)
|
||||
results: list[ExpansionResult] = []
|
||||
for edge in edges:
|
||||
discovered_doi = edge.target_work_id[4:] if relation_type == "cites" else edge.source_work_id[4:]
|
||||
discovered = self._lookup_discovered_entry(discovered_doi)
|
||||
if discovered is None:
|
||||
discovered = _opencitations_stub_entry(discovered_doi, citation_key)
|
||||
|
||||
existing_key = _existing_entry_key_for_discovered_work(store, discovered)
|
||||
target_key = existing_key or discovered.citation_key
|
||||
created = False
|
||||
if existing_key is None and store.get_entry(discovered.citation_key) is None:
|
||||
store.upsert_entry(
|
||||
discovered,
|
||||
raw_bibtex=None,
|
||||
source_type="graph_expand",
|
||||
source_label=edge.source_label,
|
||||
review_status="draft",
|
||||
)
|
||||
store.connection.commit()
|
||||
created = True
|
||||
|
||||
if relation_type == "cites":
|
||||
source_key = citation_key
|
||||
relation_target_key = target_key
|
||||
else:
|
||||
source_key = target_key
|
||||
relation_target_key = citation_key
|
||||
|
||||
store.add_relation(
|
||||
source_key,
|
||||
relation_target_key,
|
||||
"cites",
|
||||
source_type="graph_expand",
|
||||
source_label=edge.source_label,
|
||||
confidence=edge.confidence,
|
||||
)
|
||||
results.append(
|
||||
ExpansionResult(
|
||||
source_citation_key=source_key,
|
||||
discovered_citation_key=target_key,
|
||||
created_entry=created,
|
||||
relation_type=relation_type,
|
||||
source_label=edge.source_label,
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
def _lookup_discovered_entry(self, doi: str) -> BibEntry | None:
|
||||
resolution = self.resolver.resolve_doi(doi)
|
||||
if resolution is not None:
|
||||
return resolution.entry
|
||||
resolution = self.resolver.resolve_datacite_doi(doi)
|
||||
if resolution is not None:
|
||||
return resolution.entry
|
||||
return self.source.lookup_by_doi(doi)
|
||||
|
||||
|
||||
class TopicExpander:
|
||||
def __init__(
|
||||
self,
|
||||
crossref_expander: CrossrefExpander | None = None,
|
||||
openalex_expander: OpenAlexExpander | None = None,
|
||||
opencitations_expander: OpenCitationsExpander | None = None,
|
||||
) -> None:
|
||||
self.crossref_expander = crossref_expander or CrossrefExpander()
|
||||
self.openalex_expander = openalex_expander or OpenAlexExpander()
|
||||
self.opencitations_expander = opencitations_expander or OpenCitationsExpander()
|
||||
self.last_run_meta: dict[str, object] = {}
|
||||
|
||||
def expand_topic(
|
||||
|
|
@ -362,6 +443,17 @@ class TopicExpander:
|
|||
) -> list[tuple[ExpansionResult, dict[str, object] | None]]:
|
||||
if source == "crossref":
|
||||
expansion_rows = self.crossref_expander.expand_entry_references(store, citation_key)
|
||||
elif source == "opencitations":
|
||||
expansion_rows = []
|
||||
for relation_name in _expand_relation_types(relation_type):
|
||||
expansion_rows.extend(
|
||||
self.opencitations_expander.expand_entry(
|
||||
store,
|
||||
citation_key,
|
||||
relation_type=relation_name,
|
||||
limit=limit,
|
||||
)
|
||||
)
|
||||
else:
|
||||
expansion_rows: list[ExpansionResult] = []
|
||||
for relation_name in _expand_relation_types(relation_type):
|
||||
|
|
@ -385,6 +477,11 @@ class TopicExpander:
|
|||
) -> list[tuple[ExpansionResult, dict[str, object]]]:
|
||||
if source == "crossref":
|
||||
return self._preview_crossref_discoveries(store, citation_key, limit)
|
||||
if source == "opencitations":
|
||||
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
|
||||
for relation_name in _expand_relation_types(relation_type):
|
||||
rows.extend(self._preview_opencitations_discoveries(store, citation_key, relation_name, limit))
|
||||
return rows
|
||||
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
|
||||
for relation_name in _expand_relation_types(relation_type):
|
||||
rows.extend(self._preview_openalex_discoveries(store, citation_key, relation_name, limit))
|
||||
|
|
@ -467,6 +564,40 @@ class TopicExpander:
|
|||
)
|
||||
return rows
|
||||
|
||||
def _preview_opencitations_discoveries(
|
||||
self,
|
||||
store: BibliographyStore,
|
||||
citation_key: str,
|
||||
relation_type: str,
|
||||
limit: int,
|
||||
) -> list[tuple[ExpansionResult, dict[str, object]]]:
|
||||
entry = store.get_entry(citation_key)
|
||||
if entry is None or not entry.get("doi"):
|
||||
return []
|
||||
doi = str(entry["doi"])
|
||||
edges = self.opencitations_expander.source.get_citations(doi, relation_type=relation_type, limit=limit)
|
||||
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
|
||||
for edge in edges:
|
||||
discovered_doi = edge.target_work_id[4:] if relation_type == "cites" else edge.source_work_id[4:]
|
||||
discovered = self.opencitations_expander._lookup_discovered_entry(discovered_doi)
|
||||
if discovered is None:
|
||||
discovered = _opencitations_stub_entry(discovered_doi, citation_key)
|
||||
existing_key = _existing_entry_key_for_discovered_work(store, discovered)
|
||||
target_key = existing_key or discovered.citation_key
|
||||
rows.append(
|
||||
(
|
||||
ExpansionResult(
|
||||
source_citation_key=citation_key if relation_type == "cites" else target_key,
|
||||
discovered_citation_key=target_key,
|
||||
created_entry=existing_key is None and store.get_entry(discovered.citation_key) is None,
|
||||
relation_type=relation_type,
|
||||
source_label=edge.source_label,
|
||||
),
|
||||
dict(discovered.fields),
|
||||
)
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
|
||||
title = _crossref_reference_title(reference, ordinal)
|
||||
|
|
@ -567,6 +698,20 @@ def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int
|
|||
return f"{family}{year or 'nd'}{first_word}{ordinal}"
|
||||
|
||||
|
||||
def _opencitations_stub_entry(doi: str, source_citation_key: str) -> BibEntry:
|
||||
suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
|
||||
return BibEntry(
|
||||
entry_type="misc",
|
||||
citation_key=f"doi{suffix}",
|
||||
fields={
|
||||
"title": f"Referenced work for DOI {doi}",
|
||||
"doi": doi,
|
||||
"url": f"https://doi.org/{doi}",
|
||||
"note": f"discovered_from = {{{source_citation_key}}}",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _normalize_text(value: str) -> str:
|
||||
without_tags = re.sub(r"<[^>]+>", " ", html.unescape(value))
|
||||
normalized = " ".join(without_tags.split())
|
||||
|
|
|
|||
|
|
@ -7,17 +7,38 @@ import re
|
|||
import urllib.error
|
||||
import urllib.parse
|
||||
import xml.etree.ElementTree as ET
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from .bibtex import BibEntry, parse_bibtex
|
||||
from .sources.europepmc import EuropePmcSource
|
||||
from .sources.openlibrary import OpenLibrarySource
|
||||
from .sources.semanticscholar import SemanticScholarSource
|
||||
from .sources import SourceClient
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ResolutionAttempt:
|
||||
source_name: str
|
||||
strategy: str
|
||||
query_value: str
|
||||
matched: bool
|
||||
candidate_count: int | None = None
|
||||
source_label: str = ""
|
||||
error: str = ""
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class Resolution:
|
||||
entry: BibEntry
|
||||
source_type: str
|
||||
source_label: str
|
||||
attempts: list[ResolutionAttempt] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ResolutionOutcome:
|
||||
resolution: Resolution | None
|
||||
attempts: list[ResolutionAttempt]
|
||||
|
||||
|
||||
class MetadataResolver:
|
||||
|
|
@ -31,70 +52,109 @@ class MetadataResolver:
|
|||
) -> None:
|
||||
self.user_agent = user_agent
|
||||
self.source_client = source_client or SourceClient(user_agent=user_agent)
|
||||
self.europepmc = EuropePmcSource(config={"source_client": self.source_client, "user_agent": user_agent})
|
||||
self.openlibrary = OpenLibrarySource(config={"source_client": self.source_client, "user_agent": user_agent})
|
||||
self.semanticscholar = SemanticScholarSource(config={"user_agent": user_agent})
|
||||
self.ncbi_api_key = ncbi_api_key if ncbi_api_key is not None else os.environ.get("NCBI_API_KEY", "")
|
||||
self.ncbi_tool = ncbi_tool if ncbi_tool is not None else os.environ.get("NCBI_TOOL", "citegeist")
|
||||
self.ncbi_email = ncbi_email if ncbi_email is not None else os.environ.get("NCBI_EMAIL", "")
|
||||
|
||||
def resolve_entry(self, entry: BibEntry) -> Resolution | None:
|
||||
return self.resolve_entry_with_trace(entry).resolution
|
||||
|
||||
def resolve_entry_with_trace(self, entry: BibEntry) -> ResolutionOutcome:
|
||||
attempts: list[ResolutionAttempt] = []
|
||||
if doi := entry.fields.get("doi"):
|
||||
resolved = self.resolve_doi(doi)
|
||||
resolved = self._attempt_direct_resolution(attempts, "crossref", "doi_lookup", doi, self.resolve_doi)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
resolved = self.resolve_datacite_doi(doi)
|
||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
||||
resolved = self._attempt_direct_resolution(
|
||||
attempts, "datacite", "doi_lookup", doi, self.resolve_datacite_doi
|
||||
)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
||||
resolved = self._attempt_direct_resolution(
|
||||
attempts, "europepmc", "doi_lookup", doi, self.resolve_europepmc_doi
|
||||
)
|
||||
if resolved is not None:
|
||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
||||
resolved = self._attempt_direct_resolution(
|
||||
attempts, "semanticscholar", "doi_lookup", doi, self.resolve_semanticscholar_doi
|
||||
)
|
||||
if resolved is not None:
|
||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
||||
|
||||
if pmid := entry.fields.get("pmid"):
|
||||
resolved = self.resolve_pmid(pmid)
|
||||
resolved = self._attempt_direct_resolution(attempts, "pubmed", "pmid_lookup", pmid, self.resolve_pmid)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
||||
|
||||
if openalex_id := entry.fields.get("openalex"):
|
||||
resolved = self.resolve_openalex(openalex_id)
|
||||
resolved = self._attempt_direct_resolution(
|
||||
attempts, "openalex", "work_lookup", openalex_id, self.resolve_openalex
|
||||
)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
||||
|
||||
if dblp_key := entry.fields.get("dblp"):
|
||||
resolved = self.resolve_dblp(dblp_key)
|
||||
resolved = self._attempt_direct_resolution(attempts, "dblp", "key_lookup", dblp_key, self.resolve_dblp)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
||||
|
||||
if arxiv_id := entry.fields.get("arxiv"):
|
||||
resolved = self.resolve_arxiv(arxiv_id)
|
||||
resolved = self._attempt_direct_resolution(
|
||||
attempts, "arxiv", "id_lookup", arxiv_id, self.resolve_arxiv
|
||||
)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
||||
|
||||
if title := entry.fields.get("title"):
|
||||
resolved = self.search_crossref_best_match(
|
||||
title=title,
|
||||
author_text=entry.fields.get("author", ""),
|
||||
year=entry.fields.get("year", ""),
|
||||
author_text = entry.fields.get("author", "")
|
||||
year = entry.fields.get("year", "")
|
||||
resolved = self._attempt_title_search_resolution(
|
||||
attempts, "crossref", title, author_text, year, self.search_crossref
|
||||
)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
resolved = self.search_datacite_best_match(
|
||||
title=title,
|
||||
author_text=entry.fields.get("author", ""),
|
||||
year=entry.fields.get("year", ""),
|
||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
||||
resolved = self._attempt_title_search_resolution(
|
||||
attempts, "datacite", title, author_text, year, self.search_datacite
|
||||
)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
resolved = self.search_openalex_best_match(
|
||||
title=title,
|
||||
author_text=entry.fields.get("author", ""),
|
||||
year=entry.fields.get("year", ""),
|
||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
||||
resolved = self._attempt_title_search_resolution(
|
||||
attempts, "openalex", title, author_text, year, self.search_openalex
|
||||
)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
resolved = self.search_pubmed_best_match(
|
||||
title=title,
|
||||
author_text=entry.fields.get("author", ""),
|
||||
year=entry.fields.get("year", ""),
|
||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
||||
resolved = self._attempt_title_search_resolution(
|
||||
attempts, "pubmed", title, author_text, year, self.search_pubmed
|
||||
)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
||||
resolved = self._attempt_title_search_resolution(
|
||||
attempts, "europepmc", title, author_text, year, self.search_europepmc
|
||||
)
|
||||
if resolved is not None:
|
||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
||||
resolved = self._attempt_title_search_resolution(
|
||||
attempts, "semanticscholar", title, author_text, year, self.search_semanticscholar
|
||||
)
|
||||
if resolved is not None:
|
||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
||||
if _entry_prefers_catalog_search(entry):
|
||||
resolved = self._attempt_title_search_resolution(
|
||||
attempts,
|
||||
"openlibrary",
|
||||
title,
|
||||
author_text,
|
||||
year,
|
||||
self.search_openlibrary,
|
||||
selector=_select_best_catalog_title_match,
|
||||
)
|
||||
if resolved is not None:
|
||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
||||
|
||||
return None
|
||||
return ResolutionOutcome(resolution=None, attempts=attempts)
|
||||
|
||||
def resolve_doi(self, doi: str) -> Resolution | None:
|
||||
encoded = urllib.parse.quote(doi, safe="")
|
||||
|
|
@ -124,19 +184,7 @@ class MetadataResolver:
|
|||
author_text: str = "",
|
||||
year: str = "",
|
||||
) -> Resolution | None:
|
||||
candidate = _select_best_title_match(
|
||||
self.search_crossref(title, limit=5),
|
||||
title=title,
|
||||
author_text=author_text,
|
||||
year=year,
|
||||
)
|
||||
if candidate is None:
|
||||
return None
|
||||
return Resolution(
|
||||
entry=candidate,
|
||||
source_type="resolver",
|
||||
source_label=f"crossref:search:{title}",
|
||||
)
|
||||
return self._search_best_match_resolution("crossref", title, author_text, year, self.search_crossref)
|
||||
|
||||
def resolve_dblp(self, dblp_key: str) -> Resolution | None:
|
||||
encoded_key = urllib.parse.quote(dblp_key, safe="/:")
|
||||
|
|
@ -245,19 +293,7 @@ class MetadataResolver:
|
|||
author_text: str = "",
|
||||
year: str = "",
|
||||
) -> Resolution | None:
|
||||
candidate = _select_best_title_match(
|
||||
self.search_datacite(title, limit=5),
|
||||
title=title,
|
||||
author_text=author_text,
|
||||
year=year,
|
||||
)
|
||||
if candidate is None:
|
||||
return None
|
||||
return Resolution(
|
||||
entry=candidate,
|
||||
source_type="resolver",
|
||||
source_label=f"datacite:search:{title}",
|
||||
)
|
||||
return self._search_best_match_resolution("datacite", title, author_text, year, self.search_datacite)
|
||||
|
||||
def search_openalex(self, title: str, limit: int = 5) -> list[BibEntry]:
|
||||
query = urllib.parse.urlencode({"search": title, "per-page": limit})
|
||||
|
|
@ -290,6 +326,35 @@ class MetadataResolver:
|
|||
return []
|
||||
return self._fetch_pubmed_entries(ids[:limit])
|
||||
|
||||
def resolve_europepmc_doi(self, doi: str) -> Resolution | None:
|
||||
entry = self.europepmc.lookup_by_doi(doi)
|
||||
if entry is None:
|
||||
return None
|
||||
return Resolution(
|
||||
entry=entry,
|
||||
source_type="resolver",
|
||||
source_label=f"europepmc:doi:{doi}",
|
||||
)
|
||||
|
||||
def search_europepmc(self, title: str, limit: int = 5) -> list[BibEntry]:
|
||||
return self.europepmc.search(title, limit=limit)
|
||||
|
||||
def search_openlibrary(self, title: str, limit: int = 5) -> list[BibEntry]:
|
||||
return self.openlibrary.search(title, limit=limit)
|
||||
|
||||
def resolve_semanticscholar_doi(self, doi: str) -> Resolution | None:
|
||||
entry = self.semanticscholar.lookup_by_doi(doi)
|
||||
if entry is None:
|
||||
return None
|
||||
return Resolution(
|
||||
entry=entry,
|
||||
source_type="resolver",
|
||||
source_label=f"semanticscholar:doi:{doi}",
|
||||
)
|
||||
|
||||
def search_semanticscholar(self, title: str, limit: int = 5) -> list[BibEntry]:
|
||||
return self.semanticscholar.search(title, limit=limit)
|
||||
|
||||
def _safe_get_json(self, url: str) -> dict | None:
|
||||
try:
|
||||
return self.source_client.get_json(url)
|
||||
|
|
@ -333,19 +398,7 @@ class MetadataResolver:
|
|||
author_text: str = "",
|
||||
year: str = "",
|
||||
) -> Resolution | None:
|
||||
candidate = _select_best_title_match(
|
||||
self.search_openalex(title, limit=5),
|
||||
title=title,
|
||||
author_text=author_text,
|
||||
year=year,
|
||||
)
|
||||
if candidate is None:
|
||||
return None
|
||||
return Resolution(
|
||||
entry=candidate,
|
||||
source_type="resolver",
|
||||
source_label=f"openalex:search:{title}",
|
||||
)
|
||||
return self._search_best_match_resolution("openalex", title, author_text, year, self.search_openalex)
|
||||
|
||||
def search_pubmed_best_match(
|
||||
self,
|
||||
|
|
@ -353,19 +406,122 @@ class MetadataResolver:
|
|||
author_text: str = "",
|
||||
year: str = "",
|
||||
) -> Resolution | None:
|
||||
candidate = _select_best_title_match(
|
||||
self.search_pubmed(title, limit=5),
|
||||
title=title,
|
||||
author_text=author_text,
|
||||
year=year,
|
||||
return self._search_best_match_resolution("pubmed", title, author_text, year, self.search_pubmed)
|
||||
|
||||
def search_europepmc_best_match(
|
||||
self,
|
||||
title: str,
|
||||
author_text: str = "",
|
||||
year: str = "",
|
||||
) -> Resolution | None:
|
||||
return self._search_best_match_resolution("europepmc", title, author_text, year, self.search_europepmc)
|
||||
|
||||
def search_semanticscholar_best_match(
|
||||
self,
|
||||
title: str,
|
||||
author_text: str = "",
|
||||
year: str = "",
|
||||
) -> Resolution | None:
|
||||
return self._search_best_match_resolution(
|
||||
"semanticscholar", title, author_text, year, self.search_semanticscholar
|
||||
)
|
||||
|
||||
def search_openlibrary_best_match(
|
||||
self,
|
||||
title: str,
|
||||
author_text: str = "",
|
||||
year: str = "",
|
||||
) -> Resolution | None:
|
||||
return self._search_best_match_resolution("openlibrary", title, author_text, year, self.search_openlibrary)
|
||||
|
||||
def _search_best_match_resolution(
|
||||
self, source_name: str, title: str, author_text: str, year: str, search_func
|
||||
) -> Resolution | None:
|
||||
candidates = search_func(title, limit=5)
|
||||
candidate = _select_best_title_match(candidates, title=title, author_text=author_text, year=year)
|
||||
if candidate is None:
|
||||
return None
|
||||
return Resolution(
|
||||
return Resolution(entry=candidate, source_type="resolver", source_label=f"{source_name}:search:{title}")
|
||||
|
||||
def _attempt_direct_resolution(
|
||||
self,
|
||||
attempts: list[ResolutionAttempt],
|
||||
source_name: str,
|
||||
strategy: str,
|
||||
query_value: str,
|
||||
resolver_func,
|
||||
) -> Resolution | None:
|
||||
try:
|
||||
resolution = resolver_func(query_value)
|
||||
except Exception as exc:
|
||||
attempts.append(
|
||||
ResolutionAttempt(
|
||||
source_name=source_name,
|
||||
strategy=strategy,
|
||||
query_value=query_value,
|
||||
matched=False,
|
||||
error=str(exc),
|
||||
)
|
||||
)
|
||||
return None
|
||||
attempts.append(
|
||||
ResolutionAttempt(
|
||||
source_name=source_name,
|
||||
strategy=strategy,
|
||||
query_value=query_value,
|
||||
matched=resolution is not None,
|
||||
source_label=resolution.source_label if resolution is not None else "",
|
||||
)
|
||||
)
|
||||
if resolution is not None and not resolution.attempts:
|
||||
resolution.attempts = list(attempts)
|
||||
return resolution
|
||||
|
||||
def _attempt_title_search_resolution(
|
||||
self,
|
||||
attempts: list[ResolutionAttempt],
|
||||
source_name: str,
|
||||
title: str,
|
||||
author_text: str,
|
||||
year: str,
|
||||
search_func,
|
||||
selector=None,
|
||||
) -> Resolution | None:
|
||||
try:
|
||||
candidates = search_func(title, limit=5)
|
||||
except Exception as exc:
|
||||
attempts.append(
|
||||
ResolutionAttempt(
|
||||
source_name=source_name,
|
||||
strategy="title_search",
|
||||
query_value=title,
|
||||
matched=False,
|
||||
error=str(exc),
|
||||
)
|
||||
)
|
||||
return None
|
||||
match_selector = selector or _select_best_title_match
|
||||
candidate = match_selector(candidates, title=title, author_text=author_text, year=year)
|
||||
resolution = None
|
||||
if candidate is not None:
|
||||
resolution = Resolution(
|
||||
entry=candidate,
|
||||
source_type="resolver",
|
||||
source_label=f"pubmed:search:{title}",
|
||||
source_label=f"{source_name}:search:{title}",
|
||||
)
|
||||
attempts.append(
|
||||
ResolutionAttempt(
|
||||
source_name=source_name,
|
||||
strategy="title_search",
|
||||
query_value=title,
|
||||
matched=resolution is not None,
|
||||
candidate_count=len(candidates),
|
||||
source_label=resolution.source_label if resolution is not None else "",
|
||||
)
|
||||
)
|
||||
if resolution is not None and not resolution.attempts:
|
||||
resolution.attempts = list(attempts)
|
||||
return resolution
|
||||
|
||||
def _fetch_pubmed_entries(self, pmids: list[str]) -> list[BibEntry]:
|
||||
ordered_pmids = [pmid for pmid in dict.fromkeys(pmids) if pmid]
|
||||
|
|
@ -768,6 +924,42 @@ def _select_best_title_match(
|
|||
return None
|
||||
|
||||
|
||||
def _select_best_catalog_title_match(
|
||||
candidates: list[BibEntry],
|
||||
title: str,
|
||||
author_text: str = "",
|
||||
year: str = "",
|
||||
) -> BibEntry | None:
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
title_tokens = _catalog_title_tokens(title)
|
||||
author_tokens = _author_match_tokens(author_text)
|
||||
year_text = str(year or "").strip()
|
||||
scored: list[tuple[float, BibEntry]] = []
|
||||
|
||||
for candidate in candidates:
|
||||
candidate_title_tokens = _catalog_title_tokens(candidate.fields.get("title", ""))
|
||||
if not candidate_title_tokens:
|
||||
continue
|
||||
overlap = len(title_tokens & candidate_title_tokens)
|
||||
union = len(title_tokens | candidate_title_tokens)
|
||||
score = (overlap / union) if union else 0.0
|
||||
if score < 0.6:
|
||||
continue
|
||||
candidate_year = str(candidate.fields.get("year", "") or "").strip()
|
||||
if year_text and candidate_year and year_text != candidate_year:
|
||||
continue
|
||||
if author_tokens and not _candidate_matches_author_tokens(candidate, author_tokens):
|
||||
continue
|
||||
scored.append((score, candidate))
|
||||
|
||||
if not scored:
|
||||
return None
|
||||
scored.sort(key=lambda item: (-item[0], item[1].citation_key))
|
||||
return scored[0][1]
|
||||
|
||||
|
||||
def _author_match_tokens(author_text: str) -> set[str]:
|
||||
normalized = _normalize_match_text(author_text)
|
||||
if not normalized:
|
||||
|
|
@ -788,6 +980,39 @@ def _candidate_matches_author_tokens(candidate: BibEntry, author_tokens: set[str
|
|||
return bool(author_tokens & candidate_tokens)
|
||||
|
||||
|
||||
def _catalog_title_tokens(value: str) -> set[str]:
|
||||
normalized = _normalize_match_text(value)
|
||||
stopwords = {"the", "and", "for", "with", "from", "into", "after", "all"}
|
||||
return {
|
||||
f"{token[:-4]}ic" if token.endswith("ical") and len(token) > 6 else token
|
||||
for token in re.findall(r"[a-z0-9]+", normalized)
|
||||
if len(token) >= 4 and token not in stopwords
|
||||
}
|
||||
|
||||
|
||||
def _entry_prefers_catalog_search(entry: BibEntry) -> bool:
|
||||
if entry.entry_type in {"book", "incollection", "phdthesis", "mastersthesis"}:
|
||||
return True
|
||||
title = _normalize_match_text(entry.fields.get("title", ""))
|
||||
venue = _normalize_match_text(
|
||||
" ".join(
|
||||
filter(
|
||||
None,
|
||||
[
|
||||
entry.fields.get("publisher", ""),
|
||||
entry.fields.get("howpublished", ""),
|
||||
entry.fields.get("booktitle", ""),
|
||||
],
|
||||
)
|
||||
)
|
||||
)
|
||||
if entry.entry_type != "misc":
|
||||
return False
|
||||
if any(token in venue for token in ("press", "university", "house", "dictionary", "christendom")):
|
||||
return True
|
||||
return any(token in title for token in ("dictionary", "history", "world", "universe", "record"))
|
||||
|
||||
|
||||
def _normalize_pmid(value: str) -> str:
|
||||
return "".join(ch for ch in str(value) if ch.isdigit())
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,27 @@
|
|||
"""
|
||||
Identifier resolution and normalization module.
|
||||
|
||||
Provides functions for extracting, normalizing, and resolving
|
||||
bibliographic identifiers across multiple schemes.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from citegeist.resolver.identifiers import (
|
||||
IdentifierExtractor,
|
||||
IdentifierNormalizer,
|
||||
IdentifierResolver,
|
||||
extract_identifiers,
|
||||
normalize_identifier,
|
||||
get_primary_identifier,
|
||||
resolve_identifiers,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'IdentifierExtractor',
|
||||
'IdentifierNormalizer',
|
||||
'IdentifierResolver',
|
||||
'extract_identifiers',
|
||||
'normalize_identifier',
|
||||
'get_primary_identifier',
|
||||
'resolve_identifiers',
|
||||
]
|
||||
|
|
@ -0,0 +1,418 @@
|
|||
"""
|
||||
Identifier resolution and normalization module.
|
||||
|
||||
This module provides functions for extracting, normalizing, and resolving
|
||||
bibliographic identifiers across multiple schemes (DOI, PMID, arXiv, ORCID, etc.).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
# Identifier scheme patterns
|
||||
DOI_PATTERN = re.compile(
|
||||
r'^10\.\d{4,9}/\S+$',
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
PMID_PATTERN = re.compile(r'^\d{5,7}$')
|
||||
|
||||
PMCID_PATTERN = re.compile(
|
||||
r'^PMC\d+$|^PMC[0-9a-f]+$',
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
ARXIV_PATTERN = re.compile(
|
||||
r'^\d{4}\.\d{4,5}(v\d+)?$',
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
ORCID_PATTERN = re.compile(
|
||||
r'^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$',
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
ROR_PATTERN = re.compile(
|
||||
r'^https?://ror\.org/[0-9A-Z]{4,10}$'
|
||||
)
|
||||
|
||||
DBLP_PATTERN = re.compile(
|
||||
r'^[a-zA-Z0-9_]+:[a-zA-Z0-9_]+$',
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
OPENALEX_PATTERN = re.compile(
|
||||
r'^W[0-9]{4}-[A-F0-9]{4}$',
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
|
||||
class IdentifierExtractor:
|
||||
"""Extract identifiers from BibEntry fields."""
|
||||
|
||||
@staticmethod
|
||||
def extract(entry_fields: Dict[str, str]) -> Dict[str, str]:
|
||||
"""Extract all identifier schemes from entry fields.
|
||||
|
||||
Args:
|
||||
entry_fields: Dictionary of entry fields
|
||||
|
||||
Returns:
|
||||
Dictionary mapping scheme names to values
|
||||
"""
|
||||
identifiers = {}
|
||||
|
||||
# DOI
|
||||
if doi := entry_fields.get('doi'):
|
||||
identifiers['doi'] = doi
|
||||
|
||||
# PMID
|
||||
if pmid := entry_fields.get('pmid'):
|
||||
identifiers['pmid'] = pmid
|
||||
|
||||
# PMCID
|
||||
if pmcid := entry_fields.get('pmcid'):
|
||||
identifiers['pmcid'] = pmcid
|
||||
|
||||
# arXiv
|
||||
if arxiv := entry_fields.get('arxiv'):
|
||||
identifiers['arxiv'] = arxiv
|
||||
|
||||
# DBLP
|
||||
if dblp := entry_fields.get('dblp'):
|
||||
identifiers['dblp'] = dblp
|
||||
|
||||
# OpenAlex
|
||||
if openalex := entry_fields.get('openalex'):
|
||||
identifiers['openalex'] = openalex
|
||||
|
||||
# ISBN
|
||||
if isbn := entry_fields.get('isbn'):
|
||||
identifiers['isbn'] = isbn
|
||||
|
||||
# ISSN
|
||||
if issn := entry_fields.get('issn'):
|
||||
identifiers['issn'] = issn
|
||||
|
||||
return identifiers
|
||||
|
||||
|
||||
class IdentifierNormalizer:
|
||||
"""Normalize identifiers to canonical form."""
|
||||
|
||||
@staticmethod
|
||||
def normalize_doi(doi: str) -> Optional[str]:
|
||||
"""Normalize DOI to lowercase.
|
||||
|
||||
Args:
|
||||
doi: DOI string
|
||||
|
||||
Returns:
|
||||
Lowercase DOI, or None if invalid
|
||||
"""
|
||||
if not doi:
|
||||
return None
|
||||
normalized = doi.strip().lower()
|
||||
if DOI_PATTERN.match(normalized):
|
||||
return normalized
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def normalize_pmid(pmid: str) -> Optional[str]:
|
||||
"""Normalize PMID to string.
|
||||
|
||||
Args:
|
||||
pmid: PMID string
|
||||
|
||||
Returns:
|
||||
PMID string, or None if invalid
|
||||
"""
|
||||
if not pmid:
|
||||
return None
|
||||
pmid_str = str(pmid).strip()
|
||||
if PMID_PATTERN.match(pmid_str):
|
||||
return pmid_str
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def normalize_pmcid(pmcid: str) -> Optional[str]:
|
||||
"""Normalize PMCID to lowercase.
|
||||
|
||||
Args:
|
||||
pmcid: PMCID string
|
||||
|
||||
Returns:
|
||||
Lowercase PMCID, or None if invalid
|
||||
"""
|
||||
if not pmcid:
|
||||
return None
|
||||
normalized = pmcid.strip().lower()
|
||||
if PMCID_PATTERN.match(normalized):
|
||||
return normalized
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def normalize_arxiv(arxiv: str) -> Optional[str]:
|
||||
"""Normalize arXiv ID.
|
||||
|
||||
Args:
|
||||
arxiv: arXiv ID string
|
||||
|
||||
Returns:
|
||||
Normalized arXiv ID, or None if invalid
|
||||
"""
|
||||
if not arxiv:
|
||||
return None
|
||||
# Remove 'v' and version suffix if present
|
||||
normalized = arxiv.strip().lower()
|
||||
if 'v' in normalized:
|
||||
normalized = normalized.split('v')[0]
|
||||
if ARXIV_PATTERN.match(normalized):
|
||||
return normalized
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def normalize_orcid(orcid: str) -> Optional[str]:
|
||||
"""Normalize ORCID to canonical format.
|
||||
|
||||
Args:
|
||||
orcid: ORCID string
|
||||
|
||||
Returns:
|
||||
Normalized ORCID (XXXX-XXXX-XXXX-XXX0), or None if invalid
|
||||
"""
|
||||
if not orcid:
|
||||
return None
|
||||
orcid = orcid.strip().upper().replace(' ', '')
|
||||
if ORCID_PATTERN.match(orcid):
|
||||
return orcid
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def normalize_ror(ror_url: str) -> Optional[str]:
|
||||
"""Normalize ROR URL to identifier.
|
||||
|
||||
Args:
|
||||
ror_url: ROR URL string
|
||||
|
||||
Returns:
|
||||
ROR identifier, or None if invalid
|
||||
"""
|
||||
if not ror_url:
|
||||
return None
|
||||
ror_id = ror_url.strip().lower()
|
||||
if ROR_PATTERN.match(ror_id):
|
||||
return ror_id
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def normalize_dblp(dblp_key: str) -> Optional[str]:
|
||||
"""Normalize DBLP key.
|
||||
|
||||
Args:
|
||||
dblp_key: DBLP key string
|
||||
|
||||
Returns:
|
||||
DBLP key, or None if invalid
|
||||
"""
|
||||
if not dblp_key:
|
||||
return None
|
||||
dblp = dblp_key.strip()
|
||||
if DBLP_PATTERN.match(dblp):
|
||||
return dblp
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def normalize_openalex(openalex_id: str) -> Optional[str]:
|
||||
"""Normalize OpenAlex ID.
|
||||
|
||||
Args:
|
||||
openalex_id: OpenAlex ID string
|
||||
|
||||
Returns:
|
||||
OpenAlex ID, or None if invalid
|
||||
"""
|
||||
if not openalex_id:
|
||||
return None
|
||||
openalex = openalex_id.strip().upper()
|
||||
if OPENALEX_PATTERN.match(openalex):
|
||||
return openalex
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def normalize_identifier(scheme: str, value: str) -> Optional[Tuple[str, str]]:
|
||||
"""Normalize an identifier.
|
||||
|
||||
Args:
|
||||
scheme: Identifier scheme name
|
||||
value: Identifier value
|
||||
|
||||
Returns:
|
||||
Tuple of (scheme, normalized_value), or None if invalid
|
||||
"""
|
||||
scheme = scheme.lower()
|
||||
|
||||
normalizers = {
|
||||
'doi': IdentifierNormalizer.normalize_doi,
|
||||
'pmid': IdentifierNormalizer.normalize_pmid,
|
||||
'pmcid': IdentifierNormalizer.normalize_pmcid,
|
||||
'arxiv': IdentifierNormalizer.normalize_arxiv,
|
||||
'orcid': IdentifierNormalizer.normalize_orcid,
|
||||
'ror': IdentifierNormalizer.normalize_ror,
|
||||
'dblp': IdentifierNormalizer.normalize_dblp,
|
||||
'openalex': IdentifierNormalizer.normalize_openalex,
|
||||
}
|
||||
|
||||
normalizer = normalizers.get(scheme)
|
||||
if normalizer:
|
||||
normalized = normalizer(value)
|
||||
if normalized:
|
||||
return (scheme, normalized)
|
||||
return None
|
||||
|
||||
|
||||
class IdentifierResolver:
|
||||
"""Resolve identifiers across multiple schemes."""
|
||||
|
||||
# Lookup priority: schemes should be checked in this order
|
||||
LOOKUP_PRIORITY = [
|
||||
('doi', IdentifierNormalizer.normalize_doi),
|
||||
('pmid', IdentifierNormalizer.normalize_pmid),
|
||||
('pmcid', IdentifierNormalizer.normalize_pmcid),
|
||||
('arxiv', IdentifierNormalizer.normalize_arxiv),
|
||||
('dblp', IdentifierNormalizer.normalize_dblp),
|
||||
('openalex', IdentifierNormalizer.normalize_openalex),
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def resolve(entry_fields: Dict[str, str]) -> List[Tuple[str, str]]:
|
||||
"""Resolve identifiers from entry fields.
|
||||
|
||||
Args:
|
||||
entry_fields: Dictionary of entry fields
|
||||
|
||||
Returns:
|
||||
List of (scheme, normalized_value) tuples in priority order
|
||||
"""
|
||||
identifiers = IdentifierExtractor.extract(entry_fields)
|
||||
resolved = []
|
||||
|
||||
for scheme, value in identifiers.items():
|
||||
if normalized := IdentifierNormalizer.normalize_identifier(scheme, value):
|
||||
resolved.append(normalized)
|
||||
|
||||
# Add title fingerprint as fallback
|
||||
if title := entry_fields.get('title'):
|
||||
fingerprint = IdentifierResolver._create_title_fingerprint(title)
|
||||
if fingerprint:
|
||||
resolved.append(('title', fingerprint))
|
||||
|
||||
return resolved
|
||||
|
||||
@staticmethod
|
||||
def _create_title_fingerprint(title: str) -> Optional[str]:
|
||||
"""Create a fingerprint from title for fallback lookup.
|
||||
|
||||
Args:
|
||||
title: Work title
|
||||
|
||||
Returns:
|
||||
Fingerprint string
|
||||
"""
|
||||
if not title:
|
||||
return None
|
||||
|
||||
# Remove common words, punctuation, and normalize
|
||||
words = title.lower()
|
||||
words = re.sub(r'[^\w\s]', ' ', words) # Remove punctuation
|
||||
words = re.sub(r'\s+', ' ', words) # Normalize whitespace
|
||||
words = words.strip()
|
||||
|
||||
return words
|
||||
|
||||
@staticmethod
|
||||
def get_primary_identifier(entry_fields: Dict[str, str]) -> Optional[Tuple[str, str]]:
|
||||
"""Get the primary identifier (first in priority order).
|
||||
|
||||
Args:
|
||||
entry_fields: Dictionary of entry fields
|
||||
|
||||
Returns:
|
||||
Tuple of (scheme, value), or None if no identifier found
|
||||
"""
|
||||
resolved = IdentifierResolver.resolve(entry_fields)
|
||||
|
||||
for scheme, _ in IdentifierResolver.LOOKUP_PRIORITY:
|
||||
# Find this scheme in resolved identifiers
|
||||
for rscheme, rvalue in resolved:
|
||||
if rscheme == scheme:
|
||||
return (rscheme, rvalue)
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def get_scheme_value(scheme: str, entry_fields: Dict[str, str]) -> Optional[str]:
|
||||
"""Get a specific identifier value from entry fields.
|
||||
|
||||
Args:
|
||||
scheme: Identifier scheme name
|
||||
entry_fields: Dictionary of entry fields
|
||||
|
||||
Returns:
|
||||
Identifier value, or None if not found
|
||||
"""
|
||||
if value := entry_fields.get(scheme):
|
||||
if normalized := IdentifierNormalizer.normalize_identifier(scheme, value):
|
||||
return normalized[1]
|
||||
return None
|
||||
|
||||
|
||||
# Convenience functions
|
||||
def extract_identifiers(entry_fields: Dict[str, str]) -> Dict[str, str]:
|
||||
"""Extract all identifiers from entry fields.
|
||||
|
||||
Args:
|
||||
entry_fields: Dictionary of entry fields
|
||||
|
||||
Returns:
|
||||
Dictionary mapping scheme names to values
|
||||
"""
|
||||
return IdentifierExtractor.extract(entry_fields)
|
||||
|
||||
|
||||
def normalize_identifier(scheme: str, value: str) -> Optional[Tuple[str, str]]:
|
||||
"""Normalize an identifier.
|
||||
|
||||
Args:
|
||||
scheme: Identifier scheme name
|
||||
value: Identifier value
|
||||
|
||||
Returns:
|
||||
Tuple of (scheme, normalized_value), or None if invalid
|
||||
"""
|
||||
return IdentifierNormalizer.normalize_identifier(scheme, value)
|
||||
|
||||
|
||||
def get_primary_identifier(entry_fields: Dict[str, str]) -> Optional[Tuple[str, str]]:
|
||||
"""Get the primary identifier.
|
||||
|
||||
Args:
|
||||
entry_fields: Dictionary of entry fields
|
||||
|
||||
Returns:
|
||||
Tuple of (scheme, value), or None if no identifier found
|
||||
"""
|
||||
return IdentifierResolver.get_primary_identifier(entry_fields)
|
||||
|
||||
|
||||
def resolve_identifiers(entry_fields: Dict[str, str]) -> List[Tuple[str, str]]:
|
||||
"""Resolve identifiers from entry fields.
|
||||
|
||||
Args:
|
||||
entry_fields: Dictionary of entry fields
|
||||
|
||||
Returns:
|
||||
List of (scheme, value) tuples
|
||||
"""
|
||||
return IdentifierResolver.resolve(entry_fields)
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
"""Export all source plugins."""
|
||||
from __future__ import annotations
|
||||
|
||||
from citegeist.sources.base import BibliographicSource, SourceRecord, CitationEdge
|
||||
from citegeist.sources.catalog import SourceCatalogEntry, list_source_catalog, prioritized_source_keys
|
||||
from citegeist.sources.registry import SourceRegistry, get_registry
|
||||
from citegeist.sources.crossref import CrossRefSource
|
||||
from citegeist.sources.europepmc import EuropePmcSource
|
||||
from citegeist.sources.opencitations import OpenCitationsSource
|
||||
from citegeist.sources.openlibrary import OpenLibrarySource
|
||||
from citegeist.sources.semanticscholar import SemanticScholarSource
|
||||
from citegeist.sources.unpaywall import UnpaywallSource
|
||||
|
||||
__all__ = [
|
||||
'BibliographicSource',
|
||||
'SourceRecord',
|
||||
'CitationEdge',
|
||||
'SourceCatalogEntry',
|
||||
'SourceRegistry',
|
||||
'get_registry',
|
||||
'list_source_catalog',
|
||||
'prioritized_source_keys',
|
||||
'CrossRefSource',
|
||||
'EuropePmcSource',
|
||||
'OpenCitationsSource',
|
||||
'OpenLibrarySource',
|
||||
'SemanticScholarSource',
|
||||
'UnpaywallSource',
|
||||
]
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
"""
|
||||
Bibliographic source plugins.
|
||||
|
||||
This package provides a plugin architecture for integrating multiple
|
||||
bibliographic data sources (CrossRef, PubMed, Semantic Scholar, etc.).
|
||||
"""
|
||||
|
||||
# Import old sources module for backward compatibility
|
||||
from . import _old_sources_compat
|
||||
|
||||
# Import new plugin architecture
|
||||
from citegeist.sources.base import BibliographicSource, SourceRecord, CitationEdge
|
||||
from citegeist.sources.catalog import SourceCatalogEntry, list_source_catalog, prioritized_source_keys
|
||||
from citegeist.sources.registry import SourceRegistry, get_registry
|
||||
from citegeist.sources.crossref import CrossRefSource
|
||||
from citegeist.sources.europepmc import EuropePmcSource
|
||||
from citegeist.sources.opencitations import OpenCitationsSource
|
||||
from citegeist.sources.openlibrary import OpenLibrarySource
|
||||
from citegeist.sources.semanticscholar import SemanticScholarSource
|
||||
from citegeist.sources.unpaywall import UnpaywallSource
|
||||
|
||||
# Re-export old classes for compatibility
|
||||
__all__ = [
|
||||
# New plugin architecture
|
||||
'BibliographicSource',
|
||||
'SourceRecord',
|
||||
'CitationEdge',
|
||||
'SourceCatalogEntry',
|
||||
'SourceRegistry',
|
||||
'get_registry',
|
||||
'list_source_catalog',
|
||||
'prioritized_source_keys',
|
||||
'CrossRefSource',
|
||||
'EuropePmcSource',
|
||||
'OpenCitationsSource',
|
||||
'OpenLibrarySource',
|
||||
'SemanticScholarSource',
|
||||
'UnpaywallSource',
|
||||
# Old API (for backward compatibility)
|
||||
'SourceClient',
|
||||
]
|
||||
|
||||
# Backward compatibility - make SourceClient available from this module
|
||||
SourceClient = _old_sources_compat.SourceClient
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
"""
|
||||
Backward compatibility module for old sources module.
|
||||
|
||||
This module re-exports the old SourceClient class for compatibility.
|
||||
"""
|
||||
from pathlib import Path
|
||||
import importlib.util
|
||||
|
||||
from .base import BibliographicSource, SourceRecord, CitationEdge
|
||||
from .registry import SourceRegistry, get_registry
|
||||
from .crossref import CrossRefSource
|
||||
|
||||
# Load the old sources.py module from the citegeist package root
|
||||
_OLD_SOURCES_PATH = Path(__file__).resolve().parents[1] / "sources.py"
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"citegeist.sources_old",
|
||||
_OLD_SOURCES_PATH
|
||||
)
|
||||
if spec and spec.loader:
|
||||
old_sources = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(old_sources)
|
||||
SourceClient = old_sources.SourceClient
|
||||
else:
|
||||
# Fallback if old sources.py doesn't exist
|
||||
SourceClient = None
|
||||
|
|
@ -0,0 +1,189 @@
|
|||
"""
|
||||
Base interface for bibliographic sources.
|
||||
|
||||
This module defines the abstract base class that all source plugins must implement.
|
||||
Plugins can register themselves with the SourceRegistry for dynamic loading.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from citegeist.bibtex import BibEntry
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class SourceRecord:
|
||||
"""Represents a raw record from a source API."""
|
||||
raw: Dict[str, Any]
|
||||
source_type: str
|
||||
source_label: str
|
||||
timestamp: str
|
||||
confidence: float
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class CitationEdge:
|
||||
"""Represents a citation relationship."""
|
||||
source_work_id: str
|
||||
target_work_id: str
|
||||
relation_type: str # "cites" or "cited_by"
|
||||
source_type: str
|
||||
source_label: str
|
||||
confidence: float
|
||||
|
||||
|
||||
class BibliographicSource(ABC):
|
||||
"""Abstract base class for bibliographic data sources.
|
||||
|
||||
All source plugins must inherit from this class and implement the required methods.
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""Initialize the source with optional configuration.
|
||||
|
||||
Args:
|
||||
config: Source-specific configuration dictionary
|
||||
"""
|
||||
self.config = config or {}
|
||||
self.enabled = self.config.get('enabled', True)
|
||||
self.source_type = self.config.get('source_type', self.__class__.__name__)
|
||||
|
||||
@abstractmethod
|
||||
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
|
||||
"""Look up a work by DOI.
|
||||
|
||||
Args:
|
||||
doi: Digital Object Identifier
|
||||
|
||||
Returns:
|
||||
BibEntry if found, None otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
|
||||
"""Look up a work by title.
|
||||
|
||||
Args:
|
||||
title: Work title
|
||||
|
||||
Returns:
|
||||
BibEntry if found, None otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def search(self, query: str, limit: int = 10) -> List[BibEntry]:
|
||||
"""Search for works matching the query.
|
||||
|
||||
Args:
|
||||
query: Search query string
|
||||
limit: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of matching BibEntry objects
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
|
||||
"""Normalize a raw API record to a canonical BibEntry.
|
||||
|
||||
Args:
|
||||
record: Raw record from source API
|
||||
|
||||
Returns:
|
||||
BibEntry if normalization succeeds, None otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_citations(self, work_id: str, relation_type: str = 'cites', limit: int = 10) -> List[CitationEdge]:
|
||||
"""Get citations for a work.
|
||||
|
||||
Args:
|
||||
work_id: Work identifier (DOI, PMID, etc.)
|
||||
relation_type: Type of relation ('cites' or 'cited_by')
|
||||
limit: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of CitationEdge objects
|
||||
"""
|
||||
return []
|
||||
|
||||
def get_related(self, work_id: str, limit: int = 10) -> List[BibEntry]:
|
||||
"""Get works related to a work.
|
||||
|
||||
Args:
|
||||
work_id: Work identifier
|
||||
limit: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of related BibEntry objects
|
||||
"""
|
||||
return []
|
||||
|
||||
def get_fulltext_url(self, doi: str) -> Optional[str]:
|
||||
"""Get full-text URL for a work.
|
||||
|
||||
Args:
|
||||
doi: Digital Object Identifier
|
||||
|
||||
Returns:
|
||||
Full-text URL if available, None otherwise
|
||||
"""
|
||||
return None
|
||||
|
||||
def get_embedding(self, work_id: str) -> Optional[List[float]]:
|
||||
"""Get embedding vector for a work.
|
||||
|
||||
Args:
|
||||
work_id: Work identifier
|
||||
|
||||
Returns:
|
||||
Embedding vector if available, None otherwise
|
||||
"""
|
||||
return None
|
||||
|
||||
def get_identifier_scheme(self) -> str:
|
||||
"""Get the identifier scheme used by this source.
|
||||
|
||||
Returns:
|
||||
Identifier scheme (e.g., 'doi', 'pmid', 'openalex')
|
||||
"""
|
||||
return self.source_type.lower()
|
||||
|
||||
def record_source_metadata(self, entry: BibEntry, operation: str = 'ingest') -> SourceRecord:
|
||||
"""Create a source record for provenance tracking.
|
||||
|
||||
Args:
|
||||
entry: The BibEntry to record
|
||||
operation: Operation type (e.g., 'ingest', 'enrich')
|
||||
|
||||
Returns:
|
||||
SourceRecord with metadata
|
||||
"""
|
||||
return SourceRecord(
|
||||
raw=self._entry_to_dict(entry),
|
||||
source_type=self.source_type,
|
||||
source_label=f"{self.source_type}:{self.config.get('name', self.__class__.__name__)}",
|
||||
timestamp='',
|
||||
confidence=1.0
|
||||
)
|
||||
|
||||
def _entry_to_dict(self, entry: BibEntry) -> Dict[str, Any]:
|
||||
"""Convert BibEntry to dictionary for source records."""
|
||||
return {
|
||||
'entry_type': entry.entry_type,
|
||||
'citation_key': entry.citation_key,
|
||||
'fields': entry.fields
|
||||
}
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if the source is available and enabled.
|
||||
|
||||
Returns:
|
||||
True if enabled and available, False otherwise
|
||||
"""
|
||||
return self.enabled
|
||||
|
|
@ -0,0 +1,173 @@
|
|||
"""Open bibliographic source inventory and prioritization helpers."""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class SourceCatalogEntry:
|
||||
key: str
|
||||
label: str
|
||||
category: str
|
||||
access: str
|
||||
capabilities: tuple[str, ...]
|
||||
strengths: str
|
||||
caveats: str
|
||||
current_status: str
|
||||
priority: str
|
||||
|
||||
|
||||
_CATALOG: tuple[SourceCatalogEntry, ...] = (
|
||||
SourceCatalogEntry(
|
||||
key="crossref",
|
||||
label="Crossref",
|
||||
category="metadata",
|
||||
access="open API",
|
||||
capabilities=("doi_lookup", "title_search", "reference_lists"),
|
||||
strengths="Broad DOI coverage and good article-level metadata.",
|
||||
caveats="Citation coverage is incomplete and some references are unstructured blobs.",
|
||||
current_status="integrated",
|
||||
priority="now",
|
||||
),
|
||||
SourceCatalogEntry(
|
||||
key="openalex",
|
||||
label="OpenAlex",
|
||||
category="metadata+graph",
|
||||
access="open API",
|
||||
capabilities=("work_lookup", "title_search", "citations", "cited_by", "topics"),
|
||||
strengths="Best current open source for citation graph expansion and work-level discovery.",
|
||||
caveats="Occasional noisy secondary records require conservative admission rules.",
|
||||
current_status="integrated",
|
||||
priority="now",
|
||||
),
|
||||
SourceCatalogEntry(
|
||||
key="pubmed",
|
||||
label="PubMed / NCBI E-utilities",
|
||||
category="metadata",
|
||||
access="open API",
|
||||
capabilities=("pmid_lookup", "title_search", "biomedical_metadata"),
|
||||
strengths="High-value authoritative metadata for biomedical literature.",
|
||||
caveats="Domain-specific coverage outside biomedicine is limited.",
|
||||
current_status="integrated",
|
||||
priority="now",
|
||||
),
|
||||
SourceCatalogEntry(
|
||||
key="datacite",
|
||||
label="DataCite",
|
||||
category="metadata",
|
||||
access="open API",
|
||||
capabilities=("doi_lookup", "title_search", "datasets"),
|
||||
strengths="Important for datasets, reports, and non-traditional DOI-bearing objects.",
|
||||
caveats="Less useful than Crossref/OpenAlex for mainstream article citation graphs.",
|
||||
current_status="integrated",
|
||||
priority="now",
|
||||
),
|
||||
SourceCatalogEntry(
|
||||
key="dblp",
|
||||
label="DBLP",
|
||||
category="metadata",
|
||||
access="open API",
|
||||
capabilities=("key_lookup", "search", "computer_science"),
|
||||
strengths="Excellent computer-science coverage and clean bibliographic records.",
|
||||
caveats="Discipline-specific rather than general-purpose.",
|
||||
current_status="integrated",
|
||||
priority="selective",
|
||||
),
|
||||
SourceCatalogEntry(
|
||||
key="arxiv",
|
||||
label="arXiv",
|
||||
category="metadata+fulltext",
|
||||
access="open API",
|
||||
capabilities=("id_lookup", "search", "preprints"),
|
||||
strengths="Useful for preprint-first fields and free full-text links.",
|
||||
caveats="Not a general citation graph source.",
|
||||
current_status="integrated",
|
||||
priority="selective",
|
||||
),
|
||||
SourceCatalogEntry(
|
||||
key="open_citations",
|
||||
label="OpenCitations",
|
||||
category="graph",
|
||||
access="open API",
|
||||
capabilities=("doi_citations", "doi_references", "provenance"),
|
||||
strengths="Directly aligned with open citation-edge expansion.",
|
||||
caveats="Coverage is narrower than OpenAlex and needs merge discipline.",
|
||||
current_status="integrated",
|
||||
priority="now",
|
||||
),
|
||||
SourceCatalogEntry(
|
||||
key="semantic_scholar",
|
||||
label="Semantic Scholar",
|
||||
category="metadata+graph",
|
||||
access="free API with limits",
|
||||
capabilities=("work_lookup", "search", "citations", "references"),
|
||||
strengths="Strong graph and relevance signals, especially for discovery workflows.",
|
||||
caveats="Not fully open data in the same sense as Crossref/OpenAlex/OpenCitations.",
|
||||
current_status="integrated",
|
||||
priority="now",
|
||||
),
|
||||
SourceCatalogEntry(
|
||||
key="unpaywall",
|
||||
label="Unpaywall",
|
||||
category="access-links",
|
||||
access="open API",
|
||||
capabilities=("doi_fulltext_links", "oa_status"),
|
||||
strengths="Best open source for landing-page and OA-link enrichment.",
|
||||
caveats="Improves access, not bibliographic identity or graph completeness.",
|
||||
current_status="integrated",
|
||||
priority="now",
|
||||
),
|
||||
SourceCatalogEntry(
|
||||
key="europe_pmc",
|
||||
label="Europe PMC",
|
||||
category="metadata+fulltext",
|
||||
access="open API",
|
||||
capabilities=("search", "citations", "fulltext_links", "biomedical"),
|
||||
strengths="Valuable biomedical complement to PubMed with richer open-access linkage.",
|
||||
caveats="Domain-specific and partially overlapping with PubMed/OpenAlex.",
|
||||
current_status="integrated",
|
||||
priority="now",
|
||||
),
|
||||
SourceCatalogEntry(
|
||||
key="open_library",
|
||||
label="Open Library",
|
||||
category="metadata",
|
||||
access="open API",
|
||||
capabilities=("title_search", "book_metadata", "author_catalog", "isbn_hints"),
|
||||
strengths="Useful general-purpose open catalog coverage for books, monographs, and reference works.",
|
||||
caveats="Catalog metadata is not a citation graph and can be noisy for article-like titles.",
|
||||
current_status="integrated",
|
||||
priority="selective",
|
||||
),
|
||||
SourceCatalogEntry(
|
||||
key="openaire",
|
||||
label="OpenAIRE",
|
||||
category="metadata+repository",
|
||||
access="open API",
|
||||
capabilities=("repository_metadata", "oa_links", "project_links"),
|
||||
strengths="Good for repository, project, and European OA discovery.",
|
||||
caveats="May be better treated as a corpus-acquisition adapter than a first-line resolver.",
|
||||
current_status="planned",
|
||||
priority="evaluate",
|
||||
),
|
||||
SourceCatalogEntry(
|
||||
key="oai_pmh",
|
||||
label="OAI-PMH Repositories",
|
||||
category="repository",
|
||||
access="open protocol",
|
||||
capabilities=("repository_harvest", "set_discovery", "metadata_formats"),
|
||||
strengths="Already useful for theses, dissertations, and institutional repositories.",
|
||||
caveats="Heterogeneous metadata quality; not a single canonical bibliographic source.",
|
||||
current_status="integrated",
|
||||
priority="selective",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def list_source_catalog() -> list[SourceCatalogEntry]:
|
||||
return list(_CATALOG)
|
||||
|
||||
|
||||
def prioritized_source_keys() -> list[str]:
|
||||
order = {"now": 0, "next": 1, "selective": 2, "evaluate": 3}
|
||||
return [entry.key for entry in sorted(_CATALOG, key=lambda entry: (order[entry.priority], entry.label.lower()))]
|
||||
|
|
@ -0,0 +1,210 @@
|
|||
"""
|
||||
CrossRef source plugin.
|
||||
|
||||
CrossRef provides metadata for DOIs for scholarly works.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from citegeist.bibtex import BibEntry
|
||||
from citegeist.sources.base import BibliographicSource
|
||||
|
||||
|
||||
class CrossRefSource(BibliographicSource):
|
||||
"""CrossRef source for DOI-based metadata lookup."""
|
||||
|
||||
BASE_URL = "https://api.crossref.org"
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""Initialize CrossRef source.
|
||||
|
||||
Args:
|
||||
config: Configuration with optional 'api_key'
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.api_key = self.config.get('api_key', '')
|
||||
self.user_agent = self.config.get(
|
||||
'user_agent',
|
||||
'citegeist/0.1 (local research tool)',
|
||||
)
|
||||
|
||||
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
|
||||
"""Look up a work by DOI.
|
||||
|
||||
Args:
|
||||
doi: Digital Object Identifier
|
||||
|
||||
Returns:
|
||||
BibEntry if found, None otherwise
|
||||
"""
|
||||
if not doi:
|
||||
return None
|
||||
|
||||
encoded = urllib.parse.quote(doi, safe="")
|
||||
url = f"{self.BASE_URL}/works/{encoded}"
|
||||
headers = {'User-Agent': self.user_agent}
|
||||
if self.api_key:
|
||||
headers['X-Api-Key'] = self.api_key
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
response = urllib.request.urlopen(req)
|
||||
data = response.read().decode('utf-8')
|
||||
payload = json.loads(data)
|
||||
return self._normalize_crossref(payload)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
|
||||
"""CrossRef doesn't support title-only lookup.
|
||||
|
||||
Returns None as this is not a supported operation.
|
||||
"""
|
||||
return None
|
||||
|
||||
def search(self, query: str, limit: int = 10) -> List[BibEntry]:
|
||||
"""Search CrossRef for works.
|
||||
|
||||
Args:
|
||||
query: Search query string
|
||||
limit: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of matching BibEntry objects
|
||||
"""
|
||||
if not query:
|
||||
return []
|
||||
|
||||
encoded_query = urllib.parse.quote(query, safe="")
|
||||
url = f"{self.BASE_URL}/works?query={encoded_query}&rows={limit}"
|
||||
headers = {'User-Agent': self.user_agent}
|
||||
if self.api_key:
|
||||
headers['X-Api-Key'] = self.api_key
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
response = urllib.request.urlopen(req)
|
||||
data = response.read().decode('utf-8')
|
||||
payload = json.loads(data)
|
||||
items = payload.get('message', {}).get('items', [])
|
||||
return [entry for item in items if (entry := self._normalize_crossref(item)) is not None]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
|
||||
"""Normalize a raw CrossRef record to a BibEntry.
|
||||
|
||||
Args:
|
||||
record: Raw record from CrossRef API
|
||||
|
||||
Returns:
|
||||
BibEntry if normalization succeeds
|
||||
"""
|
||||
return self._normalize_crossref(record)
|
||||
|
||||
def get_identifier_scheme(self) -> str:
|
||||
"""Return 'doi' as the identifier scheme."""
|
||||
return 'doi'
|
||||
|
||||
def _normalize_crossref(self, payload: Dict[str, Any]) -> Optional[BibEntry]:
|
||||
"""Normalize a CrossRef payload to a BibEntry.
|
||||
|
||||
Args:
|
||||
payload: Raw JSON payload from CrossRef
|
||||
|
||||
Returns:
|
||||
BibEntry object
|
||||
"""
|
||||
message = payload.get('message', payload)
|
||||
if not message:
|
||||
return None
|
||||
|
||||
# Extract basic fields
|
||||
doi = str(message.get('DOI', ''))
|
||||
title = ' '.join(message.get('title', [])) if message.get('title') else ''
|
||||
author_data = message.get('author', [])
|
||||
year = self._extract_year(message)
|
||||
|
||||
# Format authors
|
||||
authors = []
|
||||
for author in author_data:
|
||||
given = str(author.get('given', ''))
|
||||
family = str(author.get('family', ''))
|
||||
if given and family:
|
||||
authors.append(f"{given} {family}")
|
||||
elif family:
|
||||
authors.append(family)
|
||||
|
||||
# Get publisher
|
||||
publisher = str(message.get('publisher', ''))
|
||||
|
||||
# Get journal info
|
||||
container_title = message.get('container-title', [])
|
||||
journal = container_title[0] if container_title else ''
|
||||
|
||||
# Get URL
|
||||
url = str(message.get('URL', ''))
|
||||
|
||||
# Get abstract
|
||||
abstract = self._extract_abstract(message.get('abstract'))
|
||||
|
||||
# Map to BibEntry
|
||||
fields: Dict[str, str] = {}
|
||||
if title:
|
||||
fields['title'] = title
|
||||
if authors:
|
||||
fields['author'] = ' and '.join(authors)
|
||||
if year:
|
||||
fields['year'] = year
|
||||
if doi:
|
||||
fields['doi'] = doi
|
||||
if journal:
|
||||
fields['journal'] = journal
|
||||
if publisher:
|
||||
fields['publisher'] = publisher
|
||||
if url:
|
||||
fields['url'] = url
|
||||
if abstract:
|
||||
fields['abstract'] = abstract
|
||||
|
||||
citation_key = f"{authors[0] if authors else 'crossref'}_{year or 'n.d.'}_{title or doi}"
|
||||
|
||||
return BibEntry(
|
||||
entry_type='article',
|
||||
citation_key=citation_key,
|
||||
fields=fields
|
||||
)
|
||||
|
||||
def _extract_year(self, message: Dict[str, Any]) -> str:
|
||||
for field_name in ('published-print', 'published-online', 'issued', 'created'):
|
||||
year = self._extract_year_from_date_parts(message.get(field_name, {}))
|
||||
if year:
|
||||
return year
|
||||
return ''
|
||||
|
||||
def _extract_year_from_date_parts(self, field: Dict[str, Any]) -> str:
|
||||
date_parts = field.get('date-parts', [])
|
||||
if not date_parts:
|
||||
return ''
|
||||
first_part = date_parts[0]
|
||||
if not first_part:
|
||||
return ''
|
||||
year = first_part[0]
|
||||
return str(year) if year else ''
|
||||
|
||||
def _extract_abstract(self, raw_abstract: Any) -> str:
|
||||
if isinstance(raw_abstract, str):
|
||||
return raw_abstract.strip()
|
||||
if isinstance(raw_abstract, list):
|
||||
for item in raw_abstract:
|
||||
if isinstance(item, dict):
|
||||
text = str(item.get('value', '')).strip()
|
||||
if text:
|
||||
return text
|
||||
elif isinstance(item, str) and item.strip():
|
||||
return item.strip()
|
||||
return ''
|
||||
|
|
@ -0,0 +1,157 @@
|
|||
"""Europe PMC source plugin."""
|
||||
from __future__ import annotations
|
||||
|
||||
import urllib.parse
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from citegeist.bibtex import BibEntry
|
||||
from citegeist.sources._old_sources_compat import SourceClient
|
||||
from citegeist.sources.base import BibliographicSource
|
||||
|
||||
|
||||
class EuropePmcSource(BibliographicSource):
|
||||
"""Europe PMC source for biomedical metadata and OA/fulltext links."""
|
||||
|
||||
BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)")
|
||||
self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
|
||||
|
||||
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
|
||||
normalized = doi.strip()
|
||||
if not normalized:
|
||||
return None
|
||||
query = f'DOI:"{normalized}"'
|
||||
row = self._search_one(query)
|
||||
return self.normalize(row) if row else None
|
||||
|
||||
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
|
||||
query_text = " ".join(title.split())
|
||||
if not query_text:
|
||||
return None
|
||||
query = f'TITLE:"{query_text}"'
|
||||
row = self._search_one(query)
|
||||
return self.normalize(row) if row else None
|
||||
|
||||
def search(self, query: str, limit: int = 10) -> list[BibEntry]:
|
||||
query_text = " ".join(query.split())
|
||||
if not query_text:
|
||||
return []
|
||||
payload = self._search_payload(f'TITLE:"{query_text}"', max(1, limit))
|
||||
results = payload.get("resultList", {}).get("result", []) if payload else []
|
||||
return [entry for row in results if (entry := self.normalize(row)) is not None]
|
||||
|
||||
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
|
||||
title = str(record.get("title") or "").strip()
|
||||
if not title:
|
||||
return None
|
||||
|
||||
doi = str(record.get("doi") or "").strip()
|
||||
pmid = str(record.get("pmid") or record.get("id") or "").strip() if str(record.get("source") or "") == "MED" else str(record.get("pmid") or "").strip()
|
||||
pmcid = str(record.get("pmcid") or "").strip()
|
||||
year = str(record.get("pubYear") or "").strip()
|
||||
author_text = self._normalize_author_string(str(record.get("authorString") or "").strip())
|
||||
journal_title = str(record.get("journalTitle") or "").strip()
|
||||
abstract = str(record.get("abstractText") or "").strip()
|
||||
|
||||
fields: Dict[str, str] = {"title": title}
|
||||
if doi:
|
||||
fields["doi"] = doi
|
||||
if pmid:
|
||||
fields["pmid"] = pmid
|
||||
if pmcid:
|
||||
fields["pmcid"] = pmcid
|
||||
if year:
|
||||
fields["year"] = year
|
||||
if author_text:
|
||||
fields["author"] = author_text
|
||||
if journal_title:
|
||||
fields["journal"] = journal_title
|
||||
if volume := str(record.get("journalVolume") or "").strip():
|
||||
fields["volume"] = volume
|
||||
if issue := str(record.get("issue") or "").strip():
|
||||
fields["number"] = issue
|
||||
if pages := str(record.get("pageInfo") or "").strip():
|
||||
fields["pages"] = pages
|
||||
if abstract:
|
||||
fields["abstract"] = abstract
|
||||
if fulltext_url := self._fulltext_url(record):
|
||||
fields["url"] = fulltext_url
|
||||
elif article_url := self._article_url(record):
|
||||
fields["url"] = article_url
|
||||
if str(record.get("isOpenAccess") or "").strip():
|
||||
fields["is_oa"] = "true" if str(record.get("isOpenAccess")).upper() == "Y" else "false"
|
||||
if cited_by := str(record.get("citedByCount") or "").strip():
|
||||
fields["europepmc_cited_by_count"] = cited_by
|
||||
if source := str(record.get("source") or "").strip():
|
||||
fields["europepmc_source"] = source
|
||||
|
||||
citation_key = self._citation_key(doi, pmid, author_text, year, title)
|
||||
return BibEntry(entry_type="article", citation_key=citation_key, fields=fields)
|
||||
|
||||
def get_fulltext_url(self, doi: str) -> Optional[str]:
|
||||
normalized = doi.strip()
|
||||
if not normalized:
|
||||
return None
|
||||
payload = self._search_payload(f'DOI:"{normalized}"', 1)
|
||||
results = payload.get("resultList", {}).get("result", []) if payload else []
|
||||
if not results:
|
||||
return None
|
||||
return self._fulltext_url(results[0]) or self._article_url(results[0])
|
||||
|
||||
def get_identifier_scheme(self) -> str:
|
||||
return "doi"
|
||||
|
||||
def _search_one(self, query: str) -> Dict[str, Any] | None:
|
||||
payload = self._search_payload(query, 1)
|
||||
results = payload.get("resultList", {}).get("result", []) if payload else []
|
||||
return results[0] if results else None
|
||||
|
||||
def _search_payload(self, query: str, page_size: int) -> Dict[str, Any] | None:
|
||||
params = {
|
||||
"query": query,
|
||||
"format": "json",
|
||||
"resultType": "core",
|
||||
"pageSize": max(1, page_size),
|
||||
}
|
||||
return self.source_client.try_get_json(f"{self.BASE_URL}?{urllib.parse.urlencode(params)}")
|
||||
|
||||
def _fulltext_url(self, record: Dict[str, Any]) -> str:
|
||||
candidates = record.get("fullTextUrlList", {})
|
||||
if isinstance(candidates, dict):
|
||||
urls = candidates.get("fullTextUrl", [])
|
||||
if isinstance(urls, dict):
|
||||
urls = [urls]
|
||||
if isinstance(urls, list):
|
||||
for item in urls:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
url = str(item.get("url") or "").strip()
|
||||
if url:
|
||||
return url
|
||||
return ""
|
||||
|
||||
def _article_url(self, record: Dict[str, Any]) -> str:
|
||||
source = str(record.get("source") or "").strip()
|
||||
identifier = str(record.get("id") or "").strip()
|
||||
if source and identifier:
|
||||
return f"https://europepmc.org/article/{source}/{identifier}"
|
||||
return ""
|
||||
|
||||
def _normalize_author_string(self, value: str) -> str:
|
||||
if not value:
|
||||
return ""
|
||||
authors = [part.strip().rstrip(".") for part in value.split(",") if part.strip()]
|
||||
return " and ".join(authors)
|
||||
|
||||
def _citation_key(self, doi: str, pmid: str, author_text: str, year: str, title: str) -> str:
|
||||
if doi:
|
||||
return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
|
||||
if pmid:
|
||||
return f"pmid{pmid}"
|
||||
family = author_text.split(" and ")[0].split()[-1] if author_text else "ref"
|
||||
family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref"
|
||||
first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
|
||||
return f"{family}{year or 'nd'}{first_word or 'untitled'}"
|
||||
|
|
@ -0,0 +1,178 @@
|
|||
"""OpenCitations source plugin."""
|
||||
from __future__ import annotations
|
||||
|
||||
import urllib.parse
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from citegeist.bibtex import BibEntry
|
||||
from citegeist.sources.base import BibliographicSource, CitationEdge
|
||||
from citegeist.sources._old_sources_compat import SourceClient
|
||||
|
||||
|
||||
class OpenCitationsSource(BibliographicSource):
|
||||
"""OpenCitations source for DOI metadata and citation edges."""
|
||||
|
||||
INDEX_BASE_URL = "https://api.opencitations.net/index/v2"
|
||||
META_BASE_URL = "https://api.opencitations.net/meta/v1"
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)")
|
||||
self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
|
||||
|
||||
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
|
||||
normalized = self._normalize_doi_pid(doi)
|
||||
if not normalized:
|
||||
return None
|
||||
rows = self.source_client.try_get_json(f"{self.META_BASE_URL}/metadata/{normalized}")
|
||||
if not rows:
|
||||
return None
|
||||
return self.normalize(rows[0])
|
||||
|
||||
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
|
||||
return None
|
||||
|
||||
def search(self, query: str, limit: int = 10) -> List[BibEntry]:
|
||||
return []
|
||||
|
||||
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
|
||||
ids = str(record.get("id") or "")
|
||||
title = str(record.get("title") or "").strip()
|
||||
if not ids or not title:
|
||||
return None
|
||||
|
||||
doi = self._extract_id_value(ids, "doi")
|
||||
openalex = self._extract_id_value(ids, "openalex")
|
||||
year = self._extract_year(str(record.get("pub_date") or ""))
|
||||
authors = self._normalize_author_field(str(record.get("author") or ""))
|
||||
venue, venue_ids = self._parse_venue_field(str(record.get("venue") or ""))
|
||||
entry_type = self._map_entry_type(str(record.get("type") or ""))
|
||||
|
||||
fields: Dict[str, str] = {"title": title}
|
||||
if doi:
|
||||
fields["doi"] = doi
|
||||
fields["url"] = f"https://doi.org/{doi}"
|
||||
if openalex:
|
||||
fields["openalex"] = openalex
|
||||
if year:
|
||||
fields["year"] = year
|
||||
if authors:
|
||||
fields["author"] = authors
|
||||
if venue:
|
||||
if entry_type == "article":
|
||||
fields["journal"] = venue
|
||||
else:
|
||||
fields["booktitle"] = venue
|
||||
if volume := str(record.get("volume") or "").strip():
|
||||
fields["volume"] = volume
|
||||
if issue := str(record.get("issue") or "").strip():
|
||||
fields["number"] = issue
|
||||
if pages := str(record.get("page") or "").strip():
|
||||
fields["pages"] = pages
|
||||
if publisher := self._strip_bracketed_ids(str(record.get("publisher") or "")):
|
||||
fields["publisher"] = publisher
|
||||
if venue_ids:
|
||||
fields["note"] = f"opencitations_venue_ids = {{{venue_ids}}}"
|
||||
|
||||
citation_key = self._citation_key(doi, openalex, authors, year, title)
|
||||
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
|
||||
|
||||
def get_citations(self, work_id: str, relation_type: str = "cites", limit: int = 10) -> List[CitationEdge]:
|
||||
normalized = self._normalize_doi_pid(work_id)
|
||||
if not normalized:
|
||||
return []
|
||||
path = "references" if relation_type == "cites" else "citations"
|
||||
rows = self.source_client.try_get_json(f"{self.INDEX_BASE_URL}/{path}/{normalized}")
|
||||
if not rows:
|
||||
return []
|
||||
|
||||
edges: List[CitationEdge] = []
|
||||
for row in rows[:limit]:
|
||||
citing = self._extract_id_value(str(row.get("citing") or ""), "doi")
|
||||
cited = self._extract_id_value(str(row.get("cited") or ""), "doi")
|
||||
if not citing or not cited:
|
||||
continue
|
||||
if relation_type == "cites":
|
||||
source_work_id, target_work_id = citing, cited
|
||||
else:
|
||||
source_work_id, target_work_id = citing, cited
|
||||
edges.append(
|
||||
CitationEdge(
|
||||
source_work_id=f"doi:{source_work_id}",
|
||||
target_work_id=f"doi:{target_work_id}",
|
||||
relation_type="cites",
|
||||
source_type="opencitations",
|
||||
source_label=f"opencitations:{path}:{normalized}",
|
||||
confidence=0.85,
|
||||
)
|
||||
)
|
||||
return edges
|
||||
|
||||
def get_identifier_scheme(self) -> str:
|
||||
return "doi"
|
||||
|
||||
def _normalize_doi_pid(self, value: str) -> str:
|
||||
doi = value.strip()
|
||||
if not doi:
|
||||
return ""
|
||||
if doi.lower().startswith("doi:"):
|
||||
doi = doi[4:]
|
||||
return f"doi:{doi}"
|
||||
|
||||
def _extract_id_value(self, identifiers: str, scheme: str) -> str:
|
||||
prefix = f"{scheme}:"
|
||||
for token in identifiers.split():
|
||||
if token.startswith(prefix):
|
||||
return token[len(prefix):]
|
||||
return ""
|
||||
|
||||
def _extract_year(self, pub_date: str) -> str:
|
||||
pub_date = pub_date.strip()
|
||||
if len(pub_date) >= 4 and pub_date[:4].isdigit():
|
||||
return pub_date[:4]
|
||||
return ""
|
||||
|
||||
def _normalize_author_field(self, raw_authors: str) -> str:
|
||||
authors: List[str] = []
|
||||
for part in raw_authors.split(";"):
|
||||
cleaned = self._strip_bracketed_ids(part)
|
||||
cleaned = " ".join(cleaned.split())
|
||||
if cleaned:
|
||||
authors.append(cleaned)
|
||||
return " and ".join(authors)
|
||||
|
||||
def _parse_venue_field(self, raw_venue: str) -> tuple[str, str]:
|
||||
raw_venue = raw_venue.strip()
|
||||
if not raw_venue:
|
||||
return "", ""
|
||||
if "[" not in raw_venue:
|
||||
return raw_venue, ""
|
||||
title, _, remainder = raw_venue.partition("[")
|
||||
return title.strip(), remainder.rstrip("] ").strip()
|
||||
|
||||
def _strip_bracketed_ids(self, value: str) -> str:
|
||||
return value.split("[", 1)[0].strip()
|
||||
|
||||
def _map_entry_type(self, raw_type: str) -> str:
|
||||
lowered = raw_type.casefold()
|
||||
if lowered == "journal article":
|
||||
return "article"
|
||||
if lowered == "book":
|
||||
return "book"
|
||||
if lowered == "book chapter":
|
||||
return "incollection"
|
||||
if lowered in {"proceedings article", "conference paper"}:
|
||||
return "inproceedings"
|
||||
if "thesis" in lowered or "dissertation" in lowered:
|
||||
return "phdthesis"
|
||||
return "misc"
|
||||
|
||||
def _citation_key(self, doi: str, openalex: str, authors: str, year: str, title: str) -> str:
|
||||
if doi:
|
||||
return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
|
||||
if openalex:
|
||||
return "openalex" + "".join(ch for ch in openalex.lower() if ch.isalnum())
|
||||
family = authors.split(" and ")[0].split(",")[0].split()[-1] if authors else "ref"
|
||||
family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref"
|
||||
first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
|
||||
return f"{family}{year or 'nd'}{first_word or 'untitled'}"
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
"""Open Library source plugin."""
|
||||
from __future__ import annotations
|
||||
|
||||
import urllib.parse
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from citegeist.bibtex import BibEntry
|
||||
from citegeist.sources.base import BibliographicSource
|
||||
from citegeist.sources._old_sources_compat import SourceClient
|
||||
|
||||
|
||||
class OpenLibrarySource(BibliographicSource):
|
||||
"""Open Library source for broad book and monograph metadata."""
|
||||
|
||||
SEARCH_URL = "https://openlibrary.org/search.json"
|
||||
WORK_URL = "https://openlibrary.org"
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
user_agent = str(self.config.get("user_agent") or "citegeist/0.1 (local research tool)")
|
||||
self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
|
||||
|
||||
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
|
||||
return None
|
||||
|
||||
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
|
||||
matches = self.search(title, limit=1)
|
||||
return matches[0] if matches else None
|
||||
|
||||
def search(self, query: str, limit: int = 10) -> List[BibEntry]:
|
||||
title = " ".join(query.split())
|
||||
if not title:
|
||||
return []
|
||||
params = urllib.parse.urlencode({"title": title, "limit": max(1, limit), "fields": "*"})
|
||||
payload = self.source_client.try_get_json(f"{self.SEARCH_URL}?{params}")
|
||||
if not payload:
|
||||
return []
|
||||
docs = payload.get("docs", [])
|
||||
if not isinstance(docs, list):
|
||||
return []
|
||||
return [entry for record in docs if isinstance(record, dict) and (entry := self.normalize(record)) is not None]
|
||||
|
||||
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
|
||||
title = str(record.get("title") or "").strip()
|
||||
if not title:
|
||||
return None
|
||||
|
||||
authors = self._join_list(record.get("author_name"))
|
||||
year = self._extract_year(record)
|
||||
publishers = self._join_list(record.get("publisher"))
|
||||
work_key = str(record.get("key") or "").strip()
|
||||
edition_keys = record.get("edition_key") or []
|
||||
isbn_values = record.get("isbn") or []
|
||||
|
||||
fields: Dict[str, str] = {"title": title}
|
||||
if authors:
|
||||
fields["author"] = authors
|
||||
if year:
|
||||
fields["year"] = year
|
||||
if publishers:
|
||||
fields["publisher"] = publishers
|
||||
if work_key:
|
||||
fields["openlibrary_work"] = work_key
|
||||
fields["url"] = f"{self.WORK_URL}{work_key}"
|
||||
if isinstance(edition_keys, list) and edition_keys:
|
||||
fields["openlibrary_edition"] = str(edition_keys[0])
|
||||
if isinstance(isbn_values, list) and isbn_values:
|
||||
fields["isbn"] = str(isbn_values[0])
|
||||
|
||||
return BibEntry(
|
||||
entry_type="book",
|
||||
citation_key=self._citation_key(work_key, authors, year, title),
|
||||
fields=fields,
|
||||
)
|
||||
|
||||
def get_identifier_scheme(self) -> str:
|
||||
return "openlibrary"
|
||||
|
||||
def _extract_year(self, record: Dict[str, Any]) -> str:
|
||||
first_publish_year = record.get("first_publish_year")
|
||||
if first_publish_year:
|
||||
return str(first_publish_year)
|
||||
publish_year = record.get("publish_year")
|
||||
if isinstance(publish_year, list) and publish_year:
|
||||
return str(publish_year[0])
|
||||
return ""
|
||||
|
||||
def _join_list(self, value: Any) -> str:
|
||||
if not isinstance(value, list):
|
||||
return ""
|
||||
items = [str(item).strip() for item in value if str(item).strip()]
|
||||
return " and ".join(items)
|
||||
|
||||
def _citation_key(self, work_key: str, authors: str, year: str, title: str) -> str:
|
||||
if work_key:
|
||||
return "ol" + "".join(ch for ch in work_key.lower() if ch.isalnum())
|
||||
family = authors.split(" and ")[0].split()[-1] if authors else "book"
|
||||
family = "".join(ch for ch in family.lower() if ch.isalnum()) or "book"
|
||||
first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
|
||||
return f"{family}{year or 'nd'}{first_word or 'untitled'}"
|
||||
|
|
@ -0,0 +1,253 @@
|
|||
"""
|
||||
Source registry for managing bibliographic source plugins.
|
||||
|
||||
This module provides a registry that can discover, load, and manage
|
||||
multiple bibliographic source plugins.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib.util
|
||||
import inspect
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Type
|
||||
|
||||
from citegeist.sources.base import BibliographicSource
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class SourceRegistration:
|
||||
"""Registration information for a source plugin."""
|
||||
name: str
|
||||
source_class: Type[BibliographicSource]
|
||||
config: Dict[str, Any]
|
||||
enabled: bool
|
||||
|
||||
|
||||
class SourceRegistry:
|
||||
"""Registry for bibliographic source plugins.
|
||||
|
||||
This class manages the discovery, registration, and instantiation
|
||||
of bibliographic source plugins.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize the source registry."""
|
||||
self._registrations: Dict[str, SourceRegistration] = {}
|
||||
self._instances: Dict[str, BibliographicSource] = {}
|
||||
|
||||
def register(
|
||||
self,
|
||||
source_class: Type[BibliographicSource],
|
||||
name: Optional[str] = None,
|
||||
config: Optional[Dict[str, Any]] = None,
|
||||
) -> None:
|
||||
"""Register a source class.
|
||||
|
||||
Args:
|
||||
source_class: The source class to register (must inherit from BibliographicSource)
|
||||
name: Optional name for the source (uses class name if not provided)
|
||||
config: Optional configuration dictionary
|
||||
"""
|
||||
if not inspect.isclass(source_class) or not issubclass(source_class, BibliographicSource):
|
||||
raise ValueError(f"{source_class} must be a subclass of BibliographicSource")
|
||||
|
||||
source_name = name or source_class.__name__
|
||||
self._registrations[source_name] = SourceRegistration(
|
||||
name=source_name,
|
||||
source_class=source_class,
|
||||
config=config or {},
|
||||
enabled=config.get('enabled', True) if config else True
|
||||
)
|
||||
|
||||
def get(self, name: str) -> Optional[BibliographicSource]:
|
||||
"""Get a source instance by name.
|
||||
|
||||
Args:
|
||||
name: Name of the source
|
||||
|
||||
Returns:
|
||||
Source instance if registered and enabled, None otherwise
|
||||
"""
|
||||
if name not in self._registrations:
|
||||
return None
|
||||
|
||||
registration = self._registrations[name]
|
||||
|
||||
# Return cached instance if available
|
||||
if name in self._instances:
|
||||
return self._instances[name]
|
||||
|
||||
# Create new instance
|
||||
if not registration.enabled:
|
||||
return None
|
||||
|
||||
instance = registration.source_class(config=registration.config)
|
||||
self._instances[name] = instance
|
||||
return instance
|
||||
|
||||
def list_sources(self, enabled_only: bool = False) -> List[str]:
|
||||
"""List registered source names.
|
||||
|
||||
Args:
|
||||
enabled_only: Only return enabled sources
|
||||
|
||||
Returns:
|
||||
List of source names
|
||||
"""
|
||||
sources = list(self._registrations.keys())
|
||||
if enabled_only:
|
||||
return [name for name, reg in self._registrations.items() if reg.enabled]
|
||||
return sources
|
||||
|
||||
def get_config(self, name: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get configuration for a source.
|
||||
|
||||
Args:
|
||||
name: Name of the source
|
||||
|
||||
Returns:
|
||||
Configuration dictionary, or None if not found
|
||||
"""
|
||||
registration = self._registrations.get(name)
|
||||
return registration.config if registration else None
|
||||
|
||||
def load_from_file(self, filepath: str) -> None:
|
||||
"""Load source plugins from a Python file.
|
||||
|
||||
Args:
|
||||
filepath: Path to Python file containing source classes
|
||||
"""
|
||||
spec = importlib.util.spec_from_file_location("module.sources", filepath)
|
||||
if spec is None or spec.loader is None:
|
||||
raise ImportError(f"Cannot load module from {filepath}")
|
||||
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
|
||||
# Find all classes that inherit from BibliographicSource
|
||||
for name, obj in inspect.getmembers(module, inspect.isclass):
|
||||
if issubclass(obj, BibliographicSource) and obj is not BibliographicSource:
|
||||
self.register(obj)
|
||||
|
||||
def load_from_directory(self, directory: str) -> None:
|
||||
"""Load source plugins from a directory.
|
||||
|
||||
Args:
|
||||
directory: Path to directory containing source plugin files
|
||||
"""
|
||||
import os
|
||||
for filename in os.listdir(directory):
|
||||
if filename.endswith('.py') and not filename.startswith('_'):
|
||||
filepath = os.path.join(directory, filename)
|
||||
self.load_from_file(filepath)
|
||||
|
||||
def from_config_dict(self, config: Dict[str, Any]) -> None:
|
||||
"""Load sources from a configuration dictionary.
|
||||
|
||||
Example config format:
|
||||
{
|
||||
"sources": {
|
||||
"crossref": {
|
||||
"source_type": "crossref",
|
||||
"enabled": true
|
||||
},
|
||||
"semantic_scholar": {
|
||||
"source_type": "semantic_scholar",
|
||||
"enabled": true,
|
||||
"api_key": "..."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
"""
|
||||
if 'sources' not in config:
|
||||
return
|
||||
|
||||
for name, source_config in config['sources'].items():
|
||||
source_name = str(name)
|
||||
source_type = str(source_config.get('source_type', source_name))
|
||||
self.register(
|
||||
source_class=self._resolve_source_class(source_type),
|
||||
name=source_name,
|
||||
config=source_config
|
||||
)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Serialize registry to dictionary.
|
||||
|
||||
Returns:
|
||||
Dictionary representation of registry
|
||||
"""
|
||||
return {
|
||||
name: {
|
||||
'enabled': reg.enabled,
|
||||
'config': reg.config
|
||||
}
|
||||
for name, reg in self._registrations.items()
|
||||
}
|
||||
|
||||
def from_dict(self, data: Dict[str, Any]) -> None:
|
||||
"""Load registry from dictionary.
|
||||
|
||||
Args:
|
||||
data: Dictionary representation of registry
|
||||
"""
|
||||
for name, source_data in data.items():
|
||||
source_name = str(name)
|
||||
source_type = str(source_data.get('source_type', source_name))
|
||||
self.register(
|
||||
source_class=self._resolve_source_class(source_type),
|
||||
name=source_name,
|
||||
config=source_data.get('config', source_data)
|
||||
)
|
||||
|
||||
def get_registered_sources(self) -> List[SourceRegistration]:
|
||||
"""Get all registered source registrations.
|
||||
|
||||
Returns:
|
||||
List of SourceRegistration objects
|
||||
"""
|
||||
return list(self._registrations.values())
|
||||
|
||||
def _resolve_source_class(self, source_type: str) -> Type[BibliographicSource]:
|
||||
normalized = source_type.strip().lower().replace('-', '_')
|
||||
if normalized in {'crossref', 'cross_ref'}:
|
||||
from citegeist.sources.crossref import CrossRefSource
|
||||
|
||||
return CrossRefSource
|
||||
if normalized in {'opencitations', 'open_citations'}:
|
||||
from citegeist.sources.opencitations import OpenCitationsSource
|
||||
|
||||
return OpenCitationsSource
|
||||
if normalized == 'unpaywall':
|
||||
from citegeist.sources.unpaywall import UnpaywallSource
|
||||
|
||||
return UnpaywallSource
|
||||
if normalized in {'europepmc', 'europe_pmc'}:
|
||||
from citegeist.sources.europepmc import EuropePmcSource
|
||||
|
||||
return EuropePmcSource
|
||||
if normalized in {'semanticscholar', 'semantic_scholar'}:
|
||||
from citegeist.sources.semanticscholar import SemanticScholarSource
|
||||
|
||||
return SemanticScholarSource
|
||||
if normalized in {"openlibrary", "open_library"}:
|
||||
from citegeist.sources.openlibrary import OpenLibrarySource
|
||||
|
||||
return OpenLibrarySource
|
||||
raise ValueError(f"Unknown source type: {source_type}")
|
||||
|
||||
|
||||
# Global registry instance
|
||||
_global_registry = SourceRegistry()
|
||||
|
||||
|
||||
def get_registry() -> SourceRegistry:
|
||||
"""Get the global source registry instance.
|
||||
|
||||
Returns:
|
||||
The global SourceRegistry instance
|
||||
"""
|
||||
return _global_registry
|
||||
|
|
@ -0,0 +1,140 @@
|
|||
"""Semantic Scholar source plugin."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from citegeist.bibtex import BibEntry
|
||||
from citegeist.sources.base import BibliographicSource
|
||||
|
||||
|
||||
class SemanticScholarSource(BibliographicSource):
|
||||
"""Semantic Scholar source for broad scientific metadata coverage."""
|
||||
|
||||
BASE_URL = "https://api.semanticscholar.org/graph/v1"
|
||||
DEFAULT_FIELDS = (
|
||||
"paperId,title,year,abstract,authors,externalIds,journal,venue,url,"
|
||||
"openAccessPdf,citationCount,publicationTypes"
|
||||
)
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
self.api_key = str(
|
||||
self.config.get("api_key")
|
||||
or os.environ.get("SEMANTIC_SCHOLAR_API_KEY")
|
||||
or ""
|
||||
).strip()
|
||||
self.user_agent = str(self.config.get("user_agent") or "citegeist/0.1 (local research tool)")
|
||||
|
||||
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
|
||||
normalized = doi.strip()
|
||||
if not normalized:
|
||||
return None
|
||||
encoded = urllib.parse.quote(f"DOI:{normalized}", safe="")
|
||||
payload = self._get_json(f"{self.BASE_URL}/paper/{encoded}?fields={self.DEFAULT_FIELDS}")
|
||||
if not payload:
|
||||
return None
|
||||
return self.normalize(payload)
|
||||
|
||||
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
|
||||
matches = self.search(title, limit=1)
|
||||
return matches[0] if matches else None
|
||||
|
||||
def search(self, query: str, limit: int = 10) -> List[BibEntry]:
|
||||
query_text = " ".join(query.split())
|
||||
if not query_text:
|
||||
return []
|
||||
params = urllib.parse.urlencode(
|
||||
{"query": query_text, "limit": max(1, limit), "fields": self.DEFAULT_FIELDS}
|
||||
)
|
||||
payload = self._get_json(f"{self.BASE_URL}/paper/search?{params}")
|
||||
if not payload:
|
||||
return []
|
||||
return [entry for row in payload.get("data", []) if (entry := self.normalize(row)) is not None]
|
||||
|
||||
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
|
||||
title = str(record.get("title") or "").strip()
|
||||
if not title:
|
||||
return None
|
||||
|
||||
external_ids = record.get("externalIds") or {}
|
||||
doi = str(external_ids.get("DOI") or "").strip()
|
||||
authors = " and ".join(
|
||||
str(author.get("name") or "").strip()
|
||||
for author in record.get("authors", [])
|
||||
if str(author.get("name") or "").strip()
|
||||
)
|
||||
year = str(record.get("year") or "").strip()
|
||||
abstract = str(record.get("abstract") or "").strip()
|
||||
journal = record.get("journal") or {}
|
||||
journal_name = str(journal.get("name") or record.get("venue") or "").strip()
|
||||
open_access_pdf = record.get("openAccessPdf") or {}
|
||||
|
||||
fields: Dict[str, str] = {"title": title}
|
||||
if doi:
|
||||
fields["doi"] = doi
|
||||
if paper_id := str(record.get("paperId") or "").strip():
|
||||
fields["semanticscholar_id"] = paper_id
|
||||
if year:
|
||||
fields["year"] = year
|
||||
if authors:
|
||||
fields["author"] = authors
|
||||
if abstract:
|
||||
fields["abstract"] = abstract
|
||||
if journal_name:
|
||||
if self._entry_type(record) == "inproceedings":
|
||||
fields["booktitle"] = journal_name
|
||||
else:
|
||||
fields["journal"] = journal_name
|
||||
if url := str(open_access_pdf.get("url") or record.get("url") or "").strip():
|
||||
fields["url"] = url
|
||||
if open_access_pdf:
|
||||
fields["is_oa"] = "true"
|
||||
if citation_count := record.get("citationCount"):
|
||||
fields["semanticscholar_citation_count"] = str(citation_count)
|
||||
|
||||
citation_key = self._citation_key(doi, str(record.get("paperId") or ""), authors, year, title)
|
||||
return BibEntry(entry_type=self._entry_type(record), citation_key=citation_key, fields=fields)
|
||||
|
||||
def get_fulltext_url(self, doi: str) -> Optional[str]:
|
||||
entry = self.lookup_by_doi(doi)
|
||||
if entry is None:
|
||||
return None
|
||||
return entry.fields.get("url")
|
||||
|
||||
def get_identifier_scheme(self) -> str:
|
||||
return "doi"
|
||||
|
||||
def _entry_type(self, record: Dict[str, Any]) -> str:
|
||||
publication_types = [str(item).lower() for item in (record.get("publicationTypes") or [])]
|
||||
if any("conference" in item for item in publication_types):
|
||||
return "inproceedings"
|
||||
if any("review" in item for item in publication_types):
|
||||
return "article"
|
||||
if record.get("journal") or record.get("venue"):
|
||||
return "article"
|
||||
return "misc"
|
||||
|
||||
def _citation_key(self, doi: str, paper_id: str, authors: str, year: str, title: str) -> str:
|
||||
if doi:
|
||||
return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
|
||||
if paper_id:
|
||||
return "s2" + "".join(ch for ch in paper_id.lower() if ch.isalnum())
|
||||
family = authors.split(" and ")[0].split()[-1] if authors else "ref"
|
||||
family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref"
|
||||
first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
|
||||
return f"{family}{year or 'nd'}{first_word or 'untitled'}"
|
||||
|
||||
def _get_json(self, url: str) -> Dict[str, Any] | None:
|
||||
headers = {"User-Agent": self.user_agent}
|
||||
if self.api_key:
|
||||
headers["x-api-key"] = self.api_key
|
||||
try:
|
||||
request = urllib.request.Request(url, headers=headers)
|
||||
with urllib.request.urlopen(request) as response:
|
||||
return json.loads(response.read().decode("utf-8"))
|
||||
except Exception:
|
||||
return None
|
||||
|
|
@ -0,0 +1,116 @@
|
|||
"""Unpaywall source plugin."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import urllib.parse
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from citegeist.bibtex import BibEntry
|
||||
from citegeist.sources._old_sources_compat import SourceClient
|
||||
from citegeist.sources.base import BibliographicSource
|
||||
|
||||
|
||||
class UnpaywallSource(BibliographicSource):
|
||||
"""Unpaywall source for DOI-based OA link enrichment."""
|
||||
|
||||
BASE_URL = "https://api.unpaywall.org/v2"
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)")
|
||||
self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
|
||||
self.email = str(
|
||||
self.config.get("email")
|
||||
or os.environ.get("UNPAYWALL_EMAIL")
|
||||
or os.environ.get("NCBI_EMAIL")
|
||||
or ""
|
||||
).strip()
|
||||
|
||||
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
|
||||
payload = self.lookup_oa_record(doi)
|
||||
if not payload:
|
||||
return None
|
||||
return self.normalize(payload)
|
||||
|
||||
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
|
||||
return None
|
||||
|
||||
def search(self, query: str, limit: int = 10) -> list[BibEntry]:
|
||||
return []
|
||||
|
||||
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
|
||||
doi = str(record.get("doi") or "").strip()
|
||||
title = str(record.get("title") or "").strip() or (doi and f"OA record for DOI {doi}")
|
||||
if not doi or not title:
|
||||
return None
|
||||
|
||||
fields: Dict[str, str] = {
|
||||
"title": title,
|
||||
"doi": doi,
|
||||
}
|
||||
if year := str(record.get("year") or "").strip():
|
||||
fields["year"] = year
|
||||
if landing_url := self._best_landing_url(record):
|
||||
fields["url"] = landing_url
|
||||
fields["best_oa_url"] = landing_url
|
||||
if pdf_url := self._best_pdf_url(record):
|
||||
fields["best_oa_pdf_url"] = pdf_url
|
||||
if oa_status := str(record.get("oa_status") or "").strip():
|
||||
fields["oa_status"] = oa_status
|
||||
if license_name := self._best_license(record):
|
||||
fields["oa_license"] = license_name
|
||||
if host_type := self._best_host_type(record):
|
||||
fields["oa_host_type"] = host_type
|
||||
if version := self._best_version(record):
|
||||
fields["oa_version"] = version
|
||||
if evidence := self._best_evidence(record):
|
||||
fields["oa_evidence"] = evidence
|
||||
if record.get("is_oa") is not None:
|
||||
fields["is_oa"] = "true" if bool(record.get("is_oa")) else "false"
|
||||
|
||||
citation_key = "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
|
||||
return BibEntry(entry_type="misc", citation_key=citation_key, fields=fields)
|
||||
|
||||
def get_fulltext_url(self, doi: str) -> Optional[str]:
|
||||
payload = self.lookup_oa_record(doi)
|
||||
if not payload:
|
||||
return None
|
||||
return self._best_pdf_url(payload) or self._best_landing_url(payload)
|
||||
|
||||
def get_identifier_scheme(self) -> str:
|
||||
return "doi"
|
||||
|
||||
def is_available(self) -> bool:
|
||||
return self.enabled and bool(self.email)
|
||||
|
||||
def lookup_oa_record(self, doi: str) -> Dict[str, Any] | None:
|
||||
normalized = doi.strip()
|
||||
if not normalized or not self.email:
|
||||
return None
|
||||
encoded = urllib.parse.quote(normalized, safe="")
|
||||
query = urllib.parse.urlencode({"email": self.email})
|
||||
return self.source_client.try_get_json(f"{self.BASE_URL}/{encoded}?{query}")
|
||||
|
||||
def _best_landing_url(self, payload: Dict[str, Any]) -> str:
|
||||
location = payload.get("best_oa_location") or {}
|
||||
return str(location.get("url") or location.get("url_for_landing_page") or "").strip()
|
||||
|
||||
def _best_pdf_url(self, payload: Dict[str, Any]) -> str:
|
||||
location = payload.get("best_oa_location") or {}
|
||||
return str(location.get("url_for_pdf") or "").strip()
|
||||
|
||||
def _best_license(self, payload: Dict[str, Any]) -> str:
|
||||
location = payload.get("best_oa_location") or {}
|
||||
return str(location.get("license") or "").strip()
|
||||
|
||||
def _best_host_type(self, payload: Dict[str, Any]) -> str:
|
||||
location = payload.get("best_oa_location") or {}
|
||||
return str(location.get("host_type") or "").strip()
|
||||
|
||||
def _best_version(self, payload: Dict[str, Any]) -> str:
|
||||
location = payload.get("best_oa_location") or {}
|
||||
return str(location.get("version") or "").strip()
|
||||
|
||||
def _best_evidence(self, payload: Dict[str, Any]) -> str:
|
||||
location = payload.get("best_oa_location") or {}
|
||||
return str(location.get("evidence") or "").strip()
|
||||
|
|
@ -138,6 +138,7 @@ class TalkOriginsEnrichmentResult:
|
|||
applied: bool
|
||||
source_label: str = ""
|
||||
weak_reasons_after: list[str] | None = None
|
||||
resolution_attempts: list[dict[str, object]] | None = None
|
||||
conflicts: list[dict[str, str]] | None = None
|
||||
error: str = ""
|
||||
|
||||
|
|
@ -545,8 +546,28 @@ class TalkOriginsScraper:
|
|||
if not weak_reasons_before:
|
||||
continue
|
||||
resolution = None
|
||||
attempts: list[dict[str, object]] = []
|
||||
error = ""
|
||||
try:
|
||||
resolver_with_trace = getattr(self.resolver, "resolve_entry_with_trace", None)
|
||||
resolver_plain = getattr(self.resolver, "resolve_entry", None)
|
||||
plain_func = getattr(resolver_plain, "__func__", None)
|
||||
trace_func = getattr(resolver_with_trace, "__func__", None)
|
||||
use_trace = (
|
||||
resolver_with_trace is not None
|
||||
and (
|
||||
trace_func is None
|
||||
or (
|
||||
plain_func is MetadataResolver.resolve_entry
|
||||
and trace_func is MetadataResolver.resolve_entry_with_trace
|
||||
)
|
||||
)
|
||||
)
|
||||
if use_trace:
|
||||
outcome = self.resolver.resolve_entry_with_trace(canonical)
|
||||
resolution = outcome.resolution
|
||||
attempts = [asdict(attempt) for attempt in outcome.attempts]
|
||||
else:
|
||||
resolution = self.resolver.resolve_entry(canonical)
|
||||
except Exception as exc:
|
||||
error = str(exc)
|
||||
|
|
@ -559,6 +580,7 @@ class TalkOriginsScraper:
|
|||
applied=False,
|
||||
source_label=resolution.source_label if resolution is not None else "",
|
||||
error=error,
|
||||
resolution_attempts=attempts,
|
||||
)
|
||||
|
||||
if resolution is not None:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,123 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from citegeist.resolve import MetadataResolver
|
||||
from citegeist.sources import EuropePmcSource, SourceRegistry, list_source_catalog
|
||||
|
||||
|
||||
def test_europepmc_source_normalizes_core_record() -> None:
|
||||
source = EuropePmcSource(config={})
|
||||
entry = source.normalize(
|
||||
{
|
||||
"id": "37158217",
|
||||
"source": "MED",
|
||||
"pmid": "37158217",
|
||||
"pmcid": "PMC10000001",
|
||||
"doi": "10.1000/example",
|
||||
"title": "Biomedical Example",
|
||||
"authorString": "Doe J, Roe A",
|
||||
"journalTitle": "Biomed Journal",
|
||||
"pubYear": "2024",
|
||||
"journalVolume": "16",
|
||||
"issue": "1",
|
||||
"pageInfo": "10-20",
|
||||
"abstractText": "Abstract text.",
|
||||
"isOpenAccess": "Y",
|
||||
"citedByCount": 12,
|
||||
"fullTextUrlList": {"fullTextUrl": [{"url": "https://europepmc.org/articles/PMC10000001?pdf=render"}]},
|
||||
}
|
||||
)
|
||||
|
||||
assert entry is not None
|
||||
assert entry.fields["doi"] == "10.1000/example"
|
||||
assert entry.fields["pmid"] == "37158217"
|
||||
assert entry.fields["pmcid"] == "PMC10000001"
|
||||
assert entry.fields["journal"] == "Biomed Journal"
|
||||
assert entry.fields["url"] == "https://europepmc.org/articles/PMC10000001?pdf=render"
|
||||
assert entry.fields["is_oa"] == "true"
|
||||
|
||||
|
||||
def test_europepmc_registry_and_catalog() -> None:
|
||||
registry = SourceRegistry()
|
||||
registry.from_config_dict(
|
||||
{
|
||||
"sources": {
|
||||
"europepmc": {
|
||||
"source_type": "europepmc",
|
||||
"enabled": True,
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
source = registry.get("europepmc")
|
||||
assert isinstance(source, EuropePmcSource)
|
||||
|
||||
catalog = {entry.key: entry for entry in list_source_catalog()}
|
||||
assert catalog["europe_pmc"].current_status == "integrated"
|
||||
assert catalog["europe_pmc"].priority == "now"
|
||||
|
||||
|
||||
def test_metadata_resolver_uses_europepmc_doi_after_primary_lookups_fail() -> None:
|
||||
resolver = MetadataResolver()
|
||||
resolver.resolve_doi = lambda _doi: None # type: ignore[method-assign]
|
||||
resolver.resolve_datacite_doi = lambda _doi: None # type: ignore[method-assign]
|
||||
resolver.europepmc.lookup_by_doi = lambda _doi: resolver.europepmc.normalize( # type: ignore[method-assign]
|
||||
{
|
||||
"id": "37158217",
|
||||
"source": "MED",
|
||||
"pmid": "37158217",
|
||||
"doi": "10.1000/example",
|
||||
"title": "Biomedical Example",
|
||||
"authorString": "Doe J, Roe A",
|
||||
"journalTitle": "Biomed Journal",
|
||||
"pubYear": "2024",
|
||||
}
|
||||
)
|
||||
|
||||
from citegeist.bibtex import BibEntry
|
||||
|
||||
result = resolver.resolve_entry(
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="seed2024",
|
||||
fields={"doi": "10.1000/example", "title": "Biomedical Example"},
|
||||
)
|
||||
)
|
||||
|
||||
assert result is not None
|
||||
assert result.source_label == "europepmc:doi:10.1000/example"
|
||||
assert result.entry.fields["pmid"] == "37158217"
|
||||
|
||||
|
||||
def test_metadata_resolver_uses_europepmc_title_search_after_pubmed() -> None:
|
||||
resolver = MetadataResolver()
|
||||
resolver.search_crossref_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
||||
resolver.search_datacite_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
||||
resolver.search_openalex_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
||||
resolver.search_pubmed_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
||||
resolver.europepmc.search = lambda _title, limit=5: [ # type: ignore[method-assign]
|
||||
resolver.europepmc.normalize(
|
||||
{
|
||||
"id": "37158217",
|
||||
"source": "MED",
|
||||
"pmid": "37158217",
|
||||
"doi": "10.1000/example",
|
||||
"title": "Biomedical Example",
|
||||
"authorString": "Doe J, Roe A",
|
||||
"journalTitle": "Biomed Journal",
|
||||
"pubYear": "2024",
|
||||
}
|
||||
)
|
||||
]
|
||||
|
||||
from citegeist.bibtex import BibEntry
|
||||
|
||||
result = resolver.resolve_entry(
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="seed2024",
|
||||
fields={"title": "Biomedical Example", "author": "Doe J", "year": "2024"},
|
||||
)
|
||||
)
|
||||
|
||||
assert result is not None
|
||||
assert result.source_label == "europepmc:search:Biomedical Example"
|
||||
|
|
@ -0,0 +1,137 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from citegeist.expand import OpenCitationsExpander
|
||||
from citegeist.sources import OpenCitationsSource
|
||||
from citegeist.storage import BibliographyStore
|
||||
|
||||
|
||||
def test_opencitations_source_normalizes_metadata_row() -> None:
|
||||
source = OpenCitationsSource(config={})
|
||||
entry = source.normalize(
|
||||
{
|
||||
"id": "doi:10.1000/example openalex:W1234567890 omid:br/06123",
|
||||
"title": "Example Work",
|
||||
"author": "Doe, Jane [omid:ra/1]; Roe, Alex [omid:ra/2]",
|
||||
"pub_date": "2024-05",
|
||||
"venue": "Journal of Examples [issn:1234-5678]",
|
||||
"volume": "12",
|
||||
"issue": "3",
|
||||
"page": "10-20",
|
||||
"type": "journal article",
|
||||
"publisher": "Example Press [crossref:123]",
|
||||
}
|
||||
)
|
||||
|
||||
assert entry is not None
|
||||
assert entry.fields["doi"] == "10.1000/example"
|
||||
assert entry.fields["openalex"] == "W1234567890"
|
||||
assert entry.fields["author"] == "Doe, Jane and Roe, Alex"
|
||||
assert entry.fields["journal"] == "Journal of Examples"
|
||||
assert entry.fields["publisher"] == "Example Press"
|
||||
assert entry.fields["year"] == "2024"
|
||||
|
||||
|
||||
def test_opencitations_source_builds_edges_for_references() -> None:
|
||||
source = OpenCitationsSource(config={})
|
||||
source.source_client.get_json = lambda _url: [ # type: ignore[method-assign]
|
||||
{
|
||||
"oci": "1-2",
|
||||
"citing": "omid:br/1 doi:10.1000/source",
|
||||
"cited": "omid:br/2 doi:10.1000/target",
|
||||
"creation": "2024-01-01",
|
||||
}
|
||||
]
|
||||
|
||||
edges = source.get_citations("10.1000/source", relation_type="cites", limit=10)
|
||||
assert len(edges) == 1
|
||||
assert edges[0].source_work_id == "doi:10.1000/source"
|
||||
assert edges[0].target_work_id == "doi:10.1000/target"
|
||||
|
||||
|
||||
def test_opencitations_expander_creates_reference_nodes_and_relations() -> None:
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024},
|
||||
doi = {10.1000/source}
|
||||
}
|
||||
"""
|
||||
)
|
||||
|
||||
expander = OpenCitationsExpander()
|
||||
expander.source.source_client.get_json = lambda url: [ # type: ignore[method-assign]
|
||||
{
|
||||
"oci": "1-2",
|
||||
"citing": "omid:br/1 doi:10.1000/source",
|
||||
"cited": "omid:br/2 doi:10.1000/target",
|
||||
"creation": "2024-01-01",
|
||||
}
|
||||
] if "/references/" in url else [
|
||||
{
|
||||
"id": "doi:10.1000/target omid:br/2",
|
||||
"title": "Target Work",
|
||||
"author": "Doe, Jane [omid:ra/1]",
|
||||
"pub_date": "2023",
|
||||
"venue": "Journal of Targets [issn:1111-1111]",
|
||||
"type": "journal article",
|
||||
}
|
||||
]
|
||||
expander.resolver.resolve_doi = lambda _doi: None # type: ignore[method-assign]
|
||||
expander.resolver.resolve_datacite_doi = lambda _doi: None # type: ignore[method-assign]
|
||||
|
||||
results = expander.expand_entry(store, "seed2024", relation_type="cites", limit=10)
|
||||
|
||||
assert [item.discovered_citation_key for item in results] == ["doi101000target"]
|
||||
discovered = store.get_entry("doi101000target")
|
||||
assert discovered is not None
|
||||
assert discovered["title"] == "Target Work"
|
||||
assert store.get_relations("seed2024") == ["doi101000target"]
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_opencitations_expander_supports_cited_by_direction() -> None:
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024},
|
||||
doi = {10.1000/seed}
|
||||
}
|
||||
"""
|
||||
)
|
||||
|
||||
expander = OpenCitationsExpander()
|
||||
expander.source.source_client.get_json = lambda url: [ # type: ignore[method-assign]
|
||||
{
|
||||
"oci": "2-1",
|
||||
"citing": "omid:br/2 doi:10.1000/citing",
|
||||
"cited": "omid:br/1 doi:10.1000/seed",
|
||||
"creation": "2024-01-01",
|
||||
}
|
||||
] if "/citations/" in url else [
|
||||
{
|
||||
"id": "doi:10.1000/citing omid:br/2",
|
||||
"title": "Citing Work",
|
||||
"author": "Doe, Jane [omid:ra/1]",
|
||||
"pub_date": "2025",
|
||||
"venue": "Journal of Citers [issn:1111-1111]",
|
||||
"type": "journal article",
|
||||
}
|
||||
]
|
||||
expander.resolver.resolve_doi = lambda _doi: None # type: ignore[method-assign]
|
||||
expander.resolver.resolve_datacite_doi = lambda _doi: None # type: ignore[method-assign]
|
||||
|
||||
results = expander.expand_entry(store, "seed2024", relation_type="cited_by", limit=10)
|
||||
|
||||
assert [item.discovered_citation_key for item in results] == ["doi101000citing"]
|
||||
assert store.get_relations("doi101000citing") == ["seed2024"]
|
||||
finally:
|
||||
store.close()
|
||||
|
|
@ -0,0 +1,188 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from citegeist.bibtex import BibEntry
|
||||
from citegeist.resolve import MetadataResolver
|
||||
from citegeist.sources import OpenLibrarySource, SourceRegistry, list_source_catalog
|
||||
|
||||
|
||||
class FakeSourceClient:
|
||||
def __init__(self, payload: dict[str, object]) -> None:
|
||||
self.payload = payload
|
||||
|
||||
def try_get_json(self, _url: str) -> dict[str, object]:
|
||||
return dict(self.payload)
|
||||
|
||||
|
||||
def test_openlibrary_source_normalizes_book_record() -> None:
|
||||
source = OpenLibrarySource(config={"source_client": FakeSourceClient({})})
|
||||
entry = source.normalize(
|
||||
{
|
||||
"title": "The Nature of the Stratigraphic Record",
|
||||
"author_name": ["D. V. Ager"],
|
||||
"first_publish_year": 1973,
|
||||
"publisher": ["Macmillan"],
|
||||
"key": "/works/OL82563W",
|
||||
"edition_key": ["OL12345M"],
|
||||
"isbn": ["9781234567890"],
|
||||
}
|
||||
)
|
||||
|
||||
assert entry is not None
|
||||
assert entry.entry_type == "book"
|
||||
assert entry.fields["title"] == "The Nature of the Stratigraphic Record"
|
||||
assert entry.fields["author"] == "D. V. Ager"
|
||||
assert entry.fields["year"] == "1973"
|
||||
assert entry.fields["publisher"] == "Macmillan"
|
||||
assert entry.fields["openlibrary_work"] == "/works/OL82563W"
|
||||
assert entry.fields["openlibrary_edition"] == "OL12345M"
|
||||
assert entry.fields["isbn"] == "9781234567890"
|
||||
|
||||
|
||||
def test_openlibrary_registry_and_catalog() -> None:
|
||||
registry = SourceRegistry()
|
||||
registry.from_config_dict(
|
||||
{
|
||||
"sources": {
|
||||
"openlibrary": {
|
||||
"source_type": "openlibrary",
|
||||
"enabled": True,
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
source = registry.get("openlibrary")
|
||||
assert isinstance(source, OpenLibrarySource)
|
||||
|
||||
catalog = {entry.key: entry for entry in list_source_catalog()}
|
||||
assert catalog["open_library"].current_status == "integrated"
|
||||
assert "book_metadata" in catalog["open_library"].capabilities
|
||||
|
||||
|
||||
def test_metadata_resolver_uses_openlibrary_after_other_searches_fail() -> None:
|
||||
resolver = MetadataResolver()
|
||||
resolver.search_crossref = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_datacite = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_openalex = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_pubmed = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_europepmc = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_semanticscholar = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_openlibrary = lambda _title, limit=5: [ # type: ignore[method-assign]
|
||||
BibEntry(
|
||||
entry_type="book",
|
||||
citation_key="olworks123",
|
||||
fields={
|
||||
"title": "The Nature of the Stratigraphic Record",
|
||||
"author": "D. V. Ager",
|
||||
"year": "1973",
|
||||
"openlibrary_work": "/works/OL82563W",
|
||||
},
|
||||
)
|
||||
]
|
||||
|
||||
result = resolver.resolve_entry(
|
||||
BibEntry(
|
||||
entry_type="book",
|
||||
citation_key="seed1973",
|
||||
fields={"title": "The Nature of the Stratigraphic Record", "author": "D. V. Ager", "year": "1973"},
|
||||
)
|
||||
)
|
||||
|
||||
assert result is not None
|
||||
assert result.source_label == "openlibrary:search:The Nature of the Stratigraphic Record"
|
||||
|
||||
|
||||
def test_metadata_resolver_trace_records_fallback_attempts() -> None:
|
||||
resolver = MetadataResolver()
|
||||
resolver.search_crossref = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_datacite = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_openalex = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_pubmed = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_europepmc = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_semanticscholar = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_openlibrary = lambda _title, limit=5: [ # type: ignore[method-assign]
|
||||
BibEntry(
|
||||
entry_type="book",
|
||||
citation_key="olworks123",
|
||||
fields={"title": "Example Book", "author": "Author, A", "year": "1980"},
|
||||
)
|
||||
]
|
||||
|
||||
outcome = resolver.resolve_entry_with_trace(
|
||||
BibEntry(
|
||||
entry_type="book",
|
||||
citation_key="seed1980",
|
||||
fields={"title": "Example Book", "author": "Author, A", "year": "1980"},
|
||||
)
|
||||
)
|
||||
|
||||
assert outcome.resolution is not None
|
||||
assert outcome.resolution.source_label == "openlibrary:search:Example Book"
|
||||
assert [attempt.source_name for attempt in outcome.attempts[-2:]] == ["semanticscholar", "openlibrary"]
|
||||
assert outcome.attempts[-1].matched is True
|
||||
assert outcome.attempts[-1].candidate_count == 1
|
||||
|
||||
|
||||
def test_metadata_resolver_uses_fuzzy_catalog_match_for_book_titles() -> None:
|
||||
resolver = MetadataResolver()
|
||||
resolver.search_crossref = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_datacite = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_openalex = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_pubmed = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_europepmc = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_semanticscholar = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_openlibrary = lambda _title, limit=5: [ # type: ignore[method-assign]
|
||||
BibEntry(
|
||||
entry_type="book",
|
||||
citation_key="olworks123",
|
||||
fields={
|
||||
"title": "The nature of the stratigraphical record",
|
||||
"author": "D. V. Ager",
|
||||
"year": "1973",
|
||||
"openlibrary_work": "/works/OL82563W",
|
||||
},
|
||||
)
|
||||
]
|
||||
|
||||
result = resolver.resolve_entry(
|
||||
BibEntry(
|
||||
entry_type="book",
|
||||
citation_key="seed1973",
|
||||
fields={"title": "The Nature of the Stratigraphic Record", "author": "D. V. Ager", "year": "1973"},
|
||||
)
|
||||
)
|
||||
|
||||
assert result is not None
|
||||
assert result.source_label == "openlibrary:search:The Nature of the Stratigraphic Record"
|
||||
|
||||
|
||||
def test_metadata_resolver_skips_openlibrary_for_article_like_entries() -> None:
|
||||
resolver = MetadataResolver()
|
||||
resolver.search_crossref = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_datacite = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_openalex = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_pubmed = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_europepmc = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_semanticscholar = lambda _title, limit=5: [] # type: ignore[method-assign]
|
||||
called = {"openlibrary": False}
|
||||
|
||||
def fake_openlibrary(_title: str, limit: int = 5) -> list[BibEntry]:
|
||||
called["openlibrary"] = True
|
||||
return []
|
||||
|
||||
resolver.search_openlibrary = fake_openlibrary # type: ignore[method-assign]
|
||||
outcome = resolver.resolve_entry_with_trace(
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="seed1977",
|
||||
fields={
|
||||
"title": "Fast locomotion of some African ungulates",
|
||||
"author": "Alexander, R. M.",
|
||||
"year": "1977",
|
||||
"journal": "Journal of Zoology",
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
assert outcome.resolution is None
|
||||
assert called["openlibrary"] is False
|
||||
assert all(attempt.source_name != "openlibrary" for attempt in outcome.attempts)
|
||||
|
|
@ -0,0 +1,201 @@
|
|||
"""Tests for identifier resolution and normalization."""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from citegeist.resolver import (
|
||||
IdentifierExtractor,
|
||||
IdentifierNormalizer,
|
||||
IdentifierResolver,
|
||||
extract_identifiers,
|
||||
normalize_identifier,
|
||||
get_primary_identifier,
|
||||
resolve_identifiers,
|
||||
)
|
||||
|
||||
|
||||
class TestIdentifierExtractor:
|
||||
"""Test IdentifierExtractor class."""
|
||||
|
||||
def test_extract_from_entry(self):
|
||||
"""Test extracting identifiers from entry fields."""
|
||||
fields = {
|
||||
'doi': '10.1234/example',
|
||||
'title': 'Test Title',
|
||||
'author': 'John Doe',
|
||||
'pmid': '123456',
|
||||
}
|
||||
|
||||
identifiers = IdentifierExtractor.extract(fields)
|
||||
|
||||
assert 'doi' in identifiers
|
||||
assert identifiers['doi'] == '10.1234/example'
|
||||
assert 'pmid' in identifiers
|
||||
assert identifiers['pmid'] == '123456'
|
||||
assert 'title' not in identifiers # Title is not an identifier
|
||||
|
||||
def test_extract_multiple_identifiers(self):
|
||||
"""Test extracting multiple identifiers."""
|
||||
fields = {
|
||||
'doi': '10.1234/example',
|
||||
'pmid': '123456',
|
||||
'arxiv': '2310.12345',
|
||||
'isbn': '978-0-123456-78-9',
|
||||
}
|
||||
|
||||
identifiers = IdentifierExtractor.extract(fields)
|
||||
|
||||
assert len(identifiers) == 4
|
||||
assert identifiers['doi'] == '10.1234/example'
|
||||
assert identifiers['pmid'] == '123456'
|
||||
assert identifiers['arxiv'] == '2310.12345'
|
||||
assert identifiers['isbn'] == '978-0-123456-78-9'
|
||||
|
||||
|
||||
class TestIdentifierNormalizer:
|
||||
"""Test IdentifierNormalizer class."""
|
||||
|
||||
def test_normalize_doi(self):
|
||||
"""Test DOI normalization."""
|
||||
assert IdentifierNormalizer.normalize_doi('10.1234/EXAMPLE') == '10.1234/example'
|
||||
assert IdentifierNormalizer.normalize_doi('10.1234/test') == '10.1234/test'
|
||||
assert IdentifierNormalizer.normalize_doi('invalid') is None
|
||||
|
||||
def test_normalize_pmid(self):
|
||||
"""Test PMID normalization."""
|
||||
assert IdentifierNormalizer.normalize_pmid('12345') == '12345'
|
||||
assert IdentifierNormalizer.normalize_pmid('1234567') == '1234567'
|
||||
assert IdentifierNormalizer.normalize_pmid('invalid') is None
|
||||
|
||||
def test_normalize_pmcid(self):
|
||||
"""Test PMCID normalization."""
|
||||
assert IdentifierNormalizer.normalize_pmcid('PMC12345') == 'pmc12345'
|
||||
assert IdentifierNormalizer.normalize_pmcid('PMCabcdef') == 'pmcabcdef'
|
||||
assert IdentifierNormalizer.normalize_pmcid('invalid') is None
|
||||
|
||||
def test_normalize_arxiv(self):
|
||||
"""Test arXiv normalization."""
|
||||
assert IdentifierNormalizer.normalize_arxiv('2310.12345') == '2310.12345'
|
||||
assert IdentifierNormalizer.normalize_arxiv('2310.12345v1') == '2310.12345'
|
||||
assert IdentifierNormalizer.normalize_arxiv('INVALID') is None
|
||||
|
||||
def test_normalize_orcid(self):
|
||||
"""Test ORCID normalization."""
|
||||
assert IdentifierNormalizer.normalize_orcid('0000-0001-2345-6789') == '0000-0001-2345-6789'
|
||||
# ORCID with spaces is invalid according to the canonical format
|
||||
assert IdentifierNormalizer.normalize_orcid('0000 0001 2345 6789') is None
|
||||
assert IdentifierNormalizer.normalize_orcid('invalid') is None
|
||||
|
||||
def test_normalize_identifier(self):
|
||||
"""Test generic identifier normalization."""
|
||||
result = IdentifierNormalizer.normalize_identifier('doi', '10.1234/test')
|
||||
assert result == ('doi', '10.1234/test')
|
||||
|
||||
result = IdentifierNormalizer.normalize_identifier('pmid', '12345')
|
||||
assert result == ('pmid', '12345')
|
||||
|
||||
result = IdentifierNormalizer.normalize_identifier('invalid', 'value')
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestIdentifierResolver:
|
||||
"""Test IdentifierResolver class."""
|
||||
|
||||
def test_resolve_with_doi(self):
|
||||
"""Test resolving with DOI."""
|
||||
fields = {'doi': '10.1234/example', 'title': 'Test Title'}
|
||||
|
||||
resolved = IdentifierResolver.resolve(fields)
|
||||
|
||||
assert len(resolved) >= 1
|
||||
doi_resolved = [r for r in resolved if r[0] == 'doi']
|
||||
assert len(doi_resolved) > 0
|
||||
|
||||
def test_resolve_with_multiple_identifiers(self):
|
||||
"""Test resolving with multiple identifiers."""
|
||||
fields = {
|
||||
'doi': '10.1234/example',
|
||||
'pmid': '12345',
|
||||
'arxiv': '2310.12345',
|
||||
}
|
||||
|
||||
resolved = IdentifierResolver.resolve(fields)
|
||||
|
||||
assert len(resolved) >= 2
|
||||
doi_resolved = [r for r in resolved if r[0] == 'doi']
|
||||
assert len(doi_resolved) > 0
|
||||
|
||||
def test_resolve_without_identifiers(self):
|
||||
"""Test resolving without identifiers."""
|
||||
fields = {'title': 'Test Title', 'author': 'John Doe'}
|
||||
|
||||
resolved = IdentifierResolver.resolve(fields)
|
||||
|
||||
# Should have at least title fingerprint
|
||||
assert len(resolved) >= 1
|
||||
title_resolved = [r for r in resolved if r[0] == 'title']
|
||||
assert len(title_resolved) > 0
|
||||
|
||||
def test_get_primary_identifier(self):
|
||||
"""Test getting primary identifier."""
|
||||
fields = {
|
||||
'doi': '10.1234/example',
|
||||
'pmid': '12345',
|
||||
'title': 'Test Title',
|
||||
}
|
||||
|
||||
primary = IdentifierResolver.get_primary_identifier(fields)
|
||||
|
||||
assert primary is not None
|
||||
# DOI should be first priority
|
||||
assert primary[0] == 'doi'
|
||||
|
||||
def test_get_scheme_value(self):
|
||||
"""Test getting specific scheme value."""
|
||||
fields = {
|
||||
'doi': '10.1234/example',
|
||||
'pmid': '12345',
|
||||
}
|
||||
|
||||
doi = IdentifierResolver.get_scheme_value('doi', fields)
|
||||
assert doi == '10.1234/example'
|
||||
|
||||
pmid = IdentifierResolver.get_scheme_value('pmid', fields)
|
||||
assert pmid == '12345'
|
||||
|
||||
isbn = IdentifierResolver.get_scheme_value('isbn', fields)
|
||||
assert isbn is None
|
||||
|
||||
|
||||
class TestConvenienceFunctions:
|
||||
"""Test convenience functions."""
|
||||
|
||||
def test_extract_identifiers(self):
|
||||
"""Test extract_identifiers function."""
|
||||
fields = {'doi': '10.1234/example', 'pmid': '12345'}
|
||||
|
||||
identifiers = extract_identifiers(fields)
|
||||
|
||||
assert 'doi' in identifiers
|
||||
assert 'pmid' in identifiers
|
||||
|
||||
def test_normalize_identifier(self):
|
||||
"""Test normalize_identifier function."""
|
||||
result = normalize_identifier('doi', '10.1234/test')
|
||||
assert result == ('doi', '10.1234/test')
|
||||
|
||||
def test_get_primary_identifier(self):
|
||||
"""Test get_primary_identifier function."""
|
||||
fields = {'doi': '10.1234/example'}
|
||||
|
||||
primary = get_primary_identifier(fields)
|
||||
|
||||
assert primary == ('doi', '10.1234/example')
|
||||
|
||||
def test_resolve_identifiers(self):
|
||||
"""Test resolve_identifiers function."""
|
||||
fields = {'doi': '10.1234/example'}
|
||||
|
||||
resolved = resolve_identifiers(fields)
|
||||
|
||||
assert len(resolved) > 0
|
||||
|
|
@ -0,0 +1,117 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from citegeist.resolve import MetadataResolver
|
||||
from citegeist.sources import SemanticScholarSource, SourceRegistry, list_source_catalog
|
||||
|
||||
|
||||
def test_semanticscholar_source_normalizes_record() -> None:
|
||||
source = SemanticScholarSource(config={})
|
||||
entry = source.normalize(
|
||||
{
|
||||
"paperId": "abcdef123456",
|
||||
"title": "Physics Example",
|
||||
"year": 2024,
|
||||
"abstract": "Abstract text.",
|
||||
"authors": [{"name": "Jane Doe"}, {"name": "Alex Roe"}],
|
||||
"externalIds": {"DOI": "10.1000/physics"},
|
||||
"journal": {"name": "Physical Review Example"},
|
||||
"openAccessPdf": {"url": "https://example.org/paper.pdf"},
|
||||
"citationCount": 42,
|
||||
"publicationTypes": ["JournalArticle"],
|
||||
}
|
||||
)
|
||||
|
||||
assert entry is not None
|
||||
assert entry.fields["doi"] == "10.1000/physics"
|
||||
assert entry.fields["author"] == "Jane Doe and Alex Roe"
|
||||
assert entry.fields["journal"] == "Physical Review Example"
|
||||
assert entry.fields["url"] == "https://example.org/paper.pdf"
|
||||
assert entry.fields["is_oa"] == "true"
|
||||
assert entry.fields["semanticscholar_citation_count"] == "42"
|
||||
|
||||
|
||||
def test_semanticscholar_registry_and_catalog() -> None:
|
||||
registry = SourceRegistry()
|
||||
registry.from_config_dict(
|
||||
{
|
||||
"sources": {
|
||||
"semanticscholar": {
|
||||
"source_type": "semanticscholar",
|
||||
"enabled": True,
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
source = registry.get("semanticscholar")
|
||||
assert isinstance(source, SemanticScholarSource)
|
||||
|
||||
catalog = {entry.key: entry for entry in list_source_catalog()}
|
||||
assert catalog["semantic_scholar"].current_status == "integrated"
|
||||
assert catalog["semantic_scholar"].priority == "now"
|
||||
|
||||
|
||||
def test_metadata_resolver_uses_semanticscholar_doi_after_other_lookups_fail() -> None:
|
||||
resolver = MetadataResolver()
|
||||
resolver.resolve_doi = lambda _doi: None # type: ignore[method-assign]
|
||||
resolver.resolve_datacite_doi = lambda _doi: None # type: ignore[method-assign]
|
||||
resolver.resolve_europepmc_doi = lambda _doi: None # type: ignore[method-assign]
|
||||
resolver.semanticscholar.lookup_by_doi = lambda _doi: resolver.semanticscholar.normalize( # type: ignore[method-assign]
|
||||
{
|
||||
"paperId": "abcdef123456",
|
||||
"title": "Physics Example",
|
||||
"year": 2024,
|
||||
"authors": [{"name": "Jane Doe"}],
|
||||
"externalIds": {"DOI": "10.1000/physics"},
|
||||
"journal": {"name": "Physical Review Example"},
|
||||
"publicationTypes": ["JournalArticle"],
|
||||
}
|
||||
)
|
||||
|
||||
from citegeist.bibtex import BibEntry
|
||||
|
||||
result = resolver.resolve_entry(
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="seed2024",
|
||||
fields={"doi": "10.1000/physics", "title": "Physics Example"},
|
||||
)
|
||||
)
|
||||
|
||||
assert result is not None
|
||||
assert result.source_label == "semanticscholar:doi:10.1000/physics"
|
||||
assert result.entry.fields["journal"] == "Physical Review Example"
|
||||
|
||||
|
||||
def test_metadata_resolver_uses_semanticscholar_title_search_after_other_searches_fail() -> None:
|
||||
resolver = MetadataResolver()
|
||||
resolver.search_crossref_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
||||
resolver.search_datacite_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
||||
resolver.search_openalex_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
||||
resolver.search_pubmed_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
||||
resolver.search_europepmc_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
||||
resolver.semanticscholar.search = lambda _title, limit=5: [ # type: ignore[method-assign]
|
||||
resolver.semanticscholar.normalize(
|
||||
{
|
||||
"paperId": "abcdef123456",
|
||||
"title": "Physics Example",
|
||||
"year": 2024,
|
||||
"authors": [{"name": "Jane Doe"}],
|
||||
"externalIds": {"DOI": "10.1000/physics"},
|
||||
"journal": {"name": "Physical Review Example"},
|
||||
"publicationTypes": ["JournalArticle"],
|
||||
}
|
||||
)
|
||||
]
|
||||
|
||||
from citegeist.bibtex import BibEntry
|
||||
|
||||
result = resolver.resolve_entry(
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="seed2024",
|
||||
fields={"title": "Physics Example", "author": "Jane Doe", "year": "2024"},
|
||||
)
|
||||
)
|
||||
|
||||
assert result is not None
|
||||
assert result.source_label == "semanticscholar:search:Physics Example"
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from citegeist.sources import CrossRefSource, OpenCitationsSource, SourceRegistry, list_source_catalog, prioritized_source_keys
|
||||
|
||||
|
||||
def test_catalog_prioritizes_existing_core_sources() -> None:
|
||||
keys = prioritized_source_keys()
|
||||
assert keys[:6] == ["crossref", "datacite", "europe_pmc", "openalex", "open_citations", "pubmed"]
|
||||
|
||||
|
||||
def test_catalog_includes_open_citation_and_access_sources() -> None:
|
||||
catalog = {entry.key: entry for entry in list_source_catalog()}
|
||||
assert "open_citations" in catalog
|
||||
assert "unpaywall" in catalog
|
||||
assert catalog["open_citations"].priority == "now"
|
||||
assert "doi_citations" in catalog["open_citations"].capabilities
|
||||
|
||||
|
||||
def test_registry_loads_known_source_from_config() -> None:
|
||||
registry = SourceRegistry()
|
||||
registry.from_config_dict(
|
||||
{
|
||||
"sources": {
|
||||
"crossref": {
|
||||
"source_type": "crossref",
|
||||
"enabled": True,
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
source = registry.get("crossref")
|
||||
assert isinstance(source, CrossRefSource)
|
||||
|
||||
|
||||
def test_registry_rejects_unknown_source_type() -> None:
|
||||
registry = SourceRegistry()
|
||||
try:
|
||||
registry.from_config_dict({"sources": {"mystery": {"source_type": "mystery"}}})
|
||||
except ValueError as exc:
|
||||
assert "Unknown source type" in str(exc)
|
||||
else:
|
||||
raise AssertionError("expected ValueError for unknown source type")
|
||||
|
||||
|
||||
def test_registry_loads_opencitations_from_config() -> None:
|
||||
registry = SourceRegistry()
|
||||
registry.from_config_dict(
|
||||
{
|
||||
"sources": {
|
||||
"opencitations": {
|
||||
"source_type": "opencitations",
|
||||
"enabled": True,
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
source = registry.get("opencitations")
|
||||
assert isinstance(source, OpenCitationsSource)
|
||||
|
|
@ -0,0 +1,171 @@
|
|||
"""Tests for the source plugin architecture."""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from citegeist.sources import BibliographicSource, SourceRegistry, CrossRefSource
|
||||
|
||||
|
||||
class MockSource(BibliographicSource):
|
||||
"""Mock source for testing."""
|
||||
|
||||
def __init__(self, config: dict | None = None):
|
||||
super().__init__(config)
|
||||
self.lookup_calls = []
|
||||
|
||||
def lookup_by_doi(self, doi: str) -> None:
|
||||
"""Return None to indicate not found."""
|
||||
self.lookup_calls.append(('doi', doi))
|
||||
return None
|
||||
|
||||
def lookup_by_title(self, title: str) -> None:
|
||||
"""Return None to indicate not found."""
|
||||
self.lookup_calls.append(('title', title))
|
||||
return None
|
||||
|
||||
def search(self, query: str, limit: int = 10) -> list:
|
||||
return []
|
||||
|
||||
def normalize(self, record: dict) -> None:
|
||||
return None
|
||||
|
||||
|
||||
def test_source_base_interface():
|
||||
"""Test that BibliographicSource base class works."""
|
||||
source = MockSource()
|
||||
assert source.is_available()
|
||||
assert source.get_identifier_scheme() == 'mocksource'
|
||||
assert source.get_fulltext_url('doi:test') is None
|
||||
assert source.get_embedding('doi:test') is None
|
||||
|
||||
|
||||
def test_mock_source():
|
||||
"""Test that mock source implements interface correctly."""
|
||||
source = MockSource()
|
||||
source.lookup_by_doi('10.1234/test')
|
||||
source.lookup_by_title('Test Title')
|
||||
|
||||
assert source.lookup_calls == [
|
||||
('doi', '10.1234/test'),
|
||||
('title', 'Test Title')
|
||||
]
|
||||
|
||||
|
||||
def test_source_registry():
|
||||
"""Test source registry functionality."""
|
||||
registry = SourceRegistry()
|
||||
|
||||
# Register a source
|
||||
registry.register(MockSource, name='mock_source', config={'enabled': True})
|
||||
|
||||
# List sources
|
||||
sources = registry.list_sources()
|
||||
assert 'mock_source' in sources
|
||||
|
||||
# Get source instance
|
||||
source = registry.get('mock_source')
|
||||
assert source is not None
|
||||
assert isinstance(source, MockSource)
|
||||
assert source.is_available()
|
||||
|
||||
|
||||
def test_source_registry_disabled():
|
||||
"""Test that disabled sources are not returned."""
|
||||
registry = SourceRegistry()
|
||||
|
||||
registry.register(
|
||||
MockSource,
|
||||
name='disabled_source',
|
||||
config={'enabled': False}
|
||||
)
|
||||
|
||||
sources = registry.list_sources()
|
||||
assert 'disabled_source' in sources
|
||||
|
||||
# Getting disabled source should return None
|
||||
source = registry.get('disabled_source')
|
||||
assert source is None
|
||||
|
||||
|
||||
def test_crossref_source():
|
||||
"""Test CrossRef source plugin."""
|
||||
registry = SourceRegistry()
|
||||
registry.register(CrossRefSource, name='crossref', config={})
|
||||
|
||||
source = registry.get('crossref')
|
||||
assert source is not None
|
||||
assert source.is_available()
|
||||
assert source.get_identifier_scheme() == 'doi'
|
||||
|
||||
entry = source.normalize(
|
||||
{
|
||||
'message': {
|
||||
'DOI': '10.1234/example',
|
||||
'title': ['Test Title'],
|
||||
'author': [{'given': 'Jane', 'family': 'Doe'}],
|
||||
'published-print': {'date-parts': [[2024]]},
|
||||
'container-title': ['Journal of Tests'],
|
||||
'publisher': 'Test Publisher',
|
||||
'URL': 'https://doi.org/10.1234/example',
|
||||
'abstract': '<jats:p>Example abstract</jats:p>',
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
assert entry is not None
|
||||
assert entry.fields['doi'] == '10.1234/example'
|
||||
assert entry.fields['title'] == 'Test Title'
|
||||
assert entry.fields['year'] == '2024'
|
||||
assert entry.fields['journal'] == 'Journal of Tests'
|
||||
|
||||
|
||||
def test_crossref_search_item_normalization():
|
||||
source = CrossRefSource()
|
||||
|
||||
entry = source.normalize(
|
||||
{
|
||||
'DOI': '10.1234/example',
|
||||
'title': ['Search Result'],
|
||||
'author': [{'family': 'Doe'}],
|
||||
'issued': {'date-parts': [[2023]]},
|
||||
}
|
||||
)
|
||||
|
||||
assert entry is not None
|
||||
assert entry.fields['doi'] == '10.1234/example'
|
||||
assert entry.fields['year'] == '2023'
|
||||
|
||||
|
||||
def test_source_record():
|
||||
"""Test SourceRecord dataclass."""
|
||||
from citegeist.sources import SourceRecord
|
||||
|
||||
record = SourceRecord(
|
||||
raw={'test': 'data'},
|
||||
source_type='test',
|
||||
source_label='test_source',
|
||||
timestamp='2024-01-01',
|
||||
confidence=1.0
|
||||
)
|
||||
|
||||
assert record.source_type == 'test'
|
||||
assert record.source_label == 'test_source'
|
||||
assert record.confidence == 1.0
|
||||
assert record.raw == {'test': 'data'}
|
||||
|
||||
|
||||
def test_citation_edge():
|
||||
"""Test CitationEdge dataclass."""
|
||||
from citegeist.sources import CitationEdge
|
||||
|
||||
edge = CitationEdge(
|
||||
source_work_id='doi:10.1234',
|
||||
target_work_id='doi:10.5678',
|
||||
relation_type='cites',
|
||||
source_type='crossref',
|
||||
source_label='crossref:test',
|
||||
confidence=0.9
|
||||
)
|
||||
|
||||
assert edge.relation_type == 'cites'
|
||||
assert edge.confidence == 0.9
|
||||
|
|
@ -530,6 +530,88 @@ def test_talkorigins_enrich_weak_canonicals_can_preview_resolution(tmp_path: Pat
|
|||
assert results[0].weak_reasons_after == []
|
||||
|
||||
|
||||
def test_talkorigins_enrich_weak_canonicals_includes_resolution_attempts(tmp_path: Path):
|
||||
base_url = "https://www.talkorigins.org/origins/biblio/"
|
||||
scraper = TalkOriginsScraper(
|
||||
source_client=FakeSourceClient(
|
||||
{
|
||||
base_url: INDEX_HTML,
|
||||
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
||||
f"{base_url}evolution.html": EVOLUTION_HTML,
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
|
||||
Path(export.seed_sets[0].seed_bib).write_text(
|
||||
"""
|
||||
@misc{weak1,
|
||||
author = "Smith, Jane",
|
||||
year = "1999",
|
||||
title = "Weak Duplicate"
|
||||
}
|
||||
|
||||
@misc{weak2,
|
||||
author = "Smith, Jane",
|
||||
year = "1999",
|
||||
title = "Weak Duplicate",
|
||||
note = "Copied from legacy source"
|
||||
}
|
||||
""",
|
||||
encoding="utf-8",
|
||||
)
|
||||
Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8")
|
||||
|
||||
from citegeist.resolve import Resolution, ResolutionAttempt, ResolutionOutcome
|
||||
|
||||
scraper.resolver.resolve_entry_with_trace = lambda entry: ResolutionOutcome( # type: ignore[method-assign]
|
||||
resolution=Resolution(
|
||||
entry=BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="resolved",
|
||||
fields={
|
||||
"author": entry.fields["author"],
|
||||
"title": entry.fields["title"],
|
||||
"year": entry.fields["year"],
|
||||
"doi": "10.1000/weak",
|
||||
"journal": "Journal of Better Metadata",
|
||||
},
|
||||
),
|
||||
source_type="resolver",
|
||||
source_label="crossref:search:Weak Duplicate",
|
||||
),
|
||||
attempts=[
|
||||
ResolutionAttempt(
|
||||
source_name="crossref",
|
||||
strategy="title_search",
|
||||
query_value="Weak Duplicate",
|
||||
matched=True,
|
||||
candidate_count=1,
|
||||
source_label="crossref:search:Weak Duplicate",
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=False)
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].resolution_attempts == [
|
||||
{
|
||||
"source_name": "crossref",
|
||||
"strategy": "title_search",
|
||||
"query_value": "Weak Duplicate",
|
||||
"matched": True,
|
||||
"candidate_count": 1,
|
||||
"source_label": "crossref:search:Weak Duplicate",
|
||||
"error": "",
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def test_talkorigins_enrich_weak_canonicals_can_apply_to_store(tmp_path: Path):
|
||||
base_url = "https://www.talkorigins.org/origins/biblio/"
|
||||
scraper = TalkOriginsScraper(
|
||||
|
|
@ -799,6 +881,7 @@ def test_talkorigins_build_review_export_combines_cluster_and_enrichment(tmp_pat
|
|||
assert review.items[0]["canonical"]["citation_key"] == "weak2"
|
||||
assert review.items[0]["enrichment"]["resolved"] is True
|
||||
assert review.items[0]["enrichment"]["applied"] is False
|
||||
assert review.items[0]["enrichment"]["resolution_attempts"] == []
|
||||
|
||||
|
||||
def test_talkorigins_apply_review_corrections_updates_store_entry(tmp_path: Path):
|
||||
|
|
|
|||
|
|
@ -0,0 +1,117 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from citegeist.cli import _run_enrich_oa
|
||||
from citegeist.sources import SourceRegistry, UnpaywallSource, list_source_catalog, prioritized_source_keys
|
||||
from citegeist.storage import BibliographyStore
|
||||
|
||||
|
||||
def test_unpaywall_source_normalizes_oa_record() -> None:
|
||||
source = UnpaywallSource(config={"email": "tester@example.org"})
|
||||
entry = source.normalize(
|
||||
{
|
||||
"doi": "10.1000/example",
|
||||
"title": "Example Article",
|
||||
"year": 2024,
|
||||
"is_oa": True,
|
||||
"oa_status": "gold",
|
||||
"best_oa_location": {
|
||||
"url": "https://example.org/article",
|
||||
"url_for_pdf": "https://example.org/article.pdf",
|
||||
"license": "cc-by",
|
||||
"host_type": "publisher",
|
||||
"version": "publishedVersion",
|
||||
"evidence": "open (via free pdf)",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
assert entry is not None
|
||||
assert entry.fields["doi"] == "10.1000/example"
|
||||
assert entry.fields["best_oa_url"] == "https://example.org/article"
|
||||
assert entry.fields["best_oa_pdf_url"] == "https://example.org/article.pdf"
|
||||
assert entry.fields["oa_status"] == "gold"
|
||||
assert entry.fields["oa_license"] == "cc-by"
|
||||
assert entry.fields["is_oa"] == "true"
|
||||
|
||||
|
||||
def test_unpaywall_registry_and_catalog() -> None:
|
||||
registry = SourceRegistry()
|
||||
registry.from_config_dict(
|
||||
{
|
||||
"sources": {
|
||||
"unpaywall": {
|
||||
"source_type": "unpaywall",
|
||||
"enabled": True,
|
||||
"email": "tester@example.org",
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
source = registry.get("unpaywall")
|
||||
assert isinstance(source, UnpaywallSource)
|
||||
|
||||
catalog = {entry.key: entry for entry in list_source_catalog()}
|
||||
assert catalog["unpaywall"].current_status == "integrated"
|
||||
assert catalog["unpaywall"].priority == "now"
|
||||
assert "unpaywall" in prioritized_source_keys()
|
||||
|
||||
|
||||
def test_run_enrich_oa_updates_entry() -> None:
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024},
|
||||
doi = {10.1000/example}
|
||||
}
|
||||
"""
|
||||
)
|
||||
|
||||
original_lookup = UnpaywallSource.lookup_by_doi
|
||||
|
||||
def fake_lookup(self: UnpaywallSource, doi: str):
|
||||
return self.normalize(
|
||||
{
|
||||
"doi": doi,
|
||||
"title": "Seed Paper",
|
||||
"year": 2024,
|
||||
"is_oa": True,
|
||||
"oa_status": "green",
|
||||
"best_oa_location": {
|
||||
"url": "https://repository.example.org/seed",
|
||||
"url_for_pdf": "https://repository.example.org/seed.pdf",
|
||||
"license": "cc-by",
|
||||
"host_type": "repository",
|
||||
"version": "acceptedVersion",
|
||||
"evidence": "oa repository",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
UnpaywallSource.lookup_by_doi = fake_lookup # type: ignore[method-assign]
|
||||
try:
|
||||
assert _run_enrich_oa(store, ["seed2024"], "tester@example.org") == 0
|
||||
finally:
|
||||
UnpaywallSource.lookup_by_doi = original_lookup # type: ignore[method-assign]
|
||||
|
||||
entry = store.get_entry("seed2024")
|
||||
assert entry is not None
|
||||
assert entry["best_oa_url"] == "https://repository.example.org/seed"
|
||||
assert entry["best_oa_pdf_url"] == "https://repository.example.org/seed.pdf"
|
||||
assert entry["oa_status"] == "green"
|
||||
assert entry["oa_host_type"] == "repository"
|
||||
provenance = store.get_field_provenance("seed2024")
|
||||
assert any(item["source_type"] == "oa_enrich" for item in provenance)
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_run_enrich_oa_requires_email() -> None:
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
assert _run_enrich_oa(store, ["missing"], None) == 1
|
||||
finally:
|
||||
store.close()
|
||||
Loading…
Reference in New Issue