Compare commits
No commits in common. "89bc56a7aa48fc5310f532281a6f36d8074cc8f0" and "39fe5ea86cac3f65128a304cda47f8bbf2e79869" have entirely different histories.
89bc56a7aa
...
39fe5ea86c
|
|
@ -1,185 +0,0 @@
|
||||||
-- Migration: Multi-source bibliographic schema
|
|
||||||
-- Description: Add multi-source support with works, identifiers, source records, citations, and embeddings
|
|
||||||
|
|
||||||
-- ============================================================================
|
|
||||||
-- WORKS TABLE - Canonical metadata for works
|
|
||||||
-- ============================================================================
|
|
||||||
CREATE TABLE IF NOT EXISTS works (
|
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
work_id TEXT NOT NULL UNIQUE,
|
|
||||||
title TEXT,
|
|
||||||
abstract TEXT,
|
|
||||||
publication_year INTEGER,
|
|
||||||
publication_date TEXT,
|
|
||||||
journal_name TEXT,
|
|
||||||
publisher TEXT,
|
|
||||||
volume TEXT,
|
|
||||||
issue TEXT,
|
|
||||||
pages TEXT,
|
|
||||||
doi TEXT,
|
|
||||||
pmid TEXT,
|
|
||||||
pmcid TEXT,
|
|
||||||
arxiv_id TEXT,
|
|
||||||
dblp_key TEXT,
|
|
||||||
openalex_id TEXT,
|
|
||||||
isbn TEXT,
|
|
||||||
issn TEXT,
|
|
||||||
entry_type TEXT NOT NULL DEFAULT 'article',
|
|
||||||
citation_count INTEGER DEFAULT 0,
|
|
||||||
cited_by_count INTEGER DEFAULT 0,
|
|
||||||
influential_citations INTEGER DEFAULT 0,
|
|
||||||
is_open_access BOOLEAN DEFAULT 0,
|
|
||||||
best_oa_url TEXT,
|
|
||||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
||||||
);
|
|
||||||
|
|
||||||
-- ============================================================================
|
|
||||||
-- WORK IDENTIFIERS TABLE - Mapping scheme + value to works
|
|
||||||
-- ============================================================================
|
|
||||||
CREATE TABLE IF NOT EXISTS work_identifiers (
|
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
work_id TEXT NOT NULL,
|
|
||||||
scheme TEXT NOT NULL,
|
|
||||||
value TEXT NOT NULL,
|
|
||||||
is_primary BOOLEAN DEFAULT 0,
|
|
||||||
normalized_value TEXT,
|
|
||||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
UNIQUE(work_id, scheme, value),
|
|
||||||
FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
|
|
||||||
);
|
|
||||||
|
|
||||||
-- ============================================================================
|
|
||||||
-- SOURCE RECORDS TABLE - Raw API responses with provenance
|
|
||||||
-- ============================================================================
|
|
||||||
CREATE TABLE IF NOT EXISTS source_records (
|
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
work_id TEXT NOT NULL,
|
|
||||||
source_type TEXT NOT NULL,
|
|
||||||
source_label TEXT NOT NULL,
|
|
||||||
raw_data_json TEXT NOT NULL,
|
|
||||||
raw_record_id TEXT,
|
|
||||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
UNIQUE(work_id, source_type, source_label),
|
|
||||||
FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
|
|
||||||
);
|
|
||||||
|
|
||||||
-- ============================================================================
|
|
||||||
-- CITATIONS TABLE - Citation graph with provenance
|
|
||||||
-- ============================================================================
|
|
||||||
CREATE TABLE IF NOT EXISTS citations (
|
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
source_work_id TEXT NOT NULL,
|
|
||||||
target_work_id TEXT NOT NULL,
|
|
||||||
relation_type TEXT NOT NULL,
|
|
||||||
source_type TEXT NOT NULL,
|
|
||||||
source_label TEXT NOT NULL,
|
|
||||||
confidence REAL DEFAULT 1.0,
|
|
||||||
is_verified BOOLEAN DEFAULT 0,
|
|
||||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
UNIQUE(source_work_id, target_work_id, relation_type),
|
|
||||||
FOREIGN KEY (source_work_id) REFERENCES works(id) ON DELETE CASCADE,
|
|
||||||
FOREIGN KEY (target_work_id) REFERENCES works(id) ON DELETE CASCADE
|
|
||||||
);
|
|
||||||
|
|
||||||
-- ============================================================================
|
|
||||||
-- WORK EMBEDDINGS TABLE - Vector storage for semantic search
|
|
||||||
-- ============================================================================
|
|
||||||
CREATE TABLE IF NOT EXISTS work_embeddings (
|
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
work_id TEXT NOT NULL,
|
|
||||||
embedding TEXT NOT NULL,
|
|
||||||
model_name TEXT NOT NULL,
|
|
||||||
model_version TEXT,
|
|
||||||
dimension INTEGER NOT NULL,
|
|
||||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
UNIQUE(work_id, model_name),
|
|
||||||
FOREIGN KEY (work_id) REFERENCES works(id) ON DELETE CASCADE
|
|
||||||
);
|
|
||||||
|
|
||||||
-- ============================================================================
|
|
||||||
-- INDEXES - For performance optimization
|
|
||||||
-- ============================================================================
|
|
||||||
-- Work identifiers indexes
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_work_identifiers_scheme ON work_identifiers(scheme);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_work_identifiers_value ON work_identifiers(value);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_work_identifiers_work_id ON work_identifiers(work_id);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_work_identifiers_normalized ON work_identifiers(normalized_value);
|
|
||||||
|
|
||||||
-- Source records indexes
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_source_records_work_id ON source_records(work_id);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_source_records_source_type ON source_records(source_type);
|
|
||||||
|
|
||||||
-- Citations indexes
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_citations_source_work ON citations(source_work_id);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_citations_target_work ON citations(target_work_id);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_citations_relation_type ON citations(relation_type);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_citations_source_type ON citations(source_type);
|
|
||||||
|
|
||||||
-- Works indexes
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_works_doi ON works(doi);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_works_pmid ON works(pmid);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_works_pmcid ON works(pmcid);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_works_arxiv ON works(arxiv_id);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_works_openalex ON works(openalex_id);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_works_is_open_access ON works(is_open_access);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_works_created_at ON works(created_at);
|
|
||||||
|
|
||||||
-- Embeddings indexes
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_embeddings_work_id ON work_embeddings(work_id);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_embeddings_model_name ON work_embeddings(model_name);
|
|
||||||
|
|
||||||
-- ============================================================================
|
|
||||||
-- PostgreSQL-specific extensions and vector indexing
|
|
||||||
-- ============================================================================
|
|
||||||
-- Note: The following are PostgreSQL-specific and should be run when using pgvector
|
|
||||||
|
|
||||||
-- Uncomment these when using PostgreSQL with pgvector extension:
|
|
||||||
-- CREATE EXTENSION IF NOT EXISTS vector;
|
|
||||||
-- CREATE INDEX IF NOT EXISTS idx_embeddings_vector ON work_embeddings
|
|
||||||
-- USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
|
|
||||||
|
|
||||||
-- ============================================================================
|
|
||||||
-- TRIGGERS - For automatic timestamp updates
|
|
||||||
-- ============================================================================
|
|
||||||
-- Works table update trigger
|
|
||||||
CREATE TRIGGER IF NOT EXISTS works_updated_at
|
|
||||||
AFTER UPDATE ON works
|
|
||||||
FOR EACH ROW
|
|
||||||
WHEN (new.updated_at IS NULL)
|
|
||||||
BEGIN
|
|
||||||
UPDATE works SET updated_at = CURRENT_TIMESTAMP WHERE id = old.id;
|
|
||||||
END;
|
|
||||||
|
|
||||||
-- Work identifiers update trigger
|
|
||||||
CREATE TRIGGER IF NOT EXISTS work_identifiers_updated_at
|
|
||||||
AFTER UPDATE ON work_identifiers
|
|
||||||
FOR EACH ROW
|
|
||||||
WHEN (new.created_at IS NULL)
|
|
||||||
BEGIN
|
|
||||||
UPDATE work_identifiers SET created_at = CURRENT_TIMESTAMP WHERE id = old.id;
|
|
||||||
END;
|
|
||||||
|
|
||||||
-- ============================================================================
|
|
||||||
-- VIEWS - For simplified queries
|
|
||||||
-- ============================================================================
|
|
||||||
-- View to join works with their identifiers
|
|
||||||
CREATE VIEW IF NOT EXISTS works_with_identifiers AS
|
|
||||||
SELECT
|
|
||||||
w.id,
|
|
||||||
w.work_id,
|
|
||||||
w.title,
|
|
||||||
w.abstract,
|
|
||||||
w.publication_year,
|
|
||||||
w.journal_name,
|
|
||||||
w.publisher,
|
|
||||||
w.doi,
|
|
||||||
w.pmid,
|
|
||||||
w.pmcid,
|
|
||||||
w.arxiv_id,
|
|
||||||
w.dblp_key,
|
|
||||||
w.openalex_id,
|
|
||||||
GROUP_CONCAT(DISTINCT wi.scheme || ':' || wi.value, ', ') AS identifiers
|
|
||||||
FROM works w
|
|
||||||
LEFT JOIN work_identifiers wi ON w.id = wi.work_id
|
|
||||||
GROUP BY w.id, w.work_id, w.title, w.abstract, w.publication_year, w.journal_name, w.publisher, w.doi, w.pmid, w.pmcid, w.arxiv_id, w.dblp_key, w.openalex_id;
|
|
||||||
104
docs/README.md
104
docs/README.md
|
|
@ -1,104 +0,0 @@
|
||||||
# CiteGeist Source Planning Documentation
|
|
||||||
|
|
||||||
Welcome to the source-planning documentation for CiteGeist.
|
|
||||||
|
|
||||||
## Quick Overview
|
|
||||||
|
|
||||||
The immediate planning question is which additional open bibliographic sources should be incorporated next.
|
|
||||||
|
|
||||||
This documentation therefore emphasizes:
|
|
||||||
|
|
||||||
- the current source baseline already present in the repository
|
|
||||||
- the next highest-value open sources to add
|
|
||||||
- a smaller, more realistic source-layer abstraction
|
|
||||||
- explicit deferral of unrelated database/vector ambitions
|
|
||||||
|
|
||||||
## Documentation Files
|
|
||||||
|
|
||||||
### Planning and Status
|
|
||||||
- **[source-landscape.md](./source-landscape.md)** - recommended next open bibliographic sources
|
|
||||||
- **[implementation-progress.md](./implementation-progress.md)** - sources-first progress tracker
|
|
||||||
- **[phase-completion.md](./phase-completion.md)** - short status summary
|
|
||||||
- **[file-structure.md](./file-structure.md)** - file structure and module notes
|
|
||||||
|
|
||||||
### Existing Architecture References
|
|
||||||
- **[architecture-current.md](./architecture-current.md)** - current architecture overview
|
|
||||||
- **[schema-current.sql](./schema-current.sql)** - existing database schema
|
|
||||||
|
|
||||||
## Current Status
|
|
||||||
|
|
||||||
### Current Baseline
|
|
||||||
1. Crossref, OpenAlex, PubMed, Europe PMC, Semantic Scholar, DataCite, DBLP, arXiv, and OAI-PMH are already in play.
|
|
||||||
2. OpenCitations and Unpaywall are now integrated as source-layer additions.
|
|
||||||
3. The SQLite-based local workflow remains the baseline.
|
|
||||||
4. Notebook-ready topic bibliography bundles can now be exported with `export-notebook-topic` for downstream `Didactopus`/Notebook use.
|
|
||||||
|
|
||||||
### Recommended Next Sources
|
|
||||||
1. OpenAIRE only if repository-acquisition scope expands
|
|
||||||
|
|
||||||
### Explicitly Deferred
|
|
||||||
1. Database redesign
|
|
||||||
2. pgvector / embedding-first work
|
|
||||||
|
|
||||||
## Source Layer
|
|
||||||
|
|
||||||
The source-layer code now provides:
|
|
||||||
|
|
||||||
- `BibliographicSource` as the common interface
|
|
||||||
- `SourceRegistry` for known concrete source classes
|
|
||||||
- `CrossRefSource` as the repaired first concrete plugin
|
|
||||||
- `OpenCitationsSource` plus DOI-based graph expansion
|
|
||||||
- `UnpaywallSource` plus DOI-based OA-link enrichment
|
|
||||||
- `EuropePmcSource` plus biomedical resolver/search support
|
|
||||||
- `SemanticScholarSource` plus broader biological/physical sciences resolver/search support
|
|
||||||
- a source catalog with current status and priority order
|
|
||||||
- compatibility with the existing `SourceClient`-based resolver and expander code
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
```python
|
|
||||||
from citegeist.sources import (
|
|
||||||
CrossRefSource,
|
|
||||||
EuropePmcSource,
|
|
||||||
OpenCitationsSource,
|
|
||||||
SemanticScholarSource,
|
|
||||||
SourceRegistry,
|
|
||||||
UnpaywallSource,
|
|
||||||
list_source_catalog,
|
|
||||||
prioritized_source_keys,
|
|
||||||
)
|
|
||||||
|
|
||||||
registry = SourceRegistry()
|
|
||||||
registry.register(CrossRefSource, name="crossref", config={})
|
|
||||||
registry.register(EuropePmcSource, name="europepmc", config={})
|
|
||||||
registry.register(OpenCitationsSource, name="opencitations", config={})
|
|
||||||
registry.register(SemanticScholarSource, name="semanticscholar", config={})
|
|
||||||
registry.register(UnpaywallSource, name="unpaywall", config={"email": "you@example.org"})
|
|
||||||
|
|
||||||
source = registry.get("crossref")
|
|
||||||
catalog = list_source_catalog()
|
|
||||||
priority = prioritized_source_keys()
|
|
||||||
```
|
|
||||||
|
|
||||||
## Tests
|
|
||||||
|
|
||||||
Relevant tests for the refocused source work:
|
|
||||||
|
|
||||||
- `tests/test_sources_plugin.py`
|
|
||||||
- `tests/test_sources_catalog.py`
|
|
||||||
|
|
||||||
The existing broader repository test suite should continue to pass as the source-layer changes are integrated.
|
|
||||||
|
|
||||||
## Next Steps
|
|
||||||
|
|
||||||
1. Decide whether `OpenAIRE` is worth adding for repository-acquisition breadth.
|
|
||||||
2. Keep database/vector redesign work deferred unless a source need forces it.
|
|
||||||
|
|
||||||
## License
|
|
||||||
|
|
||||||
Same as the CiteGeist project.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Last Updated:** 2026-04-25
|
|
||||||
**Status:** Sources-first plan in effect
|
|
||||||
|
|
@ -1,87 +0,0 @@
|
||||||
# CiteGeist Current Architecture
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
CiteGeist is currently designed as a local BibTeX-native tooling system with:
|
|
||||||
- BibTeX parsing and storage
|
|
||||||
- Local text search (FTS5)
|
|
||||||
- Entry provenance tracking
|
|
||||||
- Citation graph traversal
|
|
||||||
- Topic-based expansion
|
|
||||||
|
|
||||||
## Core Modules
|
|
||||||
|
|
||||||
### Source Management
|
|
||||||
- **sources.py**: `SourceClient` class for HTTP requests with caching and retry logic
|
|
||||||
- Base HTTP client with JSON/XML/text support
|
|
||||||
- Built-in retry with exponential backoff
|
|
||||||
- Cache directory support
|
|
||||||
|
|
||||||
### Metadata Resolution
|
|
||||||
- **resolve.py**: `MetadataResolver` class for entry resolution
|
|
||||||
- DOI → CrossRef lookup
|
|
||||||
- PMID → PubMed lookup
|
|
||||||
- arXiv, DBLP, OpenAlex lookup
|
|
||||||
- Title search fallback with best-match selection
|
|
||||||
- DataCite integration
|
|
||||||
- Returns `Resolution` objects with provenance
|
|
||||||
|
|
||||||
### Storage
|
|
||||||
- **storage.py**: `BibliographyStore` class (SQLite)
|
|
||||||
- Tables: entries, creators, entry_creators, identifiers, relations, topics, entry_topics, field_provenance, relation_provenance
|
|
||||||
- FTS5 text search integration
|
|
||||||
- Field-level provenance tracking
|
|
||||||
- Citation graph support (cites, cited_by edges)
|
|
||||||
|
|
||||||
### BibTeX Processing
|
|
||||||
- **bibtex.py**: BibEntry dataclass and parsing/rendering
|
|
||||||
- BibTeX → BibEntry conversion
|
|
||||||
- BibEntry → BibTeX rendering
|
|
||||||
- Citation key generation
|
|
||||||
|
|
||||||
### CLI and Server
|
|
||||||
- **cli.py**: Command-line interface
|
|
||||||
- **app_server.py**: Local HTTP server for UI/JSON API
|
|
||||||
- **app_api.py**: JSON API adapter surface
|
|
||||||
|
|
||||||
### Expansion and Discovery
|
|
||||||
- **expand.py**: Citation graph expansion workflows
|
|
||||||
- **extract.py**: Plaintext reference extraction
|
|
||||||
- **bootstrap.py**: Topic bootstrap and expansion
|
|
||||||
|
|
||||||
## Current State Summary
|
|
||||||
|
|
||||||
**Completed/Usable:**
|
|
||||||
- BibTeX parsing and storage
|
|
||||||
- Identifier-based resolution (DOI, PMID, arXiv, DBLP, OpenAlex)
|
|
||||||
- Title search with best-match selection
|
|
||||||
- Citation graph traversal and expansion
|
|
||||||
- Field provenance tracking
|
|
||||||
- Local search with FTS5
|
|
||||||
- Topic-based discovery workflows
|
|
||||||
|
|
||||||
**Not Yet Implemented (from new roadmap):**
|
|
||||||
- Plugin-based source architecture
|
|
||||||
- Multi-source record merging
|
|
||||||
- PGVector embeddings
|
|
||||||
- Full-text OA link retrieval
|
|
||||||
- Semantic Scholar integration
|
|
||||||
- OpenCitations integration
|
|
||||||
- Unified API endpoints for multi-source queries
|
|
||||||
|
|
||||||
## Data Flow
|
|
||||||
|
|
||||||
1. **Ingest**: BibTeX file → parse → store in entries table
|
|
||||||
2. **Resolve**: Entry → resolve_doi/resolve_pmid/resolve_arxiv → fetch metadata → merge with existing
|
|
||||||
3. **Expand**: Start from entry → traverse citation edges → discover new entries
|
|
||||||
4. **Search**: Query FTS5 index → retrieve relevant entries
|
|
||||||
5. **Export**: Entries → render BibTeX → output file
|
|
||||||
|
|
||||||
## Database Schema
|
|
||||||
|
|
||||||
SQLite-based storage with:
|
|
||||||
- Normalized entry fields
|
|
||||||
- Creator relationships
|
|
||||||
- Identifier mapping
|
|
||||||
- Citation relations
|
|
||||||
- Topic associations
|
|
||||||
- Field provenance metadata
|
|
||||||
|
|
@ -1,165 +0,0 @@
|
||||||
# CiteGeist Multi-Source File Structure
|
|
||||||
|
|
||||||
**Date:** 2026-04-25
|
|
||||||
|
|
||||||
## Project Structure
|
|
||||||
|
|
||||||
```
|
|
||||||
/home/netuser/dev/CiteGeist/
|
|
||||||
├── db/
|
|
||||||
│ └── migrations/
|
|
||||||
│ └── 0001_multisource.sql ✅ NEW - Multi-source schema
|
|
||||||
│
|
|
||||||
├── docs/
|
|
||||||
│ ├── architecture-current.md ✅ NEW - Current architecture docs
|
|
||||||
│ ├── implementation-progress.md ✅ NEW - Implementation progress tracker
|
|
||||||
│ ├── schema-current.sql ✅ NEW - Current schema SQL
|
|
||||||
│ └── file-structure.md ✅ NEW - This file
|
|
||||||
│
|
|
||||||
├── src/citegeist/
|
|
||||||
│ ├── sources/ ✅ NEW - Source plugin architecture
|
|
||||||
│ │ ├── __init__.py ✅ NEW - Package exports
|
|
||||||
│ │ ├── __all__.py ✅ NEW - Public API
|
|
||||||
│ │ ├── base.py ✅ NEW - Base BibliographicSource class
|
|
||||||
│ │ ├── registry.py ✅ NEW - SourceRegistry implementation
|
|
||||||
│ │ ├── crossref.py ✅ NEW - CrossRef source plugin
|
|
||||||
│ │ └── _old_sources_compat.py ✅ NEW - Backward compatibility
|
|
||||||
│ │
|
|
||||||
│ ├── resolver/ ✅ NEW - Identifier resolution
|
|
||||||
│ │ ├── __init__.py ✅ NEW - Module exports
|
|
||||||
│ │ └── identifiers.py ✅ NEW - Extract, normalize, resolve
|
|
||||||
│ │
|
|
||||||
│ ├── db/ ✅ NEW - Database operations
|
|
||||||
│ │ └── __init__.py 🚧 TO DO - Database client
|
|
||||||
│ │
|
|
||||||
│ ├── ... (existing files)
|
|
||||||
│ ├── sources.py 📦 Existing - Old SourceClient
|
|
||||||
│ ├── resolve.py 📦 Existing - MetadataResolver
|
|
||||||
│ └── storage.py 📦 Existing - BibliographyStore
|
|
||||||
│
|
|
||||||
└── tests/
|
|
||||||
├── test_sources_plugin.py ✅ NEW - Source plugin tests
|
|
||||||
└── test_resolver_identifiers.py ✅ NEW - Identifier tests
|
|
||||||
```
|
|
||||||
|
|
||||||
## Module Documentation
|
|
||||||
|
|
||||||
### New Modules
|
|
||||||
|
|
||||||
#### `src/citegeist/sources/`
|
|
||||||
Plugin architecture for bibliographic sources.
|
|
||||||
|
|
||||||
**Classes:**
|
|
||||||
- `BibliographicSource` - Abstract base class for source plugins
|
|
||||||
- `SourceRecord` - Raw source record dataclass
|
|
||||||
- `CitationEdge` - Citation relationship dataclass
|
|
||||||
- `SourceRegistry` - Manages source plugins
|
|
||||||
|
|
||||||
**Plugin:**
|
|
||||||
- `CrossRefSource` - CrossRef API implementation
|
|
||||||
|
|
||||||
#### `src/citegeist/resolver/`
|
|
||||||
Identifier extraction, normalization, and resolution.
|
|
||||||
|
|
||||||
**Classes:**
|
|
||||||
- `IdentifierExtractor` - Extract identifiers from entry fields
|
|
||||||
- `IdentifierNormalizer` - Normalize identifiers to canonical form
|
|
||||||
- `IdentifierResolver` - Resolve identifiers with lookup priority
|
|
||||||
|
|
||||||
**Functions:**
|
|
||||||
- `extract_identifiers()` - Quick identifier extraction
|
|
||||||
- `normalize_identifier()` - Quick normalization
|
|
||||||
- `get_primary_identifier()` - Get primary identifier
|
|
||||||
- `resolve_identifiers()` - Resolve all identifiers
|
|
||||||
|
|
||||||
#### `src/citegeist/db/`
|
|
||||||
Database operations (to be implemented).
|
|
||||||
|
|
||||||
**Planned:**
|
|
||||||
- Database client for works table
|
|
||||||
- Migration runner
|
|
||||||
- Query builders
|
|
||||||
|
|
||||||
#### `db/migrations/0001_multisource.sql`
|
|
||||||
Multi-source database schema migration.
|
|
||||||
|
|
||||||
**Tables:**
|
|
||||||
1. `works` - Canonical work metadata
|
|
||||||
2. `work_identifiers` - Multi-scheme identifiers
|
|
||||||
3. `source_records` - Raw API responses
|
|
||||||
4. `citations` - Citation graph
|
|
||||||
5. `work_embeddings` - Vector embeddings
|
|
||||||
|
|
||||||
### Existing Modules (Preserved)
|
|
||||||
|
|
||||||
- `src/citegeist/sources.py` - Old SourceClient (backward compatible)
|
|
||||||
- `src/citegeist/resolve.py` - Old MetadataResolver
|
|
||||||
- `src/citegeist/storage.py` - Old BibliographyStore
|
|
||||||
|
|
||||||
## Test Coverage
|
|
||||||
|
|
||||||
**New Tests:**
|
|
||||||
- `tests/test_sources_plugin.py` (7 tests)
|
|
||||||
- `tests/test_resolver_identifiers.py` (17 tests)
|
|
||||||
|
|
||||||
**Total:** 24 tests passing
|
|
||||||
|
|
||||||
## Dependencies
|
|
||||||
|
|
||||||
**New Dependencies Required:**
|
|
||||||
- No new Python packages (uses stdlib only)
|
|
||||||
|
|
||||||
**Planned Dependencies (Future phases):**
|
|
||||||
- `pgvector` - PostgreSQL vector extension
|
|
||||||
- `sentence-transformers` - Local embedding model
|
|
||||||
- `fastapi` - API framework
|
|
||||||
- `unpaywall` - OA link retrieval (if needed)
|
|
||||||
|
|
||||||
## Implementation Status
|
|
||||||
|
|
||||||
### Completed (100%)
|
|
||||||
- ✅ Phase 0: Baseline Audit
|
|
||||||
- ✅ Phase 1: Source Plugin Architecture
|
|
||||||
- ✅ Phase 2: Identifier Resolution Layer
|
|
||||||
|
|
||||||
### In Progress (50%)
|
|
||||||
- 🚧 Phase 3: Database Schema Upgrade
|
|
||||||
|
|
||||||
### Pending (0%)
|
|
||||||
- ⏳ Phase 4: High-Value Source Integrations
|
|
||||||
- ⏳ Phase 5: Merge & Deduplication Engine
|
|
||||||
- ⏳ Phase 6: Citation Graph Construction
|
|
||||||
- ⏳ Phase 7: Embedding Pipeline
|
|
||||||
- ⏳ Phase 8: Full-Text Retrieval Layer
|
|
||||||
- ⏳ Phase 9: API Layer
|
|
||||||
- ⏳ Phase 10: Ranking & Relevance
|
|
||||||
- ⏳ Phase 12: Observability & QA
|
|
||||||
- ⏳ Phase 13: Performance Optimization
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Register a source
|
|
||||||
from citegeist.sources import SourceRegistry, CrossRefSource
|
|
||||||
|
|
||||||
registry = SourceRegistry()
|
|
||||||
registry.register(CrossRefSource, name='crossref', config={})
|
|
||||||
|
|
||||||
# Get source instance
|
|
||||||
source = registry.get('crossref')
|
|
||||||
entry = source.lookup_by_doi('10.1234/example')
|
|
||||||
|
|
||||||
# Resolve identifiers
|
|
||||||
from citegeist.resolver import resolve_identifiers
|
|
||||||
|
|
||||||
fields = {'doi': '10.1234/example', 'title': 'Test'}
|
|
||||||
resolved = resolve_identifiers(fields)
|
|
||||||
# Returns [('doi', '10.1234/example'), ('title', 'test title')]
|
|
||||||
```
|
|
||||||
|
|
||||||
## Next Steps
|
|
||||||
|
|
||||||
1. ✅ Phase 0-2: Complete
|
|
||||||
2. 🚧 Phase 3: Implement Python interface for database operations
|
|
||||||
3. ⏳ Phase 4: Add Unpaywall, Semantic Scholar, OpenCitations integrations
|
|
||||||
4. ⏳ Phase 5: Build merge engine
|
|
||||||
|
|
@ -1,122 +0,0 @@
|
||||||
# CiteGeist Sources-First Progress
|
|
||||||
|
|
||||||
**Last Updated:** 2026-04-25
|
|
||||||
|
|
||||||
This document tracks the refocused plan for source incorporation. The working question is which additional open bibliographic sources CiteGeist should integrate next, not whether it needs a new storage platform first.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Phase 0: Scope Reframe ✅ COMPLETE
|
|
||||||
|
|
||||||
**Status:** Completed
|
|
||||||
|
|
||||||
**Deliverables:**
|
|
||||||
- ✅ `/docs/source-landscape.md` - source inventory and recommendation document
|
|
||||||
- ✅ `/src/citegeist/sources/catalog.py` - code-backed source catalog
|
|
||||||
|
|
||||||
**Completed:**
|
|
||||||
- Identified which source integrations already exist in the repository
|
|
||||||
- Split source-expansion planning from database/vector-search ambitions
|
|
||||||
- Prioritized open-source additions by workflow value
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Phase 1: Source Layer Tightening ✅ COMPLETE
|
|
||||||
|
|
||||||
**Status:** Completed
|
|
||||||
|
|
||||||
**Deliverables:**
|
|
||||||
- ✅ `/src/citegeist/sources/base.py` - Base `BibliographicSource` interface
|
|
||||||
- ✅ `/src/citegeist/sources/registry.py` - Registry for known concrete sources
|
|
||||||
- ✅ `/src/citegeist/sources/crossref.py` - Repaired CrossRef source implementation
|
|
||||||
- ✅ `/src/citegeist/sources/catalog.py` - Open-source inventory
|
|
||||||
- ✅ `/src/citegeist/sources/__init__.py` - Package initialization
|
|
||||||
- ✅ `/tests/test_sources_plugin.py` - Source plugin tests
|
|
||||||
- ✅ `/tests/test_sources_catalog.py` - Source catalog and registry tests
|
|
||||||
|
|
||||||
**Completed:**
|
|
||||||
- ✅ Created `BibliographicSource` abstract base class
|
|
||||||
- ✅ Repaired `SourceRegistry` so config-backed loading resolves real source classes
|
|
||||||
- ✅ Fixed `CrossRefSource` normalization for direct lookup and search-style payloads
|
|
||||||
- ✅ Replaced path-specific compatibility loading with repo-relative loading
|
|
||||||
- ✅ Added a source catalog that captures current status and next-priority sources
|
|
||||||
|
|
||||||
**Features:**
|
|
||||||
- Abstract interface for source plugins
|
|
||||||
- Registry for known source discovery and instantiation
|
|
||||||
- Config-driven enable/disable for known source types
|
|
||||||
- Source prioritization metadata
|
|
||||||
- Compatibility with the existing `SourceClient`-based resolver/expander code
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Current Integrated Sources ✅ AVAILABLE
|
|
||||||
|
|
||||||
- `Crossref`
|
|
||||||
- `OpenAlex`
|
|
||||||
- `OpenCitations`
|
|
||||||
- `Unpaywall`
|
|
||||||
- `PubMed`
|
|
||||||
- `Europe PMC`
|
|
||||||
- `Semantic Scholar`
|
|
||||||
- `DataCite`
|
|
||||||
- `DBLP`
|
|
||||||
- `arXiv`
|
|
||||||
- `OAI-PMH`
|
|
||||||
|
|
||||||
These are already sufficient for a credible local enrichment-and-discovery workflow. The next work should complement them rather than restart infrastructure underneath them.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Phase 2: Next Source Additions 🚧 IN PROGRESS
|
|
||||||
|
|
||||||
**Status:** In Progress
|
|
||||||
|
|
||||||
**Priority Order:**
|
|
||||||
1. `OpenAIRE` only if repository-acquisition scope expands
|
|
||||||
|
|
||||||
**Completed Deliverables:**
|
|
||||||
- ✅ OpenCitations adapter for DOI citation/reference lookup
|
|
||||||
- ✅ OpenCitations graph expansion support in CLI and topic expansion flows
|
|
||||||
- ✅ Unpaywall adapter for DOI OA-link enrichment
|
|
||||||
- ✅ `enrich-oa` CLI flow for applying OA metadata to stored entries
|
|
||||||
- ✅ Europe PMC biomedical resolver/search integration
|
|
||||||
- ✅ Semantic Scholar broad-science resolver/search integration
|
|
||||||
|
|
||||||
**Planned Deliverables:**
|
|
||||||
- ⏳ Decide whether repository-acquisition breadth needs another dedicated source
|
|
||||||
|
|
||||||
**Rationale:**
|
|
||||||
- `OpenCitations` now improves open citation-edge coverage
|
|
||||||
- `Unpaywall` now improves access-link enrichment
|
|
||||||
- `Europe PMC` now improves biomedical metadata and OA/fulltext coverage
|
|
||||||
- `Semantic Scholar` now improves broader biological and physical sciences coverage
|
|
||||||
- neither requires a new database architecture to become useful
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Phase 3: Optional Source Evaluation ⏳ PLANNED
|
|
||||||
|
|
||||||
**Status:** Planned
|
|
||||||
|
|
||||||
- `OpenAIRE`
|
|
||||||
|
|
||||||
**Decision Rule:**
|
|
||||||
- add them only if they solve a concrete discovery or acquisition gap that current open sources do not already cover well
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Explicitly Deferred
|
|
||||||
|
|
||||||
- second-schema redesign work
|
|
||||||
- pgvector integration
|
|
||||||
- embedding-first retrieval
|
|
||||||
- broad canonical-work reconstruction
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Summary
|
|
||||||
|
|
||||||
**Completed:** scope reframe and source-layer cleanup
|
|
||||||
**Planned next:** `OpenAIRE` reevaluation
|
|
||||||
**Deferred:** database/vector expansion work not required by the source question
|
|
||||||
|
|
@ -1,111 +0,0 @@
|
||||||
# Sources-First Status
|
|
||||||
|
|
||||||
**Current Focus:** identify and prioritize the next open bibliographic sources to add, using the existing SQLite-based workflow as the baseline.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Phase Matrix
|
|
||||||
|
|
||||||
| Phase | Title | Status | Outcome |
|
|
||||||
|-------|-------|--------|---------|
|
|
||||||
| **0** | Scope Reframe | ✅ Complete | Planning now answers the source question directly |
|
|
||||||
| **1** | Source Layer Tightening | ✅ Complete | Registry, CrossRef plugin, compatibility seam, and source catalog repaired |
|
|
||||||
| **2** | Next Open Source Additions | 🚧 In Progress | OpenCitations, Unpaywall, and Europe PMC integrated |
|
|
||||||
| **3** | Optional Source Evaluation | ⏳ Planned | OpenAIRE evaluated later if acquisition breadth matters |
|
|
||||||
| **D** | Database / Vector Expansion | ⏸ Deferred | Not required for the current source-incorporation decision |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Test Coverage Summary
|
|
||||||
|
|
||||||
```
|
|
||||||
✅ test_sources_plugin.py
|
|
||||||
✅ test_sources_catalog.py
|
|
||||||
✅ existing full suite still expected to pass
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Key Artifacts
|
|
||||||
|
|
||||||
### Documentation
|
|
||||||
```
|
|
||||||
docs/
|
|
||||||
├── source-landscape.md ✅ Source inventory and recommendations
|
|
||||||
├── implementation-progress.md ✅ Sources-first progress tracker
|
|
||||||
└── phase-completion.md ✅ Short status summary
|
|
||||||
```
|
|
||||||
|
|
||||||
### Source Layer
|
|
||||||
```
|
|
||||||
src/citegeist/sources/
|
|
||||||
├── base.py ✅ Base source interface
|
|
||||||
├── catalog.py ✅ Source inventory in code
|
|
||||||
├── registry.py ✅ Registry for known source classes
|
|
||||||
├── crossref.py ✅ Repaired CrossRef plugin
|
|
||||||
└── _old_sources_compat.py ✅ Repo-relative compatibility bridge
|
|
||||||
```
|
|
||||||
|
|
||||||
### Tests
|
|
||||||
```
|
|
||||||
tests/
|
|
||||||
├── test_sources_plugin.py ✅ Source plugin tests
|
|
||||||
└── test_sources_catalog.py ✅ Source catalog/registry tests
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Key Features Implemented
|
|
||||||
|
|
||||||
- ✅ Source catalog covering current and candidate open sources
|
|
||||||
- ✅ Config-driven registry loading for known real source classes
|
|
||||||
- ✅ CrossRef normalization that works for both single-record and search-result payloads
|
|
||||||
- ✅ Compatibility bridge that no longer depends on one checkout path
|
|
||||||
- ✅ OpenCitations DOI-based graph expansion with CLI support
|
|
||||||
- ✅ Unpaywall OA-link enrichment with CLI support
|
|
||||||
- ✅ Europe PMC biomedical resolver/search support
|
|
||||||
- ✅ Semantic Scholar broad-science resolver/search support
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Next Milestones
|
|
||||||
|
|
||||||
### Immediate
|
|
||||||
1. Decide whether repository-acquisition scope justifies `OpenAIRE`
|
|
||||||
2. Keep the OA-enrichment flow aligned with review/export needs
|
|
||||||
3. Keep graph-source scope disciplined as broader coverage grows
|
|
||||||
|
|
||||||
### Later
|
|
||||||
1. Evaluate `Semantic Scholar`
|
|
||||||
2. Evaluate `OpenAIRE`
|
|
||||||
3. Revisit database/vector work only if a concrete source need demands it
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Success Metrics
|
|
||||||
|
|
||||||
### Completed
|
|
||||||
- ✅ Planning now matches the actual source question
|
|
||||||
- ✅ Source-layer defects from the first pass have been corrected
|
|
||||||
- ✅ OpenCitations is now a working integrated source
|
|
||||||
- ✅ Unpaywall is now a working integrated source
|
|
||||||
- ✅ Europe PMC is now a working integrated source
|
|
||||||
- ✅ Semantic Scholar is now a working integrated source
|
|
||||||
- ✅ The next source priorities are explicit
|
|
||||||
|
|
||||||
### Planned
|
|
||||||
- ⏳ Better source selection discipline before adding more integrations
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Recommendations
|
|
||||||
|
|
||||||
1. Treat the current SQLite/FTS workflow as the baseline, not as a blocker.
|
|
||||||
2. Add source integrations only when they materially improve bibliographic coverage, citation coverage, or open-access linkage.
|
|
||||||
3. Keep database/vector work explicitly subordinate to source-incorporation needs.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Last Updated:** 2026-04-25
|
|
||||||
**Status:** Sources-first plan in effect
|
|
||||||
**Confidence:** High
|
|
||||||
|
|
@ -1,131 +0,0 @@
|
||||||
-- CiteGeist Current Schema (SQLite)
|
|
||||||
|
|
||||||
-- Entries table
|
|
||||||
CREATE TABLE IF NOT EXISTS entries (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
citation_key TEXT NOT NULL UNIQUE,
|
|
||||||
entry_type TEXT NOT NULL,
|
|
||||||
review_status TEXT NOT NULL DEFAULT 'draft',
|
|
||||||
title TEXT,
|
|
||||||
year TEXT,
|
|
||||||
journal TEXT,
|
|
||||||
booktitle TEXT,
|
|
||||||
publisher TEXT,
|
|
||||||
abstract TEXT,
|
|
||||||
keywords TEXT,
|
|
||||||
url TEXT,
|
|
||||||
doi TEXT,
|
|
||||||
isbn TEXT,
|
|
||||||
fulltext TEXT,
|
|
||||||
raw_bibtex TEXT,
|
|
||||||
extra_fields_json TEXT NOT NULL DEFAULT '{}',
|
|
||||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
||||||
);
|
|
||||||
|
|
||||||
-- Creators table
|
|
||||||
CREATE TABLE IF NOT EXISTS creators (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
full_name TEXT NOT NULL UNIQUE,
|
|
||||||
family_name TEXT,
|
|
||||||
given_names TEXT
|
|
||||||
);
|
|
||||||
|
|
||||||
-- Entry-Creators relationship
|
|
||||||
CREATE TABLE IF NOT EXISTS entry_creators (
|
|
||||||
entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
|
|
||||||
creator_id INTEGER NOT NULL REFERENCES creators(id) ON DELETE CASCADE,
|
|
||||||
role TEXT NOT NULL,
|
|
||||||
ordinal INTEGER NOT NULL,
|
|
||||||
PRIMARY KEY (entry_id, role, ordinal)
|
|
||||||
);
|
|
||||||
|
|
||||||
-- Identifiers table
|
|
||||||
CREATE TABLE IF NOT EXISTS identifiers (
|
|
||||||
entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
|
|
||||||
scheme TEXT NOT NULL,
|
|
||||||
value TEXT NOT NULL,
|
|
||||||
PRIMARY KEY (scheme, value)
|
|
||||||
);
|
|
||||||
|
|
||||||
-- Relations table (citation graph)
|
|
||||||
CREATE TABLE IF NOT EXISTS relations (
|
|
||||||
source_entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
|
|
||||||
target_citation_key TEXT NOT NULL,
|
|
||||||
relation_type TEXT NOT NULL,
|
|
||||||
PRIMARY KEY (source_entry_id, target_citation_key, relation_type)
|
|
||||||
);
|
|
||||||
|
|
||||||
-- Topics table
|
|
||||||
CREATE TABLE IF NOT EXISTS topics (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
slug TEXT NOT NULL UNIQUE,
|
|
||||||
name TEXT NOT NULL,
|
|
||||||
source_type TEXT NOT NULL,
|
|
||||||
source_url TEXT,
|
|
||||||
expansion_phrase TEXT,
|
|
||||||
suggested_phrase TEXT,
|
|
||||||
phrase_review_status TEXT NOT NULL DEFAULT 'unreviewed',
|
|
||||||
phrase_review_notes TEXT,
|
|
||||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
||||||
);
|
|
||||||
|
|
||||||
-- Entry-Topics relationship
|
|
||||||
CREATE TABLE IF NOT EXISTS entry_topics (
|
|
||||||
entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
|
|
||||||
topic_id INTEGER NOT NULL REFERENCES topics(id) ON DELETE CASCADE,
|
|
||||||
source_label TEXT NOT NULL,
|
|
||||||
confidence REAL,
|
|
||||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
PRIMARY KEY (entry_id, topic_id)
|
|
||||||
);
|
|
||||||
|
|
||||||
-- Field Provenance table
|
|
||||||
CREATE TABLE IF NOT EXISTS field_provenance (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
|
|
||||||
field_name TEXT NOT NULL,
|
|
||||||
field_value TEXT,
|
|
||||||
source_type TEXT NOT NULL,
|
|
||||||
source_label TEXT NOT NULL,
|
|
||||||
operation TEXT NOT NULL,
|
|
||||||
confidence REAL,
|
|
||||||
recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
||||||
);
|
|
||||||
|
|
||||||
-- Relation Provenance table
|
|
||||||
CREATE TABLE IF NOT EXISTS relation_provenance (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
source_entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
|
|
||||||
target_citation_key TEXT NOT NULL,
|
|
||||||
relation_type TEXT NOT NULL,
|
|
||||||
source_type TEXT NOT NULL,
|
|
||||||
source_label TEXT NOT NULL,
|
|
||||||
confidence REAL,
|
|
||||||
recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
||||||
);
|
|
||||||
|
|
||||||
-- Full-text Search (FTS5)
|
|
||||||
CREATE VIRTUAL TABLE IF NOT EXISTS entries_fts USING fts5(
|
|
||||||
title,
|
|
||||||
abstract,
|
|
||||||
keywords,
|
|
||||||
content='entries',
|
|
||||||
content_rowid='id'
|
|
||||||
);
|
|
||||||
|
|
||||||
-- Trigger to sync entries with FTS
|
|
||||||
CREATE TRIGGER IF NOT EXISTS entries_ai AFTER INSERT ON entries BEGIN
|
|
||||||
INSERT INTO entries_fts(rowid, title, abstract, keywords)
|
|
||||||
VALUES (new.id, new.title, new.abstract, new.keywords);
|
|
||||||
END;
|
|
||||||
|
|
||||||
CREATE TRIGGER IF NOT EXISTS entries_ad AFTER DELETE ON entries BEGIN
|
|
||||||
DELETE FROM entries_fts WHERE rowid = old.id;
|
|
||||||
END;
|
|
||||||
|
|
||||||
CREATE TRIGGER IF NOT EXISTS entries_au AFTER UPDATE ON entries BEGIN
|
|
||||||
UPDATE entries_fts SET title = new.title, abstract = new.abstract, keywords = new.keywords
|
|
||||||
WHERE rowid = new.id;
|
|
||||||
END;
|
|
||||||
|
|
@ -1,131 +0,0 @@
|
||||||
# Open Bibliographic Source Landscape
|
|
||||||
|
|
||||||
This document answers the question that prompted the recent planning work: which additional open bibliographic sources are worth incorporating into CiteGeist, given the sources it already uses?
|
|
||||||
|
|
||||||
## Current Baseline
|
|
||||||
|
|
||||||
CiteGeist already has useful source coverage for a local BibTeX-first workflow:
|
|
||||||
|
|
||||||
- `Crossref`: DOI lookup, title search, and reference-list expansion.
|
|
||||||
- `OpenAlex`: work lookup, title/discovery search, and citation-graph expansion.
|
|
||||||
- `PubMed`: authoritative biomedical metadata lookup by PMID and title search fallback.
|
|
||||||
- `Europe PMC`: biomedical metadata/fulltext complement to PubMed.
|
|
||||||
- `Semantic Scholar`: broad cross-domain metadata with strong biological and physical sciences coverage.
|
|
||||||
- `DataCite`: DOI-backed dataset/report/non-article metadata.
|
|
||||||
- `DBLP`: strong computer-science metadata.
|
|
||||||
- `arXiv`: preprint metadata.
|
|
||||||
- `OAI-PMH`: repository harvesting for theses, dissertations, and institutional collections.
|
|
||||||
|
|
||||||
That means the immediate gap is no longer “get any scholarly metadata at all.” The immediate gap is to add the next highest-value open sources without destabilizing the existing ingest, review, and export pipeline.
|
|
||||||
|
|
||||||
## Recommended Priorities
|
|
||||||
|
|
||||||
### OpenCitations
|
|
||||||
|
|
||||||
Why:
|
|
||||||
|
|
||||||
- It directly improves open citation-edge coverage.
|
|
||||||
- It fits CiteGeist's graph-discovery workflow better than another generic metadata source.
|
|
||||||
- It complements OpenAlex rather than replacing it.
|
|
||||||
|
|
||||||
Expected role:
|
|
||||||
|
|
||||||
- DOI-to-citations lookup
|
|
||||||
- DOI-to-references lookup
|
|
||||||
- provenance for citation edges
|
|
||||||
|
|
||||||
Status:
|
|
||||||
|
|
||||||
- now integrated as a DOI-based citation/reference source in the source layer and graph expansion flow
|
|
||||||
|
|
||||||
Main risk:
|
|
||||||
|
|
||||||
- coverage is narrower than OpenAlex, so merge rules need to treat it as an additional edge source rather than a primary metadata authority.
|
|
||||||
|
|
||||||
### Unpaywall
|
|
||||||
|
|
||||||
Why:
|
|
||||||
|
|
||||||
- It solves a different problem from Crossref/OpenAlex: full-text access and OA status.
|
|
||||||
- It improves the “can I get the paper?” part of the workflow without forcing a storage redesign.
|
|
||||||
|
|
||||||
Expected role:
|
|
||||||
|
|
||||||
- DOI-to-best-open-access-link lookup
|
|
||||||
- OA status enrichment
|
|
||||||
|
|
||||||
Status:
|
|
||||||
|
|
||||||
- now integrated as an OA-link enrichment source with a dedicated `enrich-oa` CLI flow
|
|
||||||
|
|
||||||
Main risk:
|
|
||||||
|
|
||||||
- It should remain an access-link enrichment layer, not become entangled with identity resolution logic.
|
|
||||||
|
|
||||||
### Europe PMC
|
|
||||||
|
|
||||||
Why:
|
|
||||||
|
|
||||||
- It is valuable for biomedical and life-sciences use cases.
|
|
||||||
- It complements PubMed with richer open-access and citation-related information.
|
|
||||||
|
|
||||||
Expected role:
|
|
||||||
|
|
||||||
- domain-specific metadata enrichment
|
|
||||||
- biomedical search
|
|
||||||
- OA/full-text linkage
|
|
||||||
|
|
||||||
Status:
|
|
||||||
|
|
||||||
- now integrated as a biomedical resolver/search complement to `PubMed`
|
|
||||||
|
|
||||||
Main risk:
|
|
||||||
|
|
||||||
- this should remain a domain-specific source, not be treated as a universal resolver.
|
|
||||||
|
|
||||||
### Semantic Scholar
|
|
||||||
|
|
||||||
Pros:
|
|
||||||
|
|
||||||
- good graph and relevance signals
|
|
||||||
- useful for discovery quality
|
|
||||||
|
|
||||||
Status:
|
|
||||||
|
|
||||||
- now integrated as a broad resolver/search complement with good biological and physical sciences coverage
|
|
||||||
|
|
||||||
Main risk:
|
|
||||||
|
|
||||||
- rate limits and product-policy changes still matter more here than for the more explicitly open bibliographic sources
|
|
||||||
|
|
||||||
## Evaluate But Do Not Make Core Yet
|
|
||||||
|
|
||||||
### OpenAIRE
|
|
||||||
|
|
||||||
Pros:
|
|
||||||
|
|
||||||
- strong repository and OA/project linkage
|
|
||||||
- good for European repository acquisition
|
|
||||||
|
|
||||||
Cons:
|
|
||||||
|
|
||||||
- better suited to corpus acquisition than first-line metadata resolution
|
|
||||||
|
|
||||||
Recommendation:
|
|
||||||
|
|
||||||
- treat as an acquisition adapter, not an immediate resolver target
|
|
||||||
|
|
||||||
## What Not To Prioritize Right Now
|
|
||||||
|
|
||||||
### Database Redesign
|
|
||||||
|
|
||||||
The repository already has a working SQLite storage model and FTS-backed local workflow. A second schema track should not lead the next phase of work unless a concrete source integration is blocked on it.
|
|
||||||
|
|
||||||
### Vector Search
|
|
||||||
|
|
||||||
Optional semantic ranking may become useful later, but it was not the motivating question and does not need to be a prerequisite for source incorporation.
|
|
||||||
|
|
||||||
## Suggested Execution Order
|
|
||||||
|
|
||||||
1. Keep the source abstraction aligned with sources already in use.
|
|
||||||
2. Revisit `OpenAIRE` after the current source additions settle.
|
|
||||||
113
new-roadmap.md
113
new-roadmap.md
|
|
@ -1,113 +0,0 @@
|
||||||
# CiteGeist Roadmap: Sources-First Expansion
|
|
||||||
|
|
||||||
## Purpose
|
|
||||||
|
|
||||||
The primary question is not “how do we redesign CiteGeist around a new storage engine?” The primary question is “which additional open bibliographic sources should CiteGeist incorporate next?”
|
|
||||||
|
|
||||||
This roadmap treats the current SQLite-based local workflow as the baseline and focuses on source evaluation, source integration order, and reviewable source behavior.
|
|
||||||
|
|
||||||
## Baseline
|
|
||||||
|
|
||||||
Already present in the repository:
|
|
||||||
|
|
||||||
- local BibTeX ingest, review, export, and graph traversal
|
|
||||||
- metadata resolution from `Crossref`, `PubMed`, `Europe PMC`, `OpenAlex`, `Semantic Scholar`, `DBLP`, `arXiv`, and `DataCite`
|
|
||||||
- citation-graph expansion using `Crossref` and `OpenAlex`
|
|
||||||
- repository harvesting via `OAI-PMH`
|
|
||||||
|
|
||||||
That means the next planning step is source prioritization, not another platform pivot.
|
|
||||||
|
|
||||||
## Phase 0: Reframe Scope
|
|
||||||
|
|
||||||
Goal:
|
|
||||||
|
|
||||||
Put source-incorporation decisions ahead of database and vector-search ambitions.
|
|
||||||
|
|
||||||
Tasks:
|
|
||||||
|
|
||||||
- [x] identify which source integrations already exist
|
|
||||||
- [x] separate “source expansion” work from “new database/vector stack” work
|
|
||||||
- [x] document the source landscape and recommended order
|
|
||||||
|
|
||||||
Deliverables:
|
|
||||||
|
|
||||||
- `/docs/source-landscape.md`
|
|
||||||
- `/src/citegeist/sources/catalog.py`
|
|
||||||
|
|
||||||
## Phase 1: Tighten The Source Layer
|
|
||||||
|
|
||||||
Goal:
|
|
||||||
|
|
||||||
Make the new source abstraction useful for the repository that already exists, rather than speculative infrastructure.
|
|
||||||
|
|
||||||
Tasks:
|
|
||||||
|
|
||||||
- [x] keep the compatibility bridge to the existing `SourceClient`
|
|
||||||
- [x] fix the initial `CrossRefSource` implementation so normalization works
|
|
||||||
- [x] make config-driven registry loading work for known concrete sources
|
|
||||||
- [x] add a code-backed source catalog for planning and prioritization
|
|
||||||
|
|
||||||
Deliverables:
|
|
||||||
|
|
||||||
- `/src/citegeist/sources/base.py`
|
|
||||||
- `/src/citegeist/sources/registry.py`
|
|
||||||
- `/src/citegeist/sources/crossref.py`
|
|
||||||
- `/src/citegeist/sources/catalog.py`
|
|
||||||
|
|
||||||
## Phase 2: Highest-Value Open Source Additions
|
|
||||||
|
|
||||||
Goal:
|
|
||||||
|
|
||||||
Incorporate the next open sources that materially improve the current workflow.
|
|
||||||
|
|
||||||
Priority order:
|
|
||||||
|
|
||||||
1. `OpenAIRE` only if repository-acquisition scope expands
|
|
||||||
|
|
||||||
Tasks:
|
|
||||||
|
|
||||||
- [x] add `OpenCitations` DOI-to-citation and DOI-to-reference lookup
|
|
||||||
- [x] merge `OpenCitations` edges into the existing graph-expansion workflow with provenance
|
|
||||||
- [x] add `Unpaywall` DOI-to-OA-link enrichment
|
|
||||||
- [x] expose OA-link enrichment in a dedicated CLI flow
|
|
||||||
- [x] add `Europe PMC` as a biomedical metadata/fulltext complement to `PubMed`
|
|
||||||
- [x] add `Semantic Scholar` as a broader scientific metadata complement across biological and physical sciences
|
|
||||||
|
|
||||||
Why these first:
|
|
||||||
|
|
||||||
- `OpenCitations` directly answers the open-citation-coverage gap
|
|
||||||
- `Unpaywall` now solves access-link enrichment without forcing a storage redesign
|
|
||||||
- `Europe PMC` now improves biomedical metadata and OA/fulltext coverage without changing the storage model
|
|
||||||
- `Semantic Scholar` now improves broader biological and physical sciences coverage without changing the storage model
|
|
||||||
|
|
||||||
## Phase 3: Evaluate Optional Sources, Do Not Commit Prematurely
|
|
||||||
|
|
||||||
Goal:
|
|
||||||
|
|
||||||
Assess sources that may be useful, but are not clearly the next source-first move.
|
|
||||||
|
|
||||||
Candidates:
|
|
||||||
|
|
||||||
- `OpenAIRE`
|
|
||||||
|
|
||||||
Tasks:
|
|
||||||
|
|
||||||
- [ ] document API limits, openness constraints, and integration risk
|
|
||||||
- [ ] decide whether each source belongs in core resolution, graph expansion, or corpus acquisition
|
|
||||||
- [ ] avoid adding sources that duplicate existing coverage without a clear payoff
|
|
||||||
|
|
||||||
## Deferred Work
|
|
||||||
|
|
||||||
These are valid future ideas, but they are not the current planning driver:
|
|
||||||
|
|
||||||
- a second database schema
|
|
||||||
- pgvector integration
|
|
||||||
- embedding-first search
|
|
||||||
- large-scale canonical-work reconstruction
|
|
||||||
|
|
||||||
The repository already has a working local storage/search path. Those ideas should only return to the front of the plan if a concrete source-integration need forces them there.
|
|
||||||
|
|
||||||
## Immediate Next Steps
|
|
||||||
|
|
||||||
1. Land the source inventory and source-layer cleanup.
|
|
||||||
2. Reassess whether `OpenAIRE` is worth adding for repository-acquisition breadth.
|
|
||||||
|
|
@ -2,7 +2,7 @@ from .app_api import LiteratureExplorerApi
|
||||||
from .batch import BatchBootstrapRunner, BatchJobResult, load_batch_jobs
|
from .batch import BatchBootstrapRunner, BatchJobResult, load_batch_jobs
|
||||||
from .bibtex import BibEntry, parse_bibtex
|
from .bibtex import BibEntry, parse_bibtex
|
||||||
from .bootstrap import BootstrapResult, Bootstrapper
|
from .bootstrap import BootstrapResult, Bootstrapper
|
||||||
from .expand import CrossrefExpander, OpenAlexExpander, OpenCitationsExpander
|
from .expand import CrossrefExpander, OpenAlexExpander
|
||||||
from .extract import (
|
from .extract import (
|
||||||
available_extraction_backends,
|
available_extraction_backends,
|
||||||
check_extraction_comparison_summary,
|
check_extraction_comparison_summary,
|
||||||
|
|
@ -16,10 +16,6 @@ from .harvest import OaiMetadataFormat, OaiPmhHarvester, OaiSet
|
||||||
from .llm_verify import VerificationLlmClient, VerificationLlmConfig
|
from .llm_verify import VerificationLlmClient, VerificationLlmConfig
|
||||||
from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts
|
from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts
|
||||||
from .sources import SourceClient
|
from .sources import SourceClient
|
||||||
from .sources import EuropePmcSource
|
|
||||||
from .sources import OpenLibrarySource
|
|
||||||
from .sources import SemanticScholarSource
|
|
||||||
from .sources import UnpaywallSource
|
|
||||||
from .storage import BibliographyStore
|
from .storage import BibliographyStore
|
||||||
from .verify import BibliographyVerifier, VerificationResult, VerificationMatch
|
from .verify import BibliographyVerifier, VerificationResult, VerificationMatch
|
||||||
|
|
||||||
|
|
@ -35,15 +31,10 @@ __all__ = [
|
||||||
"LiteratureExplorerApi",
|
"LiteratureExplorerApi",
|
||||||
"MetadataResolver",
|
"MetadataResolver",
|
||||||
"OpenAlexExpander",
|
"OpenAlexExpander",
|
||||||
"OpenCitationsExpander",
|
|
||||||
"OaiPmhHarvester",
|
"OaiPmhHarvester",
|
||||||
"OaiMetadataFormat",
|
"OaiMetadataFormat",
|
||||||
"OaiSet",
|
"OaiSet",
|
||||||
"SourceClient",
|
"SourceClient",
|
||||||
"EuropePmcSource",
|
|
||||||
"OpenLibrarySource",
|
|
||||||
"SemanticScholarSource",
|
|
||||||
"UnpaywallSource",
|
|
||||||
"VerificationLlmClient",
|
"VerificationLlmClient",
|
||||||
"VerificationLlmConfig",
|
"VerificationLlmConfig",
|
||||||
"VerificationMatch",
|
"VerificationMatch",
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,6 @@ from .bibtex import BibEntry, parse_bibtex, render_bibtex
|
||||||
from .bootstrap import Bootstrapper
|
from .bootstrap import Bootstrapper
|
||||||
from .examples.talkorigins import TalkOriginsScraper
|
from .examples.talkorigins import TalkOriginsScraper
|
||||||
from .expand import CrossrefExpander, OpenAlexExpander, TopicExpander, _expand_relation_types
|
from .expand import CrossrefExpander, OpenAlexExpander, TopicExpander, _expand_relation_types
|
||||||
from .notebook_export import export_notebook_topic_bundle
|
|
||||||
from .extract import (
|
from .extract import (
|
||||||
available_extraction_backends,
|
available_extraction_backends,
|
||||||
check_extraction_comparison_summary,
|
check_extraction_comparison_summary,
|
||||||
|
|
@ -174,13 +173,6 @@ def build_parser() -> argparse.ArgumentParser:
|
||||||
resolve_parser = subparsers.add_parser("resolve", help="Enrich stored entries from external metadata sources")
|
resolve_parser = subparsers.add_parser("resolve", help="Enrich stored entries from external metadata sources")
|
||||||
resolve_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich")
|
resolve_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich")
|
||||||
|
|
||||||
enrich_oa_parser = subparsers.add_parser(
|
|
||||||
"enrich-oa",
|
|
||||||
help="Enrich DOI-bearing entries with Unpaywall OA link metadata",
|
|
||||||
)
|
|
||||||
enrich_oa_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich")
|
|
||||||
enrich_oa_parser.add_argument("--email", help="Email address required by the Unpaywall API")
|
|
||||||
|
|
||||||
resolve_stubs_parser = subparsers.add_parser(
|
resolve_stubs_parser = subparsers.add_parser(
|
||||||
"resolve-stubs",
|
"resolve-stubs",
|
||||||
help="Find and enrich stub-like stored entries, optionally limited to DOI-bearing candidates",
|
help="Find and enrich stub-like stored entries, optionally limited to DOI-bearing candidates",
|
||||||
|
|
@ -245,7 +237,7 @@ def build_parser() -> argparse.ArgumentParser:
|
||||||
expand_parser.add_argument("citation_keys", nargs="+", help="Seed citation keys to expand")
|
expand_parser.add_argument("citation_keys", nargs="+", help="Seed citation keys to expand")
|
||||||
expand_parser.add_argument(
|
expand_parser.add_argument(
|
||||||
"--source",
|
"--source",
|
||||||
choices=["crossref", "openalex", "opencitations"],
|
choices=["crossref", "openalex"],
|
||||||
default="crossref",
|
default="crossref",
|
||||||
help="Graph expansion source",
|
help="Graph expansion source",
|
||||||
)
|
)
|
||||||
|
|
@ -268,7 +260,7 @@ def build_parser() -> argparse.ArgumentParser:
|
||||||
)
|
)
|
||||||
expand_topic_parser.add_argument(
|
expand_topic_parser.add_argument(
|
||||||
"--source",
|
"--source",
|
||||||
choices=["crossref", "openalex", "opencitations"],
|
choices=["crossref", "openalex"],
|
||||||
default="openalex",
|
default="openalex",
|
||||||
help="Topic graph expansion source",
|
help="Topic graph expansion source",
|
||||||
)
|
)
|
||||||
|
|
@ -694,18 +686,6 @@ def build_parser() -> argparse.ArgumentParser:
|
||||||
help="Include DOI-only placeholder records in the topic export",
|
help="Include DOI-only placeholder records in the topic export",
|
||||||
)
|
)
|
||||||
|
|
||||||
export_notebook_topic_parser = subparsers.add_parser(
|
|
||||||
"export-notebook-topic",
|
|
||||||
help="Export a Notebook-ready bibliography bundle for one topic",
|
|
||||||
)
|
|
||||||
export_notebook_topic_parser.add_argument("topic_slug", help="Topic slug to export")
|
|
||||||
export_notebook_topic_parser.add_argument("--output-dir", required=True, help="Directory to write the Notebook bundle")
|
|
||||||
export_notebook_topic_parser.add_argument(
|
|
||||||
"--include-stubs",
|
|
||||||
action="store_true",
|
|
||||||
help="Include DOI-only placeholder records in the Notebook bibliography",
|
|
||||||
)
|
|
||||||
|
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -769,8 +749,6 @@ def main(argv: list[str] | None = None) -> int:
|
||||||
)
|
)
|
||||||
if args.command == "resolve":
|
if args.command == "resolve":
|
||||||
return _run_resolve(store, args.citation_keys)
|
return _run_resolve(store, args.citation_keys)
|
||||||
if args.command == "enrich-oa":
|
|
||||||
return _run_enrich_oa(store, args.citation_keys, args.email)
|
|
||||||
if args.command == "resolve-stubs":
|
if args.command == "resolve-stubs":
|
||||||
return _run_resolve_stubs(store, args.limit, args.doi_only, args.all_misc, args.topic, args.preview)
|
return _run_resolve_stubs(store, args.limit, args.doi_only, args.all_misc, args.topic, args.preview)
|
||||||
if args.command == "graph":
|
if args.command == "graph":
|
||||||
|
|
@ -925,8 +903,6 @@ def main(argv: list[str] | None = None) -> int:
|
||||||
return _run_topic_entries(store, args.topic_slug, args.limit)
|
return _run_topic_entries(store, args.topic_slug, args.limit)
|
||||||
if args.command == "export-topic":
|
if args.command == "export-topic":
|
||||||
return _run_export_topic(store, args.topic_slug, args.output, args.include_stubs)
|
return _run_export_topic(store, args.topic_slug, args.output, args.include_stubs)
|
||||||
if args.command == "export-notebook-topic":
|
|
||||||
return _run_export_notebook_topic(store, args.topic_slug, args.output_dir, args.include_stubs)
|
|
||||||
finally:
|
finally:
|
||||||
store.close()
|
store.close()
|
||||||
|
|
||||||
|
|
@ -1239,72 +1215,6 @@ def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int:
|
||||||
return exit_code
|
return exit_code
|
||||||
|
|
||||||
|
|
||||||
def _run_enrich_oa(store: BibliographyStore, citation_keys: list[str], email: str | None) -> int:
|
|
||||||
from .sources import UnpaywallSource
|
|
||||||
|
|
||||||
source = UnpaywallSource(config={"email": email} if email else {})
|
|
||||||
if not source.is_available():
|
|
||||||
print("Unpaywall enrichment requires --email or UNPAYWALL_EMAIL", file=sys.stderr)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
results: list[dict[str, object]] = []
|
|
||||||
total = len(citation_keys)
|
|
||||||
for index, citation_key in enumerate(citation_keys, start=1):
|
|
||||||
_print_progress("enriching OA", index, total, citation_key)
|
|
||||||
existing = store.get_entry(citation_key)
|
|
||||||
if existing is None:
|
|
||||||
results.append({"citation_key": citation_key, "status": "missing"})
|
|
||||||
continue
|
|
||||||
doi = str(existing.get("doi") or "").strip()
|
|
||||||
if not doi:
|
|
||||||
results.append({"citation_key": citation_key, "status": "no_doi"})
|
|
||||||
continue
|
|
||||||
|
|
||||||
enriched = source.lookup_by_doi(doi)
|
|
||||||
if enriched is None:
|
|
||||||
results.append({"citation_key": citation_key, "status": "no_record", "doi": doi})
|
|
||||||
continue
|
|
||||||
|
|
||||||
merged_fields: dict[str, str] = {}
|
|
||||||
for key, value in existing.items():
|
|
||||||
if isinstance(value, str):
|
|
||||||
merged_fields[key] = value
|
|
||||||
merged_fields.update(enriched.fields)
|
|
||||||
|
|
||||||
for field_name in ("title", "year", "author", "journal", "booktitle", "publisher", "abstract", "keywords"):
|
|
||||||
existing_value = str(existing.get(field_name) or "").strip()
|
|
||||||
if existing_value:
|
|
||||||
merged_fields[field_name] = existing_value
|
|
||||||
|
|
||||||
replacement = BibEntry(
|
|
||||||
entry_type=str(existing.get("entry_type") or "misc"),
|
|
||||||
citation_key=citation_key,
|
|
||||||
fields=merged_fields,
|
|
||||||
)
|
|
||||||
store.replace_entry(
|
|
||||||
citation_key,
|
|
||||||
replacement,
|
|
||||||
source_type="oa_enrich",
|
|
||||||
source_label=f"unpaywall:doi:{doi}",
|
|
||||||
review_status=str(existing.get("review_status") or "enriched"),
|
|
||||||
)
|
|
||||||
updated = store.get_entry(citation_key) or {}
|
|
||||||
results.append(
|
|
||||||
{
|
|
||||||
"citation_key": citation_key,
|
|
||||||
"status": "enriched",
|
|
||||||
"doi": doi,
|
|
||||||
"is_oa": updated.get("is_oa"),
|
|
||||||
"oa_status": updated.get("oa_status"),
|
|
||||||
"best_oa_url": updated.get("best_oa_url"),
|
|
||||||
"best_oa_pdf_url": updated.get("best_oa_pdf_url"),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
print(json.dumps(results, indent=2))
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
def _resolve_one(store: BibliographyStore, resolver: MetadataResolver, citation_key: str) -> bool:
|
def _resolve_one(store: BibliographyStore, resolver: MetadataResolver, citation_key: str) -> bool:
|
||||||
existing = store.get_entry(citation_key)
|
existing = store.get_entry(citation_key)
|
||||||
if existing is None:
|
if existing is None:
|
||||||
|
|
@ -1754,15 +1664,6 @@ def _run_expand(
|
||||||
for relation_name in _expand_relation_types(relation)
|
for relation_name in _expand_relation_types(relation)
|
||||||
for item in expander.expand_entry(store, key, relation_type=relation_name, limit=limit)
|
for item in expander.expand_entry(store, key, relation_type=relation_name, limit=limit)
|
||||||
]
|
]
|
||||||
elif source == "opencitations":
|
|
||||||
from .expand import OpenCitationsExpander
|
|
||||||
|
|
||||||
expander = OpenCitationsExpander()
|
|
||||||
expand_fn = lambda key: [
|
|
||||||
item
|
|
||||||
for relation_name in _expand_relation_types(relation)
|
|
||||||
for item in expander.expand_entry(store, key, relation_type=relation_name, limit=limit)
|
|
||||||
]
|
|
||||||
else:
|
else:
|
||||||
print(f"Unsupported expansion source: {source}", file=sys.stderr)
|
print(f"Unsupported expansion source: {source}", file=sys.stderr)
|
||||||
return 1
|
return 1
|
||||||
|
|
@ -2350,13 +2251,3 @@ def _run_export_topic(store: BibliographyStore, topic_slug: str, output: str | N
|
||||||
if rendered:
|
if rendered:
|
||||||
print(rendered)
|
print(rendered)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def _run_export_notebook_topic(store: BibliographyStore, topic_slug: str, output_dir: str, include_stubs: bool) -> int:
|
|
||||||
try:
|
|
||||||
payload = export_notebook_topic_bundle(store.path, topic_slug, output_dir, include_stubs=include_stubs)
|
|
||||||
except KeyError:
|
|
||||||
print(f"Topic not found: {topic_slug}", file=sys.stderr)
|
|
||||||
return 1
|
|
||||||
print(json.dumps(payload, indent=2))
|
|
||||||
return 0
|
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,6 @@ from urllib.parse import quote, urlencode
|
||||||
from .bibtex import BibEntry, parse_bibtex
|
from .bibtex import BibEntry, parse_bibtex
|
||||||
from .extract import _extract_thesis_like_title, _looks_like_citation_blob as _shared_looks_like_citation_blob
|
from .extract import _extract_thesis_like_title, _looks_like_citation_blob as _shared_looks_like_citation_blob
|
||||||
from .resolve import MetadataResolver, merge_entries
|
from .resolve import MetadataResolver, merge_entries
|
||||||
from .sources import OpenCitationsSource
|
|
||||||
from .storage import BibliographyStore
|
from .storage import BibliographyStore
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -220,94 +219,14 @@ class OpenAlexExpander:
|
||||||
return _normalize_openalex_id(results[0].get("id", ""))
|
return _normalize_openalex_id(results[0].get("id", ""))
|
||||||
|
|
||||||
|
|
||||||
class OpenCitationsExpander:
|
|
||||||
def __init__(self, resolver: MetadataResolver | None = None, source: OpenCitationsSource | None = None) -> None:
|
|
||||||
self.resolver = resolver or MetadataResolver()
|
|
||||||
self.source = source or OpenCitationsSource(config={"source_client": self.resolver.source_client})
|
|
||||||
|
|
||||||
def expand_entry(
|
|
||||||
self,
|
|
||||||
store: BibliographyStore,
|
|
||||||
citation_key: str,
|
|
||||||
relation_type: str = "cites",
|
|
||||||
limit: int = 25,
|
|
||||||
) -> list[ExpansionResult]:
|
|
||||||
entry = store.get_entry(citation_key)
|
|
||||||
if entry is None:
|
|
||||||
return []
|
|
||||||
|
|
||||||
doi = str(entry.get("doi") or "")
|
|
||||||
if not doi:
|
|
||||||
return []
|
|
||||||
|
|
||||||
edges = self.source.get_citations(doi, relation_type=relation_type, limit=limit)
|
|
||||||
results: list[ExpansionResult] = []
|
|
||||||
for edge in edges:
|
|
||||||
discovered_doi = edge.target_work_id[4:] if relation_type == "cites" else edge.source_work_id[4:]
|
|
||||||
discovered = self._lookup_discovered_entry(discovered_doi)
|
|
||||||
if discovered is None:
|
|
||||||
discovered = _opencitations_stub_entry(discovered_doi, citation_key)
|
|
||||||
|
|
||||||
existing_key = _existing_entry_key_for_discovered_work(store, discovered)
|
|
||||||
target_key = existing_key or discovered.citation_key
|
|
||||||
created = False
|
|
||||||
if existing_key is None and store.get_entry(discovered.citation_key) is None:
|
|
||||||
store.upsert_entry(
|
|
||||||
discovered,
|
|
||||||
raw_bibtex=None,
|
|
||||||
source_type="graph_expand",
|
|
||||||
source_label=edge.source_label,
|
|
||||||
review_status="draft",
|
|
||||||
)
|
|
||||||
store.connection.commit()
|
|
||||||
created = True
|
|
||||||
|
|
||||||
if relation_type == "cites":
|
|
||||||
source_key = citation_key
|
|
||||||
relation_target_key = target_key
|
|
||||||
else:
|
|
||||||
source_key = target_key
|
|
||||||
relation_target_key = citation_key
|
|
||||||
|
|
||||||
store.add_relation(
|
|
||||||
source_key,
|
|
||||||
relation_target_key,
|
|
||||||
"cites",
|
|
||||||
source_type="graph_expand",
|
|
||||||
source_label=edge.source_label,
|
|
||||||
confidence=edge.confidence,
|
|
||||||
)
|
|
||||||
results.append(
|
|
||||||
ExpansionResult(
|
|
||||||
source_citation_key=source_key,
|
|
||||||
discovered_citation_key=target_key,
|
|
||||||
created_entry=created,
|
|
||||||
relation_type=relation_type,
|
|
||||||
source_label=edge.source_label,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return results
|
|
||||||
|
|
||||||
def _lookup_discovered_entry(self, doi: str) -> BibEntry | None:
|
|
||||||
resolution = self.resolver.resolve_doi(doi)
|
|
||||||
if resolution is not None:
|
|
||||||
return resolution.entry
|
|
||||||
resolution = self.resolver.resolve_datacite_doi(doi)
|
|
||||||
if resolution is not None:
|
|
||||||
return resolution.entry
|
|
||||||
return self.source.lookup_by_doi(doi)
|
|
||||||
|
|
||||||
|
|
||||||
class TopicExpander:
|
class TopicExpander:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
crossref_expander: CrossrefExpander | None = None,
|
crossref_expander: CrossrefExpander | None = None,
|
||||||
openalex_expander: OpenAlexExpander | None = None,
|
openalex_expander: OpenAlexExpander | None = None,
|
||||||
opencitations_expander: OpenCitationsExpander | None = None,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
self.crossref_expander = crossref_expander or CrossrefExpander()
|
self.crossref_expander = crossref_expander or CrossrefExpander()
|
||||||
self.openalex_expander = openalex_expander or OpenAlexExpander()
|
self.openalex_expander = openalex_expander or OpenAlexExpander()
|
||||||
self.opencitations_expander = opencitations_expander or OpenCitationsExpander()
|
|
||||||
self.last_run_meta: dict[str, object] = {}
|
self.last_run_meta: dict[str, object] = {}
|
||||||
|
|
||||||
def expand_topic(
|
def expand_topic(
|
||||||
|
|
@ -443,17 +362,6 @@ class TopicExpander:
|
||||||
) -> list[tuple[ExpansionResult, dict[str, object] | None]]:
|
) -> list[tuple[ExpansionResult, dict[str, object] | None]]:
|
||||||
if source == "crossref":
|
if source == "crossref":
|
||||||
expansion_rows = self.crossref_expander.expand_entry_references(store, citation_key)
|
expansion_rows = self.crossref_expander.expand_entry_references(store, citation_key)
|
||||||
elif source == "opencitations":
|
|
||||||
expansion_rows = []
|
|
||||||
for relation_name in _expand_relation_types(relation_type):
|
|
||||||
expansion_rows.extend(
|
|
||||||
self.opencitations_expander.expand_entry(
|
|
||||||
store,
|
|
||||||
citation_key,
|
|
||||||
relation_type=relation_name,
|
|
||||||
limit=limit,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
expansion_rows: list[ExpansionResult] = []
|
expansion_rows: list[ExpansionResult] = []
|
||||||
for relation_name in _expand_relation_types(relation_type):
|
for relation_name in _expand_relation_types(relation_type):
|
||||||
|
|
@ -477,11 +385,6 @@ class TopicExpander:
|
||||||
) -> list[tuple[ExpansionResult, dict[str, object]]]:
|
) -> list[tuple[ExpansionResult, dict[str, object]]]:
|
||||||
if source == "crossref":
|
if source == "crossref":
|
||||||
return self._preview_crossref_discoveries(store, citation_key, limit)
|
return self._preview_crossref_discoveries(store, citation_key, limit)
|
||||||
if source == "opencitations":
|
|
||||||
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
|
|
||||||
for relation_name in _expand_relation_types(relation_type):
|
|
||||||
rows.extend(self._preview_opencitations_discoveries(store, citation_key, relation_name, limit))
|
|
||||||
return rows
|
|
||||||
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
|
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
|
||||||
for relation_name in _expand_relation_types(relation_type):
|
for relation_name in _expand_relation_types(relation_type):
|
||||||
rows.extend(self._preview_openalex_discoveries(store, citation_key, relation_name, limit))
|
rows.extend(self._preview_openalex_discoveries(store, citation_key, relation_name, limit))
|
||||||
|
|
@ -564,40 +467,6 @@ class TopicExpander:
|
||||||
)
|
)
|
||||||
return rows
|
return rows
|
||||||
|
|
||||||
def _preview_opencitations_discoveries(
|
|
||||||
self,
|
|
||||||
store: BibliographyStore,
|
|
||||||
citation_key: str,
|
|
||||||
relation_type: str,
|
|
||||||
limit: int,
|
|
||||||
) -> list[tuple[ExpansionResult, dict[str, object]]]:
|
|
||||||
entry = store.get_entry(citation_key)
|
|
||||||
if entry is None or not entry.get("doi"):
|
|
||||||
return []
|
|
||||||
doi = str(entry["doi"])
|
|
||||||
edges = self.opencitations_expander.source.get_citations(doi, relation_type=relation_type, limit=limit)
|
|
||||||
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
|
|
||||||
for edge in edges:
|
|
||||||
discovered_doi = edge.target_work_id[4:] if relation_type == "cites" else edge.source_work_id[4:]
|
|
||||||
discovered = self.opencitations_expander._lookup_discovered_entry(discovered_doi)
|
|
||||||
if discovered is None:
|
|
||||||
discovered = _opencitations_stub_entry(discovered_doi, citation_key)
|
|
||||||
existing_key = _existing_entry_key_for_discovered_work(store, discovered)
|
|
||||||
target_key = existing_key or discovered.citation_key
|
|
||||||
rows.append(
|
|
||||||
(
|
|
||||||
ExpansionResult(
|
|
||||||
source_citation_key=citation_key if relation_type == "cites" else target_key,
|
|
||||||
discovered_citation_key=target_key,
|
|
||||||
created_entry=existing_key is None and store.get_entry(discovered.citation_key) is None,
|
|
||||||
relation_type=relation_type,
|
|
||||||
source_label=edge.source_label,
|
|
||||||
),
|
|
||||||
dict(discovered.fields),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return rows
|
|
||||||
|
|
||||||
|
|
||||||
def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
|
def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
|
||||||
title = _crossref_reference_title(reference, ordinal)
|
title = _crossref_reference_title(reference, ordinal)
|
||||||
|
|
@ -698,20 +567,6 @@ def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int
|
||||||
return f"{family}{year or 'nd'}{first_word}{ordinal}"
|
return f"{family}{year or 'nd'}{first_word}{ordinal}"
|
||||||
|
|
||||||
|
|
||||||
def _opencitations_stub_entry(doi: str, source_citation_key: str) -> BibEntry:
|
|
||||||
suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
|
|
||||||
return BibEntry(
|
|
||||||
entry_type="misc",
|
|
||||||
citation_key=f"doi{suffix}",
|
|
||||||
fields={
|
|
||||||
"title": f"Referenced work for DOI {doi}",
|
|
||||||
"doi": doi,
|
|
||||||
"url": f"https://doi.org/{doi}",
|
|
||||||
"note": f"discovered_from = {{{source_citation_key}}}",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _normalize_text(value: str) -> str:
|
def _normalize_text(value: str) -> str:
|
||||||
without_tags = re.sub(r"<[^>]+>", " ", html.unescape(value))
|
without_tags = re.sub(r"<[^>]+>", " ", html.unescape(value))
|
||||||
normalized = " ".join(without_tags.split())
|
normalized = " ".join(without_tags.split())
|
||||||
|
|
|
||||||
|
|
@ -1,52 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
from .storage import BibliographyStore
|
|
||||||
|
|
||||||
|
|
||||||
def export_notebook_topic_bundle(
|
|
||||||
store_dir: str | Path,
|
|
||||||
topic_slug: str,
|
|
||||||
out_dir: str | Path,
|
|
||||||
*,
|
|
||||||
include_stubs: bool = False,
|
|
||||||
) -> dict[str, Any]:
|
|
||||||
store = BibliographyStore(store_dir)
|
|
||||||
try:
|
|
||||||
topic = store.get_topic(topic_slug)
|
|
||||||
if topic is None:
|
|
||||||
raise KeyError(f"Topic not found: {topic_slug}")
|
|
||||||
entries = store.list_topic_entries(topic_slug, limit=100000)
|
|
||||||
citation_keys = [row["citation_key"] for row in entries]
|
|
||||||
bibtex_report = store.export_bibtex_report(citation_keys, include_stubs=include_stubs)
|
|
||||||
finally:
|
|
||||||
store.close()
|
|
||||||
|
|
||||||
target = Path(out_dir)
|
|
||||||
target.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
bibliography_path = target / "notebook_topic_bibliography.bib"
|
|
||||||
bibliography_text = bibtex_report["bibtex"]
|
|
||||||
bibliography_path.write_text(bibliography_text + ("\n" if bibliography_text else ""), encoding="utf-8")
|
|
||||||
|
|
||||||
bundle = {
|
|
||||||
"bundle_kind": "notebook_topic_bibliography_bundle",
|
|
||||||
"topic": topic,
|
|
||||||
"entry_count": len(entries),
|
|
||||||
"exported_count": bibtex_report["exported_count"],
|
|
||||||
"include_stubs": include_stubs,
|
|
||||||
"skipped": bibtex_report["skipped"],
|
|
||||||
"citation_keys": citation_keys,
|
|
||||||
"bibliography_path": str(bibliography_path),
|
|
||||||
}
|
|
||||||
bundle_path = target / "notebook_topic_bundle.json"
|
|
||||||
bundle_path.write_text(json.dumps(bundle, indent=2), encoding="utf-8")
|
|
||||||
|
|
||||||
return {
|
|
||||||
"bundle_path": str(bundle_path),
|
|
||||||
"bibliography_path": str(bibliography_path),
|
|
||||||
"bundle": bundle,
|
|
||||||
}
|
|
||||||
|
|
@ -7,38 +7,17 @@ import re
|
||||||
import urllib.error
|
import urllib.error
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass
|
||||||
|
|
||||||
from .bibtex import BibEntry, parse_bibtex
|
from .bibtex import BibEntry, parse_bibtex
|
||||||
from .sources.europepmc import EuropePmcSource
|
|
||||||
from .sources.openlibrary import OpenLibrarySource
|
|
||||||
from .sources.semanticscholar import SemanticScholarSource
|
|
||||||
from .sources import SourceClient
|
from .sources import SourceClient
|
||||||
|
|
||||||
|
|
||||||
@dataclass(slots=True)
|
|
||||||
class ResolutionAttempt:
|
|
||||||
source_name: str
|
|
||||||
strategy: str
|
|
||||||
query_value: str
|
|
||||||
matched: bool
|
|
||||||
candidate_count: int | None = None
|
|
||||||
source_label: str = ""
|
|
||||||
error: str = ""
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(slots=True)
|
@dataclass(slots=True)
|
||||||
class Resolution:
|
class Resolution:
|
||||||
entry: BibEntry
|
entry: BibEntry
|
||||||
source_type: str
|
source_type: str
|
||||||
source_label: str
|
source_label: str
|
||||||
attempts: list[ResolutionAttempt] = field(default_factory=list)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(slots=True)
|
|
||||||
class ResolutionOutcome:
|
|
||||||
resolution: Resolution | None
|
|
||||||
attempts: list[ResolutionAttempt]
|
|
||||||
|
|
||||||
|
|
||||||
class MetadataResolver:
|
class MetadataResolver:
|
||||||
|
|
@ -52,109 +31,70 @@ class MetadataResolver:
|
||||||
) -> None:
|
) -> None:
|
||||||
self.user_agent = user_agent
|
self.user_agent = user_agent
|
||||||
self.source_client = source_client or SourceClient(user_agent=user_agent)
|
self.source_client = source_client or SourceClient(user_agent=user_agent)
|
||||||
self.europepmc = EuropePmcSource(config={"source_client": self.source_client, "user_agent": user_agent})
|
|
||||||
self.openlibrary = OpenLibrarySource(config={"source_client": self.source_client, "user_agent": user_agent})
|
|
||||||
self.semanticscholar = SemanticScholarSource(config={"user_agent": user_agent})
|
|
||||||
self.ncbi_api_key = ncbi_api_key if ncbi_api_key is not None else os.environ.get("NCBI_API_KEY", "")
|
self.ncbi_api_key = ncbi_api_key if ncbi_api_key is not None else os.environ.get("NCBI_API_KEY", "")
|
||||||
self.ncbi_tool = ncbi_tool if ncbi_tool is not None else os.environ.get("NCBI_TOOL", "citegeist")
|
self.ncbi_tool = ncbi_tool if ncbi_tool is not None else os.environ.get("NCBI_TOOL", "citegeist")
|
||||||
self.ncbi_email = ncbi_email if ncbi_email is not None else os.environ.get("NCBI_EMAIL", "")
|
self.ncbi_email = ncbi_email if ncbi_email is not None else os.environ.get("NCBI_EMAIL", "")
|
||||||
|
|
||||||
def resolve_entry(self, entry: BibEntry) -> Resolution | None:
|
def resolve_entry(self, entry: BibEntry) -> Resolution | None:
|
||||||
return self.resolve_entry_with_trace(entry).resolution
|
|
||||||
|
|
||||||
def resolve_entry_with_trace(self, entry: BibEntry) -> ResolutionOutcome:
|
|
||||||
attempts: list[ResolutionAttempt] = []
|
|
||||||
if doi := entry.fields.get("doi"):
|
if doi := entry.fields.get("doi"):
|
||||||
resolved = self._attempt_direct_resolution(attempts, "crossref", "doi_lookup", doi, self.resolve_doi)
|
resolved = self.resolve_doi(doi)
|
||||||
if resolved is not None:
|
if resolved is not None:
|
||||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
return resolved
|
||||||
resolved = self._attempt_direct_resolution(
|
resolved = self.resolve_datacite_doi(doi)
|
||||||
attempts, "datacite", "doi_lookup", doi, self.resolve_datacite_doi
|
|
||||||
)
|
|
||||||
if resolved is not None:
|
if resolved is not None:
|
||||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
return resolved
|
||||||
resolved = self._attempt_direct_resolution(
|
|
||||||
attempts, "europepmc", "doi_lookup", doi, self.resolve_europepmc_doi
|
|
||||||
)
|
|
||||||
if resolved is not None:
|
|
||||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
|
||||||
resolved = self._attempt_direct_resolution(
|
|
||||||
attempts, "semanticscholar", "doi_lookup", doi, self.resolve_semanticscholar_doi
|
|
||||||
)
|
|
||||||
if resolved is not None:
|
|
||||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
|
||||||
|
|
||||||
if pmid := entry.fields.get("pmid"):
|
if pmid := entry.fields.get("pmid"):
|
||||||
resolved = self._attempt_direct_resolution(attempts, "pubmed", "pmid_lookup", pmid, self.resolve_pmid)
|
resolved = self.resolve_pmid(pmid)
|
||||||
if resolved is not None:
|
if resolved is not None:
|
||||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
return resolved
|
||||||
|
|
||||||
if openalex_id := entry.fields.get("openalex"):
|
if openalex_id := entry.fields.get("openalex"):
|
||||||
resolved = self._attempt_direct_resolution(
|
resolved = self.resolve_openalex(openalex_id)
|
||||||
attempts, "openalex", "work_lookup", openalex_id, self.resolve_openalex
|
|
||||||
)
|
|
||||||
if resolved is not None:
|
if resolved is not None:
|
||||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
return resolved
|
||||||
|
|
||||||
if dblp_key := entry.fields.get("dblp"):
|
if dblp_key := entry.fields.get("dblp"):
|
||||||
resolved = self._attempt_direct_resolution(attempts, "dblp", "key_lookup", dblp_key, self.resolve_dblp)
|
resolved = self.resolve_dblp(dblp_key)
|
||||||
if resolved is not None:
|
if resolved is not None:
|
||||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
return resolved
|
||||||
|
|
||||||
if arxiv_id := entry.fields.get("arxiv"):
|
if arxiv_id := entry.fields.get("arxiv"):
|
||||||
resolved = self._attempt_direct_resolution(
|
resolved = self.resolve_arxiv(arxiv_id)
|
||||||
attempts, "arxiv", "id_lookup", arxiv_id, self.resolve_arxiv
|
|
||||||
)
|
|
||||||
if resolved is not None:
|
if resolved is not None:
|
||||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
return resolved
|
||||||
|
|
||||||
if title := entry.fields.get("title"):
|
if title := entry.fields.get("title"):
|
||||||
author_text = entry.fields.get("author", "")
|
resolved = self.search_crossref_best_match(
|
||||||
year = entry.fields.get("year", "")
|
title=title,
|
||||||
resolved = self._attempt_title_search_resolution(
|
author_text=entry.fields.get("author", ""),
|
||||||
attempts, "crossref", title, author_text, year, self.search_crossref
|
year=entry.fields.get("year", ""),
|
||||||
)
|
)
|
||||||
if resolved is not None:
|
if resolved is not None:
|
||||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
return resolved
|
||||||
resolved = self._attempt_title_search_resolution(
|
resolved = self.search_datacite_best_match(
|
||||||
attempts, "datacite", title, author_text, year, self.search_datacite
|
title=title,
|
||||||
|
author_text=entry.fields.get("author", ""),
|
||||||
|
year=entry.fields.get("year", ""),
|
||||||
)
|
)
|
||||||
if resolved is not None:
|
if resolved is not None:
|
||||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
return resolved
|
||||||
resolved = self._attempt_title_search_resolution(
|
resolved = self.search_openalex_best_match(
|
||||||
attempts, "openalex", title, author_text, year, self.search_openalex
|
title=title,
|
||||||
|
author_text=entry.fields.get("author", ""),
|
||||||
|
year=entry.fields.get("year", ""),
|
||||||
)
|
)
|
||||||
if resolved is not None:
|
if resolved is not None:
|
||||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
return resolved
|
||||||
resolved = self._attempt_title_search_resolution(
|
resolved = self.search_pubmed_best_match(
|
||||||
attempts, "pubmed", title, author_text, year, self.search_pubmed
|
title=title,
|
||||||
|
author_text=entry.fields.get("author", ""),
|
||||||
|
year=entry.fields.get("year", ""),
|
||||||
)
|
)
|
||||||
if resolved is not None:
|
if resolved is not None:
|
||||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
return resolved
|
||||||
resolved = self._attempt_title_search_resolution(
|
|
||||||
attempts, "europepmc", title, author_text, year, self.search_europepmc
|
|
||||||
)
|
|
||||||
if resolved is not None:
|
|
||||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
|
||||||
resolved = self._attempt_title_search_resolution(
|
|
||||||
attempts, "semanticscholar", title, author_text, year, self.search_semanticscholar
|
|
||||||
)
|
|
||||||
if resolved is not None:
|
|
||||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
|
||||||
if _entry_prefers_catalog_search(entry):
|
|
||||||
resolved = self._attempt_title_search_resolution(
|
|
||||||
attempts,
|
|
||||||
"openlibrary",
|
|
||||||
title,
|
|
||||||
author_text,
|
|
||||||
year,
|
|
||||||
self.search_openlibrary,
|
|
||||||
selector=_select_best_catalog_title_match,
|
|
||||||
)
|
|
||||||
if resolved is not None:
|
|
||||||
return ResolutionOutcome(resolution=resolved, attempts=attempts)
|
|
||||||
|
|
||||||
return ResolutionOutcome(resolution=None, attempts=attempts)
|
return None
|
||||||
|
|
||||||
def resolve_doi(self, doi: str) -> Resolution | None:
|
def resolve_doi(self, doi: str) -> Resolution | None:
|
||||||
encoded = urllib.parse.quote(doi, safe="")
|
encoded = urllib.parse.quote(doi, safe="")
|
||||||
|
|
@ -184,7 +124,19 @@ class MetadataResolver:
|
||||||
author_text: str = "",
|
author_text: str = "",
|
||||||
year: str = "",
|
year: str = "",
|
||||||
) -> Resolution | None:
|
) -> Resolution | None:
|
||||||
return self._search_best_match_resolution("crossref", title, author_text, year, self.search_crossref)
|
candidate = _select_best_title_match(
|
||||||
|
self.search_crossref(title, limit=5),
|
||||||
|
title=title,
|
||||||
|
author_text=author_text,
|
||||||
|
year=year,
|
||||||
|
)
|
||||||
|
if candidate is None:
|
||||||
|
return None
|
||||||
|
return Resolution(
|
||||||
|
entry=candidate,
|
||||||
|
source_type="resolver",
|
||||||
|
source_label=f"crossref:search:{title}",
|
||||||
|
)
|
||||||
|
|
||||||
def resolve_dblp(self, dblp_key: str) -> Resolution | None:
|
def resolve_dblp(self, dblp_key: str) -> Resolution | None:
|
||||||
encoded_key = urllib.parse.quote(dblp_key, safe="/:")
|
encoded_key = urllib.parse.quote(dblp_key, safe="/:")
|
||||||
|
|
@ -293,7 +245,19 @@ class MetadataResolver:
|
||||||
author_text: str = "",
|
author_text: str = "",
|
||||||
year: str = "",
|
year: str = "",
|
||||||
) -> Resolution | None:
|
) -> Resolution | None:
|
||||||
return self._search_best_match_resolution("datacite", title, author_text, year, self.search_datacite)
|
candidate = _select_best_title_match(
|
||||||
|
self.search_datacite(title, limit=5),
|
||||||
|
title=title,
|
||||||
|
author_text=author_text,
|
||||||
|
year=year,
|
||||||
|
)
|
||||||
|
if candidate is None:
|
||||||
|
return None
|
||||||
|
return Resolution(
|
||||||
|
entry=candidate,
|
||||||
|
source_type="resolver",
|
||||||
|
source_label=f"datacite:search:{title}",
|
||||||
|
)
|
||||||
|
|
||||||
def search_openalex(self, title: str, limit: int = 5) -> list[BibEntry]:
|
def search_openalex(self, title: str, limit: int = 5) -> list[BibEntry]:
|
||||||
query = urllib.parse.urlencode({"search": title, "per-page": limit})
|
query = urllib.parse.urlencode({"search": title, "per-page": limit})
|
||||||
|
|
@ -326,35 +290,6 @@ class MetadataResolver:
|
||||||
return []
|
return []
|
||||||
return self._fetch_pubmed_entries(ids[:limit])
|
return self._fetch_pubmed_entries(ids[:limit])
|
||||||
|
|
||||||
def resolve_europepmc_doi(self, doi: str) -> Resolution | None:
|
|
||||||
entry = self.europepmc.lookup_by_doi(doi)
|
|
||||||
if entry is None:
|
|
||||||
return None
|
|
||||||
return Resolution(
|
|
||||||
entry=entry,
|
|
||||||
source_type="resolver",
|
|
||||||
source_label=f"europepmc:doi:{doi}",
|
|
||||||
)
|
|
||||||
|
|
||||||
def search_europepmc(self, title: str, limit: int = 5) -> list[BibEntry]:
|
|
||||||
return self.europepmc.search(title, limit=limit)
|
|
||||||
|
|
||||||
def search_openlibrary(self, title: str, limit: int = 5) -> list[BibEntry]:
|
|
||||||
return self.openlibrary.search(title, limit=limit)
|
|
||||||
|
|
||||||
def resolve_semanticscholar_doi(self, doi: str) -> Resolution | None:
|
|
||||||
entry = self.semanticscholar.lookup_by_doi(doi)
|
|
||||||
if entry is None:
|
|
||||||
return None
|
|
||||||
return Resolution(
|
|
||||||
entry=entry,
|
|
||||||
source_type="resolver",
|
|
||||||
source_label=f"semanticscholar:doi:{doi}",
|
|
||||||
)
|
|
||||||
|
|
||||||
def search_semanticscholar(self, title: str, limit: int = 5) -> list[BibEntry]:
|
|
||||||
return self.semanticscholar.search(title, limit=limit)
|
|
||||||
|
|
||||||
def _safe_get_json(self, url: str) -> dict | None:
|
def _safe_get_json(self, url: str) -> dict | None:
|
||||||
try:
|
try:
|
||||||
return self.source_client.get_json(url)
|
return self.source_client.get_json(url)
|
||||||
|
|
@ -398,7 +333,19 @@ class MetadataResolver:
|
||||||
author_text: str = "",
|
author_text: str = "",
|
||||||
year: str = "",
|
year: str = "",
|
||||||
) -> Resolution | None:
|
) -> Resolution | None:
|
||||||
return self._search_best_match_resolution("openalex", title, author_text, year, self.search_openalex)
|
candidate = _select_best_title_match(
|
||||||
|
self.search_openalex(title, limit=5),
|
||||||
|
title=title,
|
||||||
|
author_text=author_text,
|
||||||
|
year=year,
|
||||||
|
)
|
||||||
|
if candidate is None:
|
||||||
|
return None
|
||||||
|
return Resolution(
|
||||||
|
entry=candidate,
|
||||||
|
source_type="resolver",
|
||||||
|
source_label=f"openalex:search:{title}",
|
||||||
|
)
|
||||||
|
|
||||||
def search_pubmed_best_match(
|
def search_pubmed_best_match(
|
||||||
self,
|
self,
|
||||||
|
|
@ -406,122 +353,19 @@ class MetadataResolver:
|
||||||
author_text: str = "",
|
author_text: str = "",
|
||||||
year: str = "",
|
year: str = "",
|
||||||
) -> Resolution | None:
|
) -> Resolution | None:
|
||||||
return self._search_best_match_resolution("pubmed", title, author_text, year, self.search_pubmed)
|
candidate = _select_best_title_match(
|
||||||
|
self.search_pubmed(title, limit=5),
|
||||||
def search_europepmc_best_match(
|
title=title,
|
||||||
self,
|
author_text=author_text,
|
||||||
title: str,
|
year=year,
|
||||||
author_text: str = "",
|
|
||||||
year: str = "",
|
|
||||||
) -> Resolution | None:
|
|
||||||
return self._search_best_match_resolution("europepmc", title, author_text, year, self.search_europepmc)
|
|
||||||
|
|
||||||
def search_semanticscholar_best_match(
|
|
||||||
self,
|
|
||||||
title: str,
|
|
||||||
author_text: str = "",
|
|
||||||
year: str = "",
|
|
||||||
) -> Resolution | None:
|
|
||||||
return self._search_best_match_resolution(
|
|
||||||
"semanticscholar", title, author_text, year, self.search_semanticscholar
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def search_openlibrary_best_match(
|
|
||||||
self,
|
|
||||||
title: str,
|
|
||||||
author_text: str = "",
|
|
||||||
year: str = "",
|
|
||||||
) -> Resolution | None:
|
|
||||||
return self._search_best_match_resolution("openlibrary", title, author_text, year, self.search_openlibrary)
|
|
||||||
|
|
||||||
def _search_best_match_resolution(
|
|
||||||
self, source_name: str, title: str, author_text: str, year: str, search_func
|
|
||||||
) -> Resolution | None:
|
|
||||||
candidates = search_func(title, limit=5)
|
|
||||||
candidate = _select_best_title_match(candidates, title=title, author_text=author_text, year=year)
|
|
||||||
if candidate is None:
|
if candidate is None:
|
||||||
return None
|
return None
|
||||||
return Resolution(entry=candidate, source_type="resolver", source_label=f"{source_name}:search:{title}")
|
return Resolution(
|
||||||
|
entry=candidate,
|
||||||
def _attempt_direct_resolution(
|
source_type="resolver",
|
||||||
self,
|
source_label=f"pubmed:search:{title}",
|
||||||
attempts: list[ResolutionAttempt],
|
|
||||||
source_name: str,
|
|
||||||
strategy: str,
|
|
||||||
query_value: str,
|
|
||||||
resolver_func,
|
|
||||||
) -> Resolution | None:
|
|
||||||
try:
|
|
||||||
resolution = resolver_func(query_value)
|
|
||||||
except Exception as exc:
|
|
||||||
attempts.append(
|
|
||||||
ResolutionAttempt(
|
|
||||||
source_name=source_name,
|
|
||||||
strategy=strategy,
|
|
||||||
query_value=query_value,
|
|
||||||
matched=False,
|
|
||||||
error=str(exc),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
attempts.append(
|
|
||||||
ResolutionAttempt(
|
|
||||||
source_name=source_name,
|
|
||||||
strategy=strategy,
|
|
||||||
query_value=query_value,
|
|
||||||
matched=resolution is not None,
|
|
||||||
source_label=resolution.source_label if resolution is not None else "",
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
if resolution is not None and not resolution.attempts:
|
|
||||||
resolution.attempts = list(attempts)
|
|
||||||
return resolution
|
|
||||||
|
|
||||||
def _attempt_title_search_resolution(
|
|
||||||
self,
|
|
||||||
attempts: list[ResolutionAttempt],
|
|
||||||
source_name: str,
|
|
||||||
title: str,
|
|
||||||
author_text: str,
|
|
||||||
year: str,
|
|
||||||
search_func,
|
|
||||||
selector=None,
|
|
||||||
) -> Resolution | None:
|
|
||||||
try:
|
|
||||||
candidates = search_func(title, limit=5)
|
|
||||||
except Exception as exc:
|
|
||||||
attempts.append(
|
|
||||||
ResolutionAttempt(
|
|
||||||
source_name=source_name,
|
|
||||||
strategy="title_search",
|
|
||||||
query_value=title,
|
|
||||||
matched=False,
|
|
||||||
error=str(exc),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
match_selector = selector or _select_best_title_match
|
|
||||||
candidate = match_selector(candidates, title=title, author_text=author_text, year=year)
|
|
||||||
resolution = None
|
|
||||||
if candidate is not None:
|
|
||||||
resolution = Resolution(
|
|
||||||
entry=candidate,
|
|
||||||
source_type="resolver",
|
|
||||||
source_label=f"{source_name}:search:{title}",
|
|
||||||
)
|
|
||||||
attempts.append(
|
|
||||||
ResolutionAttempt(
|
|
||||||
source_name=source_name,
|
|
||||||
strategy="title_search",
|
|
||||||
query_value=title,
|
|
||||||
matched=resolution is not None,
|
|
||||||
candidate_count=len(candidates),
|
|
||||||
source_label=resolution.source_label if resolution is not None else "",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if resolution is not None and not resolution.attempts:
|
|
||||||
resolution.attempts = list(attempts)
|
|
||||||
return resolution
|
|
||||||
|
|
||||||
def _fetch_pubmed_entries(self, pmids: list[str]) -> list[BibEntry]:
|
def _fetch_pubmed_entries(self, pmids: list[str]) -> list[BibEntry]:
|
||||||
ordered_pmids = [pmid for pmid in dict.fromkeys(pmids) if pmid]
|
ordered_pmids = [pmid for pmid in dict.fromkeys(pmids) if pmid]
|
||||||
|
|
@ -924,42 +768,6 @@ def _select_best_title_match(
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _select_best_catalog_title_match(
|
|
||||||
candidates: list[BibEntry],
|
|
||||||
title: str,
|
|
||||||
author_text: str = "",
|
|
||||||
year: str = "",
|
|
||||||
) -> BibEntry | None:
|
|
||||||
if not candidates:
|
|
||||||
return None
|
|
||||||
|
|
||||||
title_tokens = _catalog_title_tokens(title)
|
|
||||||
author_tokens = _author_match_tokens(author_text)
|
|
||||||
year_text = str(year or "").strip()
|
|
||||||
scored: list[tuple[float, BibEntry]] = []
|
|
||||||
|
|
||||||
for candidate in candidates:
|
|
||||||
candidate_title_tokens = _catalog_title_tokens(candidate.fields.get("title", ""))
|
|
||||||
if not candidate_title_tokens:
|
|
||||||
continue
|
|
||||||
overlap = len(title_tokens & candidate_title_tokens)
|
|
||||||
union = len(title_tokens | candidate_title_tokens)
|
|
||||||
score = (overlap / union) if union else 0.0
|
|
||||||
if score < 0.6:
|
|
||||||
continue
|
|
||||||
candidate_year = str(candidate.fields.get("year", "") or "").strip()
|
|
||||||
if year_text and candidate_year and year_text != candidate_year:
|
|
||||||
continue
|
|
||||||
if author_tokens and not _candidate_matches_author_tokens(candidate, author_tokens):
|
|
||||||
continue
|
|
||||||
scored.append((score, candidate))
|
|
||||||
|
|
||||||
if not scored:
|
|
||||||
return None
|
|
||||||
scored.sort(key=lambda item: (-item[0], item[1].citation_key))
|
|
||||||
return scored[0][1]
|
|
||||||
|
|
||||||
|
|
||||||
def _author_match_tokens(author_text: str) -> set[str]:
|
def _author_match_tokens(author_text: str) -> set[str]:
|
||||||
normalized = _normalize_match_text(author_text)
|
normalized = _normalize_match_text(author_text)
|
||||||
if not normalized:
|
if not normalized:
|
||||||
|
|
@ -980,39 +788,6 @@ def _candidate_matches_author_tokens(candidate: BibEntry, author_tokens: set[str
|
||||||
return bool(author_tokens & candidate_tokens)
|
return bool(author_tokens & candidate_tokens)
|
||||||
|
|
||||||
|
|
||||||
def _catalog_title_tokens(value: str) -> set[str]:
|
|
||||||
normalized = _normalize_match_text(value)
|
|
||||||
stopwords = {"the", "and", "for", "with", "from", "into", "after", "all"}
|
|
||||||
return {
|
|
||||||
f"{token[:-4]}ic" if token.endswith("ical") and len(token) > 6 else token
|
|
||||||
for token in re.findall(r"[a-z0-9]+", normalized)
|
|
||||||
if len(token) >= 4 and token not in stopwords
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _entry_prefers_catalog_search(entry: BibEntry) -> bool:
|
|
||||||
if entry.entry_type in {"book", "incollection", "phdthesis", "mastersthesis"}:
|
|
||||||
return True
|
|
||||||
title = _normalize_match_text(entry.fields.get("title", ""))
|
|
||||||
venue = _normalize_match_text(
|
|
||||||
" ".join(
|
|
||||||
filter(
|
|
||||||
None,
|
|
||||||
[
|
|
||||||
entry.fields.get("publisher", ""),
|
|
||||||
entry.fields.get("howpublished", ""),
|
|
||||||
entry.fields.get("booktitle", ""),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if entry.entry_type != "misc":
|
|
||||||
return False
|
|
||||||
if any(token in venue for token in ("press", "university", "house", "dictionary", "christendom")):
|
|
||||||
return True
|
|
||||||
return any(token in title for token in ("dictionary", "history", "world", "universe", "record"))
|
|
||||||
|
|
||||||
|
|
||||||
def _normalize_pmid(value: str) -> str:
|
def _normalize_pmid(value: str) -> str:
|
||||||
return "".join(ch for ch in str(value) if ch.isdigit())
|
return "".join(ch for ch in str(value) if ch.isdigit())
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,27 +0,0 @@
|
||||||
"""
|
|
||||||
Identifier resolution and normalization module.
|
|
||||||
|
|
||||||
Provides functions for extracting, normalizing, and resolving
|
|
||||||
bibliographic identifiers across multiple schemes.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from citegeist.resolver.identifiers import (
|
|
||||||
IdentifierExtractor,
|
|
||||||
IdentifierNormalizer,
|
|
||||||
IdentifierResolver,
|
|
||||||
extract_identifiers,
|
|
||||||
normalize_identifier,
|
|
||||||
get_primary_identifier,
|
|
||||||
resolve_identifiers,
|
|
||||||
)
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
'IdentifierExtractor',
|
|
||||||
'IdentifierNormalizer',
|
|
||||||
'IdentifierResolver',
|
|
||||||
'extract_identifiers',
|
|
||||||
'normalize_identifier',
|
|
||||||
'get_primary_identifier',
|
|
||||||
'resolve_identifiers',
|
|
||||||
]
|
|
||||||
|
|
@ -1,418 +0,0 @@
|
||||||
"""
|
|
||||||
Identifier resolution and normalization module.
|
|
||||||
|
|
||||||
This module provides functions for extracting, normalizing, and resolving
|
|
||||||
bibliographic identifiers across multiple schemes (DOI, PMID, arXiv, ORCID, etc.).
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import re
|
|
||||||
from typing import Dict, List, Optional, Tuple
|
|
||||||
|
|
||||||
|
|
||||||
# Identifier scheme patterns
|
|
||||||
DOI_PATTERN = re.compile(
|
|
||||||
r'^10\.\d{4,9}/\S+$',
|
|
||||||
re.IGNORECASE
|
|
||||||
)
|
|
||||||
|
|
||||||
PMID_PATTERN = re.compile(r'^\d{5,7}$')
|
|
||||||
|
|
||||||
PMCID_PATTERN = re.compile(
|
|
||||||
r'^PMC\d+$|^PMC[0-9a-f]+$',
|
|
||||||
re.IGNORECASE
|
|
||||||
)
|
|
||||||
|
|
||||||
ARXIV_PATTERN = re.compile(
|
|
||||||
r'^\d{4}\.\d{4,5}(v\d+)?$',
|
|
||||||
re.IGNORECASE
|
|
||||||
)
|
|
||||||
|
|
||||||
ORCID_PATTERN = re.compile(
|
|
||||||
r'^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$',
|
|
||||||
re.IGNORECASE
|
|
||||||
)
|
|
||||||
|
|
||||||
ROR_PATTERN = re.compile(
|
|
||||||
r'^https?://ror\.org/[0-9A-Z]{4,10}$'
|
|
||||||
)
|
|
||||||
|
|
||||||
DBLP_PATTERN = re.compile(
|
|
||||||
r'^[a-zA-Z0-9_]+:[a-zA-Z0-9_]+$',
|
|
||||||
re.IGNORECASE
|
|
||||||
)
|
|
||||||
|
|
||||||
OPENALEX_PATTERN = re.compile(
|
|
||||||
r'^W[0-9]{4}-[A-F0-9]{4}$',
|
|
||||||
re.IGNORECASE
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class IdentifierExtractor:
|
|
||||||
"""Extract identifiers from BibEntry fields."""
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def extract(entry_fields: Dict[str, str]) -> Dict[str, str]:
|
|
||||||
"""Extract all identifier schemes from entry fields.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
entry_fields: Dictionary of entry fields
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary mapping scheme names to values
|
|
||||||
"""
|
|
||||||
identifiers = {}
|
|
||||||
|
|
||||||
# DOI
|
|
||||||
if doi := entry_fields.get('doi'):
|
|
||||||
identifiers['doi'] = doi
|
|
||||||
|
|
||||||
# PMID
|
|
||||||
if pmid := entry_fields.get('pmid'):
|
|
||||||
identifiers['pmid'] = pmid
|
|
||||||
|
|
||||||
# PMCID
|
|
||||||
if pmcid := entry_fields.get('pmcid'):
|
|
||||||
identifiers['pmcid'] = pmcid
|
|
||||||
|
|
||||||
# arXiv
|
|
||||||
if arxiv := entry_fields.get('arxiv'):
|
|
||||||
identifiers['arxiv'] = arxiv
|
|
||||||
|
|
||||||
# DBLP
|
|
||||||
if dblp := entry_fields.get('dblp'):
|
|
||||||
identifiers['dblp'] = dblp
|
|
||||||
|
|
||||||
# OpenAlex
|
|
||||||
if openalex := entry_fields.get('openalex'):
|
|
||||||
identifiers['openalex'] = openalex
|
|
||||||
|
|
||||||
# ISBN
|
|
||||||
if isbn := entry_fields.get('isbn'):
|
|
||||||
identifiers['isbn'] = isbn
|
|
||||||
|
|
||||||
# ISSN
|
|
||||||
if issn := entry_fields.get('issn'):
|
|
||||||
identifiers['issn'] = issn
|
|
||||||
|
|
||||||
return identifiers
|
|
||||||
|
|
||||||
|
|
||||||
class IdentifierNormalizer:
|
|
||||||
"""Normalize identifiers to canonical form."""
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def normalize_doi(doi: str) -> Optional[str]:
|
|
||||||
"""Normalize DOI to lowercase.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
doi: DOI string
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Lowercase DOI, or None if invalid
|
|
||||||
"""
|
|
||||||
if not doi:
|
|
||||||
return None
|
|
||||||
normalized = doi.strip().lower()
|
|
||||||
if DOI_PATTERN.match(normalized):
|
|
||||||
return normalized
|
|
||||||
return None
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def normalize_pmid(pmid: str) -> Optional[str]:
|
|
||||||
"""Normalize PMID to string.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pmid: PMID string
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
PMID string, or None if invalid
|
|
||||||
"""
|
|
||||||
if not pmid:
|
|
||||||
return None
|
|
||||||
pmid_str = str(pmid).strip()
|
|
||||||
if PMID_PATTERN.match(pmid_str):
|
|
||||||
return pmid_str
|
|
||||||
return None
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def normalize_pmcid(pmcid: str) -> Optional[str]:
|
|
||||||
"""Normalize PMCID to lowercase.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pmcid: PMCID string
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Lowercase PMCID, or None if invalid
|
|
||||||
"""
|
|
||||||
if not pmcid:
|
|
||||||
return None
|
|
||||||
normalized = pmcid.strip().lower()
|
|
||||||
if PMCID_PATTERN.match(normalized):
|
|
||||||
return normalized
|
|
||||||
return None
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def normalize_arxiv(arxiv: str) -> Optional[str]:
|
|
||||||
"""Normalize arXiv ID.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
arxiv: arXiv ID string
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Normalized arXiv ID, or None if invalid
|
|
||||||
"""
|
|
||||||
if not arxiv:
|
|
||||||
return None
|
|
||||||
# Remove 'v' and version suffix if present
|
|
||||||
normalized = arxiv.strip().lower()
|
|
||||||
if 'v' in normalized:
|
|
||||||
normalized = normalized.split('v')[0]
|
|
||||||
if ARXIV_PATTERN.match(normalized):
|
|
||||||
return normalized
|
|
||||||
return None
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def normalize_orcid(orcid: str) -> Optional[str]:
|
|
||||||
"""Normalize ORCID to canonical format.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
orcid: ORCID string
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Normalized ORCID (XXXX-XXXX-XXXX-XXX0), or None if invalid
|
|
||||||
"""
|
|
||||||
if not orcid:
|
|
||||||
return None
|
|
||||||
orcid = orcid.strip().upper().replace(' ', '')
|
|
||||||
if ORCID_PATTERN.match(orcid):
|
|
||||||
return orcid
|
|
||||||
return None
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def normalize_ror(ror_url: str) -> Optional[str]:
|
|
||||||
"""Normalize ROR URL to identifier.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
ror_url: ROR URL string
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
ROR identifier, or None if invalid
|
|
||||||
"""
|
|
||||||
if not ror_url:
|
|
||||||
return None
|
|
||||||
ror_id = ror_url.strip().lower()
|
|
||||||
if ROR_PATTERN.match(ror_id):
|
|
||||||
return ror_id
|
|
||||||
return None
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def normalize_dblp(dblp_key: str) -> Optional[str]:
|
|
||||||
"""Normalize DBLP key.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
dblp_key: DBLP key string
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
DBLP key, or None if invalid
|
|
||||||
"""
|
|
||||||
if not dblp_key:
|
|
||||||
return None
|
|
||||||
dblp = dblp_key.strip()
|
|
||||||
if DBLP_PATTERN.match(dblp):
|
|
||||||
return dblp
|
|
||||||
return None
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def normalize_openalex(openalex_id: str) -> Optional[str]:
|
|
||||||
"""Normalize OpenAlex ID.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
openalex_id: OpenAlex ID string
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
OpenAlex ID, or None if invalid
|
|
||||||
"""
|
|
||||||
if not openalex_id:
|
|
||||||
return None
|
|
||||||
openalex = openalex_id.strip().upper()
|
|
||||||
if OPENALEX_PATTERN.match(openalex):
|
|
||||||
return openalex
|
|
||||||
return None
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def normalize_identifier(scheme: str, value: str) -> Optional[Tuple[str, str]]:
|
|
||||||
"""Normalize an identifier.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
scheme: Identifier scheme name
|
|
||||||
value: Identifier value
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (scheme, normalized_value), or None if invalid
|
|
||||||
"""
|
|
||||||
scheme = scheme.lower()
|
|
||||||
|
|
||||||
normalizers = {
|
|
||||||
'doi': IdentifierNormalizer.normalize_doi,
|
|
||||||
'pmid': IdentifierNormalizer.normalize_pmid,
|
|
||||||
'pmcid': IdentifierNormalizer.normalize_pmcid,
|
|
||||||
'arxiv': IdentifierNormalizer.normalize_arxiv,
|
|
||||||
'orcid': IdentifierNormalizer.normalize_orcid,
|
|
||||||
'ror': IdentifierNormalizer.normalize_ror,
|
|
||||||
'dblp': IdentifierNormalizer.normalize_dblp,
|
|
||||||
'openalex': IdentifierNormalizer.normalize_openalex,
|
|
||||||
}
|
|
||||||
|
|
||||||
normalizer = normalizers.get(scheme)
|
|
||||||
if normalizer:
|
|
||||||
normalized = normalizer(value)
|
|
||||||
if normalized:
|
|
||||||
return (scheme, normalized)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
class IdentifierResolver:
|
|
||||||
"""Resolve identifiers across multiple schemes."""
|
|
||||||
|
|
||||||
# Lookup priority: schemes should be checked in this order
|
|
||||||
LOOKUP_PRIORITY = [
|
|
||||||
('doi', IdentifierNormalizer.normalize_doi),
|
|
||||||
('pmid', IdentifierNormalizer.normalize_pmid),
|
|
||||||
('pmcid', IdentifierNormalizer.normalize_pmcid),
|
|
||||||
('arxiv', IdentifierNormalizer.normalize_arxiv),
|
|
||||||
('dblp', IdentifierNormalizer.normalize_dblp),
|
|
||||||
('openalex', IdentifierNormalizer.normalize_openalex),
|
|
||||||
]
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def resolve(entry_fields: Dict[str, str]) -> List[Tuple[str, str]]:
|
|
||||||
"""Resolve identifiers from entry fields.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
entry_fields: Dictionary of entry fields
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of (scheme, normalized_value) tuples in priority order
|
|
||||||
"""
|
|
||||||
identifiers = IdentifierExtractor.extract(entry_fields)
|
|
||||||
resolved = []
|
|
||||||
|
|
||||||
for scheme, value in identifiers.items():
|
|
||||||
if normalized := IdentifierNormalizer.normalize_identifier(scheme, value):
|
|
||||||
resolved.append(normalized)
|
|
||||||
|
|
||||||
# Add title fingerprint as fallback
|
|
||||||
if title := entry_fields.get('title'):
|
|
||||||
fingerprint = IdentifierResolver._create_title_fingerprint(title)
|
|
||||||
if fingerprint:
|
|
||||||
resolved.append(('title', fingerprint))
|
|
||||||
|
|
||||||
return resolved
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _create_title_fingerprint(title: str) -> Optional[str]:
|
|
||||||
"""Create a fingerprint from title for fallback lookup.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
title: Work title
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Fingerprint string
|
|
||||||
"""
|
|
||||||
if not title:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Remove common words, punctuation, and normalize
|
|
||||||
words = title.lower()
|
|
||||||
words = re.sub(r'[^\w\s]', ' ', words) # Remove punctuation
|
|
||||||
words = re.sub(r'\s+', ' ', words) # Normalize whitespace
|
|
||||||
words = words.strip()
|
|
||||||
|
|
||||||
return words
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_primary_identifier(entry_fields: Dict[str, str]) -> Optional[Tuple[str, str]]:
|
|
||||||
"""Get the primary identifier (first in priority order).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
entry_fields: Dictionary of entry fields
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (scheme, value), or None if no identifier found
|
|
||||||
"""
|
|
||||||
resolved = IdentifierResolver.resolve(entry_fields)
|
|
||||||
|
|
||||||
for scheme, _ in IdentifierResolver.LOOKUP_PRIORITY:
|
|
||||||
# Find this scheme in resolved identifiers
|
|
||||||
for rscheme, rvalue in resolved:
|
|
||||||
if rscheme == scheme:
|
|
||||||
return (rscheme, rvalue)
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_scheme_value(scheme: str, entry_fields: Dict[str, str]) -> Optional[str]:
|
|
||||||
"""Get a specific identifier value from entry fields.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
scheme: Identifier scheme name
|
|
||||||
entry_fields: Dictionary of entry fields
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Identifier value, or None if not found
|
|
||||||
"""
|
|
||||||
if value := entry_fields.get(scheme):
|
|
||||||
if normalized := IdentifierNormalizer.normalize_identifier(scheme, value):
|
|
||||||
return normalized[1]
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
# Convenience functions
|
|
||||||
def extract_identifiers(entry_fields: Dict[str, str]) -> Dict[str, str]:
|
|
||||||
"""Extract all identifiers from entry fields.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
entry_fields: Dictionary of entry fields
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary mapping scheme names to values
|
|
||||||
"""
|
|
||||||
return IdentifierExtractor.extract(entry_fields)
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_identifier(scheme: str, value: str) -> Optional[Tuple[str, str]]:
|
|
||||||
"""Normalize an identifier.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
scheme: Identifier scheme name
|
|
||||||
value: Identifier value
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (scheme, normalized_value), or None if invalid
|
|
||||||
"""
|
|
||||||
return IdentifierNormalizer.normalize_identifier(scheme, value)
|
|
||||||
|
|
||||||
|
|
||||||
def get_primary_identifier(entry_fields: Dict[str, str]) -> Optional[Tuple[str, str]]:
|
|
||||||
"""Get the primary identifier.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
entry_fields: Dictionary of entry fields
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (scheme, value), or None if no identifier found
|
|
||||||
"""
|
|
||||||
return IdentifierResolver.get_primary_identifier(entry_fields)
|
|
||||||
|
|
||||||
|
|
||||||
def resolve_identifiers(entry_fields: Dict[str, str]) -> List[Tuple[str, str]]:
|
|
||||||
"""Resolve identifiers from entry fields.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
entry_fields: Dictionary of entry fields
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of (scheme, value) tuples
|
|
||||||
"""
|
|
||||||
return IdentifierResolver.resolve(entry_fields)
|
|
||||||
|
|
@ -1,29 +0,0 @@
|
||||||
"""Export all source plugins."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from citegeist.sources.base import BibliographicSource, SourceRecord, CitationEdge
|
|
||||||
from citegeist.sources.catalog import SourceCatalogEntry, list_source_catalog, prioritized_source_keys
|
|
||||||
from citegeist.sources.registry import SourceRegistry, get_registry
|
|
||||||
from citegeist.sources.crossref import CrossRefSource
|
|
||||||
from citegeist.sources.europepmc import EuropePmcSource
|
|
||||||
from citegeist.sources.opencitations import OpenCitationsSource
|
|
||||||
from citegeist.sources.openlibrary import OpenLibrarySource
|
|
||||||
from citegeist.sources.semanticscholar import SemanticScholarSource
|
|
||||||
from citegeist.sources.unpaywall import UnpaywallSource
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
'BibliographicSource',
|
|
||||||
'SourceRecord',
|
|
||||||
'CitationEdge',
|
|
||||||
'SourceCatalogEntry',
|
|
||||||
'SourceRegistry',
|
|
||||||
'get_registry',
|
|
||||||
'list_source_catalog',
|
|
||||||
'prioritized_source_keys',
|
|
||||||
'CrossRefSource',
|
|
||||||
'EuropePmcSource',
|
|
||||||
'OpenCitationsSource',
|
|
||||||
'OpenLibrarySource',
|
|
||||||
'SemanticScholarSource',
|
|
||||||
'UnpaywallSource',
|
|
||||||
]
|
|
||||||
|
|
@ -1,44 +0,0 @@
|
||||||
"""
|
|
||||||
Bibliographic source plugins.
|
|
||||||
|
|
||||||
This package provides a plugin architecture for integrating multiple
|
|
||||||
bibliographic data sources (CrossRef, PubMed, Semantic Scholar, etc.).
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Import old sources module for backward compatibility
|
|
||||||
from . import _old_sources_compat
|
|
||||||
|
|
||||||
# Import new plugin architecture
|
|
||||||
from citegeist.sources.base import BibliographicSource, SourceRecord, CitationEdge
|
|
||||||
from citegeist.sources.catalog import SourceCatalogEntry, list_source_catalog, prioritized_source_keys
|
|
||||||
from citegeist.sources.registry import SourceRegistry, get_registry
|
|
||||||
from citegeist.sources.crossref import CrossRefSource
|
|
||||||
from citegeist.sources.europepmc import EuropePmcSource
|
|
||||||
from citegeist.sources.opencitations import OpenCitationsSource
|
|
||||||
from citegeist.sources.openlibrary import OpenLibrarySource
|
|
||||||
from citegeist.sources.semanticscholar import SemanticScholarSource
|
|
||||||
from citegeist.sources.unpaywall import UnpaywallSource
|
|
||||||
|
|
||||||
# Re-export old classes for compatibility
|
|
||||||
__all__ = [
|
|
||||||
# New plugin architecture
|
|
||||||
'BibliographicSource',
|
|
||||||
'SourceRecord',
|
|
||||||
'CitationEdge',
|
|
||||||
'SourceCatalogEntry',
|
|
||||||
'SourceRegistry',
|
|
||||||
'get_registry',
|
|
||||||
'list_source_catalog',
|
|
||||||
'prioritized_source_keys',
|
|
||||||
'CrossRefSource',
|
|
||||||
'EuropePmcSource',
|
|
||||||
'OpenCitationsSource',
|
|
||||||
'OpenLibrarySource',
|
|
||||||
'SemanticScholarSource',
|
|
||||||
'UnpaywallSource',
|
|
||||||
# Old API (for backward compatibility)
|
|
||||||
'SourceClient',
|
|
||||||
]
|
|
||||||
|
|
||||||
# Backward compatibility - make SourceClient available from this module
|
|
||||||
SourceClient = _old_sources_compat.SourceClient
|
|
||||||
|
|
@ -1,25 +0,0 @@
|
||||||
"""
|
|
||||||
Backward compatibility module for old sources module.
|
|
||||||
|
|
||||||
This module re-exports the old SourceClient class for compatibility.
|
|
||||||
"""
|
|
||||||
from pathlib import Path
|
|
||||||
import importlib.util
|
|
||||||
|
|
||||||
from .base import BibliographicSource, SourceRecord, CitationEdge
|
|
||||||
from .registry import SourceRegistry, get_registry
|
|
||||||
from .crossref import CrossRefSource
|
|
||||||
|
|
||||||
# Load the old sources.py module from the citegeist package root
|
|
||||||
_OLD_SOURCES_PATH = Path(__file__).resolve().parents[1] / "sources.py"
|
|
||||||
spec = importlib.util.spec_from_file_location(
|
|
||||||
"citegeist.sources_old",
|
|
||||||
_OLD_SOURCES_PATH
|
|
||||||
)
|
|
||||||
if spec and spec.loader:
|
|
||||||
old_sources = importlib.util.module_from_spec(spec)
|
|
||||||
spec.loader.exec_module(old_sources)
|
|
||||||
SourceClient = old_sources.SourceClient
|
|
||||||
else:
|
|
||||||
# Fallback if old sources.py doesn't exist
|
|
||||||
SourceClient = None
|
|
||||||
|
|
@ -1,189 +0,0 @@
|
||||||
"""
|
|
||||||
Base interface for bibliographic sources.
|
|
||||||
|
|
||||||
This module defines the abstract base class that all source plugins must implement.
|
|
||||||
Plugins can register themselves with the SourceRegistry for dynamic loading.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from typing import Any, Dict, List, Optional
|
|
||||||
|
|
||||||
from citegeist.bibtex import BibEntry
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(slots=True)
|
|
||||||
class SourceRecord:
|
|
||||||
"""Represents a raw record from a source API."""
|
|
||||||
raw: Dict[str, Any]
|
|
||||||
source_type: str
|
|
||||||
source_label: str
|
|
||||||
timestamp: str
|
|
||||||
confidence: float
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(slots=True)
|
|
||||||
class CitationEdge:
|
|
||||||
"""Represents a citation relationship."""
|
|
||||||
source_work_id: str
|
|
||||||
target_work_id: str
|
|
||||||
relation_type: str # "cites" or "cited_by"
|
|
||||||
source_type: str
|
|
||||||
source_label: str
|
|
||||||
confidence: float
|
|
||||||
|
|
||||||
|
|
||||||
class BibliographicSource(ABC):
|
|
||||||
"""Abstract base class for bibliographic data sources.
|
|
||||||
|
|
||||||
All source plugins must inherit from this class and implement the required methods.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
||||||
"""Initialize the source with optional configuration.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config: Source-specific configuration dictionary
|
|
||||||
"""
|
|
||||||
self.config = config or {}
|
|
||||||
self.enabled = self.config.get('enabled', True)
|
|
||||||
self.source_type = self.config.get('source_type', self.__class__.__name__)
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
|
|
||||||
"""Look up a work by DOI.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
doi: Digital Object Identifier
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
BibEntry if found, None otherwise
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
|
|
||||||
"""Look up a work by title.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
title: Work title
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
BibEntry if found, None otherwise
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def search(self, query: str, limit: int = 10) -> List[BibEntry]:
|
|
||||||
"""Search for works matching the query.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
query: Search query string
|
|
||||||
limit: Maximum number of results
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of matching BibEntry objects
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
|
|
||||||
"""Normalize a raw API record to a canonical BibEntry.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
record: Raw record from source API
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
BibEntry if normalization succeeds, None otherwise
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def get_citations(self, work_id: str, relation_type: str = 'cites', limit: int = 10) -> List[CitationEdge]:
|
|
||||||
"""Get citations for a work.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
work_id: Work identifier (DOI, PMID, etc.)
|
|
||||||
relation_type: Type of relation ('cites' or 'cited_by')
|
|
||||||
limit: Maximum number of results
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of CitationEdge objects
|
|
||||||
"""
|
|
||||||
return []
|
|
||||||
|
|
||||||
def get_related(self, work_id: str, limit: int = 10) -> List[BibEntry]:
|
|
||||||
"""Get works related to a work.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
work_id: Work identifier
|
|
||||||
limit: Maximum number of results
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of related BibEntry objects
|
|
||||||
"""
|
|
||||||
return []
|
|
||||||
|
|
||||||
def get_fulltext_url(self, doi: str) -> Optional[str]:
|
|
||||||
"""Get full-text URL for a work.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
doi: Digital Object Identifier
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Full-text URL if available, None otherwise
|
|
||||||
"""
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_embedding(self, work_id: str) -> Optional[List[float]]:
|
|
||||||
"""Get embedding vector for a work.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
work_id: Work identifier
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Embedding vector if available, None otherwise
|
|
||||||
"""
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_identifier_scheme(self) -> str:
|
|
||||||
"""Get the identifier scheme used by this source.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Identifier scheme (e.g., 'doi', 'pmid', 'openalex')
|
|
||||||
"""
|
|
||||||
return self.source_type.lower()
|
|
||||||
|
|
||||||
def record_source_metadata(self, entry: BibEntry, operation: str = 'ingest') -> SourceRecord:
|
|
||||||
"""Create a source record for provenance tracking.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
entry: The BibEntry to record
|
|
||||||
operation: Operation type (e.g., 'ingest', 'enrich')
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
SourceRecord with metadata
|
|
||||||
"""
|
|
||||||
return SourceRecord(
|
|
||||||
raw=self._entry_to_dict(entry),
|
|
||||||
source_type=self.source_type,
|
|
||||||
source_label=f"{self.source_type}:{self.config.get('name', self.__class__.__name__)}",
|
|
||||||
timestamp='',
|
|
||||||
confidence=1.0
|
|
||||||
)
|
|
||||||
|
|
||||||
def _entry_to_dict(self, entry: BibEntry) -> Dict[str, Any]:
|
|
||||||
"""Convert BibEntry to dictionary for source records."""
|
|
||||||
return {
|
|
||||||
'entry_type': entry.entry_type,
|
|
||||||
'citation_key': entry.citation_key,
|
|
||||||
'fields': entry.fields
|
|
||||||
}
|
|
||||||
|
|
||||||
def is_available(self) -> bool:
|
|
||||||
"""Check if the source is available and enabled.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if enabled and available, False otherwise
|
|
||||||
"""
|
|
||||||
return self.enabled
|
|
||||||
|
|
@ -1,173 +0,0 @@
|
||||||
"""Open bibliographic source inventory and prioritization helpers."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True, slots=True)
|
|
||||||
class SourceCatalogEntry:
|
|
||||||
key: str
|
|
||||||
label: str
|
|
||||||
category: str
|
|
||||||
access: str
|
|
||||||
capabilities: tuple[str, ...]
|
|
||||||
strengths: str
|
|
||||||
caveats: str
|
|
||||||
current_status: str
|
|
||||||
priority: str
|
|
||||||
|
|
||||||
|
|
||||||
_CATALOG: tuple[SourceCatalogEntry, ...] = (
|
|
||||||
SourceCatalogEntry(
|
|
||||||
key="crossref",
|
|
||||||
label="Crossref",
|
|
||||||
category="metadata",
|
|
||||||
access="open API",
|
|
||||||
capabilities=("doi_lookup", "title_search", "reference_lists"),
|
|
||||||
strengths="Broad DOI coverage and good article-level metadata.",
|
|
||||||
caveats="Citation coverage is incomplete and some references are unstructured blobs.",
|
|
||||||
current_status="integrated",
|
|
||||||
priority="now",
|
|
||||||
),
|
|
||||||
SourceCatalogEntry(
|
|
||||||
key="openalex",
|
|
||||||
label="OpenAlex",
|
|
||||||
category="metadata+graph",
|
|
||||||
access="open API",
|
|
||||||
capabilities=("work_lookup", "title_search", "citations", "cited_by", "topics"),
|
|
||||||
strengths="Best current open source for citation graph expansion and work-level discovery.",
|
|
||||||
caveats="Occasional noisy secondary records require conservative admission rules.",
|
|
||||||
current_status="integrated",
|
|
||||||
priority="now",
|
|
||||||
),
|
|
||||||
SourceCatalogEntry(
|
|
||||||
key="pubmed",
|
|
||||||
label="PubMed / NCBI E-utilities",
|
|
||||||
category="metadata",
|
|
||||||
access="open API",
|
|
||||||
capabilities=("pmid_lookup", "title_search", "biomedical_metadata"),
|
|
||||||
strengths="High-value authoritative metadata for biomedical literature.",
|
|
||||||
caveats="Domain-specific coverage outside biomedicine is limited.",
|
|
||||||
current_status="integrated",
|
|
||||||
priority="now",
|
|
||||||
),
|
|
||||||
SourceCatalogEntry(
|
|
||||||
key="datacite",
|
|
||||||
label="DataCite",
|
|
||||||
category="metadata",
|
|
||||||
access="open API",
|
|
||||||
capabilities=("doi_lookup", "title_search", "datasets"),
|
|
||||||
strengths="Important for datasets, reports, and non-traditional DOI-bearing objects.",
|
|
||||||
caveats="Less useful than Crossref/OpenAlex for mainstream article citation graphs.",
|
|
||||||
current_status="integrated",
|
|
||||||
priority="now",
|
|
||||||
),
|
|
||||||
SourceCatalogEntry(
|
|
||||||
key="dblp",
|
|
||||||
label="DBLP",
|
|
||||||
category="metadata",
|
|
||||||
access="open API",
|
|
||||||
capabilities=("key_lookup", "search", "computer_science"),
|
|
||||||
strengths="Excellent computer-science coverage and clean bibliographic records.",
|
|
||||||
caveats="Discipline-specific rather than general-purpose.",
|
|
||||||
current_status="integrated",
|
|
||||||
priority="selective",
|
|
||||||
),
|
|
||||||
SourceCatalogEntry(
|
|
||||||
key="arxiv",
|
|
||||||
label="arXiv",
|
|
||||||
category="metadata+fulltext",
|
|
||||||
access="open API",
|
|
||||||
capabilities=("id_lookup", "search", "preprints"),
|
|
||||||
strengths="Useful for preprint-first fields and free full-text links.",
|
|
||||||
caveats="Not a general citation graph source.",
|
|
||||||
current_status="integrated",
|
|
||||||
priority="selective",
|
|
||||||
),
|
|
||||||
SourceCatalogEntry(
|
|
||||||
key="open_citations",
|
|
||||||
label="OpenCitations",
|
|
||||||
category="graph",
|
|
||||||
access="open API",
|
|
||||||
capabilities=("doi_citations", "doi_references", "provenance"),
|
|
||||||
strengths="Directly aligned with open citation-edge expansion.",
|
|
||||||
caveats="Coverage is narrower than OpenAlex and needs merge discipline.",
|
|
||||||
current_status="integrated",
|
|
||||||
priority="now",
|
|
||||||
),
|
|
||||||
SourceCatalogEntry(
|
|
||||||
key="semantic_scholar",
|
|
||||||
label="Semantic Scholar",
|
|
||||||
category="metadata+graph",
|
|
||||||
access="free API with limits",
|
|
||||||
capabilities=("work_lookup", "search", "citations", "references"),
|
|
||||||
strengths="Strong graph and relevance signals, especially for discovery workflows.",
|
|
||||||
caveats="Not fully open data in the same sense as Crossref/OpenAlex/OpenCitations.",
|
|
||||||
current_status="integrated",
|
|
||||||
priority="now",
|
|
||||||
),
|
|
||||||
SourceCatalogEntry(
|
|
||||||
key="unpaywall",
|
|
||||||
label="Unpaywall",
|
|
||||||
category="access-links",
|
|
||||||
access="open API",
|
|
||||||
capabilities=("doi_fulltext_links", "oa_status"),
|
|
||||||
strengths="Best open source for landing-page and OA-link enrichment.",
|
|
||||||
caveats="Improves access, not bibliographic identity or graph completeness.",
|
|
||||||
current_status="integrated",
|
|
||||||
priority="now",
|
|
||||||
),
|
|
||||||
SourceCatalogEntry(
|
|
||||||
key="europe_pmc",
|
|
||||||
label="Europe PMC",
|
|
||||||
category="metadata+fulltext",
|
|
||||||
access="open API",
|
|
||||||
capabilities=("search", "citations", "fulltext_links", "biomedical"),
|
|
||||||
strengths="Valuable biomedical complement to PubMed with richer open-access linkage.",
|
|
||||||
caveats="Domain-specific and partially overlapping with PubMed/OpenAlex.",
|
|
||||||
current_status="integrated",
|
|
||||||
priority="now",
|
|
||||||
),
|
|
||||||
SourceCatalogEntry(
|
|
||||||
key="open_library",
|
|
||||||
label="Open Library",
|
|
||||||
category="metadata",
|
|
||||||
access="open API",
|
|
||||||
capabilities=("title_search", "book_metadata", "author_catalog", "isbn_hints"),
|
|
||||||
strengths="Useful general-purpose open catalog coverage for books, monographs, and reference works.",
|
|
||||||
caveats="Catalog metadata is not a citation graph and can be noisy for article-like titles.",
|
|
||||||
current_status="integrated",
|
|
||||||
priority="selective",
|
|
||||||
),
|
|
||||||
SourceCatalogEntry(
|
|
||||||
key="openaire",
|
|
||||||
label="OpenAIRE",
|
|
||||||
category="metadata+repository",
|
|
||||||
access="open API",
|
|
||||||
capabilities=("repository_metadata", "oa_links", "project_links"),
|
|
||||||
strengths="Good for repository, project, and European OA discovery.",
|
|
||||||
caveats="May be better treated as a corpus-acquisition adapter than a first-line resolver.",
|
|
||||||
current_status="planned",
|
|
||||||
priority="evaluate",
|
|
||||||
),
|
|
||||||
SourceCatalogEntry(
|
|
||||||
key="oai_pmh",
|
|
||||||
label="OAI-PMH Repositories",
|
|
||||||
category="repository",
|
|
||||||
access="open protocol",
|
|
||||||
capabilities=("repository_harvest", "set_discovery", "metadata_formats"),
|
|
||||||
strengths="Already useful for theses, dissertations, and institutional repositories.",
|
|
||||||
caveats="Heterogeneous metadata quality; not a single canonical bibliographic source.",
|
|
||||||
current_status="integrated",
|
|
||||||
priority="selective",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def list_source_catalog() -> list[SourceCatalogEntry]:
|
|
||||||
return list(_CATALOG)
|
|
||||||
|
|
||||||
|
|
||||||
def prioritized_source_keys() -> list[str]:
|
|
||||||
order = {"now": 0, "next": 1, "selective": 2, "evaluate": 3}
|
|
||||||
return [entry.key for entry in sorted(_CATALOG, key=lambda entry: (order[entry.priority], entry.label.lower()))]
|
|
||||||
|
|
@ -1,210 +0,0 @@
|
||||||
"""
|
|
||||||
CrossRef source plugin.
|
|
||||||
|
|
||||||
CrossRef provides metadata for DOIs for scholarly works.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
import urllib.request
|
|
||||||
import urllib.parse
|
|
||||||
from typing import Any, Dict, List, Optional
|
|
||||||
|
|
||||||
from citegeist.bibtex import BibEntry
|
|
||||||
from citegeist.sources.base import BibliographicSource
|
|
||||||
|
|
||||||
|
|
||||||
class CrossRefSource(BibliographicSource):
|
|
||||||
"""CrossRef source for DOI-based metadata lookup."""
|
|
||||||
|
|
||||||
BASE_URL = "https://api.crossref.org"
|
|
||||||
|
|
||||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
||||||
"""Initialize CrossRef source.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config: Configuration with optional 'api_key'
|
|
||||||
"""
|
|
||||||
super().__init__(config)
|
|
||||||
self.api_key = self.config.get('api_key', '')
|
|
||||||
self.user_agent = self.config.get(
|
|
||||||
'user_agent',
|
|
||||||
'citegeist/0.1 (local research tool)',
|
|
||||||
)
|
|
||||||
|
|
||||||
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
|
|
||||||
"""Look up a work by DOI.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
doi: Digital Object Identifier
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
BibEntry if found, None otherwise
|
|
||||||
"""
|
|
||||||
if not doi:
|
|
||||||
return None
|
|
||||||
|
|
||||||
encoded = urllib.parse.quote(doi, safe="")
|
|
||||||
url = f"{self.BASE_URL}/works/{encoded}"
|
|
||||||
headers = {'User-Agent': self.user_agent}
|
|
||||||
if self.api_key:
|
|
||||||
headers['X-Api-Key'] = self.api_key
|
|
||||||
|
|
||||||
try:
|
|
||||||
req = urllib.request.Request(url, headers=headers)
|
|
||||||
response = urllib.request.urlopen(req)
|
|
||||||
data = response.read().decode('utf-8')
|
|
||||||
payload = json.loads(data)
|
|
||||||
return self._normalize_crossref(payload)
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
|
|
||||||
"""CrossRef doesn't support title-only lookup.
|
|
||||||
|
|
||||||
Returns None as this is not a supported operation.
|
|
||||||
"""
|
|
||||||
return None
|
|
||||||
|
|
||||||
def search(self, query: str, limit: int = 10) -> List[BibEntry]:
|
|
||||||
"""Search CrossRef for works.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
query: Search query string
|
|
||||||
limit: Maximum number of results
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of matching BibEntry objects
|
|
||||||
"""
|
|
||||||
if not query:
|
|
||||||
return []
|
|
||||||
|
|
||||||
encoded_query = urllib.parse.quote(query, safe="")
|
|
||||||
url = f"{self.BASE_URL}/works?query={encoded_query}&rows={limit}"
|
|
||||||
headers = {'User-Agent': self.user_agent}
|
|
||||||
if self.api_key:
|
|
||||||
headers['X-Api-Key'] = self.api_key
|
|
||||||
|
|
||||||
try:
|
|
||||||
req = urllib.request.Request(url, headers=headers)
|
|
||||||
response = urllib.request.urlopen(req)
|
|
||||||
data = response.read().decode('utf-8')
|
|
||||||
payload = json.loads(data)
|
|
||||||
items = payload.get('message', {}).get('items', [])
|
|
||||||
return [entry for item in items if (entry := self._normalize_crossref(item)) is not None]
|
|
||||||
except Exception:
|
|
||||||
return []
|
|
||||||
|
|
||||||
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
|
|
||||||
"""Normalize a raw CrossRef record to a BibEntry.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
record: Raw record from CrossRef API
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
BibEntry if normalization succeeds
|
|
||||||
"""
|
|
||||||
return self._normalize_crossref(record)
|
|
||||||
|
|
||||||
def get_identifier_scheme(self) -> str:
|
|
||||||
"""Return 'doi' as the identifier scheme."""
|
|
||||||
return 'doi'
|
|
||||||
|
|
||||||
def _normalize_crossref(self, payload: Dict[str, Any]) -> Optional[BibEntry]:
|
|
||||||
"""Normalize a CrossRef payload to a BibEntry.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
payload: Raw JSON payload from CrossRef
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
BibEntry object
|
|
||||||
"""
|
|
||||||
message = payload.get('message', payload)
|
|
||||||
if not message:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Extract basic fields
|
|
||||||
doi = str(message.get('DOI', ''))
|
|
||||||
title = ' '.join(message.get('title', [])) if message.get('title') else ''
|
|
||||||
author_data = message.get('author', [])
|
|
||||||
year = self._extract_year(message)
|
|
||||||
|
|
||||||
# Format authors
|
|
||||||
authors = []
|
|
||||||
for author in author_data:
|
|
||||||
given = str(author.get('given', ''))
|
|
||||||
family = str(author.get('family', ''))
|
|
||||||
if given and family:
|
|
||||||
authors.append(f"{given} {family}")
|
|
||||||
elif family:
|
|
||||||
authors.append(family)
|
|
||||||
|
|
||||||
# Get publisher
|
|
||||||
publisher = str(message.get('publisher', ''))
|
|
||||||
|
|
||||||
# Get journal info
|
|
||||||
container_title = message.get('container-title', [])
|
|
||||||
journal = container_title[0] if container_title else ''
|
|
||||||
|
|
||||||
# Get URL
|
|
||||||
url = str(message.get('URL', ''))
|
|
||||||
|
|
||||||
# Get abstract
|
|
||||||
abstract = self._extract_abstract(message.get('abstract'))
|
|
||||||
|
|
||||||
# Map to BibEntry
|
|
||||||
fields: Dict[str, str] = {}
|
|
||||||
if title:
|
|
||||||
fields['title'] = title
|
|
||||||
if authors:
|
|
||||||
fields['author'] = ' and '.join(authors)
|
|
||||||
if year:
|
|
||||||
fields['year'] = year
|
|
||||||
if doi:
|
|
||||||
fields['doi'] = doi
|
|
||||||
if journal:
|
|
||||||
fields['journal'] = journal
|
|
||||||
if publisher:
|
|
||||||
fields['publisher'] = publisher
|
|
||||||
if url:
|
|
||||||
fields['url'] = url
|
|
||||||
if abstract:
|
|
||||||
fields['abstract'] = abstract
|
|
||||||
|
|
||||||
citation_key = f"{authors[0] if authors else 'crossref'}_{year or 'n.d.'}_{title or doi}"
|
|
||||||
|
|
||||||
return BibEntry(
|
|
||||||
entry_type='article',
|
|
||||||
citation_key=citation_key,
|
|
||||||
fields=fields
|
|
||||||
)
|
|
||||||
|
|
||||||
def _extract_year(self, message: Dict[str, Any]) -> str:
|
|
||||||
for field_name in ('published-print', 'published-online', 'issued', 'created'):
|
|
||||||
year = self._extract_year_from_date_parts(message.get(field_name, {}))
|
|
||||||
if year:
|
|
||||||
return year
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def _extract_year_from_date_parts(self, field: Dict[str, Any]) -> str:
|
|
||||||
date_parts = field.get('date-parts', [])
|
|
||||||
if not date_parts:
|
|
||||||
return ''
|
|
||||||
first_part = date_parts[0]
|
|
||||||
if not first_part:
|
|
||||||
return ''
|
|
||||||
year = first_part[0]
|
|
||||||
return str(year) if year else ''
|
|
||||||
|
|
||||||
def _extract_abstract(self, raw_abstract: Any) -> str:
|
|
||||||
if isinstance(raw_abstract, str):
|
|
||||||
return raw_abstract.strip()
|
|
||||||
if isinstance(raw_abstract, list):
|
|
||||||
for item in raw_abstract:
|
|
||||||
if isinstance(item, dict):
|
|
||||||
text = str(item.get('value', '')).strip()
|
|
||||||
if text:
|
|
||||||
return text
|
|
||||||
elif isinstance(item, str) and item.strip():
|
|
||||||
return item.strip()
|
|
||||||
return ''
|
|
||||||
|
|
@ -1,157 +0,0 @@
|
||||||
"""Europe PMC source plugin."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import urllib.parse
|
|
||||||
from typing import Any, Dict, Optional
|
|
||||||
|
|
||||||
from citegeist.bibtex import BibEntry
|
|
||||||
from citegeist.sources._old_sources_compat import SourceClient
|
|
||||||
from citegeist.sources.base import BibliographicSource
|
|
||||||
|
|
||||||
|
|
||||||
class EuropePmcSource(BibliographicSource):
|
|
||||||
"""Europe PMC source for biomedical metadata and OA/fulltext links."""
|
|
||||||
|
|
||||||
BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
|
|
||||||
|
|
||||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
||||||
super().__init__(config)
|
|
||||||
user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)")
|
|
||||||
self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
|
|
||||||
|
|
||||||
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
|
|
||||||
normalized = doi.strip()
|
|
||||||
if not normalized:
|
|
||||||
return None
|
|
||||||
query = f'DOI:"{normalized}"'
|
|
||||||
row = self._search_one(query)
|
|
||||||
return self.normalize(row) if row else None
|
|
||||||
|
|
||||||
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
|
|
||||||
query_text = " ".join(title.split())
|
|
||||||
if not query_text:
|
|
||||||
return None
|
|
||||||
query = f'TITLE:"{query_text}"'
|
|
||||||
row = self._search_one(query)
|
|
||||||
return self.normalize(row) if row else None
|
|
||||||
|
|
||||||
def search(self, query: str, limit: int = 10) -> list[BibEntry]:
|
|
||||||
query_text = " ".join(query.split())
|
|
||||||
if not query_text:
|
|
||||||
return []
|
|
||||||
payload = self._search_payload(f'TITLE:"{query_text}"', max(1, limit))
|
|
||||||
results = payload.get("resultList", {}).get("result", []) if payload else []
|
|
||||||
return [entry for row in results if (entry := self.normalize(row)) is not None]
|
|
||||||
|
|
||||||
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
|
|
||||||
title = str(record.get("title") or "").strip()
|
|
||||||
if not title:
|
|
||||||
return None
|
|
||||||
|
|
||||||
doi = str(record.get("doi") or "").strip()
|
|
||||||
pmid = str(record.get("pmid") or record.get("id") or "").strip() if str(record.get("source") or "") == "MED" else str(record.get("pmid") or "").strip()
|
|
||||||
pmcid = str(record.get("pmcid") or "").strip()
|
|
||||||
year = str(record.get("pubYear") or "").strip()
|
|
||||||
author_text = self._normalize_author_string(str(record.get("authorString") or "").strip())
|
|
||||||
journal_title = str(record.get("journalTitle") or "").strip()
|
|
||||||
abstract = str(record.get("abstractText") or "").strip()
|
|
||||||
|
|
||||||
fields: Dict[str, str] = {"title": title}
|
|
||||||
if doi:
|
|
||||||
fields["doi"] = doi
|
|
||||||
if pmid:
|
|
||||||
fields["pmid"] = pmid
|
|
||||||
if pmcid:
|
|
||||||
fields["pmcid"] = pmcid
|
|
||||||
if year:
|
|
||||||
fields["year"] = year
|
|
||||||
if author_text:
|
|
||||||
fields["author"] = author_text
|
|
||||||
if journal_title:
|
|
||||||
fields["journal"] = journal_title
|
|
||||||
if volume := str(record.get("journalVolume") or "").strip():
|
|
||||||
fields["volume"] = volume
|
|
||||||
if issue := str(record.get("issue") or "").strip():
|
|
||||||
fields["number"] = issue
|
|
||||||
if pages := str(record.get("pageInfo") or "").strip():
|
|
||||||
fields["pages"] = pages
|
|
||||||
if abstract:
|
|
||||||
fields["abstract"] = abstract
|
|
||||||
if fulltext_url := self._fulltext_url(record):
|
|
||||||
fields["url"] = fulltext_url
|
|
||||||
elif article_url := self._article_url(record):
|
|
||||||
fields["url"] = article_url
|
|
||||||
if str(record.get("isOpenAccess") or "").strip():
|
|
||||||
fields["is_oa"] = "true" if str(record.get("isOpenAccess")).upper() == "Y" else "false"
|
|
||||||
if cited_by := str(record.get("citedByCount") or "").strip():
|
|
||||||
fields["europepmc_cited_by_count"] = cited_by
|
|
||||||
if source := str(record.get("source") or "").strip():
|
|
||||||
fields["europepmc_source"] = source
|
|
||||||
|
|
||||||
citation_key = self._citation_key(doi, pmid, author_text, year, title)
|
|
||||||
return BibEntry(entry_type="article", citation_key=citation_key, fields=fields)
|
|
||||||
|
|
||||||
def get_fulltext_url(self, doi: str) -> Optional[str]:
|
|
||||||
normalized = doi.strip()
|
|
||||||
if not normalized:
|
|
||||||
return None
|
|
||||||
payload = self._search_payload(f'DOI:"{normalized}"', 1)
|
|
||||||
results = payload.get("resultList", {}).get("result", []) if payload else []
|
|
||||||
if not results:
|
|
||||||
return None
|
|
||||||
return self._fulltext_url(results[0]) or self._article_url(results[0])
|
|
||||||
|
|
||||||
def get_identifier_scheme(self) -> str:
|
|
||||||
return "doi"
|
|
||||||
|
|
||||||
def _search_one(self, query: str) -> Dict[str, Any] | None:
|
|
||||||
payload = self._search_payload(query, 1)
|
|
||||||
results = payload.get("resultList", {}).get("result", []) if payload else []
|
|
||||||
return results[0] if results else None
|
|
||||||
|
|
||||||
def _search_payload(self, query: str, page_size: int) -> Dict[str, Any] | None:
|
|
||||||
params = {
|
|
||||||
"query": query,
|
|
||||||
"format": "json",
|
|
||||||
"resultType": "core",
|
|
||||||
"pageSize": max(1, page_size),
|
|
||||||
}
|
|
||||||
return self.source_client.try_get_json(f"{self.BASE_URL}?{urllib.parse.urlencode(params)}")
|
|
||||||
|
|
||||||
def _fulltext_url(self, record: Dict[str, Any]) -> str:
|
|
||||||
candidates = record.get("fullTextUrlList", {})
|
|
||||||
if isinstance(candidates, dict):
|
|
||||||
urls = candidates.get("fullTextUrl", [])
|
|
||||||
if isinstance(urls, dict):
|
|
||||||
urls = [urls]
|
|
||||||
if isinstance(urls, list):
|
|
||||||
for item in urls:
|
|
||||||
if not isinstance(item, dict):
|
|
||||||
continue
|
|
||||||
url = str(item.get("url") or "").strip()
|
|
||||||
if url:
|
|
||||||
return url
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def _article_url(self, record: Dict[str, Any]) -> str:
|
|
||||||
source = str(record.get("source") or "").strip()
|
|
||||||
identifier = str(record.get("id") or "").strip()
|
|
||||||
if source and identifier:
|
|
||||||
return f"https://europepmc.org/article/{source}/{identifier}"
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def _normalize_author_string(self, value: str) -> str:
|
|
||||||
if not value:
|
|
||||||
return ""
|
|
||||||
authors = [part.strip().rstrip(".") for part in value.split(",") if part.strip()]
|
|
||||||
return " and ".join(authors)
|
|
||||||
|
|
||||||
def _citation_key(self, doi: str, pmid: str, author_text: str, year: str, title: str) -> str:
|
|
||||||
if doi:
|
|
||||||
return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
|
|
||||||
if pmid:
|
|
||||||
return f"pmid{pmid}"
|
|
||||||
family = author_text.split(" and ")[0].split()[-1] if author_text else "ref"
|
|
||||||
family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref"
|
|
||||||
first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
|
|
||||||
return f"{family}{year or 'nd'}{first_word or 'untitled'}"
|
|
||||||
|
|
@ -1,178 +0,0 @@
|
||||||
"""OpenCitations source plugin."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import urllib.parse
|
|
||||||
from typing import Any, Dict, List, Optional
|
|
||||||
|
|
||||||
from citegeist.bibtex import BibEntry
|
|
||||||
from citegeist.sources.base import BibliographicSource, CitationEdge
|
|
||||||
from citegeist.sources._old_sources_compat import SourceClient
|
|
||||||
|
|
||||||
|
|
||||||
class OpenCitationsSource(BibliographicSource):
|
|
||||||
"""OpenCitations source for DOI metadata and citation edges."""
|
|
||||||
|
|
||||||
INDEX_BASE_URL = "https://api.opencitations.net/index/v2"
|
|
||||||
META_BASE_URL = "https://api.opencitations.net/meta/v1"
|
|
||||||
|
|
||||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
||||||
super().__init__(config)
|
|
||||||
user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)")
|
|
||||||
self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
|
|
||||||
|
|
||||||
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
|
|
||||||
normalized = self._normalize_doi_pid(doi)
|
|
||||||
if not normalized:
|
|
||||||
return None
|
|
||||||
rows = self.source_client.try_get_json(f"{self.META_BASE_URL}/metadata/{normalized}")
|
|
||||||
if not rows:
|
|
||||||
return None
|
|
||||||
return self.normalize(rows[0])
|
|
||||||
|
|
||||||
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def search(self, query: str, limit: int = 10) -> List[BibEntry]:
|
|
||||||
return []
|
|
||||||
|
|
||||||
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
|
|
||||||
ids = str(record.get("id") or "")
|
|
||||||
title = str(record.get("title") or "").strip()
|
|
||||||
if not ids or not title:
|
|
||||||
return None
|
|
||||||
|
|
||||||
doi = self._extract_id_value(ids, "doi")
|
|
||||||
openalex = self._extract_id_value(ids, "openalex")
|
|
||||||
year = self._extract_year(str(record.get("pub_date") or ""))
|
|
||||||
authors = self._normalize_author_field(str(record.get("author") or ""))
|
|
||||||
venue, venue_ids = self._parse_venue_field(str(record.get("venue") or ""))
|
|
||||||
entry_type = self._map_entry_type(str(record.get("type") or ""))
|
|
||||||
|
|
||||||
fields: Dict[str, str] = {"title": title}
|
|
||||||
if doi:
|
|
||||||
fields["doi"] = doi
|
|
||||||
fields["url"] = f"https://doi.org/{doi}"
|
|
||||||
if openalex:
|
|
||||||
fields["openalex"] = openalex
|
|
||||||
if year:
|
|
||||||
fields["year"] = year
|
|
||||||
if authors:
|
|
||||||
fields["author"] = authors
|
|
||||||
if venue:
|
|
||||||
if entry_type == "article":
|
|
||||||
fields["journal"] = venue
|
|
||||||
else:
|
|
||||||
fields["booktitle"] = venue
|
|
||||||
if volume := str(record.get("volume") or "").strip():
|
|
||||||
fields["volume"] = volume
|
|
||||||
if issue := str(record.get("issue") or "").strip():
|
|
||||||
fields["number"] = issue
|
|
||||||
if pages := str(record.get("page") or "").strip():
|
|
||||||
fields["pages"] = pages
|
|
||||||
if publisher := self._strip_bracketed_ids(str(record.get("publisher") or "")):
|
|
||||||
fields["publisher"] = publisher
|
|
||||||
if venue_ids:
|
|
||||||
fields["note"] = f"opencitations_venue_ids = {{{venue_ids}}}"
|
|
||||||
|
|
||||||
citation_key = self._citation_key(doi, openalex, authors, year, title)
|
|
||||||
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
|
|
||||||
|
|
||||||
def get_citations(self, work_id: str, relation_type: str = "cites", limit: int = 10) -> List[CitationEdge]:
|
|
||||||
normalized = self._normalize_doi_pid(work_id)
|
|
||||||
if not normalized:
|
|
||||||
return []
|
|
||||||
path = "references" if relation_type == "cites" else "citations"
|
|
||||||
rows = self.source_client.try_get_json(f"{self.INDEX_BASE_URL}/{path}/{normalized}")
|
|
||||||
if not rows:
|
|
||||||
return []
|
|
||||||
|
|
||||||
edges: List[CitationEdge] = []
|
|
||||||
for row in rows[:limit]:
|
|
||||||
citing = self._extract_id_value(str(row.get("citing") or ""), "doi")
|
|
||||||
cited = self._extract_id_value(str(row.get("cited") or ""), "doi")
|
|
||||||
if not citing or not cited:
|
|
||||||
continue
|
|
||||||
if relation_type == "cites":
|
|
||||||
source_work_id, target_work_id = citing, cited
|
|
||||||
else:
|
|
||||||
source_work_id, target_work_id = citing, cited
|
|
||||||
edges.append(
|
|
||||||
CitationEdge(
|
|
||||||
source_work_id=f"doi:{source_work_id}",
|
|
||||||
target_work_id=f"doi:{target_work_id}",
|
|
||||||
relation_type="cites",
|
|
||||||
source_type="opencitations",
|
|
||||||
source_label=f"opencitations:{path}:{normalized}",
|
|
||||||
confidence=0.85,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return edges
|
|
||||||
|
|
||||||
def get_identifier_scheme(self) -> str:
|
|
||||||
return "doi"
|
|
||||||
|
|
||||||
def _normalize_doi_pid(self, value: str) -> str:
|
|
||||||
doi = value.strip()
|
|
||||||
if not doi:
|
|
||||||
return ""
|
|
||||||
if doi.lower().startswith("doi:"):
|
|
||||||
doi = doi[4:]
|
|
||||||
return f"doi:{doi}"
|
|
||||||
|
|
||||||
def _extract_id_value(self, identifiers: str, scheme: str) -> str:
|
|
||||||
prefix = f"{scheme}:"
|
|
||||||
for token in identifiers.split():
|
|
||||||
if token.startswith(prefix):
|
|
||||||
return token[len(prefix):]
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def _extract_year(self, pub_date: str) -> str:
|
|
||||||
pub_date = pub_date.strip()
|
|
||||||
if len(pub_date) >= 4 and pub_date[:4].isdigit():
|
|
||||||
return pub_date[:4]
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def _normalize_author_field(self, raw_authors: str) -> str:
|
|
||||||
authors: List[str] = []
|
|
||||||
for part in raw_authors.split(";"):
|
|
||||||
cleaned = self._strip_bracketed_ids(part)
|
|
||||||
cleaned = " ".join(cleaned.split())
|
|
||||||
if cleaned:
|
|
||||||
authors.append(cleaned)
|
|
||||||
return " and ".join(authors)
|
|
||||||
|
|
||||||
def _parse_venue_field(self, raw_venue: str) -> tuple[str, str]:
|
|
||||||
raw_venue = raw_venue.strip()
|
|
||||||
if not raw_venue:
|
|
||||||
return "", ""
|
|
||||||
if "[" not in raw_venue:
|
|
||||||
return raw_venue, ""
|
|
||||||
title, _, remainder = raw_venue.partition("[")
|
|
||||||
return title.strip(), remainder.rstrip("] ").strip()
|
|
||||||
|
|
||||||
def _strip_bracketed_ids(self, value: str) -> str:
|
|
||||||
return value.split("[", 1)[0].strip()
|
|
||||||
|
|
||||||
def _map_entry_type(self, raw_type: str) -> str:
|
|
||||||
lowered = raw_type.casefold()
|
|
||||||
if lowered == "journal article":
|
|
||||||
return "article"
|
|
||||||
if lowered == "book":
|
|
||||||
return "book"
|
|
||||||
if lowered == "book chapter":
|
|
||||||
return "incollection"
|
|
||||||
if lowered in {"proceedings article", "conference paper"}:
|
|
||||||
return "inproceedings"
|
|
||||||
if "thesis" in lowered or "dissertation" in lowered:
|
|
||||||
return "phdthesis"
|
|
||||||
return "misc"
|
|
||||||
|
|
||||||
def _citation_key(self, doi: str, openalex: str, authors: str, year: str, title: str) -> str:
|
|
||||||
if doi:
|
|
||||||
return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
|
|
||||||
if openalex:
|
|
||||||
return "openalex" + "".join(ch for ch in openalex.lower() if ch.isalnum())
|
|
||||||
family = authors.split(" and ")[0].split(",")[0].split()[-1] if authors else "ref"
|
|
||||||
family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref"
|
|
||||||
first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
|
|
||||||
return f"{family}{year or 'nd'}{first_word or 'untitled'}"
|
|
||||||
|
|
@ -1,100 +0,0 @@
|
||||||
"""Open Library source plugin."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import urllib.parse
|
|
||||||
from typing import Any, Dict, List, Optional
|
|
||||||
|
|
||||||
from citegeist.bibtex import BibEntry
|
|
||||||
from citegeist.sources.base import BibliographicSource
|
|
||||||
from citegeist.sources._old_sources_compat import SourceClient
|
|
||||||
|
|
||||||
|
|
||||||
class OpenLibrarySource(BibliographicSource):
|
|
||||||
"""Open Library source for broad book and monograph metadata."""
|
|
||||||
|
|
||||||
SEARCH_URL = "https://openlibrary.org/search.json"
|
|
||||||
WORK_URL = "https://openlibrary.org"
|
|
||||||
|
|
||||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
||||||
super().__init__(config)
|
|
||||||
user_agent = str(self.config.get("user_agent") or "citegeist/0.1 (local research tool)")
|
|
||||||
self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
|
|
||||||
|
|
||||||
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
|
|
||||||
matches = self.search(title, limit=1)
|
|
||||||
return matches[0] if matches else None
|
|
||||||
|
|
||||||
def search(self, query: str, limit: int = 10) -> List[BibEntry]:
|
|
||||||
title = " ".join(query.split())
|
|
||||||
if not title:
|
|
||||||
return []
|
|
||||||
params = urllib.parse.urlencode({"title": title, "limit": max(1, limit), "fields": "*"})
|
|
||||||
payload = self.source_client.try_get_json(f"{self.SEARCH_URL}?{params}")
|
|
||||||
if not payload:
|
|
||||||
return []
|
|
||||||
docs = payload.get("docs", [])
|
|
||||||
if not isinstance(docs, list):
|
|
||||||
return []
|
|
||||||
return [entry for record in docs if isinstance(record, dict) and (entry := self.normalize(record)) is not None]
|
|
||||||
|
|
||||||
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
|
|
||||||
title = str(record.get("title") or "").strip()
|
|
||||||
if not title:
|
|
||||||
return None
|
|
||||||
|
|
||||||
authors = self._join_list(record.get("author_name"))
|
|
||||||
year = self._extract_year(record)
|
|
||||||
publishers = self._join_list(record.get("publisher"))
|
|
||||||
work_key = str(record.get("key") or "").strip()
|
|
||||||
edition_keys = record.get("edition_key") or []
|
|
||||||
isbn_values = record.get("isbn") or []
|
|
||||||
|
|
||||||
fields: Dict[str, str] = {"title": title}
|
|
||||||
if authors:
|
|
||||||
fields["author"] = authors
|
|
||||||
if year:
|
|
||||||
fields["year"] = year
|
|
||||||
if publishers:
|
|
||||||
fields["publisher"] = publishers
|
|
||||||
if work_key:
|
|
||||||
fields["openlibrary_work"] = work_key
|
|
||||||
fields["url"] = f"{self.WORK_URL}{work_key}"
|
|
||||||
if isinstance(edition_keys, list) and edition_keys:
|
|
||||||
fields["openlibrary_edition"] = str(edition_keys[0])
|
|
||||||
if isinstance(isbn_values, list) and isbn_values:
|
|
||||||
fields["isbn"] = str(isbn_values[0])
|
|
||||||
|
|
||||||
return BibEntry(
|
|
||||||
entry_type="book",
|
|
||||||
citation_key=self._citation_key(work_key, authors, year, title),
|
|
||||||
fields=fields,
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_identifier_scheme(self) -> str:
|
|
||||||
return "openlibrary"
|
|
||||||
|
|
||||||
def _extract_year(self, record: Dict[str, Any]) -> str:
|
|
||||||
first_publish_year = record.get("first_publish_year")
|
|
||||||
if first_publish_year:
|
|
||||||
return str(first_publish_year)
|
|
||||||
publish_year = record.get("publish_year")
|
|
||||||
if isinstance(publish_year, list) and publish_year:
|
|
||||||
return str(publish_year[0])
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def _join_list(self, value: Any) -> str:
|
|
||||||
if not isinstance(value, list):
|
|
||||||
return ""
|
|
||||||
items = [str(item).strip() for item in value if str(item).strip()]
|
|
||||||
return " and ".join(items)
|
|
||||||
|
|
||||||
def _citation_key(self, work_key: str, authors: str, year: str, title: str) -> str:
|
|
||||||
if work_key:
|
|
||||||
return "ol" + "".join(ch for ch in work_key.lower() if ch.isalnum())
|
|
||||||
family = authors.split(" and ")[0].split()[-1] if authors else "book"
|
|
||||||
family = "".join(ch for ch in family.lower() if ch.isalnum()) or "book"
|
|
||||||
first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
|
|
||||||
return f"{family}{year or 'nd'}{first_word or 'untitled'}"
|
|
||||||
|
|
@ -1,253 +0,0 @@
|
||||||
"""
|
|
||||||
Source registry for managing bibliographic source plugins.
|
|
||||||
|
|
||||||
This module provides a registry that can discover, load, and manage
|
|
||||||
multiple bibliographic source plugins.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import importlib.util
|
|
||||||
import inspect
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from typing import Any, Dict, List, Optional, Type
|
|
||||||
|
|
||||||
from citegeist.sources.base import BibliographicSource
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(slots=True)
|
|
||||||
class SourceRegistration:
|
|
||||||
"""Registration information for a source plugin."""
|
|
||||||
name: str
|
|
||||||
source_class: Type[BibliographicSource]
|
|
||||||
config: Dict[str, Any]
|
|
||||||
enabled: bool
|
|
||||||
|
|
||||||
|
|
||||||
class SourceRegistry:
|
|
||||||
"""Registry for bibliographic source plugins.
|
|
||||||
|
|
||||||
This class manages the discovery, registration, and instantiation
|
|
||||||
of bibliographic source plugins.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self) -> None:
|
|
||||||
"""Initialize the source registry."""
|
|
||||||
self._registrations: Dict[str, SourceRegistration] = {}
|
|
||||||
self._instances: Dict[str, BibliographicSource] = {}
|
|
||||||
|
|
||||||
def register(
|
|
||||||
self,
|
|
||||||
source_class: Type[BibliographicSource],
|
|
||||||
name: Optional[str] = None,
|
|
||||||
config: Optional[Dict[str, Any]] = None,
|
|
||||||
) -> None:
|
|
||||||
"""Register a source class.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
source_class: The source class to register (must inherit from BibliographicSource)
|
|
||||||
name: Optional name for the source (uses class name if not provided)
|
|
||||||
config: Optional configuration dictionary
|
|
||||||
"""
|
|
||||||
if not inspect.isclass(source_class) or not issubclass(source_class, BibliographicSource):
|
|
||||||
raise ValueError(f"{source_class} must be a subclass of BibliographicSource")
|
|
||||||
|
|
||||||
source_name = name or source_class.__name__
|
|
||||||
self._registrations[source_name] = SourceRegistration(
|
|
||||||
name=source_name,
|
|
||||||
source_class=source_class,
|
|
||||||
config=config or {},
|
|
||||||
enabled=config.get('enabled', True) if config else True
|
|
||||||
)
|
|
||||||
|
|
||||||
def get(self, name: str) -> Optional[BibliographicSource]:
|
|
||||||
"""Get a source instance by name.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
name: Name of the source
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Source instance if registered and enabled, None otherwise
|
|
||||||
"""
|
|
||||||
if name not in self._registrations:
|
|
||||||
return None
|
|
||||||
|
|
||||||
registration = self._registrations[name]
|
|
||||||
|
|
||||||
# Return cached instance if available
|
|
||||||
if name in self._instances:
|
|
||||||
return self._instances[name]
|
|
||||||
|
|
||||||
# Create new instance
|
|
||||||
if not registration.enabled:
|
|
||||||
return None
|
|
||||||
|
|
||||||
instance = registration.source_class(config=registration.config)
|
|
||||||
self._instances[name] = instance
|
|
||||||
return instance
|
|
||||||
|
|
||||||
def list_sources(self, enabled_only: bool = False) -> List[str]:
|
|
||||||
"""List registered source names.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
enabled_only: Only return enabled sources
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of source names
|
|
||||||
"""
|
|
||||||
sources = list(self._registrations.keys())
|
|
||||||
if enabled_only:
|
|
||||||
return [name for name, reg in self._registrations.items() if reg.enabled]
|
|
||||||
return sources
|
|
||||||
|
|
||||||
def get_config(self, name: str) -> Optional[Dict[str, Any]]:
|
|
||||||
"""Get configuration for a source.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
name: Name of the source
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Configuration dictionary, or None if not found
|
|
||||||
"""
|
|
||||||
registration = self._registrations.get(name)
|
|
||||||
return registration.config if registration else None
|
|
||||||
|
|
||||||
def load_from_file(self, filepath: str) -> None:
|
|
||||||
"""Load source plugins from a Python file.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
filepath: Path to Python file containing source classes
|
|
||||||
"""
|
|
||||||
spec = importlib.util.spec_from_file_location("module.sources", filepath)
|
|
||||||
if spec is None or spec.loader is None:
|
|
||||||
raise ImportError(f"Cannot load module from {filepath}")
|
|
||||||
|
|
||||||
module = importlib.util.module_from_spec(spec)
|
|
||||||
spec.loader.exec_module(module)
|
|
||||||
|
|
||||||
# Find all classes that inherit from BibliographicSource
|
|
||||||
for name, obj in inspect.getmembers(module, inspect.isclass):
|
|
||||||
if issubclass(obj, BibliographicSource) and obj is not BibliographicSource:
|
|
||||||
self.register(obj)
|
|
||||||
|
|
||||||
def load_from_directory(self, directory: str) -> None:
|
|
||||||
"""Load source plugins from a directory.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
directory: Path to directory containing source plugin files
|
|
||||||
"""
|
|
||||||
import os
|
|
||||||
for filename in os.listdir(directory):
|
|
||||||
if filename.endswith('.py') and not filename.startswith('_'):
|
|
||||||
filepath = os.path.join(directory, filename)
|
|
||||||
self.load_from_file(filepath)
|
|
||||||
|
|
||||||
def from_config_dict(self, config: Dict[str, Any]) -> None:
|
|
||||||
"""Load sources from a configuration dictionary.
|
|
||||||
|
|
||||||
Example config format:
|
|
||||||
{
|
|
||||||
"sources": {
|
|
||||||
"crossref": {
|
|
||||||
"source_type": "crossref",
|
|
||||||
"enabled": true
|
|
||||||
},
|
|
||||||
"semantic_scholar": {
|
|
||||||
"source_type": "semantic_scholar",
|
|
||||||
"enabled": true,
|
|
||||||
"api_key": "..."
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config: Configuration dictionary
|
|
||||||
"""
|
|
||||||
if 'sources' not in config:
|
|
||||||
return
|
|
||||||
|
|
||||||
for name, source_config in config['sources'].items():
|
|
||||||
source_name = str(name)
|
|
||||||
source_type = str(source_config.get('source_type', source_name))
|
|
||||||
self.register(
|
|
||||||
source_class=self._resolve_source_class(source_type),
|
|
||||||
name=source_name,
|
|
||||||
config=source_config
|
|
||||||
)
|
|
||||||
|
|
||||||
def to_dict(self) -> Dict[str, Any]:
|
|
||||||
"""Serialize registry to dictionary.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary representation of registry
|
|
||||||
"""
|
|
||||||
return {
|
|
||||||
name: {
|
|
||||||
'enabled': reg.enabled,
|
|
||||||
'config': reg.config
|
|
||||||
}
|
|
||||||
for name, reg in self._registrations.items()
|
|
||||||
}
|
|
||||||
|
|
||||||
def from_dict(self, data: Dict[str, Any]) -> None:
|
|
||||||
"""Load registry from dictionary.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
data: Dictionary representation of registry
|
|
||||||
"""
|
|
||||||
for name, source_data in data.items():
|
|
||||||
source_name = str(name)
|
|
||||||
source_type = str(source_data.get('source_type', source_name))
|
|
||||||
self.register(
|
|
||||||
source_class=self._resolve_source_class(source_type),
|
|
||||||
name=source_name,
|
|
||||||
config=source_data.get('config', source_data)
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_registered_sources(self) -> List[SourceRegistration]:
|
|
||||||
"""Get all registered source registrations.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of SourceRegistration objects
|
|
||||||
"""
|
|
||||||
return list(self._registrations.values())
|
|
||||||
|
|
||||||
def _resolve_source_class(self, source_type: str) -> Type[BibliographicSource]:
|
|
||||||
normalized = source_type.strip().lower().replace('-', '_')
|
|
||||||
if normalized in {'crossref', 'cross_ref'}:
|
|
||||||
from citegeist.sources.crossref import CrossRefSource
|
|
||||||
|
|
||||||
return CrossRefSource
|
|
||||||
if normalized in {'opencitations', 'open_citations'}:
|
|
||||||
from citegeist.sources.opencitations import OpenCitationsSource
|
|
||||||
|
|
||||||
return OpenCitationsSource
|
|
||||||
if normalized == 'unpaywall':
|
|
||||||
from citegeist.sources.unpaywall import UnpaywallSource
|
|
||||||
|
|
||||||
return UnpaywallSource
|
|
||||||
if normalized in {'europepmc', 'europe_pmc'}:
|
|
||||||
from citegeist.sources.europepmc import EuropePmcSource
|
|
||||||
|
|
||||||
return EuropePmcSource
|
|
||||||
if normalized in {'semanticscholar', 'semantic_scholar'}:
|
|
||||||
from citegeist.sources.semanticscholar import SemanticScholarSource
|
|
||||||
|
|
||||||
return SemanticScholarSource
|
|
||||||
if normalized in {"openlibrary", "open_library"}:
|
|
||||||
from citegeist.sources.openlibrary import OpenLibrarySource
|
|
||||||
|
|
||||||
return OpenLibrarySource
|
|
||||||
raise ValueError(f"Unknown source type: {source_type}")
|
|
||||||
|
|
||||||
|
|
||||||
# Global registry instance
|
|
||||||
_global_registry = SourceRegistry()
|
|
||||||
|
|
||||||
|
|
||||||
def get_registry() -> SourceRegistry:
|
|
||||||
"""Get the global source registry instance.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The global SourceRegistry instance
|
|
||||||
"""
|
|
||||||
return _global_registry
|
|
||||||
|
|
@ -1,140 +0,0 @@
|
||||||
"""Semantic Scholar source plugin."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import urllib.parse
|
|
||||||
import urllib.request
|
|
||||||
from typing import Any, Dict, List, Optional
|
|
||||||
|
|
||||||
from citegeist.bibtex import BibEntry
|
|
||||||
from citegeist.sources.base import BibliographicSource
|
|
||||||
|
|
||||||
|
|
||||||
class SemanticScholarSource(BibliographicSource):
|
|
||||||
"""Semantic Scholar source for broad scientific metadata coverage."""
|
|
||||||
|
|
||||||
BASE_URL = "https://api.semanticscholar.org/graph/v1"
|
|
||||||
DEFAULT_FIELDS = (
|
|
||||||
"paperId,title,year,abstract,authors,externalIds,journal,venue,url,"
|
|
||||||
"openAccessPdf,citationCount,publicationTypes"
|
|
||||||
)
|
|
||||||
|
|
||||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
||||||
super().__init__(config)
|
|
||||||
self.api_key = str(
|
|
||||||
self.config.get("api_key")
|
|
||||||
or os.environ.get("SEMANTIC_SCHOLAR_API_KEY")
|
|
||||||
or ""
|
|
||||||
).strip()
|
|
||||||
self.user_agent = str(self.config.get("user_agent") or "citegeist/0.1 (local research tool)")
|
|
||||||
|
|
||||||
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
|
|
||||||
normalized = doi.strip()
|
|
||||||
if not normalized:
|
|
||||||
return None
|
|
||||||
encoded = urllib.parse.quote(f"DOI:{normalized}", safe="")
|
|
||||||
payload = self._get_json(f"{self.BASE_URL}/paper/{encoded}?fields={self.DEFAULT_FIELDS}")
|
|
||||||
if not payload:
|
|
||||||
return None
|
|
||||||
return self.normalize(payload)
|
|
||||||
|
|
||||||
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
|
|
||||||
matches = self.search(title, limit=1)
|
|
||||||
return matches[0] if matches else None
|
|
||||||
|
|
||||||
def search(self, query: str, limit: int = 10) -> List[BibEntry]:
|
|
||||||
query_text = " ".join(query.split())
|
|
||||||
if not query_text:
|
|
||||||
return []
|
|
||||||
params = urllib.parse.urlencode(
|
|
||||||
{"query": query_text, "limit": max(1, limit), "fields": self.DEFAULT_FIELDS}
|
|
||||||
)
|
|
||||||
payload = self._get_json(f"{self.BASE_URL}/paper/search?{params}")
|
|
||||||
if not payload:
|
|
||||||
return []
|
|
||||||
return [entry for row in payload.get("data", []) if (entry := self.normalize(row)) is not None]
|
|
||||||
|
|
||||||
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
|
|
||||||
title = str(record.get("title") or "").strip()
|
|
||||||
if not title:
|
|
||||||
return None
|
|
||||||
|
|
||||||
external_ids = record.get("externalIds") or {}
|
|
||||||
doi = str(external_ids.get("DOI") or "").strip()
|
|
||||||
authors = " and ".join(
|
|
||||||
str(author.get("name") or "").strip()
|
|
||||||
for author in record.get("authors", [])
|
|
||||||
if str(author.get("name") or "").strip()
|
|
||||||
)
|
|
||||||
year = str(record.get("year") or "").strip()
|
|
||||||
abstract = str(record.get("abstract") or "").strip()
|
|
||||||
journal = record.get("journal") or {}
|
|
||||||
journal_name = str(journal.get("name") or record.get("venue") or "").strip()
|
|
||||||
open_access_pdf = record.get("openAccessPdf") or {}
|
|
||||||
|
|
||||||
fields: Dict[str, str] = {"title": title}
|
|
||||||
if doi:
|
|
||||||
fields["doi"] = doi
|
|
||||||
if paper_id := str(record.get("paperId") or "").strip():
|
|
||||||
fields["semanticscholar_id"] = paper_id
|
|
||||||
if year:
|
|
||||||
fields["year"] = year
|
|
||||||
if authors:
|
|
||||||
fields["author"] = authors
|
|
||||||
if abstract:
|
|
||||||
fields["abstract"] = abstract
|
|
||||||
if journal_name:
|
|
||||||
if self._entry_type(record) == "inproceedings":
|
|
||||||
fields["booktitle"] = journal_name
|
|
||||||
else:
|
|
||||||
fields["journal"] = journal_name
|
|
||||||
if url := str(open_access_pdf.get("url") or record.get("url") or "").strip():
|
|
||||||
fields["url"] = url
|
|
||||||
if open_access_pdf:
|
|
||||||
fields["is_oa"] = "true"
|
|
||||||
if citation_count := record.get("citationCount"):
|
|
||||||
fields["semanticscholar_citation_count"] = str(citation_count)
|
|
||||||
|
|
||||||
citation_key = self._citation_key(doi, str(record.get("paperId") or ""), authors, year, title)
|
|
||||||
return BibEntry(entry_type=self._entry_type(record), citation_key=citation_key, fields=fields)
|
|
||||||
|
|
||||||
def get_fulltext_url(self, doi: str) -> Optional[str]:
|
|
||||||
entry = self.lookup_by_doi(doi)
|
|
||||||
if entry is None:
|
|
||||||
return None
|
|
||||||
return entry.fields.get("url")
|
|
||||||
|
|
||||||
def get_identifier_scheme(self) -> str:
|
|
||||||
return "doi"
|
|
||||||
|
|
||||||
def _entry_type(self, record: Dict[str, Any]) -> str:
|
|
||||||
publication_types = [str(item).lower() for item in (record.get("publicationTypes") or [])]
|
|
||||||
if any("conference" in item for item in publication_types):
|
|
||||||
return "inproceedings"
|
|
||||||
if any("review" in item for item in publication_types):
|
|
||||||
return "article"
|
|
||||||
if record.get("journal") or record.get("venue"):
|
|
||||||
return "article"
|
|
||||||
return "misc"
|
|
||||||
|
|
||||||
def _citation_key(self, doi: str, paper_id: str, authors: str, year: str, title: str) -> str:
|
|
||||||
if doi:
|
|
||||||
return "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
|
|
||||||
if paper_id:
|
|
||||||
return "s2" + "".join(ch for ch in paper_id.lower() if ch.isalnum())
|
|
||||||
family = authors.split(" and ")[0].split()[-1] if authors else "ref"
|
|
||||||
family = "".join(ch for ch in family.lower() if ch.isalnum()) or "ref"
|
|
||||||
first_word = "".join(ch for ch in (title.split()[0] if title.split() else "untitled").lower() if ch.isalnum())
|
|
||||||
return f"{family}{year or 'nd'}{first_word or 'untitled'}"
|
|
||||||
|
|
||||||
def _get_json(self, url: str) -> Dict[str, Any] | None:
|
|
||||||
headers = {"User-Agent": self.user_agent}
|
|
||||||
if self.api_key:
|
|
||||||
headers["x-api-key"] = self.api_key
|
|
||||||
try:
|
|
||||||
request = urllib.request.Request(url, headers=headers)
|
|
||||||
with urllib.request.urlopen(request) as response:
|
|
||||||
return json.loads(response.read().decode("utf-8"))
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
|
|
@ -1,116 +0,0 @@
|
||||||
"""Unpaywall source plugin."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import os
|
|
||||||
import urllib.parse
|
|
||||||
from typing import Any, Dict, Optional
|
|
||||||
|
|
||||||
from citegeist.bibtex import BibEntry
|
|
||||||
from citegeist.sources._old_sources_compat import SourceClient
|
|
||||||
from citegeist.sources.base import BibliographicSource
|
|
||||||
|
|
||||||
|
|
||||||
class UnpaywallSource(BibliographicSource):
|
|
||||||
"""Unpaywall source for DOI-based OA link enrichment."""
|
|
||||||
|
|
||||||
BASE_URL = "https://api.unpaywall.org/v2"
|
|
||||||
|
|
||||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
||||||
super().__init__(config)
|
|
||||||
user_agent = self.config.get("user_agent", "citegeist/0.1 (local research tool)")
|
|
||||||
self.source_client = self.config.get("source_client") or SourceClient(user_agent=user_agent)
|
|
||||||
self.email = str(
|
|
||||||
self.config.get("email")
|
|
||||||
or os.environ.get("UNPAYWALL_EMAIL")
|
|
||||||
or os.environ.get("NCBI_EMAIL")
|
|
||||||
or ""
|
|
||||||
).strip()
|
|
||||||
|
|
||||||
def lookup_by_doi(self, doi: str) -> Optional[BibEntry]:
|
|
||||||
payload = self.lookup_oa_record(doi)
|
|
||||||
if not payload:
|
|
||||||
return None
|
|
||||||
return self.normalize(payload)
|
|
||||||
|
|
||||||
def lookup_by_title(self, title: str) -> Optional[BibEntry]:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def search(self, query: str, limit: int = 10) -> list[BibEntry]:
|
|
||||||
return []
|
|
||||||
|
|
||||||
def normalize(self, record: Dict[str, Any]) -> Optional[BibEntry]:
|
|
||||||
doi = str(record.get("doi") or "").strip()
|
|
||||||
title = str(record.get("title") or "").strip() or (doi and f"OA record for DOI {doi}")
|
|
||||||
if not doi or not title:
|
|
||||||
return None
|
|
||||||
|
|
||||||
fields: Dict[str, str] = {
|
|
||||||
"title": title,
|
|
||||||
"doi": doi,
|
|
||||||
}
|
|
||||||
if year := str(record.get("year") or "").strip():
|
|
||||||
fields["year"] = year
|
|
||||||
if landing_url := self._best_landing_url(record):
|
|
||||||
fields["url"] = landing_url
|
|
||||||
fields["best_oa_url"] = landing_url
|
|
||||||
if pdf_url := self._best_pdf_url(record):
|
|
||||||
fields["best_oa_pdf_url"] = pdf_url
|
|
||||||
if oa_status := str(record.get("oa_status") or "").strip():
|
|
||||||
fields["oa_status"] = oa_status
|
|
||||||
if license_name := self._best_license(record):
|
|
||||||
fields["oa_license"] = license_name
|
|
||||||
if host_type := self._best_host_type(record):
|
|
||||||
fields["oa_host_type"] = host_type
|
|
||||||
if version := self._best_version(record):
|
|
||||||
fields["oa_version"] = version
|
|
||||||
if evidence := self._best_evidence(record):
|
|
||||||
fields["oa_evidence"] = evidence
|
|
||||||
if record.get("is_oa") is not None:
|
|
||||||
fields["is_oa"] = "true" if bool(record.get("is_oa")) else "false"
|
|
||||||
|
|
||||||
citation_key = "doi" + "".join(ch for ch in doi.lower() if ch.isalnum())
|
|
||||||
return BibEntry(entry_type="misc", citation_key=citation_key, fields=fields)
|
|
||||||
|
|
||||||
def get_fulltext_url(self, doi: str) -> Optional[str]:
|
|
||||||
payload = self.lookup_oa_record(doi)
|
|
||||||
if not payload:
|
|
||||||
return None
|
|
||||||
return self._best_pdf_url(payload) or self._best_landing_url(payload)
|
|
||||||
|
|
||||||
def get_identifier_scheme(self) -> str:
|
|
||||||
return "doi"
|
|
||||||
|
|
||||||
def is_available(self) -> bool:
|
|
||||||
return self.enabled and bool(self.email)
|
|
||||||
|
|
||||||
def lookup_oa_record(self, doi: str) -> Dict[str, Any] | None:
|
|
||||||
normalized = doi.strip()
|
|
||||||
if not normalized or not self.email:
|
|
||||||
return None
|
|
||||||
encoded = urllib.parse.quote(normalized, safe="")
|
|
||||||
query = urllib.parse.urlencode({"email": self.email})
|
|
||||||
return self.source_client.try_get_json(f"{self.BASE_URL}/{encoded}?{query}")
|
|
||||||
|
|
||||||
def _best_landing_url(self, payload: Dict[str, Any]) -> str:
|
|
||||||
location = payload.get("best_oa_location") or {}
|
|
||||||
return str(location.get("url") or location.get("url_for_landing_page") or "").strip()
|
|
||||||
|
|
||||||
def _best_pdf_url(self, payload: Dict[str, Any]) -> str:
|
|
||||||
location = payload.get("best_oa_location") or {}
|
|
||||||
return str(location.get("url_for_pdf") or "").strip()
|
|
||||||
|
|
||||||
def _best_license(self, payload: Dict[str, Any]) -> str:
|
|
||||||
location = payload.get("best_oa_location") or {}
|
|
||||||
return str(location.get("license") or "").strip()
|
|
||||||
|
|
||||||
def _best_host_type(self, payload: Dict[str, Any]) -> str:
|
|
||||||
location = payload.get("best_oa_location") or {}
|
|
||||||
return str(location.get("host_type") or "").strip()
|
|
||||||
|
|
||||||
def _best_version(self, payload: Dict[str, Any]) -> str:
|
|
||||||
location = payload.get("best_oa_location") or {}
|
|
||||||
return str(location.get("version") or "").strip()
|
|
||||||
|
|
||||||
def _best_evidence(self, payload: Dict[str, Any]) -> str:
|
|
||||||
location = payload.get("best_oa_location") or {}
|
|
||||||
return str(location.get("evidence") or "").strip()
|
|
||||||
|
|
@ -138,7 +138,6 @@ class TalkOriginsEnrichmentResult:
|
||||||
applied: bool
|
applied: bool
|
||||||
source_label: str = ""
|
source_label: str = ""
|
||||||
weak_reasons_after: list[str] | None = None
|
weak_reasons_after: list[str] | None = None
|
||||||
resolution_attempts: list[dict[str, object]] | None = None
|
|
||||||
conflicts: list[dict[str, str]] | None = None
|
conflicts: list[dict[str, str]] | None = None
|
||||||
error: str = ""
|
error: str = ""
|
||||||
|
|
||||||
|
|
@ -546,29 +545,9 @@ class TalkOriginsScraper:
|
||||||
if not weak_reasons_before:
|
if not weak_reasons_before:
|
||||||
continue
|
continue
|
||||||
resolution = None
|
resolution = None
|
||||||
attempts: list[dict[str, object]] = []
|
|
||||||
error = ""
|
error = ""
|
||||||
try:
|
try:
|
||||||
resolver_with_trace = getattr(self.resolver, "resolve_entry_with_trace", None)
|
resolution = self.resolver.resolve_entry(canonical)
|
||||||
resolver_plain = getattr(self.resolver, "resolve_entry", None)
|
|
||||||
plain_func = getattr(resolver_plain, "__func__", None)
|
|
||||||
trace_func = getattr(resolver_with_trace, "__func__", None)
|
|
||||||
use_trace = (
|
|
||||||
resolver_with_trace is not None
|
|
||||||
and (
|
|
||||||
trace_func is None
|
|
||||||
or (
|
|
||||||
plain_func is MetadataResolver.resolve_entry
|
|
||||||
and trace_func is MetadataResolver.resolve_entry_with_trace
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if use_trace:
|
|
||||||
outcome = self.resolver.resolve_entry_with_trace(canonical)
|
|
||||||
resolution = outcome.resolution
|
|
||||||
attempts = [asdict(attempt) for attempt in outcome.attempts]
|
|
||||||
else:
|
|
||||||
resolution = self.resolver.resolve_entry(canonical)
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
error = str(exc)
|
error = str(exc)
|
||||||
|
|
||||||
|
|
@ -580,7 +559,6 @@ class TalkOriginsScraper:
|
||||||
applied=False,
|
applied=False,
|
||||||
source_label=resolution.source_label if resolution is not None else "",
|
source_label=resolution.source_label if resolution is not None else "",
|
||||||
error=error,
|
error=error,
|
||||||
resolution_attempts=attempts,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if resolution is not None:
|
if resolution is not None:
|
||||||
|
|
|
||||||
|
|
@ -41,12 +41,9 @@ SAMPLE_BIB = """
|
||||||
|
|
||||||
def run_cli(tmp_path: Path, *args: str) -> subprocess.CompletedProcess[str]:
|
def run_cli(tmp_path: Path, *args: str) -> subprocess.CompletedProcess[str]:
|
||||||
database = tmp_path / "library.sqlite3"
|
database = tmp_path / "library.sqlite3"
|
||||||
python = Path(__file__).resolve().parents[1] / ".venv/bin/python"
|
|
||||||
if not python.exists():
|
|
||||||
python = Path(sys.executable)
|
|
||||||
env = {"PYTHONPATH": "src"}
|
env = {"PYTHONPATH": "src"}
|
||||||
return subprocess.run(
|
return subprocess.run(
|
||||||
[str(python), "-m", "citegeist", "--db", str(database), *args],
|
[sys.executable, "-m", "citegeist", "--db", str(database), *args],
|
||||||
cwd=Path(__file__).resolve().parents[1],
|
cwd=Path(__file__).resolve().parents[1],
|
||||||
env=env,
|
env=env,
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
|
|
@ -1684,45 +1681,6 @@ def test_cli_export_topic(tmp_path: Path):
|
||||||
assert "@article{seed2024," in exported
|
assert "@article{seed2024," in exported
|
||||||
|
|
||||||
|
|
||||||
def test_cli_export_notebook_topic_bundle(tmp_path: Path):
|
|
||||||
bib_path = tmp_path / "input.bib"
|
|
||||||
bib_path.write_text(
|
|
||||||
"""
|
|
||||||
@article{seed2024,
|
|
||||||
author = {Seed, Alice},
|
|
||||||
title = {Graph Topic Result},
|
|
||||||
year = {2024}
|
|
||||||
}
|
|
||||||
""",
|
|
||||||
encoding="utf-8",
|
|
||||||
)
|
|
||||||
assert run_cli(tmp_path, "ingest", str(bib_path)).returncode == 0
|
|
||||||
|
|
||||||
from citegeist.storage import BibliographyStore
|
|
||||||
|
|
||||||
database = tmp_path / "library.sqlite3"
|
|
||||||
store = BibliographyStore(database)
|
|
||||||
try:
|
|
||||||
store.add_entry_topic(
|
|
||||||
"seed2024",
|
|
||||||
topic_slug="graph-topic",
|
|
||||||
topic_name="Graph Topic",
|
|
||||||
source_label="seed",
|
|
||||||
)
|
|
||||||
store.connection.commit()
|
|
||||||
finally:
|
|
||||||
store.close()
|
|
||||||
|
|
||||||
output_dir = tmp_path / "notebook-export"
|
|
||||||
result = run_cli(tmp_path, "export-notebook-topic", "graph-topic", "--output-dir", str(output_dir))
|
|
||||||
assert result.returncode == 0
|
|
||||||
payload = json.loads(result.stdout)
|
|
||||||
assert payload["bundle"]["bundle_kind"] == "notebook_topic_bibliography_bundle"
|
|
||||||
assert (output_dir / "notebook_topic_bundle.json").exists()
|
|
||||||
assert (output_dir / "notebook_topic_bibliography.bib").exists()
|
|
||||||
assert "@article{seed2024," in (output_dir / "notebook_topic_bibliography.bib").read_text(encoding="utf-8")
|
|
||||||
|
|
||||||
|
|
||||||
def test_cli_export_topic_skips_stub_entries_by_default(tmp_path: Path):
|
def test_cli_export_topic_skips_stub_entries_by_default(tmp_path: Path):
|
||||||
bib_path = tmp_path / "input.bib"
|
bib_path = tmp_path / "input.bib"
|
||||||
bib_path.write_text(
|
bib_path.write_text(
|
||||||
|
|
|
||||||
|
|
@ -1,123 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from citegeist.resolve import MetadataResolver
|
|
||||||
from citegeist.sources import EuropePmcSource, SourceRegistry, list_source_catalog
|
|
||||||
|
|
||||||
|
|
||||||
def test_europepmc_source_normalizes_core_record() -> None:
|
|
||||||
source = EuropePmcSource(config={})
|
|
||||||
entry = source.normalize(
|
|
||||||
{
|
|
||||||
"id": "37158217",
|
|
||||||
"source": "MED",
|
|
||||||
"pmid": "37158217",
|
|
||||||
"pmcid": "PMC10000001",
|
|
||||||
"doi": "10.1000/example",
|
|
||||||
"title": "Biomedical Example",
|
|
||||||
"authorString": "Doe J, Roe A",
|
|
||||||
"journalTitle": "Biomed Journal",
|
|
||||||
"pubYear": "2024",
|
|
||||||
"journalVolume": "16",
|
|
||||||
"issue": "1",
|
|
||||||
"pageInfo": "10-20",
|
|
||||||
"abstractText": "Abstract text.",
|
|
||||||
"isOpenAccess": "Y",
|
|
||||||
"citedByCount": 12,
|
|
||||||
"fullTextUrlList": {"fullTextUrl": [{"url": "https://europepmc.org/articles/PMC10000001?pdf=render"}]},
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
assert entry is not None
|
|
||||||
assert entry.fields["doi"] == "10.1000/example"
|
|
||||||
assert entry.fields["pmid"] == "37158217"
|
|
||||||
assert entry.fields["pmcid"] == "PMC10000001"
|
|
||||||
assert entry.fields["journal"] == "Biomed Journal"
|
|
||||||
assert entry.fields["url"] == "https://europepmc.org/articles/PMC10000001?pdf=render"
|
|
||||||
assert entry.fields["is_oa"] == "true"
|
|
||||||
|
|
||||||
|
|
||||||
def test_europepmc_registry_and_catalog() -> None:
|
|
||||||
registry = SourceRegistry()
|
|
||||||
registry.from_config_dict(
|
|
||||||
{
|
|
||||||
"sources": {
|
|
||||||
"europepmc": {
|
|
||||||
"source_type": "europepmc",
|
|
||||||
"enabled": True,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
source = registry.get("europepmc")
|
|
||||||
assert isinstance(source, EuropePmcSource)
|
|
||||||
|
|
||||||
catalog = {entry.key: entry for entry in list_source_catalog()}
|
|
||||||
assert catalog["europe_pmc"].current_status == "integrated"
|
|
||||||
assert catalog["europe_pmc"].priority == "now"
|
|
||||||
|
|
||||||
|
|
||||||
def test_metadata_resolver_uses_europepmc_doi_after_primary_lookups_fail() -> None:
|
|
||||||
resolver = MetadataResolver()
|
|
||||||
resolver.resolve_doi = lambda _doi: None # type: ignore[method-assign]
|
|
||||||
resolver.resolve_datacite_doi = lambda _doi: None # type: ignore[method-assign]
|
|
||||||
resolver.europepmc.lookup_by_doi = lambda _doi: resolver.europepmc.normalize( # type: ignore[method-assign]
|
|
||||||
{
|
|
||||||
"id": "37158217",
|
|
||||||
"source": "MED",
|
|
||||||
"pmid": "37158217",
|
|
||||||
"doi": "10.1000/example",
|
|
||||||
"title": "Biomedical Example",
|
|
||||||
"authorString": "Doe J, Roe A",
|
|
||||||
"journalTitle": "Biomed Journal",
|
|
||||||
"pubYear": "2024",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
from citegeist.bibtex import BibEntry
|
|
||||||
|
|
||||||
result = resolver.resolve_entry(
|
|
||||||
BibEntry(
|
|
||||||
entry_type="article",
|
|
||||||
citation_key="seed2024",
|
|
||||||
fields={"doi": "10.1000/example", "title": "Biomedical Example"},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
assert result is not None
|
|
||||||
assert result.source_label == "europepmc:doi:10.1000/example"
|
|
||||||
assert result.entry.fields["pmid"] == "37158217"
|
|
||||||
|
|
||||||
|
|
||||||
def test_metadata_resolver_uses_europepmc_title_search_after_pubmed() -> None:
|
|
||||||
resolver = MetadataResolver()
|
|
||||||
resolver.search_crossref_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
|
||||||
resolver.search_datacite_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
|
||||||
resolver.search_openalex_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
|
||||||
resolver.search_pubmed_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
|
||||||
resolver.europepmc.search = lambda _title, limit=5: [ # type: ignore[method-assign]
|
|
||||||
resolver.europepmc.normalize(
|
|
||||||
{
|
|
||||||
"id": "37158217",
|
|
||||||
"source": "MED",
|
|
||||||
"pmid": "37158217",
|
|
||||||
"doi": "10.1000/example",
|
|
||||||
"title": "Biomedical Example",
|
|
||||||
"authorString": "Doe J, Roe A",
|
|
||||||
"journalTitle": "Biomed Journal",
|
|
||||||
"pubYear": "2024",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
from citegeist.bibtex import BibEntry
|
|
||||||
|
|
||||||
result = resolver.resolve_entry(
|
|
||||||
BibEntry(
|
|
||||||
entry_type="article",
|
|
||||||
citation_key="seed2024",
|
|
||||||
fields={"title": "Biomedical Example", "author": "Doe J", "year": "2024"},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
assert result is not None
|
|
||||||
assert result.source_label == "europepmc:search:Biomedical Example"
|
|
||||||
|
|
@ -1,137 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from citegeist.expand import OpenCitationsExpander
|
|
||||||
from citegeist.sources import OpenCitationsSource
|
|
||||||
from citegeist.storage import BibliographyStore
|
|
||||||
|
|
||||||
|
|
||||||
def test_opencitations_source_normalizes_metadata_row() -> None:
|
|
||||||
source = OpenCitationsSource(config={})
|
|
||||||
entry = source.normalize(
|
|
||||||
{
|
|
||||||
"id": "doi:10.1000/example openalex:W1234567890 omid:br/06123",
|
|
||||||
"title": "Example Work",
|
|
||||||
"author": "Doe, Jane [omid:ra/1]; Roe, Alex [omid:ra/2]",
|
|
||||||
"pub_date": "2024-05",
|
|
||||||
"venue": "Journal of Examples [issn:1234-5678]",
|
|
||||||
"volume": "12",
|
|
||||||
"issue": "3",
|
|
||||||
"page": "10-20",
|
|
||||||
"type": "journal article",
|
|
||||||
"publisher": "Example Press [crossref:123]",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
assert entry is not None
|
|
||||||
assert entry.fields["doi"] == "10.1000/example"
|
|
||||||
assert entry.fields["openalex"] == "W1234567890"
|
|
||||||
assert entry.fields["author"] == "Doe, Jane and Roe, Alex"
|
|
||||||
assert entry.fields["journal"] == "Journal of Examples"
|
|
||||||
assert entry.fields["publisher"] == "Example Press"
|
|
||||||
assert entry.fields["year"] == "2024"
|
|
||||||
|
|
||||||
|
|
||||||
def test_opencitations_source_builds_edges_for_references() -> None:
|
|
||||||
source = OpenCitationsSource(config={})
|
|
||||||
source.source_client.get_json = lambda _url: [ # type: ignore[method-assign]
|
|
||||||
{
|
|
||||||
"oci": "1-2",
|
|
||||||
"citing": "omid:br/1 doi:10.1000/source",
|
|
||||||
"cited": "omid:br/2 doi:10.1000/target",
|
|
||||||
"creation": "2024-01-01",
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
edges = source.get_citations("10.1000/source", relation_type="cites", limit=10)
|
|
||||||
assert len(edges) == 1
|
|
||||||
assert edges[0].source_work_id == "doi:10.1000/source"
|
|
||||||
assert edges[0].target_work_id == "doi:10.1000/target"
|
|
||||||
|
|
||||||
|
|
||||||
def test_opencitations_expander_creates_reference_nodes_and_relations() -> None:
|
|
||||||
store = BibliographyStore()
|
|
||||||
try:
|
|
||||||
store.ingest_bibtex(
|
|
||||||
"""
|
|
||||||
@article{seed2024,
|
|
||||||
author = {Seed, Alice},
|
|
||||||
title = {Seed Paper},
|
|
||||||
year = {2024},
|
|
||||||
doi = {10.1000/source}
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
|
|
||||||
expander = OpenCitationsExpander()
|
|
||||||
expander.source.source_client.get_json = lambda url: [ # type: ignore[method-assign]
|
|
||||||
{
|
|
||||||
"oci": "1-2",
|
|
||||||
"citing": "omid:br/1 doi:10.1000/source",
|
|
||||||
"cited": "omid:br/2 doi:10.1000/target",
|
|
||||||
"creation": "2024-01-01",
|
|
||||||
}
|
|
||||||
] if "/references/" in url else [
|
|
||||||
{
|
|
||||||
"id": "doi:10.1000/target omid:br/2",
|
|
||||||
"title": "Target Work",
|
|
||||||
"author": "Doe, Jane [omid:ra/1]",
|
|
||||||
"pub_date": "2023",
|
|
||||||
"venue": "Journal of Targets [issn:1111-1111]",
|
|
||||||
"type": "journal article",
|
|
||||||
}
|
|
||||||
]
|
|
||||||
expander.resolver.resolve_doi = lambda _doi: None # type: ignore[method-assign]
|
|
||||||
expander.resolver.resolve_datacite_doi = lambda _doi: None # type: ignore[method-assign]
|
|
||||||
|
|
||||||
results = expander.expand_entry(store, "seed2024", relation_type="cites", limit=10)
|
|
||||||
|
|
||||||
assert [item.discovered_citation_key for item in results] == ["doi101000target"]
|
|
||||||
discovered = store.get_entry("doi101000target")
|
|
||||||
assert discovered is not None
|
|
||||||
assert discovered["title"] == "Target Work"
|
|
||||||
assert store.get_relations("seed2024") == ["doi101000target"]
|
|
||||||
finally:
|
|
||||||
store.close()
|
|
||||||
|
|
||||||
|
|
||||||
def test_opencitations_expander_supports_cited_by_direction() -> None:
|
|
||||||
store = BibliographyStore()
|
|
||||||
try:
|
|
||||||
store.ingest_bibtex(
|
|
||||||
"""
|
|
||||||
@article{seed2024,
|
|
||||||
author = {Seed, Alice},
|
|
||||||
title = {Seed Paper},
|
|
||||||
year = {2024},
|
|
||||||
doi = {10.1000/seed}
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
|
|
||||||
expander = OpenCitationsExpander()
|
|
||||||
expander.source.source_client.get_json = lambda url: [ # type: ignore[method-assign]
|
|
||||||
{
|
|
||||||
"oci": "2-1",
|
|
||||||
"citing": "omid:br/2 doi:10.1000/citing",
|
|
||||||
"cited": "omid:br/1 doi:10.1000/seed",
|
|
||||||
"creation": "2024-01-01",
|
|
||||||
}
|
|
||||||
] if "/citations/" in url else [
|
|
||||||
{
|
|
||||||
"id": "doi:10.1000/citing omid:br/2",
|
|
||||||
"title": "Citing Work",
|
|
||||||
"author": "Doe, Jane [omid:ra/1]",
|
|
||||||
"pub_date": "2025",
|
|
||||||
"venue": "Journal of Citers [issn:1111-1111]",
|
|
||||||
"type": "journal article",
|
|
||||||
}
|
|
||||||
]
|
|
||||||
expander.resolver.resolve_doi = lambda _doi: None # type: ignore[method-assign]
|
|
||||||
expander.resolver.resolve_datacite_doi = lambda _doi: None # type: ignore[method-assign]
|
|
||||||
|
|
||||||
results = expander.expand_entry(store, "seed2024", relation_type="cited_by", limit=10)
|
|
||||||
|
|
||||||
assert [item.discovered_citation_key for item in results] == ["doi101000citing"]
|
|
||||||
assert store.get_relations("doi101000citing") == ["seed2024"]
|
|
||||||
finally:
|
|
||||||
store.close()
|
|
||||||
|
|
@ -1,188 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from citegeist.bibtex import BibEntry
|
|
||||||
from citegeist.resolve import MetadataResolver
|
|
||||||
from citegeist.sources import OpenLibrarySource, SourceRegistry, list_source_catalog
|
|
||||||
|
|
||||||
|
|
||||||
class FakeSourceClient:
|
|
||||||
def __init__(self, payload: dict[str, object]) -> None:
|
|
||||||
self.payload = payload
|
|
||||||
|
|
||||||
def try_get_json(self, _url: str) -> dict[str, object]:
|
|
||||||
return dict(self.payload)
|
|
||||||
|
|
||||||
|
|
||||||
def test_openlibrary_source_normalizes_book_record() -> None:
|
|
||||||
source = OpenLibrarySource(config={"source_client": FakeSourceClient({})})
|
|
||||||
entry = source.normalize(
|
|
||||||
{
|
|
||||||
"title": "The Nature of the Stratigraphic Record",
|
|
||||||
"author_name": ["D. V. Ager"],
|
|
||||||
"first_publish_year": 1973,
|
|
||||||
"publisher": ["Macmillan"],
|
|
||||||
"key": "/works/OL82563W",
|
|
||||||
"edition_key": ["OL12345M"],
|
|
||||||
"isbn": ["9781234567890"],
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
assert entry is not None
|
|
||||||
assert entry.entry_type == "book"
|
|
||||||
assert entry.fields["title"] == "The Nature of the Stratigraphic Record"
|
|
||||||
assert entry.fields["author"] == "D. V. Ager"
|
|
||||||
assert entry.fields["year"] == "1973"
|
|
||||||
assert entry.fields["publisher"] == "Macmillan"
|
|
||||||
assert entry.fields["openlibrary_work"] == "/works/OL82563W"
|
|
||||||
assert entry.fields["openlibrary_edition"] == "OL12345M"
|
|
||||||
assert entry.fields["isbn"] == "9781234567890"
|
|
||||||
|
|
||||||
|
|
||||||
def test_openlibrary_registry_and_catalog() -> None:
|
|
||||||
registry = SourceRegistry()
|
|
||||||
registry.from_config_dict(
|
|
||||||
{
|
|
||||||
"sources": {
|
|
||||||
"openlibrary": {
|
|
||||||
"source_type": "openlibrary",
|
|
||||||
"enabled": True,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
source = registry.get("openlibrary")
|
|
||||||
assert isinstance(source, OpenLibrarySource)
|
|
||||||
|
|
||||||
catalog = {entry.key: entry for entry in list_source_catalog()}
|
|
||||||
assert catalog["open_library"].current_status == "integrated"
|
|
||||||
assert "book_metadata" in catalog["open_library"].capabilities
|
|
||||||
|
|
||||||
|
|
||||||
def test_metadata_resolver_uses_openlibrary_after_other_searches_fail() -> None:
|
|
||||||
resolver = MetadataResolver()
|
|
||||||
resolver.search_crossref = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_datacite = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_openalex = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_pubmed = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_europepmc = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_semanticscholar = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_openlibrary = lambda _title, limit=5: [ # type: ignore[method-assign]
|
|
||||||
BibEntry(
|
|
||||||
entry_type="book",
|
|
||||||
citation_key="olworks123",
|
|
||||||
fields={
|
|
||||||
"title": "The Nature of the Stratigraphic Record",
|
|
||||||
"author": "D. V. Ager",
|
|
||||||
"year": "1973",
|
|
||||||
"openlibrary_work": "/works/OL82563W",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
result = resolver.resolve_entry(
|
|
||||||
BibEntry(
|
|
||||||
entry_type="book",
|
|
||||||
citation_key="seed1973",
|
|
||||||
fields={"title": "The Nature of the Stratigraphic Record", "author": "D. V. Ager", "year": "1973"},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
assert result is not None
|
|
||||||
assert result.source_label == "openlibrary:search:The Nature of the Stratigraphic Record"
|
|
||||||
|
|
||||||
|
|
||||||
def test_metadata_resolver_trace_records_fallback_attempts() -> None:
|
|
||||||
resolver = MetadataResolver()
|
|
||||||
resolver.search_crossref = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_datacite = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_openalex = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_pubmed = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_europepmc = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_semanticscholar = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_openlibrary = lambda _title, limit=5: [ # type: ignore[method-assign]
|
|
||||||
BibEntry(
|
|
||||||
entry_type="book",
|
|
||||||
citation_key="olworks123",
|
|
||||||
fields={"title": "Example Book", "author": "Author, A", "year": "1980"},
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
outcome = resolver.resolve_entry_with_trace(
|
|
||||||
BibEntry(
|
|
||||||
entry_type="book",
|
|
||||||
citation_key="seed1980",
|
|
||||||
fields={"title": "Example Book", "author": "Author, A", "year": "1980"},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
assert outcome.resolution is not None
|
|
||||||
assert outcome.resolution.source_label == "openlibrary:search:Example Book"
|
|
||||||
assert [attempt.source_name for attempt in outcome.attempts[-2:]] == ["semanticscholar", "openlibrary"]
|
|
||||||
assert outcome.attempts[-1].matched is True
|
|
||||||
assert outcome.attempts[-1].candidate_count == 1
|
|
||||||
|
|
||||||
|
|
||||||
def test_metadata_resolver_uses_fuzzy_catalog_match_for_book_titles() -> None:
|
|
||||||
resolver = MetadataResolver()
|
|
||||||
resolver.search_crossref = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_datacite = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_openalex = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_pubmed = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_europepmc = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_semanticscholar = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_openlibrary = lambda _title, limit=5: [ # type: ignore[method-assign]
|
|
||||||
BibEntry(
|
|
||||||
entry_type="book",
|
|
||||||
citation_key="olworks123",
|
|
||||||
fields={
|
|
||||||
"title": "The nature of the stratigraphical record",
|
|
||||||
"author": "D. V. Ager",
|
|
||||||
"year": "1973",
|
|
||||||
"openlibrary_work": "/works/OL82563W",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
result = resolver.resolve_entry(
|
|
||||||
BibEntry(
|
|
||||||
entry_type="book",
|
|
||||||
citation_key="seed1973",
|
|
||||||
fields={"title": "The Nature of the Stratigraphic Record", "author": "D. V. Ager", "year": "1973"},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
assert result is not None
|
|
||||||
assert result.source_label == "openlibrary:search:The Nature of the Stratigraphic Record"
|
|
||||||
|
|
||||||
|
|
||||||
def test_metadata_resolver_skips_openlibrary_for_article_like_entries() -> None:
|
|
||||||
resolver = MetadataResolver()
|
|
||||||
resolver.search_crossref = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_datacite = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_openalex = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_pubmed = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_europepmc = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
resolver.search_semanticscholar = lambda _title, limit=5: [] # type: ignore[method-assign]
|
|
||||||
called = {"openlibrary": False}
|
|
||||||
|
|
||||||
def fake_openlibrary(_title: str, limit: int = 5) -> list[BibEntry]:
|
|
||||||
called["openlibrary"] = True
|
|
||||||
return []
|
|
||||||
|
|
||||||
resolver.search_openlibrary = fake_openlibrary # type: ignore[method-assign]
|
|
||||||
outcome = resolver.resolve_entry_with_trace(
|
|
||||||
BibEntry(
|
|
||||||
entry_type="article",
|
|
||||||
citation_key="seed1977",
|
|
||||||
fields={
|
|
||||||
"title": "Fast locomotion of some African ungulates",
|
|
||||||
"author": "Alexander, R. M.",
|
|
||||||
"year": "1977",
|
|
||||||
"journal": "Journal of Zoology",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
assert outcome.resolution is None
|
|
||||||
assert called["openlibrary"] is False
|
|
||||||
assert all(attempt.source_name != "openlibrary" for attempt in outcome.attempts)
|
|
||||||
|
|
@ -1,201 +0,0 @@
|
||||||
"""Tests for identifier resolution and normalization."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from citegeist.resolver import (
|
|
||||||
IdentifierExtractor,
|
|
||||||
IdentifierNormalizer,
|
|
||||||
IdentifierResolver,
|
|
||||||
extract_identifiers,
|
|
||||||
normalize_identifier,
|
|
||||||
get_primary_identifier,
|
|
||||||
resolve_identifiers,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TestIdentifierExtractor:
|
|
||||||
"""Test IdentifierExtractor class."""
|
|
||||||
|
|
||||||
def test_extract_from_entry(self):
|
|
||||||
"""Test extracting identifiers from entry fields."""
|
|
||||||
fields = {
|
|
||||||
'doi': '10.1234/example',
|
|
||||||
'title': 'Test Title',
|
|
||||||
'author': 'John Doe',
|
|
||||||
'pmid': '123456',
|
|
||||||
}
|
|
||||||
|
|
||||||
identifiers = IdentifierExtractor.extract(fields)
|
|
||||||
|
|
||||||
assert 'doi' in identifiers
|
|
||||||
assert identifiers['doi'] == '10.1234/example'
|
|
||||||
assert 'pmid' in identifiers
|
|
||||||
assert identifiers['pmid'] == '123456'
|
|
||||||
assert 'title' not in identifiers # Title is not an identifier
|
|
||||||
|
|
||||||
def test_extract_multiple_identifiers(self):
|
|
||||||
"""Test extracting multiple identifiers."""
|
|
||||||
fields = {
|
|
||||||
'doi': '10.1234/example',
|
|
||||||
'pmid': '123456',
|
|
||||||
'arxiv': '2310.12345',
|
|
||||||
'isbn': '978-0-123456-78-9',
|
|
||||||
}
|
|
||||||
|
|
||||||
identifiers = IdentifierExtractor.extract(fields)
|
|
||||||
|
|
||||||
assert len(identifiers) == 4
|
|
||||||
assert identifiers['doi'] == '10.1234/example'
|
|
||||||
assert identifiers['pmid'] == '123456'
|
|
||||||
assert identifiers['arxiv'] == '2310.12345'
|
|
||||||
assert identifiers['isbn'] == '978-0-123456-78-9'
|
|
||||||
|
|
||||||
|
|
||||||
class TestIdentifierNormalizer:
|
|
||||||
"""Test IdentifierNormalizer class."""
|
|
||||||
|
|
||||||
def test_normalize_doi(self):
|
|
||||||
"""Test DOI normalization."""
|
|
||||||
assert IdentifierNormalizer.normalize_doi('10.1234/EXAMPLE') == '10.1234/example'
|
|
||||||
assert IdentifierNormalizer.normalize_doi('10.1234/test') == '10.1234/test'
|
|
||||||
assert IdentifierNormalizer.normalize_doi('invalid') is None
|
|
||||||
|
|
||||||
def test_normalize_pmid(self):
|
|
||||||
"""Test PMID normalization."""
|
|
||||||
assert IdentifierNormalizer.normalize_pmid('12345') == '12345'
|
|
||||||
assert IdentifierNormalizer.normalize_pmid('1234567') == '1234567'
|
|
||||||
assert IdentifierNormalizer.normalize_pmid('invalid') is None
|
|
||||||
|
|
||||||
def test_normalize_pmcid(self):
|
|
||||||
"""Test PMCID normalization."""
|
|
||||||
assert IdentifierNormalizer.normalize_pmcid('PMC12345') == 'pmc12345'
|
|
||||||
assert IdentifierNormalizer.normalize_pmcid('PMCabcdef') == 'pmcabcdef'
|
|
||||||
assert IdentifierNormalizer.normalize_pmcid('invalid') is None
|
|
||||||
|
|
||||||
def test_normalize_arxiv(self):
|
|
||||||
"""Test arXiv normalization."""
|
|
||||||
assert IdentifierNormalizer.normalize_arxiv('2310.12345') == '2310.12345'
|
|
||||||
assert IdentifierNormalizer.normalize_arxiv('2310.12345v1') == '2310.12345'
|
|
||||||
assert IdentifierNormalizer.normalize_arxiv('INVALID') is None
|
|
||||||
|
|
||||||
def test_normalize_orcid(self):
|
|
||||||
"""Test ORCID normalization."""
|
|
||||||
assert IdentifierNormalizer.normalize_orcid('0000-0001-2345-6789') == '0000-0001-2345-6789'
|
|
||||||
# ORCID with spaces is invalid according to the canonical format
|
|
||||||
assert IdentifierNormalizer.normalize_orcid('0000 0001 2345 6789') is None
|
|
||||||
assert IdentifierNormalizer.normalize_orcid('invalid') is None
|
|
||||||
|
|
||||||
def test_normalize_identifier(self):
|
|
||||||
"""Test generic identifier normalization."""
|
|
||||||
result = IdentifierNormalizer.normalize_identifier('doi', '10.1234/test')
|
|
||||||
assert result == ('doi', '10.1234/test')
|
|
||||||
|
|
||||||
result = IdentifierNormalizer.normalize_identifier('pmid', '12345')
|
|
||||||
assert result == ('pmid', '12345')
|
|
||||||
|
|
||||||
result = IdentifierNormalizer.normalize_identifier('invalid', 'value')
|
|
||||||
assert result is None
|
|
||||||
|
|
||||||
|
|
||||||
class TestIdentifierResolver:
|
|
||||||
"""Test IdentifierResolver class."""
|
|
||||||
|
|
||||||
def test_resolve_with_doi(self):
|
|
||||||
"""Test resolving with DOI."""
|
|
||||||
fields = {'doi': '10.1234/example', 'title': 'Test Title'}
|
|
||||||
|
|
||||||
resolved = IdentifierResolver.resolve(fields)
|
|
||||||
|
|
||||||
assert len(resolved) >= 1
|
|
||||||
doi_resolved = [r for r in resolved if r[0] == 'doi']
|
|
||||||
assert len(doi_resolved) > 0
|
|
||||||
|
|
||||||
def test_resolve_with_multiple_identifiers(self):
|
|
||||||
"""Test resolving with multiple identifiers."""
|
|
||||||
fields = {
|
|
||||||
'doi': '10.1234/example',
|
|
||||||
'pmid': '12345',
|
|
||||||
'arxiv': '2310.12345',
|
|
||||||
}
|
|
||||||
|
|
||||||
resolved = IdentifierResolver.resolve(fields)
|
|
||||||
|
|
||||||
assert len(resolved) >= 2
|
|
||||||
doi_resolved = [r for r in resolved if r[0] == 'doi']
|
|
||||||
assert len(doi_resolved) > 0
|
|
||||||
|
|
||||||
def test_resolve_without_identifiers(self):
|
|
||||||
"""Test resolving without identifiers."""
|
|
||||||
fields = {'title': 'Test Title', 'author': 'John Doe'}
|
|
||||||
|
|
||||||
resolved = IdentifierResolver.resolve(fields)
|
|
||||||
|
|
||||||
# Should have at least title fingerprint
|
|
||||||
assert len(resolved) >= 1
|
|
||||||
title_resolved = [r for r in resolved if r[0] == 'title']
|
|
||||||
assert len(title_resolved) > 0
|
|
||||||
|
|
||||||
def test_get_primary_identifier(self):
|
|
||||||
"""Test getting primary identifier."""
|
|
||||||
fields = {
|
|
||||||
'doi': '10.1234/example',
|
|
||||||
'pmid': '12345',
|
|
||||||
'title': 'Test Title',
|
|
||||||
}
|
|
||||||
|
|
||||||
primary = IdentifierResolver.get_primary_identifier(fields)
|
|
||||||
|
|
||||||
assert primary is not None
|
|
||||||
# DOI should be first priority
|
|
||||||
assert primary[0] == 'doi'
|
|
||||||
|
|
||||||
def test_get_scheme_value(self):
|
|
||||||
"""Test getting specific scheme value."""
|
|
||||||
fields = {
|
|
||||||
'doi': '10.1234/example',
|
|
||||||
'pmid': '12345',
|
|
||||||
}
|
|
||||||
|
|
||||||
doi = IdentifierResolver.get_scheme_value('doi', fields)
|
|
||||||
assert doi == '10.1234/example'
|
|
||||||
|
|
||||||
pmid = IdentifierResolver.get_scheme_value('pmid', fields)
|
|
||||||
assert pmid == '12345'
|
|
||||||
|
|
||||||
isbn = IdentifierResolver.get_scheme_value('isbn', fields)
|
|
||||||
assert isbn is None
|
|
||||||
|
|
||||||
|
|
||||||
class TestConvenienceFunctions:
|
|
||||||
"""Test convenience functions."""
|
|
||||||
|
|
||||||
def test_extract_identifiers(self):
|
|
||||||
"""Test extract_identifiers function."""
|
|
||||||
fields = {'doi': '10.1234/example', 'pmid': '12345'}
|
|
||||||
|
|
||||||
identifiers = extract_identifiers(fields)
|
|
||||||
|
|
||||||
assert 'doi' in identifiers
|
|
||||||
assert 'pmid' in identifiers
|
|
||||||
|
|
||||||
def test_normalize_identifier(self):
|
|
||||||
"""Test normalize_identifier function."""
|
|
||||||
result = normalize_identifier('doi', '10.1234/test')
|
|
||||||
assert result == ('doi', '10.1234/test')
|
|
||||||
|
|
||||||
def test_get_primary_identifier(self):
|
|
||||||
"""Test get_primary_identifier function."""
|
|
||||||
fields = {'doi': '10.1234/example'}
|
|
||||||
|
|
||||||
primary = get_primary_identifier(fields)
|
|
||||||
|
|
||||||
assert primary == ('doi', '10.1234/example')
|
|
||||||
|
|
||||||
def test_resolve_identifiers(self):
|
|
||||||
"""Test resolve_identifiers function."""
|
|
||||||
fields = {'doi': '10.1234/example'}
|
|
||||||
|
|
||||||
resolved = resolve_identifiers(fields)
|
|
||||||
|
|
||||||
assert len(resolved) > 0
|
|
||||||
|
|
@ -1,117 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from citegeist.resolve import MetadataResolver
|
|
||||||
from citegeist.sources import SemanticScholarSource, SourceRegistry, list_source_catalog
|
|
||||||
|
|
||||||
|
|
||||||
def test_semanticscholar_source_normalizes_record() -> None:
|
|
||||||
source = SemanticScholarSource(config={})
|
|
||||||
entry = source.normalize(
|
|
||||||
{
|
|
||||||
"paperId": "abcdef123456",
|
|
||||||
"title": "Physics Example",
|
|
||||||
"year": 2024,
|
|
||||||
"abstract": "Abstract text.",
|
|
||||||
"authors": [{"name": "Jane Doe"}, {"name": "Alex Roe"}],
|
|
||||||
"externalIds": {"DOI": "10.1000/physics"},
|
|
||||||
"journal": {"name": "Physical Review Example"},
|
|
||||||
"openAccessPdf": {"url": "https://example.org/paper.pdf"},
|
|
||||||
"citationCount": 42,
|
|
||||||
"publicationTypes": ["JournalArticle"],
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
assert entry is not None
|
|
||||||
assert entry.fields["doi"] == "10.1000/physics"
|
|
||||||
assert entry.fields["author"] == "Jane Doe and Alex Roe"
|
|
||||||
assert entry.fields["journal"] == "Physical Review Example"
|
|
||||||
assert entry.fields["url"] == "https://example.org/paper.pdf"
|
|
||||||
assert entry.fields["is_oa"] == "true"
|
|
||||||
assert entry.fields["semanticscholar_citation_count"] == "42"
|
|
||||||
|
|
||||||
|
|
||||||
def test_semanticscholar_registry_and_catalog() -> None:
|
|
||||||
registry = SourceRegistry()
|
|
||||||
registry.from_config_dict(
|
|
||||||
{
|
|
||||||
"sources": {
|
|
||||||
"semanticscholar": {
|
|
||||||
"source_type": "semanticscholar",
|
|
||||||
"enabled": True,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
source = registry.get("semanticscholar")
|
|
||||||
assert isinstance(source, SemanticScholarSource)
|
|
||||||
|
|
||||||
catalog = {entry.key: entry for entry in list_source_catalog()}
|
|
||||||
assert catalog["semantic_scholar"].current_status == "integrated"
|
|
||||||
assert catalog["semantic_scholar"].priority == "now"
|
|
||||||
|
|
||||||
|
|
||||||
def test_metadata_resolver_uses_semanticscholar_doi_after_other_lookups_fail() -> None:
|
|
||||||
resolver = MetadataResolver()
|
|
||||||
resolver.resolve_doi = lambda _doi: None # type: ignore[method-assign]
|
|
||||||
resolver.resolve_datacite_doi = lambda _doi: None # type: ignore[method-assign]
|
|
||||||
resolver.resolve_europepmc_doi = lambda _doi: None # type: ignore[method-assign]
|
|
||||||
resolver.semanticscholar.lookup_by_doi = lambda _doi: resolver.semanticscholar.normalize( # type: ignore[method-assign]
|
|
||||||
{
|
|
||||||
"paperId": "abcdef123456",
|
|
||||||
"title": "Physics Example",
|
|
||||||
"year": 2024,
|
|
||||||
"authors": [{"name": "Jane Doe"}],
|
|
||||||
"externalIds": {"DOI": "10.1000/physics"},
|
|
||||||
"journal": {"name": "Physical Review Example"},
|
|
||||||
"publicationTypes": ["JournalArticle"],
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
from citegeist.bibtex import BibEntry
|
|
||||||
|
|
||||||
result = resolver.resolve_entry(
|
|
||||||
BibEntry(
|
|
||||||
entry_type="article",
|
|
||||||
citation_key="seed2024",
|
|
||||||
fields={"doi": "10.1000/physics", "title": "Physics Example"},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
assert result is not None
|
|
||||||
assert result.source_label == "semanticscholar:doi:10.1000/physics"
|
|
||||||
assert result.entry.fields["journal"] == "Physical Review Example"
|
|
||||||
|
|
||||||
|
|
||||||
def test_metadata_resolver_uses_semanticscholar_title_search_after_other_searches_fail() -> None:
|
|
||||||
resolver = MetadataResolver()
|
|
||||||
resolver.search_crossref_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
|
||||||
resolver.search_datacite_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
|
||||||
resolver.search_openalex_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
|
||||||
resolver.search_pubmed_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
|
||||||
resolver.search_europepmc_best_match = lambda *args, **kwargs: None # type: ignore[method-assign]
|
|
||||||
resolver.semanticscholar.search = lambda _title, limit=5: [ # type: ignore[method-assign]
|
|
||||||
resolver.semanticscholar.normalize(
|
|
||||||
{
|
|
||||||
"paperId": "abcdef123456",
|
|
||||||
"title": "Physics Example",
|
|
||||||
"year": 2024,
|
|
||||||
"authors": [{"name": "Jane Doe"}],
|
|
||||||
"externalIds": {"DOI": "10.1000/physics"},
|
|
||||||
"journal": {"name": "Physical Review Example"},
|
|
||||||
"publicationTypes": ["JournalArticle"],
|
|
||||||
}
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
from citegeist.bibtex import BibEntry
|
|
||||||
|
|
||||||
result = resolver.resolve_entry(
|
|
||||||
BibEntry(
|
|
||||||
entry_type="article",
|
|
||||||
citation_key="seed2024",
|
|
||||||
fields={"title": "Physics Example", "author": "Jane Doe", "year": "2024"},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
assert result is not None
|
|
||||||
assert result.source_label == "semanticscholar:search:Physics Example"
|
|
||||||
|
|
@ -1,60 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from citegeist.sources import CrossRefSource, OpenCitationsSource, SourceRegistry, list_source_catalog, prioritized_source_keys
|
|
||||||
|
|
||||||
|
|
||||||
def test_catalog_prioritizes_existing_core_sources() -> None:
|
|
||||||
keys = prioritized_source_keys()
|
|
||||||
assert keys[:6] == ["crossref", "datacite", "europe_pmc", "openalex", "open_citations", "pubmed"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_catalog_includes_open_citation_and_access_sources() -> None:
|
|
||||||
catalog = {entry.key: entry for entry in list_source_catalog()}
|
|
||||||
assert "open_citations" in catalog
|
|
||||||
assert "unpaywall" in catalog
|
|
||||||
assert catalog["open_citations"].priority == "now"
|
|
||||||
assert "doi_citations" in catalog["open_citations"].capabilities
|
|
||||||
|
|
||||||
|
|
||||||
def test_registry_loads_known_source_from_config() -> None:
|
|
||||||
registry = SourceRegistry()
|
|
||||||
registry.from_config_dict(
|
|
||||||
{
|
|
||||||
"sources": {
|
|
||||||
"crossref": {
|
|
||||||
"source_type": "crossref",
|
|
||||||
"enabled": True,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
source = registry.get("crossref")
|
|
||||||
assert isinstance(source, CrossRefSource)
|
|
||||||
|
|
||||||
|
|
||||||
def test_registry_rejects_unknown_source_type() -> None:
|
|
||||||
registry = SourceRegistry()
|
|
||||||
try:
|
|
||||||
registry.from_config_dict({"sources": {"mystery": {"source_type": "mystery"}}})
|
|
||||||
except ValueError as exc:
|
|
||||||
assert "Unknown source type" in str(exc)
|
|
||||||
else:
|
|
||||||
raise AssertionError("expected ValueError for unknown source type")
|
|
||||||
|
|
||||||
|
|
||||||
def test_registry_loads_opencitations_from_config() -> None:
|
|
||||||
registry = SourceRegistry()
|
|
||||||
registry.from_config_dict(
|
|
||||||
{
|
|
||||||
"sources": {
|
|
||||||
"opencitations": {
|
|
||||||
"source_type": "opencitations",
|
|
||||||
"enabled": True,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
source = registry.get("opencitations")
|
|
||||||
assert isinstance(source, OpenCitationsSource)
|
|
||||||
|
|
@ -1,171 +0,0 @@
|
||||||
"""Tests for the source plugin architecture."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from citegeist.sources import BibliographicSource, SourceRegistry, CrossRefSource
|
|
||||||
|
|
||||||
|
|
||||||
class MockSource(BibliographicSource):
|
|
||||||
"""Mock source for testing."""
|
|
||||||
|
|
||||||
def __init__(self, config: dict | None = None):
|
|
||||||
super().__init__(config)
|
|
||||||
self.lookup_calls = []
|
|
||||||
|
|
||||||
def lookup_by_doi(self, doi: str) -> None:
|
|
||||||
"""Return None to indicate not found."""
|
|
||||||
self.lookup_calls.append(('doi', doi))
|
|
||||||
return None
|
|
||||||
|
|
||||||
def lookup_by_title(self, title: str) -> None:
|
|
||||||
"""Return None to indicate not found."""
|
|
||||||
self.lookup_calls.append(('title', title))
|
|
||||||
return None
|
|
||||||
|
|
||||||
def search(self, query: str, limit: int = 10) -> list:
|
|
||||||
return []
|
|
||||||
|
|
||||||
def normalize(self, record: dict) -> None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def test_source_base_interface():
|
|
||||||
"""Test that BibliographicSource base class works."""
|
|
||||||
source = MockSource()
|
|
||||||
assert source.is_available()
|
|
||||||
assert source.get_identifier_scheme() == 'mocksource'
|
|
||||||
assert source.get_fulltext_url('doi:test') is None
|
|
||||||
assert source.get_embedding('doi:test') is None
|
|
||||||
|
|
||||||
|
|
||||||
def test_mock_source():
|
|
||||||
"""Test that mock source implements interface correctly."""
|
|
||||||
source = MockSource()
|
|
||||||
source.lookup_by_doi('10.1234/test')
|
|
||||||
source.lookup_by_title('Test Title')
|
|
||||||
|
|
||||||
assert source.lookup_calls == [
|
|
||||||
('doi', '10.1234/test'),
|
|
||||||
('title', 'Test Title')
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def test_source_registry():
|
|
||||||
"""Test source registry functionality."""
|
|
||||||
registry = SourceRegistry()
|
|
||||||
|
|
||||||
# Register a source
|
|
||||||
registry.register(MockSource, name='mock_source', config={'enabled': True})
|
|
||||||
|
|
||||||
# List sources
|
|
||||||
sources = registry.list_sources()
|
|
||||||
assert 'mock_source' in sources
|
|
||||||
|
|
||||||
# Get source instance
|
|
||||||
source = registry.get('mock_source')
|
|
||||||
assert source is not None
|
|
||||||
assert isinstance(source, MockSource)
|
|
||||||
assert source.is_available()
|
|
||||||
|
|
||||||
|
|
||||||
def test_source_registry_disabled():
|
|
||||||
"""Test that disabled sources are not returned."""
|
|
||||||
registry = SourceRegistry()
|
|
||||||
|
|
||||||
registry.register(
|
|
||||||
MockSource,
|
|
||||||
name='disabled_source',
|
|
||||||
config={'enabled': False}
|
|
||||||
)
|
|
||||||
|
|
||||||
sources = registry.list_sources()
|
|
||||||
assert 'disabled_source' in sources
|
|
||||||
|
|
||||||
# Getting disabled source should return None
|
|
||||||
source = registry.get('disabled_source')
|
|
||||||
assert source is None
|
|
||||||
|
|
||||||
|
|
||||||
def test_crossref_source():
|
|
||||||
"""Test CrossRef source plugin."""
|
|
||||||
registry = SourceRegistry()
|
|
||||||
registry.register(CrossRefSource, name='crossref', config={})
|
|
||||||
|
|
||||||
source = registry.get('crossref')
|
|
||||||
assert source is not None
|
|
||||||
assert source.is_available()
|
|
||||||
assert source.get_identifier_scheme() == 'doi'
|
|
||||||
|
|
||||||
entry = source.normalize(
|
|
||||||
{
|
|
||||||
'message': {
|
|
||||||
'DOI': '10.1234/example',
|
|
||||||
'title': ['Test Title'],
|
|
||||||
'author': [{'given': 'Jane', 'family': 'Doe'}],
|
|
||||||
'published-print': {'date-parts': [[2024]]},
|
|
||||||
'container-title': ['Journal of Tests'],
|
|
||||||
'publisher': 'Test Publisher',
|
|
||||||
'URL': 'https://doi.org/10.1234/example',
|
|
||||||
'abstract': '<jats:p>Example abstract</jats:p>',
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
assert entry is not None
|
|
||||||
assert entry.fields['doi'] == '10.1234/example'
|
|
||||||
assert entry.fields['title'] == 'Test Title'
|
|
||||||
assert entry.fields['year'] == '2024'
|
|
||||||
assert entry.fields['journal'] == 'Journal of Tests'
|
|
||||||
|
|
||||||
|
|
||||||
def test_crossref_search_item_normalization():
|
|
||||||
source = CrossRefSource()
|
|
||||||
|
|
||||||
entry = source.normalize(
|
|
||||||
{
|
|
||||||
'DOI': '10.1234/example',
|
|
||||||
'title': ['Search Result'],
|
|
||||||
'author': [{'family': 'Doe'}],
|
|
||||||
'issued': {'date-parts': [[2023]]},
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
assert entry is not None
|
|
||||||
assert entry.fields['doi'] == '10.1234/example'
|
|
||||||
assert entry.fields['year'] == '2023'
|
|
||||||
|
|
||||||
|
|
||||||
def test_source_record():
|
|
||||||
"""Test SourceRecord dataclass."""
|
|
||||||
from citegeist.sources import SourceRecord
|
|
||||||
|
|
||||||
record = SourceRecord(
|
|
||||||
raw={'test': 'data'},
|
|
||||||
source_type='test',
|
|
||||||
source_label='test_source',
|
|
||||||
timestamp='2024-01-01',
|
|
||||||
confidence=1.0
|
|
||||||
)
|
|
||||||
|
|
||||||
assert record.source_type == 'test'
|
|
||||||
assert record.source_label == 'test_source'
|
|
||||||
assert record.confidence == 1.0
|
|
||||||
assert record.raw == {'test': 'data'}
|
|
||||||
|
|
||||||
|
|
||||||
def test_citation_edge():
|
|
||||||
"""Test CitationEdge dataclass."""
|
|
||||||
from citegeist.sources import CitationEdge
|
|
||||||
|
|
||||||
edge = CitationEdge(
|
|
||||||
source_work_id='doi:10.1234',
|
|
||||||
target_work_id='doi:10.5678',
|
|
||||||
relation_type='cites',
|
|
||||||
source_type='crossref',
|
|
||||||
source_label='crossref:test',
|
|
||||||
confidence=0.9
|
|
||||||
)
|
|
||||||
|
|
||||||
assert edge.relation_type == 'cites'
|
|
||||||
assert edge.confidence == 0.9
|
|
||||||
|
|
@ -530,88 +530,6 @@ def test_talkorigins_enrich_weak_canonicals_can_preview_resolution(tmp_path: Pat
|
||||||
assert results[0].weak_reasons_after == []
|
assert results[0].weak_reasons_after == []
|
||||||
|
|
||||||
|
|
||||||
def test_talkorigins_enrich_weak_canonicals_includes_resolution_attempts(tmp_path: Path):
|
|
||||||
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
||||||
scraper = TalkOriginsScraper(
|
|
||||||
source_client=FakeSourceClient(
|
|
||||||
{
|
|
||||||
base_url: INDEX_HTML,
|
|
||||||
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
||||||
f"{base_url}evolution.html": EVOLUTION_HTML,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
|
|
||||||
Path(export.seed_sets[0].seed_bib).write_text(
|
|
||||||
"""
|
|
||||||
@misc{weak1,
|
|
||||||
author = "Smith, Jane",
|
|
||||||
year = "1999",
|
|
||||||
title = "Weak Duplicate"
|
|
||||||
}
|
|
||||||
|
|
||||||
@misc{weak2,
|
|
||||||
author = "Smith, Jane",
|
|
||||||
year = "1999",
|
|
||||||
title = "Weak Duplicate",
|
|
||||||
note = "Copied from legacy source"
|
|
||||||
}
|
|
||||||
""",
|
|
||||||
encoding="utf-8",
|
|
||||||
)
|
|
||||||
Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8")
|
|
||||||
|
|
||||||
from citegeist.resolve import Resolution, ResolutionAttempt, ResolutionOutcome
|
|
||||||
|
|
||||||
scraper.resolver.resolve_entry_with_trace = lambda entry: ResolutionOutcome( # type: ignore[method-assign]
|
|
||||||
resolution=Resolution(
|
|
||||||
entry=BibEntry(
|
|
||||||
entry_type="article",
|
|
||||||
citation_key="resolved",
|
|
||||||
fields={
|
|
||||||
"author": entry.fields["author"],
|
|
||||||
"title": entry.fields["title"],
|
|
||||||
"year": entry.fields["year"],
|
|
||||||
"doi": "10.1000/weak",
|
|
||||||
"journal": "Journal of Better Metadata",
|
|
||||||
},
|
|
||||||
),
|
|
||||||
source_type="resolver",
|
|
||||||
source_label="crossref:search:Weak Duplicate",
|
|
||||||
),
|
|
||||||
attempts=[
|
|
||||||
ResolutionAttempt(
|
|
||||||
source_name="crossref",
|
|
||||||
strategy="title_search",
|
|
||||||
query_value="Weak Duplicate",
|
|
||||||
matched=True,
|
|
||||||
candidate_count=1,
|
|
||||||
source_label="crossref:search:Weak Duplicate",
|
|
||||||
)
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
store = BibliographyStore()
|
|
||||||
try:
|
|
||||||
results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=False)
|
|
||||||
finally:
|
|
||||||
store.close()
|
|
||||||
|
|
||||||
assert len(results) == 1
|
|
||||||
assert results[0].resolution_attempts == [
|
|
||||||
{
|
|
||||||
"source_name": "crossref",
|
|
||||||
"strategy": "title_search",
|
|
||||||
"query_value": "Weak Duplicate",
|
|
||||||
"matched": True,
|
|
||||||
"candidate_count": 1,
|
|
||||||
"source_label": "crossref:search:Weak Duplicate",
|
|
||||||
"error": "",
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def test_talkorigins_enrich_weak_canonicals_can_apply_to_store(tmp_path: Path):
|
def test_talkorigins_enrich_weak_canonicals_can_apply_to_store(tmp_path: Path):
|
||||||
base_url = "https://www.talkorigins.org/origins/biblio/"
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
||||||
scraper = TalkOriginsScraper(
|
scraper = TalkOriginsScraper(
|
||||||
|
|
@ -881,7 +799,6 @@ def test_talkorigins_build_review_export_combines_cluster_and_enrichment(tmp_pat
|
||||||
assert review.items[0]["canonical"]["citation_key"] == "weak2"
|
assert review.items[0]["canonical"]["citation_key"] == "weak2"
|
||||||
assert review.items[0]["enrichment"]["resolved"] is True
|
assert review.items[0]["enrichment"]["resolved"] is True
|
||||||
assert review.items[0]["enrichment"]["applied"] is False
|
assert review.items[0]["enrichment"]["applied"] is False
|
||||||
assert review.items[0]["enrichment"]["resolution_attempts"] == []
|
|
||||||
|
|
||||||
|
|
||||||
def test_talkorigins_apply_review_corrections_updates_store_entry(tmp_path: Path):
|
def test_talkorigins_apply_review_corrections_updates_store_entry(tmp_path: Path):
|
||||||
|
|
|
||||||
|
|
@ -1,117 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from citegeist.cli import _run_enrich_oa
|
|
||||||
from citegeist.sources import SourceRegistry, UnpaywallSource, list_source_catalog, prioritized_source_keys
|
|
||||||
from citegeist.storage import BibliographyStore
|
|
||||||
|
|
||||||
|
|
||||||
def test_unpaywall_source_normalizes_oa_record() -> None:
|
|
||||||
source = UnpaywallSource(config={"email": "tester@example.org"})
|
|
||||||
entry = source.normalize(
|
|
||||||
{
|
|
||||||
"doi": "10.1000/example",
|
|
||||||
"title": "Example Article",
|
|
||||||
"year": 2024,
|
|
||||||
"is_oa": True,
|
|
||||||
"oa_status": "gold",
|
|
||||||
"best_oa_location": {
|
|
||||||
"url": "https://example.org/article",
|
|
||||||
"url_for_pdf": "https://example.org/article.pdf",
|
|
||||||
"license": "cc-by",
|
|
||||||
"host_type": "publisher",
|
|
||||||
"version": "publishedVersion",
|
|
||||||
"evidence": "open (via free pdf)",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
assert entry is not None
|
|
||||||
assert entry.fields["doi"] == "10.1000/example"
|
|
||||||
assert entry.fields["best_oa_url"] == "https://example.org/article"
|
|
||||||
assert entry.fields["best_oa_pdf_url"] == "https://example.org/article.pdf"
|
|
||||||
assert entry.fields["oa_status"] == "gold"
|
|
||||||
assert entry.fields["oa_license"] == "cc-by"
|
|
||||||
assert entry.fields["is_oa"] == "true"
|
|
||||||
|
|
||||||
|
|
||||||
def test_unpaywall_registry_and_catalog() -> None:
|
|
||||||
registry = SourceRegistry()
|
|
||||||
registry.from_config_dict(
|
|
||||||
{
|
|
||||||
"sources": {
|
|
||||||
"unpaywall": {
|
|
||||||
"source_type": "unpaywall",
|
|
||||||
"enabled": True,
|
|
||||||
"email": "tester@example.org",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
source = registry.get("unpaywall")
|
|
||||||
assert isinstance(source, UnpaywallSource)
|
|
||||||
|
|
||||||
catalog = {entry.key: entry for entry in list_source_catalog()}
|
|
||||||
assert catalog["unpaywall"].current_status == "integrated"
|
|
||||||
assert catalog["unpaywall"].priority == "now"
|
|
||||||
assert "unpaywall" in prioritized_source_keys()
|
|
||||||
|
|
||||||
|
|
||||||
def test_run_enrich_oa_updates_entry() -> None:
|
|
||||||
store = BibliographyStore()
|
|
||||||
try:
|
|
||||||
store.ingest_bibtex(
|
|
||||||
"""
|
|
||||||
@article{seed2024,
|
|
||||||
author = {Seed, Alice},
|
|
||||||
title = {Seed Paper},
|
|
||||||
year = {2024},
|
|
||||||
doi = {10.1000/example}
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
|
|
||||||
original_lookup = UnpaywallSource.lookup_by_doi
|
|
||||||
|
|
||||||
def fake_lookup(self: UnpaywallSource, doi: str):
|
|
||||||
return self.normalize(
|
|
||||||
{
|
|
||||||
"doi": doi,
|
|
||||||
"title": "Seed Paper",
|
|
||||||
"year": 2024,
|
|
||||||
"is_oa": True,
|
|
||||||
"oa_status": "green",
|
|
||||||
"best_oa_location": {
|
|
||||||
"url": "https://repository.example.org/seed",
|
|
||||||
"url_for_pdf": "https://repository.example.org/seed.pdf",
|
|
||||||
"license": "cc-by",
|
|
||||||
"host_type": "repository",
|
|
||||||
"version": "acceptedVersion",
|
|
||||||
"evidence": "oa repository",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
UnpaywallSource.lookup_by_doi = fake_lookup # type: ignore[method-assign]
|
|
||||||
try:
|
|
||||||
assert _run_enrich_oa(store, ["seed2024"], "tester@example.org") == 0
|
|
||||||
finally:
|
|
||||||
UnpaywallSource.lookup_by_doi = original_lookup # type: ignore[method-assign]
|
|
||||||
|
|
||||||
entry = store.get_entry("seed2024")
|
|
||||||
assert entry is not None
|
|
||||||
assert entry["best_oa_url"] == "https://repository.example.org/seed"
|
|
||||||
assert entry["best_oa_pdf_url"] == "https://repository.example.org/seed.pdf"
|
|
||||||
assert entry["oa_status"] == "green"
|
|
||||||
assert entry["oa_host_type"] == "repository"
|
|
||||||
provenance = store.get_field_provenance("seed2024")
|
|
||||||
assert any(item["source_type"] == "oa_enrich" for item in provenance)
|
|
||||||
finally:
|
|
||||||
store.close()
|
|
||||||
|
|
||||||
|
|
||||||
def test_run_enrich_oa_requires_email() -> None:
|
|
||||||
store = BibliographyStore()
|
|
||||||
try:
|
|
||||||
assert _run_enrich_oa(store, ["missing"], None) == 1
|
|
||||||
finally:
|
|
||||||
store.close()
|
|
||||||
Loading…
Reference in New Issue