CiteGeist/src/citegeist/sources/catalog.py

174 lines
6.5 KiB
Python

"""Open bibliographic source inventory and prioritization helpers."""
from __future__ import annotations
from dataclasses import dataclass
@dataclass(frozen=True, slots=True)
class SourceCatalogEntry:
key: str
label: str
category: str
access: str
capabilities: tuple[str, ...]
strengths: str
caveats: str
current_status: str
priority: str
_CATALOG: tuple[SourceCatalogEntry, ...] = (
SourceCatalogEntry(
key="crossref",
label="Crossref",
category="metadata",
access="open API",
capabilities=("doi_lookup", "title_search", "reference_lists"),
strengths="Broad DOI coverage and good article-level metadata.",
caveats="Citation coverage is incomplete and some references are unstructured blobs.",
current_status="integrated",
priority="now",
),
SourceCatalogEntry(
key="openalex",
label="OpenAlex",
category="metadata+graph",
access="open API",
capabilities=("work_lookup", "title_search", "citations", "cited_by", "topics"),
strengths="Best current open source for citation graph expansion and work-level discovery.",
caveats="Occasional noisy secondary records require conservative admission rules.",
current_status="integrated",
priority="now",
),
SourceCatalogEntry(
key="pubmed",
label="PubMed / NCBI E-utilities",
category="metadata",
access="open API",
capabilities=("pmid_lookup", "title_search", "biomedical_metadata"),
strengths="High-value authoritative metadata for biomedical literature.",
caveats="Domain-specific coverage outside biomedicine is limited.",
current_status="integrated",
priority="now",
),
SourceCatalogEntry(
key="datacite",
label="DataCite",
category="metadata",
access="open API",
capabilities=("doi_lookup", "title_search", "datasets"),
strengths="Important for datasets, reports, and non-traditional DOI-bearing objects.",
caveats="Less useful than Crossref/OpenAlex for mainstream article citation graphs.",
current_status="integrated",
priority="now",
),
SourceCatalogEntry(
key="dblp",
label="DBLP",
category="metadata",
access="open API",
capabilities=("key_lookup", "search", "computer_science"),
strengths="Excellent computer-science coverage and clean bibliographic records.",
caveats="Discipline-specific rather than general-purpose.",
current_status="integrated",
priority="selective",
),
SourceCatalogEntry(
key="arxiv",
label="arXiv",
category="metadata+fulltext",
access="open API",
capabilities=("id_lookup", "search", "preprints"),
strengths="Useful for preprint-first fields and free full-text links.",
caveats="Not a general citation graph source.",
current_status="integrated",
priority="selective",
),
SourceCatalogEntry(
key="open_citations",
label="OpenCitations",
category="graph",
access="open API",
capabilities=("doi_citations", "doi_references", "provenance"),
strengths="Directly aligned with open citation-edge expansion.",
caveats="Coverage is narrower than OpenAlex and needs merge discipline.",
current_status="integrated",
priority="now",
),
SourceCatalogEntry(
key="semantic_scholar",
label="Semantic Scholar",
category="metadata+graph",
access="free API with limits",
capabilities=("work_lookup", "search", "citations", "references"),
strengths="Strong graph and relevance signals, especially for discovery workflows.",
caveats="Not fully open data in the same sense as Crossref/OpenAlex/OpenCitations.",
current_status="integrated",
priority="now",
),
SourceCatalogEntry(
key="unpaywall",
label="Unpaywall",
category="access-links",
access="open API",
capabilities=("doi_fulltext_links", "oa_status"),
strengths="Best open source for landing-page and OA-link enrichment.",
caveats="Improves access, not bibliographic identity or graph completeness.",
current_status="integrated",
priority="now",
),
SourceCatalogEntry(
key="europe_pmc",
label="Europe PMC",
category="metadata+fulltext",
access="open API",
capabilities=("search", "citations", "fulltext_links", "biomedical"),
strengths="Valuable biomedical complement to PubMed with richer open-access linkage.",
caveats="Domain-specific and partially overlapping with PubMed/OpenAlex.",
current_status="integrated",
priority="now",
),
SourceCatalogEntry(
key="open_library",
label="Open Library",
category="metadata",
access="open API",
capabilities=("title_search", "book_metadata", "author_catalog", "isbn_hints"),
strengths="Useful general-purpose open catalog coverage for books, monographs, and reference works.",
caveats="Catalog metadata is not a citation graph and can be noisy for article-like titles.",
current_status="integrated",
priority="selective",
),
SourceCatalogEntry(
key="openaire",
label="OpenAIRE",
category="metadata+repository",
access="open API",
capabilities=("repository_metadata", "oa_links", "project_links"),
strengths="Good for repository, project, and European OA discovery.",
caveats="May be better treated as a corpus-acquisition adapter than a first-line resolver.",
current_status="planned",
priority="evaluate",
),
SourceCatalogEntry(
key="oai_pmh",
label="OAI-PMH Repositories",
category="repository",
access="open protocol",
capabilities=("repository_harvest", "set_discovery", "metadata_formats"),
strengths="Already useful for theses, dissertations, and institutional repositories.",
caveats="Heterogeneous metadata quality; not a single canonical bibliographic source.",
current_status="integrated",
priority="selective",
),
)
def list_source_catalog() -> list[SourceCatalogEntry]:
return list(_CATALOG)
def prioritized_source_keys() -> list[str]:
order = {"now": 0, "next": 1, "selective": 2, "evaluate": 3}
return [entry.key for entry in sorted(_CATALOG, key=lambda entry: (order[entry.priority], entry.label.lower()))]