174 lines
6.5 KiB
Python
174 lines
6.5 KiB
Python
"""Open bibliographic source inventory and prioritization helpers."""
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
|
@dataclass(frozen=True, slots=True)
|
|
class SourceCatalogEntry:
|
|
key: str
|
|
label: str
|
|
category: str
|
|
access: str
|
|
capabilities: tuple[str, ...]
|
|
strengths: str
|
|
caveats: str
|
|
current_status: str
|
|
priority: str
|
|
|
|
|
|
_CATALOG: tuple[SourceCatalogEntry, ...] = (
|
|
SourceCatalogEntry(
|
|
key="crossref",
|
|
label="Crossref",
|
|
category="metadata",
|
|
access="open API",
|
|
capabilities=("doi_lookup", "title_search", "reference_lists"),
|
|
strengths="Broad DOI coverage and good article-level metadata.",
|
|
caveats="Citation coverage is incomplete and some references are unstructured blobs.",
|
|
current_status="integrated",
|
|
priority="now",
|
|
),
|
|
SourceCatalogEntry(
|
|
key="openalex",
|
|
label="OpenAlex",
|
|
category="metadata+graph",
|
|
access="open API",
|
|
capabilities=("work_lookup", "title_search", "citations", "cited_by", "topics"),
|
|
strengths="Best current open source for citation graph expansion and work-level discovery.",
|
|
caveats="Occasional noisy secondary records require conservative admission rules.",
|
|
current_status="integrated",
|
|
priority="now",
|
|
),
|
|
SourceCatalogEntry(
|
|
key="pubmed",
|
|
label="PubMed / NCBI E-utilities",
|
|
category="metadata",
|
|
access="open API",
|
|
capabilities=("pmid_lookup", "title_search", "biomedical_metadata"),
|
|
strengths="High-value authoritative metadata for biomedical literature.",
|
|
caveats="Domain-specific coverage outside biomedicine is limited.",
|
|
current_status="integrated",
|
|
priority="now",
|
|
),
|
|
SourceCatalogEntry(
|
|
key="datacite",
|
|
label="DataCite",
|
|
category="metadata",
|
|
access="open API",
|
|
capabilities=("doi_lookup", "title_search", "datasets"),
|
|
strengths="Important for datasets, reports, and non-traditional DOI-bearing objects.",
|
|
caveats="Less useful than Crossref/OpenAlex for mainstream article citation graphs.",
|
|
current_status="integrated",
|
|
priority="now",
|
|
),
|
|
SourceCatalogEntry(
|
|
key="dblp",
|
|
label="DBLP",
|
|
category="metadata",
|
|
access="open API",
|
|
capabilities=("key_lookup", "search", "computer_science"),
|
|
strengths="Excellent computer-science coverage and clean bibliographic records.",
|
|
caveats="Discipline-specific rather than general-purpose.",
|
|
current_status="integrated",
|
|
priority="selective",
|
|
),
|
|
SourceCatalogEntry(
|
|
key="arxiv",
|
|
label="arXiv",
|
|
category="metadata+fulltext",
|
|
access="open API",
|
|
capabilities=("id_lookup", "search", "preprints"),
|
|
strengths="Useful for preprint-first fields and free full-text links.",
|
|
caveats="Not a general citation graph source.",
|
|
current_status="integrated",
|
|
priority="selective",
|
|
),
|
|
SourceCatalogEntry(
|
|
key="open_citations",
|
|
label="OpenCitations",
|
|
category="graph",
|
|
access="open API",
|
|
capabilities=("doi_citations", "doi_references", "provenance"),
|
|
strengths="Directly aligned with open citation-edge expansion.",
|
|
caveats="Coverage is narrower than OpenAlex and needs merge discipline.",
|
|
current_status="integrated",
|
|
priority="now",
|
|
),
|
|
SourceCatalogEntry(
|
|
key="semantic_scholar",
|
|
label="Semantic Scholar",
|
|
category="metadata+graph",
|
|
access="free API with limits",
|
|
capabilities=("work_lookup", "search", "citations", "references"),
|
|
strengths="Strong graph and relevance signals, especially for discovery workflows.",
|
|
caveats="Not fully open data in the same sense as Crossref/OpenAlex/OpenCitations.",
|
|
current_status="integrated",
|
|
priority="now",
|
|
),
|
|
SourceCatalogEntry(
|
|
key="unpaywall",
|
|
label="Unpaywall",
|
|
category="access-links",
|
|
access="open API",
|
|
capabilities=("doi_fulltext_links", "oa_status"),
|
|
strengths="Best open source for landing-page and OA-link enrichment.",
|
|
caveats="Improves access, not bibliographic identity or graph completeness.",
|
|
current_status="integrated",
|
|
priority="now",
|
|
),
|
|
SourceCatalogEntry(
|
|
key="europe_pmc",
|
|
label="Europe PMC",
|
|
category="metadata+fulltext",
|
|
access="open API",
|
|
capabilities=("search", "citations", "fulltext_links", "biomedical"),
|
|
strengths="Valuable biomedical complement to PubMed with richer open-access linkage.",
|
|
caveats="Domain-specific and partially overlapping with PubMed/OpenAlex.",
|
|
current_status="integrated",
|
|
priority="now",
|
|
),
|
|
SourceCatalogEntry(
|
|
key="open_library",
|
|
label="Open Library",
|
|
category="metadata",
|
|
access="open API",
|
|
capabilities=("title_search", "book_metadata", "author_catalog", "isbn_hints"),
|
|
strengths="Useful general-purpose open catalog coverage for books, monographs, and reference works.",
|
|
caveats="Catalog metadata is not a citation graph and can be noisy for article-like titles.",
|
|
current_status="integrated",
|
|
priority="selective",
|
|
),
|
|
SourceCatalogEntry(
|
|
key="openaire",
|
|
label="OpenAIRE",
|
|
category="metadata+repository",
|
|
access="open API",
|
|
capabilities=("repository_metadata", "oa_links", "project_links"),
|
|
strengths="Good for repository, project, and European OA discovery.",
|
|
caveats="May be better treated as a corpus-acquisition adapter than a first-line resolver.",
|
|
current_status="planned",
|
|
priority="evaluate",
|
|
),
|
|
SourceCatalogEntry(
|
|
key="oai_pmh",
|
|
label="OAI-PMH Repositories",
|
|
category="repository",
|
|
access="open protocol",
|
|
capabilities=("repository_harvest", "set_discovery", "metadata_formats"),
|
|
strengths="Already useful for theses, dissertations, and institutional repositories.",
|
|
caveats="Heterogeneous metadata quality; not a single canonical bibliographic source.",
|
|
current_status="integrated",
|
|
priority="selective",
|
|
),
|
|
)
|
|
|
|
|
|
def list_source_catalog() -> list[SourceCatalogEntry]:
|
|
return list(_CATALOG)
|
|
|
|
|
|
def prioritized_source_keys() -> list[str]:
|
|
order = {"now": 0, "next": 1, "selective": 2, "evaluate": 3}
|
|
return [entry.key for entry in sorted(_CATALOG, key=lambda entry: (order[entry.priority], entry.label.lower()))]
|