Handle missing upstream metadata gracefully

This commit is contained in:
welsberr 2026-03-20 15:04:10 -04:00
parent 4eba64d352
commit 912dc59301
2 changed files with 69 additions and 9 deletions

View File

@ -1,6 +1,7 @@
from __future__ import annotations from __future__ import annotations
import re import re
import urllib.error
import urllib.parse import urllib.parse
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from dataclasses import dataclass from dataclasses import dataclass
@ -76,7 +77,9 @@ class MetadataResolver:
def resolve_doi(self, doi: str) -> Resolution | None: def resolve_doi(self, doi: str) -> Resolution | None:
encoded = urllib.parse.quote(doi, safe="") encoded = urllib.parse.quote(doi, safe="")
payload = self.source_client.get_json(f"https://api.crossref.org/works/{encoded}") payload = self._safe_get_json(f"https://api.crossref.org/works/{encoded}")
if payload is None:
return None
message = payload.get("message", {}) message = payload.get("message", {})
if not message: if not message:
return None return None
@ -88,7 +91,9 @@ class MetadataResolver:
def search_crossref(self, title: str, limit: int = 5) -> list[BibEntry]: def search_crossref(self, title: str, limit: int = 5) -> list[BibEntry]:
query = urllib.parse.urlencode({"query.title": title, "rows": limit}) query = urllib.parse.urlencode({"query.title": title, "rows": limit})
payload = self.source_client.get_json(f"https://api.crossref.org/works?{query}") payload = self._safe_get_json(f"https://api.crossref.org/works?{query}")
if payload is None:
return []
items = payload.get("message", {}).get("items", []) items = payload.get("message", {}).get("items", [])
return [_crossref_message_to_entry(item) for item in items] return [_crossref_message_to_entry(item) for item in items]
@ -114,7 +119,9 @@ class MetadataResolver:
def resolve_dblp(self, dblp_key: str) -> Resolution | None: def resolve_dblp(self, dblp_key: str) -> Resolution | None:
encoded_key = urllib.parse.quote(dblp_key, safe="/:") encoded_key = urllib.parse.quote(dblp_key, safe="/:")
text = self.source_client.get_text(f"https://dblp.org/rec/{encoded_key}.bib") text = self._safe_get_text(f"https://dblp.org/rec/{encoded_key}.bib")
if text is None:
return None
entries = parse_bibtex(text) entries = parse_bibtex(text)
if not entries: if not entries:
return None return None
@ -126,7 +133,9 @@ class MetadataResolver:
def search_dblp(self, query_text: str, limit: int = 5) -> list[BibEntry]: def search_dblp(self, query_text: str, limit: int = 5) -> list[BibEntry]:
query = urllib.parse.urlencode({"q": query_text, "format": "json", "h": limit}) query = urllib.parse.urlencode({"q": query_text, "format": "json", "h": limit})
payload = self.source_client.get_json(f"https://dblp.org/search/publ/api?{query}") payload = self._safe_get_json(f"https://dblp.org/search/publ/api?{query}")
if payload is None:
return []
hits = payload.get("result", {}).get("hits", {}).get("hit", []) hits = payload.get("result", {}).get("hits", {}).get("hit", [])
if isinstance(hits, dict): if isinstance(hits, dict):
hits = [hits] hits = [hits]
@ -143,7 +152,9 @@ class MetadataResolver:
def resolve_arxiv(self, arxiv_id: str) -> Resolution | None: def resolve_arxiv(self, arxiv_id: str) -> Resolution | None:
query = urllib.parse.urlencode({"id_list": arxiv_id}) query = urllib.parse.urlencode({"id_list": arxiv_id})
root = self.source_client.get_xml(f"https://export.arxiv.org/api/query?{query}") root = self._safe_get_xml(f"https://export.arxiv.org/api/query?{query}")
if root is None:
return None
namespace = {"atom": "http://www.w3.org/2005/Atom"} namespace = {"atom": "http://www.w3.org/2005/Atom"}
entry = root.find("atom:entry", namespace) entry = root.find("atom:entry", namespace)
if entry is None: if entry is None:
@ -156,7 +167,9 @@ class MetadataResolver:
def resolve_openalex(self, openalex_id: str) -> Resolution | None: def resolve_openalex(self, openalex_id: str) -> Resolution | None:
normalized_id = _normalize_openalex_id(openalex_id) normalized_id = _normalize_openalex_id(openalex_id)
payload = self.source_client.get_json(f"https://api.openalex.org/works/{normalized_id}") payload = self._safe_get_json(f"https://api.openalex.org/works/{normalized_id}")
if payload is None:
return None
if not payload: if not payload:
return None return None
return Resolution( return Resolution(
@ -167,7 +180,9 @@ class MetadataResolver:
def resolve_datacite_doi(self, doi: str) -> Resolution | None: def resolve_datacite_doi(self, doi: str) -> Resolution | None:
encoded = urllib.parse.quote(doi, safe="") encoded = urllib.parse.quote(doi, safe="")
payload = self.source_client.get_json(f"https://api.datacite.org/dois/{encoded}") payload = self._safe_get_json(f"https://api.datacite.org/dois/{encoded}")
if payload is None:
return None
data = payload.get("data", {}) data = payload.get("data", {})
if not data: if not data:
return None return None
@ -179,7 +194,9 @@ class MetadataResolver:
def search_datacite(self, title: str, limit: int = 5) -> list[BibEntry]: def search_datacite(self, title: str, limit: int = 5) -> list[BibEntry]:
query = urllib.parse.urlencode({"query": title, "page[size]": limit}) query = urllib.parse.urlencode({"query": title, "page[size]": limit})
payload = self.source_client.get_json(f"https://api.datacite.org/dois?{query}") payload = self._safe_get_json(f"https://api.datacite.org/dois?{query}")
if payload is None:
return []
return [_datacite_work_to_entry(item) for item in payload.get("data", [])] return [_datacite_work_to_entry(item) for item in payload.get("data", [])]
def search_datacite_best_match( def search_datacite_best_match(
@ -204,9 +221,29 @@ class MetadataResolver:
def search_openalex(self, title: str, limit: int = 5) -> list[BibEntry]: def search_openalex(self, title: str, limit: int = 5) -> list[BibEntry]:
query = urllib.parse.urlencode({"search": title, "per-page": limit}) query = urllib.parse.urlencode({"search": title, "per-page": limit})
payload = self.source_client.get_json(f"https://api.openalex.org/works?{query}") payload = self._safe_get_json(f"https://api.openalex.org/works?{query}")
if payload is None:
return []
return [_openalex_work_to_entry(item) for item in payload.get("results", [])] return [_openalex_work_to_entry(item) for item in payload.get("results", [])]
def _safe_get_json(self, url: str) -> dict | None:
try:
return self.source_client.get_json(url)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError, ValueError):
return None
def _safe_get_text(self, url: str) -> str | None:
try:
return self.source_client.get_text(url)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError, ValueError):
return None
def _safe_get_xml(self, url: str) -> ET.Element | None:
try:
return self.source_client.get_xml(url)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError, ET.ParseError, ValueError):
return None
def search_openalex_best_match( def search_openalex_best_match(
self, self,
title: str, title: str,

View File

@ -1,4 +1,5 @@
from xml.etree import ElementTree as ET from xml.etree import ElementTree as ET
import urllib.error
from citegeist.bibtex import BibEntry, render_bibtex from citegeist.bibtex import BibEntry, render_bibtex
from citegeist.resolve import ( from citegeist.resolve import (
@ -200,6 +201,28 @@ def test_resolver_can_resolve_openalex_id():
assert resolution.entry.fields["openalex"] == "W12345" assert resolution.entry.fields["openalex"] == "W12345"
def test_resolve_doi_returns_none_on_http_404():
resolver = MetadataResolver()
def raise_404(_url: str):
raise urllib.error.HTTPError(_url, 404, "Not Found", hdrs=None, fp=None)
resolver.source_client.get_json = raise_404 # type: ignore[method-assign]
assert resolver.resolve_doi("10.1000/missing") is None
def test_search_crossref_returns_empty_on_fetch_error():
resolver = MetadataResolver()
def raise_url_error(_url: str):
raise urllib.error.URLError("temporary failure")
resolver.source_client.get_json = raise_url_error # type: ignore[method-assign]
assert resolver.search_crossref("Avida") == []
def test_resolver_falls_back_to_openalex_title_search(): def test_resolver_falls_back_to_openalex_title_search():
resolver = MetadataResolver() resolver = MetadataResolver()
resolver.search_datacite = lambda title, limit=5: [] # type: ignore[method-assign] resolver.search_datacite = lambda title, limit=5: [] # type: ignore[method-assign]