From 69844e9750b019feea6acdc80aa7089a9cd4c5f7 Mon Sep 17 00:00:00 2001 From: welsberr Date: Fri, 20 Mar 2026 18:57:08 -0400 Subject: [PATCH] Harden network fetch failures --- src/citegeist/expand.py | 20 +++++++++++++++----- src/citegeist/harvest.py | 20 +++++++++++++++----- src/citegeist/sources.py | 19 +++++++++++++++++++ src/citegeist/talkorigins.py | 9 +++++++-- tests/test_expand.py | 28 ++++++++++++++++++++++++++++ tests/test_sources.py | 12 ++++++++++++ 6 files changed, 96 insertions(+), 12 deletions(-) diff --git a/src/citegeist/expand.py b/src/citegeist/expand.py index d169d10..5e31506 100644 --- a/src/citegeist/expand.py +++ b/src/citegeist/expand.py @@ -49,9 +49,11 @@ class CrossrefExpander: if not doi: return [] - payload = self.resolver.source_client.get_json( + payload = self.resolver.source_client.try_get_json( f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com" ) + if payload is None: + return [] references = payload.get("message", {}).get("reference", []) results: list[ExpansionResult] = [] for index, reference in enumerate(references, start=1): @@ -141,7 +143,9 @@ class OpenAlexExpander: filter_name = "cited_by" if relation_type == "cites" else "cites" query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit}) - payload = self.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}") + payload = self.resolver.source_client.try_get_json(f"https://api.openalex.org/works?{query}") + if payload is None: + return [] works = payload.get("results", []) results: list[ExpansionResult] = [] @@ -190,7 +194,9 @@ class OpenAlexExpander: if not doi: return None query = urlencode({"filter": f"doi:https://doi.org/{doi}"}) - payload = self.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}") + payload = self.resolver.source_client.try_get_json(f"https://api.openalex.org/works?{query}") + if payload is None: + return None results = payload.get("results", []) if not results: return None @@ -326,9 +332,11 @@ class TopicExpander: if entry is None or not entry.get("doi"): return [] doi = str(entry["doi"]) - payload = self.crossref_expander.resolver.source_client.get_json( + payload = self.crossref_expander.resolver.source_client.try_get_json( f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com" ) + if payload is None: + return [] references = payload.get("message", {}).get("reference", [])[:limit] rows: list[tuple[ExpansionResult, dict[str, object]]] = [] for index, reference in enumerate(references, start=1): @@ -362,7 +370,9 @@ class TopicExpander: return [] filter_name = "cited_by" if relation_type == "cites" else "cites" query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit}) - payload = self.openalex_expander.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}") + payload = self.openalex_expander.resolver.source_client.try_get_json(f"https://api.openalex.org/works?{query}") + if payload is None: + return [] works = payload.get("results", []) rows: list[tuple[ExpansionResult, dict[str, object]]] = [] for work in works: diff --git a/src/citegeist/harvest.py b/src/citegeist/harvest.py index 1a85662..8b0348d 100644 --- a/src/citegeist/harvest.py +++ b/src/citegeist/harvest.py @@ -41,7 +41,9 @@ class OaiPmhHarvester: self.source_client = source_client or SourceClient() def identify(self, base_url: str) -> dict[str, str]: - root = self.source_client.get_xml(f"{base_url}?{urlencode({'verb': 'Identify'})}") + root = self.source_client.try_get_xml(f"{base_url}?{urlencode({'verb': 'Identify'})}") + if root is None: + return {} identify = root.find(".//oai:Identify", NS) if identify is None: return {} @@ -59,7 +61,9 @@ class OaiPmhHarvester: return payload def list_sets(self, base_url: str) -> list[OaiSet]: - root = self.source_client.get_xml(f"{base_url}?{urlencode({'verb': 'ListSets'})}") + root = self.source_client.try_get_xml(f"{base_url}?{urlencode({'verb': 'ListSets'})}") + if root is None: + return [] sets = root.findall(".//oai:set", NS) results: list[OaiSet] = [] for node in sets: @@ -76,7 +80,9 @@ class OaiPmhHarvester: params = {"verb": "ListMetadataFormats"} if identifier: params["identifier"] = identifier - root = self.source_client.get_xml(f"{base_url}?{urlencode(params)}") + root = self.source_client.try_get_xml(f"{base_url}?{urlencode(params)}") + if root is None: + return [] formats = root.findall(".//oai:metadataFormat", NS) results: list[OaiMetadataFormat] = [] for node in formats: @@ -110,7 +116,9 @@ class OaiPmhHarvester: ordinal = 1 next_url = f"{base_url}?{urlencode(params)}" while next_url: - root = self.source_client.get_xml(next_url) + root = self.source_client.try_get_xml(next_url) + if root is None: + break records = root.findall(".//oai:record", NS) for record in records: parsed = self._record_to_result(base_url, record, ordinal) @@ -133,7 +141,9 @@ class OaiPmhHarvester: "metadataPrefix": metadata_prefix, "identifier": identifier, } - root = self.source_client.get_xml(f"{base_url}?{urlencode(params)}") + root = self.source_client.try_get_xml(f"{base_url}?{urlencode(params)}") + if root is None: + return None record = root.find(".//oai:record", NS) if record is None: return None diff --git a/src/citegeist/sources.py b/src/citegeist/sources.py index 0f453e5..648f1cb 100644 --- a/src/citegeist/sources.py +++ b/src/citegeist/sources.py @@ -2,6 +2,7 @@ from __future__ import annotations import hashlib import json +import urllib.error import urllib.request import xml.etree.ElementTree as ET from pathlib import Path @@ -45,6 +46,24 @@ class SourceClient: self._write_cache(url, "xml", payload) return ET.fromstring(payload) + def try_get_json(self, url: str) -> dict | None: + try: + return self.get_json(url) + except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError, ValueError): + return None + + def try_get_text(self, url: str) -> str | None: + try: + return self.get_text(url) + except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError, ValueError): + return None + + def try_get_xml(self, url: str) -> ET.Element | None: + try: + return self.get_xml(url) + except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError, ET.ParseError, ValueError): + return None + def _fetch_bytes(self, url: str) -> bytes: with urllib.request.urlopen(self._request(url)) as response: return response.read() diff --git a/src/citegeist/talkorigins.py b/src/citegeist/talkorigins.py index e98d239..c69e79a 100644 --- a/src/citegeist/talkorigins.py +++ b/src/citegeist/talkorigins.py @@ -781,7 +781,10 @@ class TalkOriginsScraper: limit_topics: int | None = None, resume: bool = True, ) -> list[TalkOriginsTopic]: - index_html = self.source_client.get_text(base_url) + fetch_text = getattr(self.source_client, "try_get_text", self.source_client.get_text) + index_html = fetch_text(base_url) + if index_html is None: + return [] parser = _TopicIndexParser(base_url) parser.feed(index_html) @@ -793,7 +796,9 @@ class TalkOriginsScraper: if snapshot is not None: raw_entries = list(snapshot.get("raw_entries", [])) else: - page_html = self.source_client.get_text(link["url"]) + page_html = fetch_text(link["url"]) + if page_html is None: + continue topic_parser = _TopicPageParser() topic_parser.feed(page_html) raw_entries = normalize_topic_entries(topic_parser.preformatted_text()) diff --git a/tests/test_expand.py b/tests/test_expand.py index ead9db1..090eeba 100644 --- a/tests/test_expand.py +++ b/tests/test_expand.py @@ -1,3 +1,5 @@ +import urllib.error + from citegeist.bibtex import BibEntry from citegeist.expand import CrossrefExpander, _crossref_reference_to_entry from citegeist.resolve import Resolution @@ -138,3 +140,29 @@ def test_crossref_reference_to_entry_infers_non_misc_for_proceedings_like_text() ) assert entry.entry_type == "inproceedings" + + +def test_crossref_expander_returns_empty_on_fetch_error(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + doi = {10.1000/seed-doi} +} +""" + ) + + expander = CrossrefExpander() + + def raise_404(_url: str): + raise urllib.error.HTTPError(_url, 404, "Not Found", hdrs=None, fp=None) + + expander.resolver.source_client._fetch_bytes = raise_404 # type: ignore[method-assign] + + assert expander.expand_entry_references(store, "seed2024") == [] + finally: + store.close() diff --git a/tests/test_sources.py b/tests/test_sources.py index f9f0bf2..60811f3 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -1,4 +1,5 @@ from pathlib import Path +import urllib.error from citegeist.sources import SourceClient @@ -39,3 +40,14 @@ def test_source_client_falls_back_to_latin1_for_text(tmp_path: Path): payload = client.get_text(url) assert payload == "café" + + +def test_source_client_try_get_json_returns_none_on_http_error(tmp_path: Path): + client = SourceClient(cache_dir=tmp_path / "cache") + + def raise_404(_url: str): + raise urllib.error.HTTPError(_url, 404, "Not Found", hdrs=None, fp=None) + + client._fetch_bytes = raise_404 # type: ignore[method-assign] + + assert client.try_get_json("https://example.org/missing") is None