Harden network fetch failures

This commit is contained in:
welsberr 2026-03-20 18:57:08 -04:00
parent 8a21044d1f
commit 69844e9750
6 changed files with 96 additions and 12 deletions

View File

@ -49,9 +49,11 @@ class CrossrefExpander:
if not doi: if not doi:
return [] return []
payload = self.resolver.source_client.get_json( payload = self.resolver.source_client.try_get_json(
f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com" f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com"
) )
if payload is None:
return []
references = payload.get("message", {}).get("reference", []) references = payload.get("message", {}).get("reference", [])
results: list[ExpansionResult] = [] results: list[ExpansionResult] = []
for index, reference in enumerate(references, start=1): for index, reference in enumerate(references, start=1):
@ -141,7 +143,9 @@ class OpenAlexExpander:
filter_name = "cited_by" if relation_type == "cites" else "cites" filter_name = "cited_by" if relation_type == "cites" else "cites"
query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit}) query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit})
payload = self.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}") payload = self.resolver.source_client.try_get_json(f"https://api.openalex.org/works?{query}")
if payload is None:
return []
works = payload.get("results", []) works = payload.get("results", [])
results: list[ExpansionResult] = [] results: list[ExpansionResult] = []
@ -190,7 +194,9 @@ class OpenAlexExpander:
if not doi: if not doi:
return None return None
query = urlencode({"filter": f"doi:https://doi.org/{doi}"}) query = urlencode({"filter": f"doi:https://doi.org/{doi}"})
payload = self.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}") payload = self.resolver.source_client.try_get_json(f"https://api.openalex.org/works?{query}")
if payload is None:
return None
results = payload.get("results", []) results = payload.get("results", [])
if not results: if not results:
return None return None
@ -326,9 +332,11 @@ class TopicExpander:
if entry is None or not entry.get("doi"): if entry is None or not entry.get("doi"):
return [] return []
doi = str(entry["doi"]) doi = str(entry["doi"])
payload = self.crossref_expander.resolver.source_client.get_json( payload = self.crossref_expander.resolver.source_client.try_get_json(
f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com" f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com"
) )
if payload is None:
return []
references = payload.get("message", {}).get("reference", [])[:limit] references = payload.get("message", {}).get("reference", [])[:limit]
rows: list[tuple[ExpansionResult, dict[str, object]]] = [] rows: list[tuple[ExpansionResult, dict[str, object]]] = []
for index, reference in enumerate(references, start=1): for index, reference in enumerate(references, start=1):
@ -362,7 +370,9 @@ class TopicExpander:
return [] return []
filter_name = "cited_by" if relation_type == "cites" else "cites" filter_name = "cited_by" if relation_type == "cites" else "cites"
query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit}) query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit})
payload = self.openalex_expander.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}") payload = self.openalex_expander.resolver.source_client.try_get_json(f"https://api.openalex.org/works?{query}")
if payload is None:
return []
works = payload.get("results", []) works = payload.get("results", [])
rows: list[tuple[ExpansionResult, dict[str, object]]] = [] rows: list[tuple[ExpansionResult, dict[str, object]]] = []
for work in works: for work in works:

View File

@ -41,7 +41,9 @@ class OaiPmhHarvester:
self.source_client = source_client or SourceClient() self.source_client = source_client or SourceClient()
def identify(self, base_url: str) -> dict[str, str]: def identify(self, base_url: str) -> dict[str, str]:
root = self.source_client.get_xml(f"{base_url}?{urlencode({'verb': 'Identify'})}") root = self.source_client.try_get_xml(f"{base_url}?{urlencode({'verb': 'Identify'})}")
if root is None:
return {}
identify = root.find(".//oai:Identify", NS) identify = root.find(".//oai:Identify", NS)
if identify is None: if identify is None:
return {} return {}
@ -59,7 +61,9 @@ class OaiPmhHarvester:
return payload return payload
def list_sets(self, base_url: str) -> list[OaiSet]: def list_sets(self, base_url: str) -> list[OaiSet]:
root = self.source_client.get_xml(f"{base_url}?{urlencode({'verb': 'ListSets'})}") root = self.source_client.try_get_xml(f"{base_url}?{urlencode({'verb': 'ListSets'})}")
if root is None:
return []
sets = root.findall(".//oai:set", NS) sets = root.findall(".//oai:set", NS)
results: list[OaiSet] = [] results: list[OaiSet] = []
for node in sets: for node in sets:
@ -76,7 +80,9 @@ class OaiPmhHarvester:
params = {"verb": "ListMetadataFormats"} params = {"verb": "ListMetadataFormats"}
if identifier: if identifier:
params["identifier"] = identifier params["identifier"] = identifier
root = self.source_client.get_xml(f"{base_url}?{urlencode(params)}") root = self.source_client.try_get_xml(f"{base_url}?{urlencode(params)}")
if root is None:
return []
formats = root.findall(".//oai:metadataFormat", NS) formats = root.findall(".//oai:metadataFormat", NS)
results: list[OaiMetadataFormat] = [] results: list[OaiMetadataFormat] = []
for node in formats: for node in formats:
@ -110,7 +116,9 @@ class OaiPmhHarvester:
ordinal = 1 ordinal = 1
next_url = f"{base_url}?{urlencode(params)}" next_url = f"{base_url}?{urlencode(params)}"
while next_url: while next_url:
root = self.source_client.get_xml(next_url) root = self.source_client.try_get_xml(next_url)
if root is None:
break
records = root.findall(".//oai:record", NS) records = root.findall(".//oai:record", NS)
for record in records: for record in records:
parsed = self._record_to_result(base_url, record, ordinal) parsed = self._record_to_result(base_url, record, ordinal)
@ -133,7 +141,9 @@ class OaiPmhHarvester:
"metadataPrefix": metadata_prefix, "metadataPrefix": metadata_prefix,
"identifier": identifier, "identifier": identifier,
} }
root = self.source_client.get_xml(f"{base_url}?{urlencode(params)}") root = self.source_client.try_get_xml(f"{base_url}?{urlencode(params)}")
if root is None:
return None
record = root.find(".//oai:record", NS) record = root.find(".//oai:record", NS)
if record is None: if record is None:
return None return None

View File

@ -2,6 +2,7 @@ from __future__ import annotations
import hashlib import hashlib
import json import json
import urllib.error
import urllib.request import urllib.request
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from pathlib import Path from pathlib import Path
@ -45,6 +46,24 @@ class SourceClient:
self._write_cache(url, "xml", payload) self._write_cache(url, "xml", payload)
return ET.fromstring(payload) return ET.fromstring(payload)
def try_get_json(self, url: str) -> dict | None:
try:
return self.get_json(url)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError, ValueError):
return None
def try_get_text(self, url: str) -> str | None:
try:
return self.get_text(url)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError, ValueError):
return None
def try_get_xml(self, url: str) -> ET.Element | None:
try:
return self.get_xml(url)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError, ET.ParseError, ValueError):
return None
def _fetch_bytes(self, url: str) -> bytes: def _fetch_bytes(self, url: str) -> bytes:
with urllib.request.urlopen(self._request(url)) as response: with urllib.request.urlopen(self._request(url)) as response:
return response.read() return response.read()

View File

@ -781,7 +781,10 @@ class TalkOriginsScraper:
limit_topics: int | None = None, limit_topics: int | None = None,
resume: bool = True, resume: bool = True,
) -> list[TalkOriginsTopic]: ) -> list[TalkOriginsTopic]:
index_html = self.source_client.get_text(base_url) fetch_text = getattr(self.source_client, "try_get_text", self.source_client.get_text)
index_html = fetch_text(base_url)
if index_html is None:
return []
parser = _TopicIndexParser(base_url) parser = _TopicIndexParser(base_url)
parser.feed(index_html) parser.feed(index_html)
@ -793,7 +796,9 @@ class TalkOriginsScraper:
if snapshot is not None: if snapshot is not None:
raw_entries = list(snapshot.get("raw_entries", [])) raw_entries = list(snapshot.get("raw_entries", []))
else: else:
page_html = self.source_client.get_text(link["url"]) page_html = fetch_text(link["url"])
if page_html is None:
continue
topic_parser = _TopicPageParser() topic_parser = _TopicPageParser()
topic_parser.feed(page_html) topic_parser.feed(page_html)
raw_entries = normalize_topic_entries(topic_parser.preformatted_text()) raw_entries = normalize_topic_entries(topic_parser.preformatted_text())

View File

@ -1,3 +1,5 @@
import urllib.error
from citegeist.bibtex import BibEntry from citegeist.bibtex import BibEntry
from citegeist.expand import CrossrefExpander, _crossref_reference_to_entry from citegeist.expand import CrossrefExpander, _crossref_reference_to_entry
from citegeist.resolve import Resolution from citegeist.resolve import Resolution
@ -138,3 +140,29 @@ def test_crossref_reference_to_entry_infers_non_misc_for_proceedings_like_text()
) )
assert entry.entry_type == "inproceedings" assert entry.entry_type == "inproceedings"
def test_crossref_expander_returns_empty_on_fetch_error():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024},
doi = {10.1000/seed-doi}
}
"""
)
expander = CrossrefExpander()
def raise_404(_url: str):
raise urllib.error.HTTPError(_url, 404, "Not Found", hdrs=None, fp=None)
expander.resolver.source_client._fetch_bytes = raise_404 # type: ignore[method-assign]
assert expander.expand_entry_references(store, "seed2024") == []
finally:
store.close()

View File

@ -1,4 +1,5 @@
from pathlib import Path from pathlib import Path
import urllib.error
from citegeist.sources import SourceClient from citegeist.sources import SourceClient
@ -39,3 +40,14 @@ def test_source_client_falls_back_to_latin1_for_text(tmp_path: Path):
payload = client.get_text(url) payload = client.get_text(url)
assert payload == "café" assert payload == "café"
def test_source_client_try_get_json_returns_none_on_http_error(tmp_path: Path):
client = SourceClient(cache_dir=tmp_path / "cache")
def raise_404(_url: str):
raise urllib.error.HTTPError(_url, 404, "Not Found", hdrs=None, fp=None)
client._fetch_bytes = raise_404 # type: ignore[method-assign]
assert client.try_get_json("https://example.org/missing") is None