Harden network fetch failures
This commit is contained in:
parent
8a21044d1f
commit
69844e9750
|
|
@ -49,9 +49,11 @@ class CrossrefExpander:
|
||||||
if not doi:
|
if not doi:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
payload = self.resolver.source_client.get_json(
|
payload = self.resolver.source_client.try_get_json(
|
||||||
f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com"
|
f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com"
|
||||||
)
|
)
|
||||||
|
if payload is None:
|
||||||
|
return []
|
||||||
references = payload.get("message", {}).get("reference", [])
|
references = payload.get("message", {}).get("reference", [])
|
||||||
results: list[ExpansionResult] = []
|
results: list[ExpansionResult] = []
|
||||||
for index, reference in enumerate(references, start=1):
|
for index, reference in enumerate(references, start=1):
|
||||||
|
|
@ -141,7 +143,9 @@ class OpenAlexExpander:
|
||||||
|
|
||||||
filter_name = "cited_by" if relation_type == "cites" else "cites"
|
filter_name = "cited_by" if relation_type == "cites" else "cites"
|
||||||
query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit})
|
query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit})
|
||||||
payload = self.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}")
|
payload = self.resolver.source_client.try_get_json(f"https://api.openalex.org/works?{query}")
|
||||||
|
if payload is None:
|
||||||
|
return []
|
||||||
works = payload.get("results", [])
|
works = payload.get("results", [])
|
||||||
|
|
||||||
results: list[ExpansionResult] = []
|
results: list[ExpansionResult] = []
|
||||||
|
|
@ -190,7 +194,9 @@ class OpenAlexExpander:
|
||||||
if not doi:
|
if not doi:
|
||||||
return None
|
return None
|
||||||
query = urlencode({"filter": f"doi:https://doi.org/{doi}"})
|
query = urlencode({"filter": f"doi:https://doi.org/{doi}"})
|
||||||
payload = self.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}")
|
payload = self.resolver.source_client.try_get_json(f"https://api.openalex.org/works?{query}")
|
||||||
|
if payload is None:
|
||||||
|
return None
|
||||||
results = payload.get("results", [])
|
results = payload.get("results", [])
|
||||||
if not results:
|
if not results:
|
||||||
return None
|
return None
|
||||||
|
|
@ -326,9 +332,11 @@ class TopicExpander:
|
||||||
if entry is None or not entry.get("doi"):
|
if entry is None or not entry.get("doi"):
|
||||||
return []
|
return []
|
||||||
doi = str(entry["doi"])
|
doi = str(entry["doi"])
|
||||||
payload = self.crossref_expander.resolver.source_client.get_json(
|
payload = self.crossref_expander.resolver.source_client.try_get_json(
|
||||||
f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com"
|
f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com"
|
||||||
)
|
)
|
||||||
|
if payload is None:
|
||||||
|
return []
|
||||||
references = payload.get("message", {}).get("reference", [])[:limit]
|
references = payload.get("message", {}).get("reference", [])[:limit]
|
||||||
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
|
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
|
||||||
for index, reference in enumerate(references, start=1):
|
for index, reference in enumerate(references, start=1):
|
||||||
|
|
@ -362,7 +370,9 @@ class TopicExpander:
|
||||||
return []
|
return []
|
||||||
filter_name = "cited_by" if relation_type == "cites" else "cites"
|
filter_name = "cited_by" if relation_type == "cites" else "cites"
|
||||||
query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit})
|
query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit})
|
||||||
payload = self.openalex_expander.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}")
|
payload = self.openalex_expander.resolver.source_client.try_get_json(f"https://api.openalex.org/works?{query}")
|
||||||
|
if payload is None:
|
||||||
|
return []
|
||||||
works = payload.get("results", [])
|
works = payload.get("results", [])
|
||||||
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
|
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
|
||||||
for work in works:
|
for work in works:
|
||||||
|
|
|
||||||
|
|
@ -41,7 +41,9 @@ class OaiPmhHarvester:
|
||||||
self.source_client = source_client or SourceClient()
|
self.source_client = source_client or SourceClient()
|
||||||
|
|
||||||
def identify(self, base_url: str) -> dict[str, str]:
|
def identify(self, base_url: str) -> dict[str, str]:
|
||||||
root = self.source_client.get_xml(f"{base_url}?{urlencode({'verb': 'Identify'})}")
|
root = self.source_client.try_get_xml(f"{base_url}?{urlencode({'verb': 'Identify'})}")
|
||||||
|
if root is None:
|
||||||
|
return {}
|
||||||
identify = root.find(".//oai:Identify", NS)
|
identify = root.find(".//oai:Identify", NS)
|
||||||
if identify is None:
|
if identify is None:
|
||||||
return {}
|
return {}
|
||||||
|
|
@ -59,7 +61,9 @@ class OaiPmhHarvester:
|
||||||
return payload
|
return payload
|
||||||
|
|
||||||
def list_sets(self, base_url: str) -> list[OaiSet]:
|
def list_sets(self, base_url: str) -> list[OaiSet]:
|
||||||
root = self.source_client.get_xml(f"{base_url}?{urlencode({'verb': 'ListSets'})}")
|
root = self.source_client.try_get_xml(f"{base_url}?{urlencode({'verb': 'ListSets'})}")
|
||||||
|
if root is None:
|
||||||
|
return []
|
||||||
sets = root.findall(".//oai:set", NS)
|
sets = root.findall(".//oai:set", NS)
|
||||||
results: list[OaiSet] = []
|
results: list[OaiSet] = []
|
||||||
for node in sets:
|
for node in sets:
|
||||||
|
|
@ -76,7 +80,9 @@ class OaiPmhHarvester:
|
||||||
params = {"verb": "ListMetadataFormats"}
|
params = {"verb": "ListMetadataFormats"}
|
||||||
if identifier:
|
if identifier:
|
||||||
params["identifier"] = identifier
|
params["identifier"] = identifier
|
||||||
root = self.source_client.get_xml(f"{base_url}?{urlencode(params)}")
|
root = self.source_client.try_get_xml(f"{base_url}?{urlencode(params)}")
|
||||||
|
if root is None:
|
||||||
|
return []
|
||||||
formats = root.findall(".//oai:metadataFormat", NS)
|
formats = root.findall(".//oai:metadataFormat", NS)
|
||||||
results: list[OaiMetadataFormat] = []
|
results: list[OaiMetadataFormat] = []
|
||||||
for node in formats:
|
for node in formats:
|
||||||
|
|
@ -110,7 +116,9 @@ class OaiPmhHarvester:
|
||||||
ordinal = 1
|
ordinal = 1
|
||||||
next_url = f"{base_url}?{urlencode(params)}"
|
next_url = f"{base_url}?{urlencode(params)}"
|
||||||
while next_url:
|
while next_url:
|
||||||
root = self.source_client.get_xml(next_url)
|
root = self.source_client.try_get_xml(next_url)
|
||||||
|
if root is None:
|
||||||
|
break
|
||||||
records = root.findall(".//oai:record", NS)
|
records = root.findall(".//oai:record", NS)
|
||||||
for record in records:
|
for record in records:
|
||||||
parsed = self._record_to_result(base_url, record, ordinal)
|
parsed = self._record_to_result(base_url, record, ordinal)
|
||||||
|
|
@ -133,7 +141,9 @@ class OaiPmhHarvester:
|
||||||
"metadataPrefix": metadata_prefix,
|
"metadataPrefix": metadata_prefix,
|
||||||
"identifier": identifier,
|
"identifier": identifier,
|
||||||
}
|
}
|
||||||
root = self.source_client.get_xml(f"{base_url}?{urlencode(params)}")
|
root = self.source_client.try_get_xml(f"{base_url}?{urlencode(params)}")
|
||||||
|
if root is None:
|
||||||
|
return None
|
||||||
record = root.find(".//oai:record", NS)
|
record = root.find(".//oai:record", NS)
|
||||||
if record is None:
|
if record is None:
|
||||||
return None
|
return None
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ from __future__ import annotations
|
||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
|
import urllib.error
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
@ -45,6 +46,24 @@ class SourceClient:
|
||||||
self._write_cache(url, "xml", payload)
|
self._write_cache(url, "xml", payload)
|
||||||
return ET.fromstring(payload)
|
return ET.fromstring(payload)
|
||||||
|
|
||||||
|
def try_get_json(self, url: str) -> dict | None:
|
||||||
|
try:
|
||||||
|
return self.get_json(url)
|
||||||
|
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError, ValueError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def try_get_text(self, url: str) -> str | None:
|
||||||
|
try:
|
||||||
|
return self.get_text(url)
|
||||||
|
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError, ValueError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def try_get_xml(self, url: str) -> ET.Element | None:
|
||||||
|
try:
|
||||||
|
return self.get_xml(url)
|
||||||
|
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError, ET.ParseError, ValueError):
|
||||||
|
return None
|
||||||
|
|
||||||
def _fetch_bytes(self, url: str) -> bytes:
|
def _fetch_bytes(self, url: str) -> bytes:
|
||||||
with urllib.request.urlopen(self._request(url)) as response:
|
with urllib.request.urlopen(self._request(url)) as response:
|
||||||
return response.read()
|
return response.read()
|
||||||
|
|
|
||||||
|
|
@ -781,7 +781,10 @@ class TalkOriginsScraper:
|
||||||
limit_topics: int | None = None,
|
limit_topics: int | None = None,
|
||||||
resume: bool = True,
|
resume: bool = True,
|
||||||
) -> list[TalkOriginsTopic]:
|
) -> list[TalkOriginsTopic]:
|
||||||
index_html = self.source_client.get_text(base_url)
|
fetch_text = getattr(self.source_client, "try_get_text", self.source_client.get_text)
|
||||||
|
index_html = fetch_text(base_url)
|
||||||
|
if index_html is None:
|
||||||
|
return []
|
||||||
parser = _TopicIndexParser(base_url)
|
parser = _TopicIndexParser(base_url)
|
||||||
parser.feed(index_html)
|
parser.feed(index_html)
|
||||||
|
|
||||||
|
|
@ -793,7 +796,9 @@ class TalkOriginsScraper:
|
||||||
if snapshot is not None:
|
if snapshot is not None:
|
||||||
raw_entries = list(snapshot.get("raw_entries", []))
|
raw_entries = list(snapshot.get("raw_entries", []))
|
||||||
else:
|
else:
|
||||||
page_html = self.source_client.get_text(link["url"])
|
page_html = fetch_text(link["url"])
|
||||||
|
if page_html is None:
|
||||||
|
continue
|
||||||
topic_parser = _TopicPageParser()
|
topic_parser = _TopicPageParser()
|
||||||
topic_parser.feed(page_html)
|
topic_parser.feed(page_html)
|
||||||
raw_entries = normalize_topic_entries(topic_parser.preformatted_text())
|
raw_entries = normalize_topic_entries(topic_parser.preformatted_text())
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
import urllib.error
|
||||||
|
|
||||||
from citegeist.bibtex import BibEntry
|
from citegeist.bibtex import BibEntry
|
||||||
from citegeist.expand import CrossrefExpander, _crossref_reference_to_entry
|
from citegeist.expand import CrossrefExpander, _crossref_reference_to_entry
|
||||||
from citegeist.resolve import Resolution
|
from citegeist.resolve import Resolution
|
||||||
|
|
@ -138,3 +140,29 @@ def test_crossref_reference_to_entry_infers_non_misc_for_proceedings_like_text()
|
||||||
)
|
)
|
||||||
|
|
||||||
assert entry.entry_type == "inproceedings"
|
assert entry.entry_type == "inproceedings"
|
||||||
|
|
||||||
|
|
||||||
|
def test_crossref_expander_returns_empty_on_fetch_error():
|
||||||
|
store = BibliographyStore()
|
||||||
|
try:
|
||||||
|
store.ingest_bibtex(
|
||||||
|
"""
|
||||||
|
@article{seed2024,
|
||||||
|
author = {Seed, Alice},
|
||||||
|
title = {Seed Paper},
|
||||||
|
year = {2024},
|
||||||
|
doi = {10.1000/seed-doi}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
expander = CrossrefExpander()
|
||||||
|
|
||||||
|
def raise_404(_url: str):
|
||||||
|
raise urllib.error.HTTPError(_url, 404, "Not Found", hdrs=None, fp=None)
|
||||||
|
|
||||||
|
expander.resolver.source_client._fetch_bytes = raise_404 # type: ignore[method-assign]
|
||||||
|
|
||||||
|
assert expander.expand_entry_references(store, "seed2024") == []
|
||||||
|
finally:
|
||||||
|
store.close()
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import urllib.error
|
||||||
|
|
||||||
from citegeist.sources import SourceClient
|
from citegeist.sources import SourceClient
|
||||||
|
|
||||||
|
|
@ -39,3 +40,14 @@ def test_source_client_falls_back_to_latin1_for_text(tmp_path: Path):
|
||||||
payload = client.get_text(url)
|
payload = client.get_text(url)
|
||||||
|
|
||||||
assert payload == "café"
|
assert payload == "café"
|
||||||
|
|
||||||
|
|
||||||
|
def test_source_client_try_get_json_returns_none_on_http_error(tmp_path: Path):
|
||||||
|
client = SourceClient(cache_dir=tmp_path / "cache")
|
||||||
|
|
||||||
|
def raise_404(_url: str):
|
||||||
|
raise urllib.error.HTTPError(_url, 404, "Not Found", hdrs=None, fp=None)
|
||||||
|
|
||||||
|
client._fetch_bytes = raise_404 # type: ignore[method-assign]
|
||||||
|
|
||||||
|
assert client.try_get_json("https://example.org/missing") is None
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue