Filter weak Crossref discovered references
This commit is contained in:
parent
69844e9750
commit
d6b5138660
|
|
@ -159,6 +159,8 @@ Broad BibTeX exports skip DOI-only placeholder records such as `Referenced work
|
||||||
|
|
||||||
Long-running CLI commands report progress on `stderr` so `stdout` remains clean for JSON, BibTeX, or tabular output.
|
Long-running CLI commands report progress on `stderr` so `stdout` remains clean for JSON, BibTeX, or tabular output.
|
||||||
|
|
||||||
|
Crossref reference expansion is intentionally conservative about weak discoveries. If a cited reference has no DOI and Crossref only exposes it as an unstructured citation blob, `expand --source crossref`, `expand-topic --source crossref`, and bootstrap flows now skip materializing that record unless the fallback metadata looks like a cleaner non-`misc` work such as conference proceedings. This reduces junk `@misc` entries whose `title` field is really a pasted citation string.
|
||||||
|
|
||||||
For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.
|
For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.
|
||||||
|
|
||||||
## Example Application
|
## Example Application
|
||||||
|
|
|
||||||
|
|
@ -211,6 +211,8 @@ Re-enrich all current `@misc` entries with DOIs:
|
||||||
.venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --all-misc --limit 25
|
.venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --all-misc --limit 25
|
||||||
```
|
```
|
||||||
|
|
||||||
|
When Crossref expansion only yields an unstructured citation blob without a DOI, citegeist now skips materializing that discovery instead of storing it as a weak `@misc` entry. Cleaner fallback cases that infer a more specific type, such as proceedings-like titles, are still admitted.
|
||||||
|
|
||||||
## Explore Citation Graphs
|
## Explore Citation Graphs
|
||||||
|
|
||||||
Purpose: traverse citation edges, export graph data, and render quick visualizations.
|
Purpose: traverse citation edges, export graph data, and render quick visualizations.
|
||||||
|
|
|
||||||
|
|
@ -58,6 +58,8 @@ class CrossrefExpander:
|
||||||
results: list[ExpansionResult] = []
|
results: list[ExpansionResult] = []
|
||||||
for index, reference in enumerate(references, start=1):
|
for index, reference in enumerate(references, start=1):
|
||||||
discovered = self._reference_to_entry(reference, citation_key, index)
|
discovered = self._reference_to_entry(reference, citation_key, index)
|
||||||
|
if discovered is None:
|
||||||
|
continue
|
||||||
created = False
|
created = False
|
||||||
if store.get_entry(discovered.citation_key) is None:
|
if store.get_entry(discovered.citation_key) is None:
|
||||||
store.upsert_entry(
|
store.upsert_entry(
|
||||||
|
|
@ -89,17 +91,22 @@ class CrossrefExpander:
|
||||||
)
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def _reference_to_entry(self, reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
|
def _reference_to_entry(
|
||||||
|
self,
|
||||||
|
reference: dict,
|
||||||
|
source_citation_key: str,
|
||||||
|
ordinal: int,
|
||||||
|
) -> BibEntry | None:
|
||||||
fallback = _crossref_reference_to_entry(reference, source_citation_key, ordinal)
|
fallback = _crossref_reference_to_entry(reference, source_citation_key, ordinal)
|
||||||
doi = reference.get("DOI") or ""
|
doi = reference.get("DOI") or ""
|
||||||
if not doi:
|
if not doi:
|
||||||
return fallback
|
return None if _skip_crossref_reference(reference, fallback) else fallback
|
||||||
|
|
||||||
resolution = self.resolver.resolve_doi(doi)
|
resolution = self.resolver.resolve_doi(doi)
|
||||||
if resolution is None:
|
if resolution is None:
|
||||||
resolution = self.resolver.resolve_datacite_doi(doi)
|
resolution = self.resolver.resolve_datacite_doi(doi)
|
||||||
if resolution is None:
|
if resolution is None:
|
||||||
return fallback
|
return None if _skip_crossref_reference(reference, fallback) else fallback
|
||||||
|
|
||||||
merged = merge_entries(resolution.entry, fallback)
|
merged = merge_entries(resolution.entry, fallback)
|
||||||
merged.fields["note"] = fallback.fields["note"]
|
merged.fields["note"] = fallback.fields["note"]
|
||||||
|
|
@ -341,6 +348,8 @@ class TopicExpander:
|
||||||
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
|
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
|
||||||
for index, reference in enumerate(references, start=1):
|
for index, reference in enumerate(references, start=1):
|
||||||
discovered = self.crossref_expander._reference_to_entry(reference, citation_key, index)
|
discovered = self.crossref_expander._reference_to_entry(reference, citation_key, index)
|
||||||
|
if discovered is None:
|
||||||
|
continue
|
||||||
rows.append(
|
rows.append(
|
||||||
(
|
(
|
||||||
ExpansionResult(
|
ExpansionResult(
|
||||||
|
|
@ -425,6 +434,44 @@ def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordi
|
||||||
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
|
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
|
||||||
|
|
||||||
|
|
||||||
|
def _skip_crossref_reference(reference: dict, entry: BibEntry) -> bool:
|
||||||
|
if reference.get("DOI"):
|
||||||
|
return False
|
||||||
|
if reference.get("article-title") or reference.get("volume-title"):
|
||||||
|
return False
|
||||||
|
|
||||||
|
title = str(entry.fields.get("title") or "")
|
||||||
|
normalized_title = _normalize_text(title)
|
||||||
|
if not normalized_title:
|
||||||
|
return True
|
||||||
|
if normalized_title.casefold().startswith("referenced work "):
|
||||||
|
return True
|
||||||
|
if normalized_title[0] in ".,;:)":
|
||||||
|
return True
|
||||||
|
|
||||||
|
unstructured = _normalize_text(str(reference.get("unstructured") or ""))
|
||||||
|
if not unstructured:
|
||||||
|
return not bool(reference.get("journal-title"))
|
||||||
|
if entry.entry_type == "misc":
|
||||||
|
return True
|
||||||
|
return _looks_like_citation_blob(unstructured)
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_citation_blob(text: str) -> bool:
|
||||||
|
lowered = text.casefold()
|
||||||
|
if any(token in lowered for token in ("http://", "https://", "www.", " accessed ", " url ")):
|
||||||
|
return True
|
||||||
|
if any(token in lowered for token in ("supporting material", "grant", "poll results", "prescribing information")):
|
||||||
|
return True
|
||||||
|
if text.count(",") >= 3 or text.count(";") >= 2:
|
||||||
|
return True
|
||||||
|
if re.search(r"\(\d{4}\)", text):
|
||||||
|
return True
|
||||||
|
if re.search(r"\b[A-Z]\.[ ]?[A-Z]?\.", text):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int) -> str:
|
def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int) -> str:
|
||||||
if doi := reference.get("DOI"):
|
if doi := reference.get("DOI"):
|
||||||
suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
|
suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
|
||||||
|
|
|
||||||
|
|
@ -59,14 +59,11 @@ def test_crossref_expander_creates_draft_nodes_and_relations():
|
||||||
|
|
||||||
results = expander.expand_entry_references(store, "seed2024")
|
results = expander.expand_entry_references(store, "seed2024")
|
||||||
|
|
||||||
assert [result.discovered_citation_key for result in results] == [
|
assert [result.discovered_citation_key for result in results] == ["doi101000exampleref"]
|
||||||
"doi101000exampleref",
|
|
||||||
"ref2021unstructured2",
|
|
||||||
]
|
|
||||||
discovered = store.get_entry("doi101000exampleref")
|
discovered = store.get_entry("doi101000exampleref")
|
||||||
assert discovered is not None
|
assert discovered is not None
|
||||||
assert discovered["review_status"] == "draft"
|
assert discovered["review_status"] == "draft"
|
||||||
assert store.get_relations("seed2024") == ["doi101000exampleref", "ref2021unstructured2"]
|
assert store.get_relations("seed2024") == ["doi101000exampleref"]
|
||||||
relation_provenance = store.get_relation_provenance("seed2024")
|
relation_provenance = store.get_relation_provenance("seed2024")
|
||||||
assert relation_provenance[0]["source_type"] == "graph_expand"
|
assert relation_provenance[0]["source_type"] == "graph_expand"
|
||||||
finally:
|
finally:
|
||||||
|
|
@ -142,6 +139,78 @@ def test_crossref_reference_to_entry_infers_non_misc_for_proceedings_like_text()
|
||||||
assert entry.entry_type == "inproceedings"
|
assert entry.entry_type == "inproceedings"
|
||||||
|
|
||||||
|
|
||||||
|
def test_crossref_expander_skips_citation_blob_without_identifier():
|
||||||
|
store = BibliographyStore()
|
||||||
|
try:
|
||||||
|
store.ingest_bibtex(
|
||||||
|
"""
|
||||||
|
@article{seed2024,
|
||||||
|
author = {Seed, Alice},
|
||||||
|
title = {Seed Paper},
|
||||||
|
year = {2024},
|
||||||
|
doi = {10.1000/seed-doi}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
expander = CrossrefExpander()
|
||||||
|
expander.resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
|
||||||
|
"message": {
|
||||||
|
"reference": [
|
||||||
|
{
|
||||||
|
"unstructured": (
|
||||||
|
"Abraham, J.K., Meir, E., Perry, J. (2009). "
|
||||||
|
"Addressing undergraduate student misconceptions. "
|
||||||
|
"https://example.org/article"
|
||||||
|
),
|
||||||
|
"year": "2009",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assert expander.expand_entry_references(store, "seed2024") == []
|
||||||
|
assert store.get_relations("seed2024") == []
|
||||||
|
finally:
|
||||||
|
store.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_crossref_expander_keeps_simple_unstructured_title_without_identifier():
|
||||||
|
store = BibliographyStore()
|
||||||
|
try:
|
||||||
|
store.ingest_bibtex(
|
||||||
|
"""
|
||||||
|
@article{seed2024,
|
||||||
|
author = {Seed, Alice},
|
||||||
|
title = {Seed Paper},
|
||||||
|
year = {2024},
|
||||||
|
doi = {10.1000/seed-doi}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
expander = CrossrefExpander()
|
||||||
|
expander.resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
|
||||||
|
"message": {
|
||||||
|
"reference": [
|
||||||
|
{
|
||||||
|
"unstructured": "Proceedings of the Artificial Life Workshop",
|
||||||
|
"year": "2005",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
results = expander.expand_entry_references(store, "seed2024")
|
||||||
|
|
||||||
|
assert [result.discovered_citation_key for result in results] == ["ref2005proceedings1"]
|
||||||
|
discovered = store.get_entry("ref2005proceedings1")
|
||||||
|
assert discovered is not None
|
||||||
|
assert discovered["entry_type"] == "inproceedings"
|
||||||
|
finally:
|
||||||
|
store.close()
|
||||||
|
|
||||||
|
|
||||||
def test_crossref_expander_returns_empty_on_fetch_error():
|
def test_crossref_expander_returns_empty_on_fetch_error():
|
||||||
store = BibliographyStore()
|
store = BibliographyStore()
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue