Filter weak Crossref discovered references

This commit is contained in:
welsberr 2026-03-20 19:41:28 -04:00
parent 69844e9750
commit d6b5138660
4 changed files with 128 additions and 8 deletions

View File

@ -159,6 +159,8 @@ Broad BibTeX exports skip DOI-only placeholder records such as `Referenced work
Long-running CLI commands report progress on `stderr` so `stdout` remains clean for JSON, BibTeX, or tabular output.
Crossref reference expansion is intentionally conservative about weak discoveries. If a cited reference has no DOI and Crossref only exposes it as an unstructured citation blob, `expand --source crossref`, `expand-topic --source crossref`, and bootstrap flows now skip materializing that record unless the fallback metadata looks like a cleaner non-`misc` work such as conference proceedings. This reduces junk `@misc` entries whose `title` field is really a pasted citation string.
For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.
## Example Application

View File

@ -211,6 +211,8 @@ Re-enrich all current `@misc` entries with DOIs:
.venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --all-misc --limit 25
```
When Crossref expansion only yields an unstructured citation blob without a DOI, citegeist now skips materializing that discovery instead of storing it as a weak `@misc` entry. Cleaner fallback cases that infer a more specific type, such as proceedings-like titles, are still admitted.
## Explore Citation Graphs
Purpose: traverse citation edges, export graph data, and render quick visualizations.

View File

@ -58,6 +58,8 @@ class CrossrefExpander:
results: list[ExpansionResult] = []
for index, reference in enumerate(references, start=1):
discovered = self._reference_to_entry(reference, citation_key, index)
if discovered is None:
continue
created = False
if store.get_entry(discovered.citation_key) is None:
store.upsert_entry(
@ -89,17 +91,22 @@ class CrossrefExpander:
)
return results
def _reference_to_entry(self, reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
def _reference_to_entry(
self,
reference: dict,
source_citation_key: str,
ordinal: int,
) -> BibEntry | None:
fallback = _crossref_reference_to_entry(reference, source_citation_key, ordinal)
doi = reference.get("DOI") or ""
if not doi:
return fallback
return None if _skip_crossref_reference(reference, fallback) else fallback
resolution = self.resolver.resolve_doi(doi)
if resolution is None:
resolution = self.resolver.resolve_datacite_doi(doi)
if resolution is None:
return fallback
return None if _skip_crossref_reference(reference, fallback) else fallback
merged = merge_entries(resolution.entry, fallback)
merged.fields["note"] = fallback.fields["note"]
@ -341,6 +348,8 @@ class TopicExpander:
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
for index, reference in enumerate(references, start=1):
discovered = self.crossref_expander._reference_to_entry(reference, citation_key, index)
if discovered is None:
continue
rows.append(
(
ExpansionResult(
@ -425,6 +434,44 @@ def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordi
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
def _skip_crossref_reference(reference: dict, entry: BibEntry) -> bool:
if reference.get("DOI"):
return False
if reference.get("article-title") or reference.get("volume-title"):
return False
title = str(entry.fields.get("title") or "")
normalized_title = _normalize_text(title)
if not normalized_title:
return True
if normalized_title.casefold().startswith("referenced work "):
return True
if normalized_title[0] in ".,;:)":
return True
unstructured = _normalize_text(str(reference.get("unstructured") or ""))
if not unstructured:
return not bool(reference.get("journal-title"))
if entry.entry_type == "misc":
return True
return _looks_like_citation_blob(unstructured)
def _looks_like_citation_blob(text: str) -> bool:
lowered = text.casefold()
if any(token in lowered for token in ("http://", "https://", "www.", " accessed ", " url ")):
return True
if any(token in lowered for token in ("supporting material", "grant", "poll results", "prescribing information")):
return True
if text.count(",") >= 3 or text.count(";") >= 2:
return True
if re.search(r"\(\d{4}\)", text):
return True
if re.search(r"\b[A-Z]\.[ ]?[A-Z]?\.", text):
return True
return False
def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int) -> str:
if doi := reference.get("DOI"):
suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()

View File

@ -59,14 +59,11 @@ def test_crossref_expander_creates_draft_nodes_and_relations():
results = expander.expand_entry_references(store, "seed2024")
assert [result.discovered_citation_key for result in results] == [
"doi101000exampleref",
"ref2021unstructured2",
]
assert [result.discovered_citation_key for result in results] == ["doi101000exampleref"]
discovered = store.get_entry("doi101000exampleref")
assert discovered is not None
assert discovered["review_status"] == "draft"
assert store.get_relations("seed2024") == ["doi101000exampleref", "ref2021unstructured2"]
assert store.get_relations("seed2024") == ["doi101000exampleref"]
relation_provenance = store.get_relation_provenance("seed2024")
assert relation_provenance[0]["source_type"] == "graph_expand"
finally:
@ -142,6 +139,78 @@ def test_crossref_reference_to_entry_infers_non_misc_for_proceedings_like_text()
assert entry.entry_type == "inproceedings"
def test_crossref_expander_skips_citation_blob_without_identifier():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024},
doi = {10.1000/seed-doi}
}
"""
)
expander = CrossrefExpander()
expander.resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
"message": {
"reference": [
{
"unstructured": (
"Abraham, J.K., Meir, E., Perry, J. (2009). "
"Addressing undergraduate student misconceptions. "
"https://example.org/article"
),
"year": "2009",
}
]
}
}
assert expander.expand_entry_references(store, "seed2024") == []
assert store.get_relations("seed2024") == []
finally:
store.close()
def test_crossref_expander_keeps_simple_unstructured_title_without_identifier():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024},
doi = {10.1000/seed-doi}
}
"""
)
expander = CrossrefExpander()
expander.resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
"message": {
"reference": [
{
"unstructured": "Proceedings of the Artificial Life Workshop",
"year": "2005",
}
]
}
}
results = expander.expand_entry_references(store, "seed2024")
assert [result.discovered_citation_key for result in results] == ["ref2005proceedings1"]
discovered = store.get_entry("ref2005proceedings1")
assert discovered is not None
assert discovered["entry_type"] == "inproceedings"
finally:
store.close()
def test_crossref_expander_returns_empty_on_fetch_error():
store = BibliographyStore()
try: