From d6b5138660cf2d41d39b6a4b2575c888101fa9e0 Mon Sep 17 00:00:00 2001 From: welsberr Date: Fri, 20 Mar 2026 19:41:28 -0400 Subject: [PATCH] Filter weak Crossref discovered references --- README.md | 2 ++ examples/cli/README.md | 2 ++ src/citegeist/expand.py | 53 +++++++++++++++++++++++++-- tests/test_expand.py | 79 ++++++++++++++++++++++++++++++++++++++--- 4 files changed, 128 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 82d73d7..c378f17 100644 --- a/README.md +++ b/README.md @@ -159,6 +159,8 @@ Broad BibTeX exports skip DOI-only placeholder records such as `Referenced work Long-running CLI commands report progress on `stderr` so `stdout` remains clean for JSON, BibTeX, or tabular output. +Crossref reference expansion is intentionally conservative about weak discoveries. If a cited reference has no DOI and Crossref only exposes it as an unstructured citation blob, `expand --source crossref`, `expand-topic --source crossref`, and bootstrap flows now skip materializing that record unless the fallback metadata looks like a cleaner non-`misc` work such as conference proceedings. This reduces junk `@misc` entries whose `title` field is really a pasted citation string. + For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run. ## Example Application diff --git a/examples/cli/README.md b/examples/cli/README.md index bf029fd..3590a6e 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -211,6 +211,8 @@ Re-enrich all current `@misc` entries with DOIs: .venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --all-misc --limit 25 ``` +When Crossref expansion only yields an unstructured citation blob without a DOI, citegeist now skips materializing that discovery instead of storing it as a weak `@misc` entry. Cleaner fallback cases that infer a more specific type, such as proceedings-like titles, are still admitted. + ## Explore Citation Graphs Purpose: traverse citation edges, export graph data, and render quick visualizations. diff --git a/src/citegeist/expand.py b/src/citegeist/expand.py index 5e31506..c91acff 100644 --- a/src/citegeist/expand.py +++ b/src/citegeist/expand.py @@ -58,6 +58,8 @@ class CrossrefExpander: results: list[ExpansionResult] = [] for index, reference in enumerate(references, start=1): discovered = self._reference_to_entry(reference, citation_key, index) + if discovered is None: + continue created = False if store.get_entry(discovered.citation_key) is None: store.upsert_entry( @@ -89,17 +91,22 @@ class CrossrefExpander: ) return results - def _reference_to_entry(self, reference: dict, source_citation_key: str, ordinal: int) -> BibEntry: + def _reference_to_entry( + self, + reference: dict, + source_citation_key: str, + ordinal: int, + ) -> BibEntry | None: fallback = _crossref_reference_to_entry(reference, source_citation_key, ordinal) doi = reference.get("DOI") or "" if not doi: - return fallback + return None if _skip_crossref_reference(reference, fallback) else fallback resolution = self.resolver.resolve_doi(doi) if resolution is None: resolution = self.resolver.resolve_datacite_doi(doi) if resolution is None: - return fallback + return None if _skip_crossref_reference(reference, fallback) else fallback merged = merge_entries(resolution.entry, fallback) merged.fields["note"] = fallback.fields["note"] @@ -341,6 +348,8 @@ class TopicExpander: rows: list[tuple[ExpansionResult, dict[str, object]]] = [] for index, reference in enumerate(references, start=1): discovered = self.crossref_expander._reference_to_entry(reference, citation_key, index) + if discovered is None: + continue rows.append( ( ExpansionResult( @@ -425,6 +434,44 @@ def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordi return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) +def _skip_crossref_reference(reference: dict, entry: BibEntry) -> bool: + if reference.get("DOI"): + return False + if reference.get("article-title") or reference.get("volume-title"): + return False + + title = str(entry.fields.get("title") or "") + normalized_title = _normalize_text(title) + if not normalized_title: + return True + if normalized_title.casefold().startswith("referenced work "): + return True + if normalized_title[0] in ".,;:)": + return True + + unstructured = _normalize_text(str(reference.get("unstructured") or "")) + if not unstructured: + return not bool(reference.get("journal-title")) + if entry.entry_type == "misc": + return True + return _looks_like_citation_blob(unstructured) + + +def _looks_like_citation_blob(text: str) -> bool: + lowered = text.casefold() + if any(token in lowered for token in ("http://", "https://", "www.", " accessed ", " url ")): + return True + if any(token in lowered for token in ("supporting material", "grant", "poll results", "prescribing information")): + return True + if text.count(",") >= 3 or text.count(";") >= 2: + return True + if re.search(r"\(\d{4}\)", text): + return True + if re.search(r"\b[A-Z]\.[ ]?[A-Z]?\.", text): + return True + return False + + def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int) -> str: if doi := reference.get("DOI"): suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower() diff --git a/tests/test_expand.py b/tests/test_expand.py index 090eeba..0e16e70 100644 --- a/tests/test_expand.py +++ b/tests/test_expand.py @@ -59,14 +59,11 @@ def test_crossref_expander_creates_draft_nodes_and_relations(): results = expander.expand_entry_references(store, "seed2024") - assert [result.discovered_citation_key for result in results] == [ - "doi101000exampleref", - "ref2021unstructured2", - ] + assert [result.discovered_citation_key for result in results] == ["doi101000exampleref"] discovered = store.get_entry("doi101000exampleref") assert discovered is not None assert discovered["review_status"] == "draft" - assert store.get_relations("seed2024") == ["doi101000exampleref", "ref2021unstructured2"] + assert store.get_relations("seed2024") == ["doi101000exampleref"] relation_provenance = store.get_relation_provenance("seed2024") assert relation_provenance[0]["source_type"] == "graph_expand" finally: @@ -142,6 +139,78 @@ def test_crossref_reference_to_entry_infers_non_misc_for_proceedings_like_text() assert entry.entry_type == "inproceedings" +def test_crossref_expander_skips_citation_blob_without_identifier(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + doi = {10.1000/seed-doi} +} +""" + ) + + expander = CrossrefExpander() + expander.resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign] + "message": { + "reference": [ + { + "unstructured": ( + "Abraham, J.K., Meir, E., Perry, J. (2009). " + "Addressing undergraduate student misconceptions. " + "https://example.org/article" + ), + "year": "2009", + } + ] + } + } + + assert expander.expand_entry_references(store, "seed2024") == [] + assert store.get_relations("seed2024") == [] + finally: + store.close() + + +def test_crossref_expander_keeps_simple_unstructured_title_without_identifier(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + doi = {10.1000/seed-doi} +} +""" + ) + + expander = CrossrefExpander() + expander.resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign] + "message": { + "reference": [ + { + "unstructured": "Proceedings of the Artificial Life Workshop", + "year": "2005", + } + ] + } + } + + results = expander.expand_entry_references(store, "seed2024") + + assert [result.discovered_citation_key for result in results] == ["ref2005proceedings1"] + discovered = store.get_entry("ref2005proceedings1") + assert discovered is not None + assert discovered["entry_type"] == "inproceedings" + finally: + store.close() + + def test_crossref_expander_returns_empty_on_fetch_error(): store = BibliographyStore() try: