Filter weak Crossref discovered references
This commit is contained in:
parent
69844e9750
commit
d6b5138660
|
|
@ -159,6 +159,8 @@ Broad BibTeX exports skip DOI-only placeholder records such as `Referenced work
|
|||
|
||||
Long-running CLI commands report progress on `stderr` so `stdout` remains clean for JSON, BibTeX, or tabular output.
|
||||
|
||||
Crossref reference expansion is intentionally conservative about weak discoveries. If a cited reference has no DOI and Crossref only exposes it as an unstructured citation blob, `expand --source crossref`, `expand-topic --source crossref`, and bootstrap flows now skip materializing that record unless the fallback metadata looks like a cleaner non-`misc` work such as conference proceedings. This reduces junk `@misc` entries whose `title` field is really a pasted citation string.
|
||||
|
||||
For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.
|
||||
|
||||
## Example Application
|
||||
|
|
|
|||
|
|
@ -211,6 +211,8 @@ Re-enrich all current `@misc` entries with DOIs:
|
|||
.venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --all-misc --limit 25
|
||||
```
|
||||
|
||||
When Crossref expansion only yields an unstructured citation blob without a DOI, citegeist now skips materializing that discovery instead of storing it as a weak `@misc` entry. Cleaner fallback cases that infer a more specific type, such as proceedings-like titles, are still admitted.
|
||||
|
||||
## Explore Citation Graphs
|
||||
|
||||
Purpose: traverse citation edges, export graph data, and render quick visualizations.
|
||||
|
|
|
|||
|
|
@ -58,6 +58,8 @@ class CrossrefExpander:
|
|||
results: list[ExpansionResult] = []
|
||||
for index, reference in enumerate(references, start=1):
|
||||
discovered = self._reference_to_entry(reference, citation_key, index)
|
||||
if discovered is None:
|
||||
continue
|
||||
created = False
|
||||
if store.get_entry(discovered.citation_key) is None:
|
||||
store.upsert_entry(
|
||||
|
|
@ -89,17 +91,22 @@ class CrossrefExpander:
|
|||
)
|
||||
return results
|
||||
|
||||
def _reference_to_entry(self, reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
|
||||
def _reference_to_entry(
|
||||
self,
|
||||
reference: dict,
|
||||
source_citation_key: str,
|
||||
ordinal: int,
|
||||
) -> BibEntry | None:
|
||||
fallback = _crossref_reference_to_entry(reference, source_citation_key, ordinal)
|
||||
doi = reference.get("DOI") or ""
|
||||
if not doi:
|
||||
return fallback
|
||||
return None if _skip_crossref_reference(reference, fallback) else fallback
|
||||
|
||||
resolution = self.resolver.resolve_doi(doi)
|
||||
if resolution is None:
|
||||
resolution = self.resolver.resolve_datacite_doi(doi)
|
||||
if resolution is None:
|
||||
return fallback
|
||||
return None if _skip_crossref_reference(reference, fallback) else fallback
|
||||
|
||||
merged = merge_entries(resolution.entry, fallback)
|
||||
merged.fields["note"] = fallback.fields["note"]
|
||||
|
|
@ -341,6 +348,8 @@ class TopicExpander:
|
|||
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
|
||||
for index, reference in enumerate(references, start=1):
|
||||
discovered = self.crossref_expander._reference_to_entry(reference, citation_key, index)
|
||||
if discovered is None:
|
||||
continue
|
||||
rows.append(
|
||||
(
|
||||
ExpansionResult(
|
||||
|
|
@ -425,6 +434,44 @@ def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordi
|
|||
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
|
||||
|
||||
|
||||
def _skip_crossref_reference(reference: dict, entry: BibEntry) -> bool:
|
||||
if reference.get("DOI"):
|
||||
return False
|
||||
if reference.get("article-title") or reference.get("volume-title"):
|
||||
return False
|
||||
|
||||
title = str(entry.fields.get("title") or "")
|
||||
normalized_title = _normalize_text(title)
|
||||
if not normalized_title:
|
||||
return True
|
||||
if normalized_title.casefold().startswith("referenced work "):
|
||||
return True
|
||||
if normalized_title[0] in ".,;:)":
|
||||
return True
|
||||
|
||||
unstructured = _normalize_text(str(reference.get("unstructured") or ""))
|
||||
if not unstructured:
|
||||
return not bool(reference.get("journal-title"))
|
||||
if entry.entry_type == "misc":
|
||||
return True
|
||||
return _looks_like_citation_blob(unstructured)
|
||||
|
||||
|
||||
def _looks_like_citation_blob(text: str) -> bool:
|
||||
lowered = text.casefold()
|
||||
if any(token in lowered for token in ("http://", "https://", "www.", " accessed ", " url ")):
|
||||
return True
|
||||
if any(token in lowered for token in ("supporting material", "grant", "poll results", "prescribing information")):
|
||||
return True
|
||||
if text.count(",") >= 3 or text.count(";") >= 2:
|
||||
return True
|
||||
if re.search(r"\(\d{4}\)", text):
|
||||
return True
|
||||
if re.search(r"\b[A-Z]\.[ ]?[A-Z]?\.", text):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int) -> str:
|
||||
if doi := reference.get("DOI"):
|
||||
suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
|
||||
|
|
|
|||
|
|
@ -59,14 +59,11 @@ def test_crossref_expander_creates_draft_nodes_and_relations():
|
|||
|
||||
results = expander.expand_entry_references(store, "seed2024")
|
||||
|
||||
assert [result.discovered_citation_key for result in results] == [
|
||||
"doi101000exampleref",
|
||||
"ref2021unstructured2",
|
||||
]
|
||||
assert [result.discovered_citation_key for result in results] == ["doi101000exampleref"]
|
||||
discovered = store.get_entry("doi101000exampleref")
|
||||
assert discovered is not None
|
||||
assert discovered["review_status"] == "draft"
|
||||
assert store.get_relations("seed2024") == ["doi101000exampleref", "ref2021unstructured2"]
|
||||
assert store.get_relations("seed2024") == ["doi101000exampleref"]
|
||||
relation_provenance = store.get_relation_provenance("seed2024")
|
||||
assert relation_provenance[0]["source_type"] == "graph_expand"
|
||||
finally:
|
||||
|
|
@ -142,6 +139,78 @@ def test_crossref_reference_to_entry_infers_non_misc_for_proceedings_like_text()
|
|||
assert entry.entry_type == "inproceedings"
|
||||
|
||||
|
||||
def test_crossref_expander_skips_citation_blob_without_identifier():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024},
|
||||
doi = {10.1000/seed-doi}
|
||||
}
|
||||
"""
|
||||
)
|
||||
|
||||
expander = CrossrefExpander()
|
||||
expander.resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
|
||||
"message": {
|
||||
"reference": [
|
||||
{
|
||||
"unstructured": (
|
||||
"Abraham, J.K., Meir, E., Perry, J. (2009). "
|
||||
"Addressing undergraduate student misconceptions. "
|
||||
"https://example.org/article"
|
||||
),
|
||||
"year": "2009",
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
assert expander.expand_entry_references(store, "seed2024") == []
|
||||
assert store.get_relations("seed2024") == []
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_crossref_expander_keeps_simple_unstructured_title_without_identifier():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024},
|
||||
doi = {10.1000/seed-doi}
|
||||
}
|
||||
"""
|
||||
)
|
||||
|
||||
expander = CrossrefExpander()
|
||||
expander.resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
|
||||
"message": {
|
||||
"reference": [
|
||||
{
|
||||
"unstructured": "Proceedings of the Artificial Life Workshop",
|
||||
"year": "2005",
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
results = expander.expand_entry_references(store, "seed2024")
|
||||
|
||||
assert [result.discovered_citation_key for result in results] == ["ref2005proceedings1"]
|
||||
discovered = store.get_entry("ref2005proceedings1")
|
||||
assert discovered is not None
|
||||
assert discovered["entry_type"] == "inproceedings"
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_crossref_expander_returns_empty_on_fetch_error():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
|
|
|
|||
Loading…
Reference in New Issue