From 753b8a2ccf001d36c812f0faa69a1a20cc1d996a Mon Sep 17 00:00:00 2001 From: welsberr Date: Fri, 20 Mar 2026 15:18:07 -0400 Subject: [PATCH] Improve discovered reference typing --- src/citegeist/expand.py | 46 +++++++++++++++++++++++--- tests/test_expand.py | 71 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 4 deletions(-) diff --git a/src/citegeist/expand.py b/src/citegeist/expand.py index b93943d..d169d10 100644 --- a/src/citegeist/expand.py +++ b/src/citegeist/expand.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from urllib.parse import quote, urlencode from .bibtex import BibEntry, parse_bibtex -from .resolve import MetadataResolver +from .resolve import MetadataResolver, merge_entries from .storage import BibliographyStore @@ -55,7 +55,7 @@ class CrossrefExpander: references = payload.get("message", {}).get("reference", []) results: list[ExpansionResult] = [] for index, reference in enumerate(references, start=1): - discovered = _crossref_reference_to_entry(reference, citation_key, index) + discovered = self._reference_to_entry(reference, citation_key, index) created = False if store.get_entry(discovered.citation_key) is None: store.upsert_entry( @@ -87,6 +87,26 @@ class CrossrefExpander: ) return results + def _reference_to_entry(self, reference: dict, source_citation_key: str, ordinal: int) -> BibEntry: + fallback = _crossref_reference_to_entry(reference, source_citation_key, ordinal) + doi = reference.get("DOI") or "" + if not doi: + return fallback + + resolution = self.resolver.resolve_doi(doi) + if resolution is None: + resolution = self.resolver.resolve_datacite_doi(doi) + if resolution is None: + return fallback + + merged = merge_entries(resolution.entry, fallback) + merged.fields["note"] = fallback.fields["note"] + return BibEntry( + entry_type=resolution.entry.entry_type or merged.entry_type, + citation_key=fallback.citation_key, + fields=merged.fields, + ) + class OpenAlexExpander: def __init__(self, resolver: MetadataResolver | None = None) -> None: @@ -312,7 +332,7 @@ class TopicExpander: references = payload.get("message", {}).get("reference", [])[:limit] rows: list[tuple[ExpansionResult, dict[str, object]]] = [] for index, reference in enumerate(references, start=1): - discovered = _crossref_reference_to_entry(reference, citation_key, index) + discovered = self.crossref_expander._reference_to_entry(reference, citation_key, index) rows.append( ( ExpansionResult( @@ -391,7 +411,7 @@ def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordi fields["journal"] = _normalize_text(journal_title) citation_key = _reference_citation_key(reference, title, year, ordinal) - entry_type = "article" if journal_title else "misc" + entry_type = _crossref_reference_entry_type(reference, title, journal_title) return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) @@ -411,6 +431,24 @@ def _normalize_text(value: str) -> str: return " ".join(value.split()) +def _crossref_reference_entry_type(reference: dict, title: str, journal_title: str) -> str: + if journal_title: + return "article" + combined = " ".join( + str(reference.get(field) or "") + for field in ("article-title", "volume-title", "journal-title", "series-title", "unstructured") + ).casefold() + if any(token in combined for token in ("conference", "proceedings", "symposium", "workshop")): + return "inproceedings" + if any(token in combined for token in ("thesis", "dissertation")): + return "phdthesis" + if reference.get("volume-title"): + return "incollection" + if any(token in combined for token in ("press", "publisher", "edition")): + return "book" + return "misc" + + def _topic_relevance_score(topic_phrase: str, entry: dict[str, object] | None) -> float: if entry is None: return 0.0 diff --git a/tests/test_expand.py b/tests/test_expand.py index 4f6425b..ead9db1 100644 --- a/tests/test_expand.py +++ b/tests/test_expand.py @@ -1,4 +1,6 @@ +from citegeist.bibtex import BibEntry from citegeist.expand import CrossrefExpander, _crossref_reference_to_entry +from citegeist.resolve import Resolution from citegeist.storage import BibliographyStore @@ -67,3 +69,72 @@ def test_crossref_expander_creates_draft_nodes_and_relations(): assert relation_provenance[0]["source_type"] == "graph_expand" finally: store.close() + + +def test_crossref_expander_prefers_resolved_doi_metadata_for_discovered_refs(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + doi = {10.1000/seed-doi} +} +""" + ) + + expander = CrossrefExpander() + expander.resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign] + "message": { + "reference": [ + { + "DOI": "10.1117/12.512613", + "unstructured": "J. R. Koza ... Genetic Programming IV ... Springer ... 2005.", + "year": "2005", + } + ] + } + } + expander.resolver.resolve_doi = lambda doi: Resolution( # type: ignore[method-assign] + entry=BibEntry( + entry_type="inproceedings", + citation_key="koza2005genetic", + fields={ + "author": "Koza, J. R. and Keane, M. A.", + "title": "Genetic Programming IV: Routine Human-Competitive Machine Intelligence", + "year": "2005", + "booktitle": "Genetic and Evolutionary Computation Conference", + "doi": doi, + "url": f"https://doi.org/{doi}", + }, + ), + source_type="resolver", + source_label=f"crossref:doi:{doi}", + ) # type: ignore[return-value] + + results = expander.expand_entry_references(store, "seed2024") + + assert [result.discovered_citation_key for result in results] == ["doi10111712512613"] + discovered = store.get_entry("doi10111712512613") + assert discovered is not None + assert discovered["entry_type"] == "inproceedings" + assert discovered["title"] == "Genetic Programming IV: Routine Human-Competitive Machine Intelligence" + assert discovered["booktitle"] == "Genetic and Evolutionary Computation Conference" + finally: + store.close() + + +def test_crossref_reference_to_entry_infers_non_misc_for_proceedings_like_text(): + entry = _crossref_reference_to_entry( + { + "DOI": "10.1117/12.512613", + "unstructured": "Proceedings of the Artificial Life Workshop", + "year": "2005", + }, + "seed2024", + 1, + ) + + assert entry.entry_type == "inproceedings"