Improve discovered reference typing

This commit is contained in:
welsberr 2026-03-20 15:18:07 -04:00
parent 0144bd9ef4
commit 753b8a2ccf
2 changed files with 113 additions and 4 deletions

View File

@ -5,7 +5,7 @@ from dataclasses import dataclass
from urllib.parse import quote, urlencode from urllib.parse import quote, urlencode
from .bibtex import BibEntry, parse_bibtex from .bibtex import BibEntry, parse_bibtex
from .resolve import MetadataResolver from .resolve import MetadataResolver, merge_entries
from .storage import BibliographyStore from .storage import BibliographyStore
@ -55,7 +55,7 @@ class CrossrefExpander:
references = payload.get("message", {}).get("reference", []) references = payload.get("message", {}).get("reference", [])
results: list[ExpansionResult] = [] results: list[ExpansionResult] = []
for index, reference in enumerate(references, start=1): for index, reference in enumerate(references, start=1):
discovered = _crossref_reference_to_entry(reference, citation_key, index) discovered = self._reference_to_entry(reference, citation_key, index)
created = False created = False
if store.get_entry(discovered.citation_key) is None: if store.get_entry(discovered.citation_key) is None:
store.upsert_entry( store.upsert_entry(
@ -87,6 +87,26 @@ class CrossrefExpander:
) )
return results return results
def _reference_to_entry(self, reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
fallback = _crossref_reference_to_entry(reference, source_citation_key, ordinal)
doi = reference.get("DOI") or ""
if not doi:
return fallback
resolution = self.resolver.resolve_doi(doi)
if resolution is None:
resolution = self.resolver.resolve_datacite_doi(doi)
if resolution is None:
return fallback
merged = merge_entries(resolution.entry, fallback)
merged.fields["note"] = fallback.fields["note"]
return BibEntry(
entry_type=resolution.entry.entry_type or merged.entry_type,
citation_key=fallback.citation_key,
fields=merged.fields,
)
class OpenAlexExpander: class OpenAlexExpander:
def __init__(self, resolver: MetadataResolver | None = None) -> None: def __init__(self, resolver: MetadataResolver | None = None) -> None:
@ -312,7 +332,7 @@ class TopicExpander:
references = payload.get("message", {}).get("reference", [])[:limit] references = payload.get("message", {}).get("reference", [])[:limit]
rows: list[tuple[ExpansionResult, dict[str, object]]] = [] rows: list[tuple[ExpansionResult, dict[str, object]]] = []
for index, reference in enumerate(references, start=1): for index, reference in enumerate(references, start=1):
discovered = _crossref_reference_to_entry(reference, citation_key, index) discovered = self.crossref_expander._reference_to_entry(reference, citation_key, index)
rows.append( rows.append(
( (
ExpansionResult( ExpansionResult(
@ -391,7 +411,7 @@ def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordi
fields["journal"] = _normalize_text(journal_title) fields["journal"] = _normalize_text(journal_title)
citation_key = _reference_citation_key(reference, title, year, ordinal) citation_key = _reference_citation_key(reference, title, year, ordinal)
entry_type = "article" if journal_title else "misc" entry_type = _crossref_reference_entry_type(reference, title, journal_title)
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
@ -411,6 +431,24 @@ def _normalize_text(value: str) -> str:
return " ".join(value.split()) return " ".join(value.split())
def _crossref_reference_entry_type(reference: dict, title: str, journal_title: str) -> str:
if journal_title:
return "article"
combined = " ".join(
str(reference.get(field) or "")
for field in ("article-title", "volume-title", "journal-title", "series-title", "unstructured")
).casefold()
if any(token in combined for token in ("conference", "proceedings", "symposium", "workshop")):
return "inproceedings"
if any(token in combined for token in ("thesis", "dissertation")):
return "phdthesis"
if reference.get("volume-title"):
return "incollection"
if any(token in combined for token in ("press", "publisher", "edition")):
return "book"
return "misc"
def _topic_relevance_score(topic_phrase: str, entry: dict[str, object] | None) -> float: def _topic_relevance_score(topic_phrase: str, entry: dict[str, object] | None) -> float:
if entry is None: if entry is None:
return 0.0 return 0.0

View File

@ -1,4 +1,6 @@
from citegeist.bibtex import BibEntry
from citegeist.expand import CrossrefExpander, _crossref_reference_to_entry from citegeist.expand import CrossrefExpander, _crossref_reference_to_entry
from citegeist.resolve import Resolution
from citegeist.storage import BibliographyStore from citegeist.storage import BibliographyStore
@ -67,3 +69,72 @@ def test_crossref_expander_creates_draft_nodes_and_relations():
assert relation_provenance[0]["source_type"] == "graph_expand" assert relation_provenance[0]["source_type"] == "graph_expand"
finally: finally:
store.close() store.close()
def test_crossref_expander_prefers_resolved_doi_metadata_for_discovered_refs():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024},
doi = {10.1000/seed-doi}
}
"""
)
expander = CrossrefExpander()
expander.resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
"message": {
"reference": [
{
"DOI": "10.1117/12.512613",
"unstructured": "J. R. Koza ... Genetic Programming IV ... Springer ... 2005.",
"year": "2005",
}
]
}
}
expander.resolver.resolve_doi = lambda doi: Resolution( # type: ignore[method-assign]
entry=BibEntry(
entry_type="inproceedings",
citation_key="koza2005genetic",
fields={
"author": "Koza, J. R. and Keane, M. A.",
"title": "Genetic Programming IV: Routine Human-Competitive Machine Intelligence",
"year": "2005",
"booktitle": "Genetic and Evolutionary Computation Conference",
"doi": doi,
"url": f"https://doi.org/{doi}",
},
),
source_type="resolver",
source_label=f"crossref:doi:{doi}",
) # type: ignore[return-value]
results = expander.expand_entry_references(store, "seed2024")
assert [result.discovered_citation_key for result in results] == ["doi10111712512613"]
discovered = store.get_entry("doi10111712512613")
assert discovered is not None
assert discovered["entry_type"] == "inproceedings"
assert discovered["title"] == "Genetic Programming IV: Routine Human-Competitive Machine Intelligence"
assert discovered["booktitle"] == "Genetic and Evolutionary Computation Conference"
finally:
store.close()
def test_crossref_reference_to_entry_infers_non_misc_for_proceedings_like_text():
entry = _crossref_reference_to_entry(
{
"DOI": "10.1117/12.512613",
"unstructured": "Proceedings of the Artificial Life Workshop",
"year": "2005",
},
"seed2024",
1,
)
assert entry.entry_type == "inproceedings"