Improve discovered reference typing
This commit is contained in:
parent
0144bd9ef4
commit
753b8a2ccf
|
|
@ -5,7 +5,7 @@ from dataclasses import dataclass
|
|||
from urllib.parse import quote, urlencode
|
||||
|
||||
from .bibtex import BibEntry, parse_bibtex
|
||||
from .resolve import MetadataResolver
|
||||
from .resolve import MetadataResolver, merge_entries
|
||||
from .storage import BibliographyStore
|
||||
|
||||
|
||||
|
|
@ -55,7 +55,7 @@ class CrossrefExpander:
|
|||
references = payload.get("message", {}).get("reference", [])
|
||||
results: list[ExpansionResult] = []
|
||||
for index, reference in enumerate(references, start=1):
|
||||
discovered = _crossref_reference_to_entry(reference, citation_key, index)
|
||||
discovered = self._reference_to_entry(reference, citation_key, index)
|
||||
created = False
|
||||
if store.get_entry(discovered.citation_key) is None:
|
||||
store.upsert_entry(
|
||||
|
|
@ -87,6 +87,26 @@ class CrossrefExpander:
|
|||
)
|
||||
return results
|
||||
|
||||
def _reference_to_entry(self, reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
|
||||
fallback = _crossref_reference_to_entry(reference, source_citation_key, ordinal)
|
||||
doi = reference.get("DOI") or ""
|
||||
if not doi:
|
||||
return fallback
|
||||
|
||||
resolution = self.resolver.resolve_doi(doi)
|
||||
if resolution is None:
|
||||
resolution = self.resolver.resolve_datacite_doi(doi)
|
||||
if resolution is None:
|
||||
return fallback
|
||||
|
||||
merged = merge_entries(resolution.entry, fallback)
|
||||
merged.fields["note"] = fallback.fields["note"]
|
||||
return BibEntry(
|
||||
entry_type=resolution.entry.entry_type or merged.entry_type,
|
||||
citation_key=fallback.citation_key,
|
||||
fields=merged.fields,
|
||||
)
|
||||
|
||||
|
||||
class OpenAlexExpander:
|
||||
def __init__(self, resolver: MetadataResolver | None = None) -> None:
|
||||
|
|
@ -312,7 +332,7 @@ class TopicExpander:
|
|||
references = payload.get("message", {}).get("reference", [])[:limit]
|
||||
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
|
||||
for index, reference in enumerate(references, start=1):
|
||||
discovered = _crossref_reference_to_entry(reference, citation_key, index)
|
||||
discovered = self.crossref_expander._reference_to_entry(reference, citation_key, index)
|
||||
rows.append(
|
||||
(
|
||||
ExpansionResult(
|
||||
|
|
@ -391,7 +411,7 @@ def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordi
|
|||
fields["journal"] = _normalize_text(journal_title)
|
||||
|
||||
citation_key = _reference_citation_key(reference, title, year, ordinal)
|
||||
entry_type = "article" if journal_title else "misc"
|
||||
entry_type = _crossref_reference_entry_type(reference, title, journal_title)
|
||||
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
|
||||
|
||||
|
||||
|
|
@ -411,6 +431,24 @@ def _normalize_text(value: str) -> str:
|
|||
return " ".join(value.split())
|
||||
|
||||
|
||||
def _crossref_reference_entry_type(reference: dict, title: str, journal_title: str) -> str:
|
||||
if journal_title:
|
||||
return "article"
|
||||
combined = " ".join(
|
||||
str(reference.get(field) or "")
|
||||
for field in ("article-title", "volume-title", "journal-title", "series-title", "unstructured")
|
||||
).casefold()
|
||||
if any(token in combined for token in ("conference", "proceedings", "symposium", "workshop")):
|
||||
return "inproceedings"
|
||||
if any(token in combined for token in ("thesis", "dissertation")):
|
||||
return "phdthesis"
|
||||
if reference.get("volume-title"):
|
||||
return "incollection"
|
||||
if any(token in combined for token in ("press", "publisher", "edition")):
|
||||
return "book"
|
||||
return "misc"
|
||||
|
||||
|
||||
def _topic_relevance_score(topic_phrase: str, entry: dict[str, object] | None) -> float:
|
||||
if entry is None:
|
||||
return 0.0
|
||||
|
|
|
|||
|
|
@ -1,4 +1,6 @@
|
|||
from citegeist.bibtex import BibEntry
|
||||
from citegeist.expand import CrossrefExpander, _crossref_reference_to_entry
|
||||
from citegeist.resolve import Resolution
|
||||
from citegeist.storage import BibliographyStore
|
||||
|
||||
|
||||
|
|
@ -67,3 +69,72 @@ def test_crossref_expander_creates_draft_nodes_and_relations():
|
|||
assert relation_provenance[0]["source_type"] == "graph_expand"
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_crossref_expander_prefers_resolved_doi_metadata_for_discovered_refs():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024},
|
||||
doi = {10.1000/seed-doi}
|
||||
}
|
||||
"""
|
||||
)
|
||||
|
||||
expander = CrossrefExpander()
|
||||
expander.resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
|
||||
"message": {
|
||||
"reference": [
|
||||
{
|
||||
"DOI": "10.1117/12.512613",
|
||||
"unstructured": "J. R. Koza ... Genetic Programming IV ... Springer ... 2005.",
|
||||
"year": "2005",
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
expander.resolver.resolve_doi = lambda doi: Resolution( # type: ignore[method-assign]
|
||||
entry=BibEntry(
|
||||
entry_type="inproceedings",
|
||||
citation_key="koza2005genetic",
|
||||
fields={
|
||||
"author": "Koza, J. R. and Keane, M. A.",
|
||||
"title": "Genetic Programming IV: Routine Human-Competitive Machine Intelligence",
|
||||
"year": "2005",
|
||||
"booktitle": "Genetic and Evolutionary Computation Conference",
|
||||
"doi": doi,
|
||||
"url": f"https://doi.org/{doi}",
|
||||
},
|
||||
),
|
||||
source_type="resolver",
|
||||
source_label=f"crossref:doi:{doi}",
|
||||
) # type: ignore[return-value]
|
||||
|
||||
results = expander.expand_entry_references(store, "seed2024")
|
||||
|
||||
assert [result.discovered_citation_key for result in results] == ["doi10111712512613"]
|
||||
discovered = store.get_entry("doi10111712512613")
|
||||
assert discovered is not None
|
||||
assert discovered["entry_type"] == "inproceedings"
|
||||
assert discovered["title"] == "Genetic Programming IV: Routine Human-Competitive Machine Intelligence"
|
||||
assert discovered["booktitle"] == "Genetic and Evolutionary Computation Conference"
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_crossref_reference_to_entry_infers_non_misc_for_proceedings_like_text():
|
||||
entry = _crossref_reference_to_entry(
|
||||
{
|
||||
"DOI": "10.1117/12.512613",
|
||||
"unstructured": "Proceedings of the Artificial Life Workshop",
|
||||
"year": "2005",
|
||||
},
|
||||
"seed2024",
|
||||
1,
|
||||
)
|
||||
|
||||
assert entry.entry_type == "inproceedings"
|
||||
|
|
|
|||
Loading…
Reference in New Issue