CiteGeist/tests/test_resolve.py

653 lines
22 KiB
Python

from xml.etree import ElementTree as ET
import urllib.error
from citegeist.bibtex import BibEntry, parse_bibtex, render_bibtex
from citegeist.resolve import (
MetadataResolver,
_arxiv_atom_entry_to_bib,
_crossref_message_to_entry,
_datacite_work_to_entry,
_openalex_work_to_entry,
_pubmed_article_to_entry,
merge_entries_with_conflicts,
merge_entries,
)
def test_crossref_message_to_entry_maps_basic_fields():
entry = _crossref_message_to_entry(
{
"type": "journal-article",
"title": ["Graph-first bibliography augmentation"],
"DOI": "10.1000/example-doi",
"URL": "https://doi.org/10.1000/example-doi",
"container-title": ["Journal of Graph Studies"],
"author": [{"family": "Smith", "given": "Jane"}],
"issued": {"date-parts": [[2024, 5, 1]]},
}
)
assert entry.entry_type == "article"
assert entry.fields["author"] == "Smith, Jane"
assert entry.fields["journal"] == "Journal of Graph Studies"
assert entry.fields["year"] == "2024"
def test_crossref_message_to_entry_handles_missing_author_without_crashing():
entry = _crossref_message_to_entry(
{
"type": "journal-article",
"title": ["Avida and digital evolution"],
"container-title": ["Artificial Life"],
"issued": {"date-parts": [[2003, 1, 1]]},
"author": [{"family": "", "given": ""}],
}
)
assert entry.citation_key == "crossref2003avida"
assert entry.fields["title"] == "Avida and digital evolution"
assert entry.fields["year"] == "2003"
def test_crossref_message_to_entry_strips_markup_from_title_and_abstract():
entry = _crossref_message_to_entry(
{
"type": "journal-article",
"title": [
"The Fine Structure of the Testis of a Lancelet (=Amphioxus), <i>Branchiostoma floridae</i>"
],
"container-title": ["Acta <i>Zoologica</i>"],
"abstract": "<jats:title>Abstract</jats:title><jats:p>Tagged abstract text.</jats:p>",
"author": [{"family": "Holland", "given": "Nicholas D."}],
"issued": {"date-parts": [[1989]]},
}
)
assert entry.fields["title"] == (
"The Fine Structure of the Testis of a Lancelet (=Amphioxus), Branchiostoma floridae"
)
assert entry.fields["journal"] == "Acta Zoologica"
assert entry.fields["abstract"] == "Tagged abstract text."
assert "), Branchiostoma" in entry.fields["title"]
def test_arxiv_atom_entry_to_bib_maps_basic_fields():
xml = ET.fromstring(
"""
<entry xmlns="http://www.w3.org/2005/Atom" xmlns:arxiv="http://arxiv.org/schemas/atom">
<title>Semantic search for research corpora</title>
<summary>Dense retrieval improves recall.</summary>
<published>2023-01-15T00:00:00Z</published>
<author><name>Miller, Sam</name></author>
<arxiv:doi>10.1000/arxiv-example</arxiv:doi>
</entry>
"""
)
entry = _arxiv_atom_entry_to_bib(xml, "2301.12345")
assert entry.fields["author"] == "Miller, Sam"
assert entry.fields["arxiv"] == "2301.12345"
assert entry.fields["doi"] == "10.1000/arxiv-example"
def test_pubmed_article_to_entry_maps_basic_fields():
xml = ET.fromstring(
"""
<PubmedArticle>
<MedlineCitation>
<PMID>12345678</PMID>
<Article>
<ArticleTitle>PubMed Resolved Work</ArticleTitle>
<Abstract>
<AbstractText Label="Background">Evidence summary.</AbstractText>
<AbstractText>Second paragraph.</AbstractText>
</Abstract>
<Journal>
<JournalIssue>
<PubDate><Year>2021</Year></PubDate>
</JournalIssue>
<Title>Journal of Evidence</Title>
</Journal>
<AuthorList>
<Author><LastName>Smith</LastName><ForeName>Jane</ForeName></Author>
</AuthorList>
<ELocationID EIdType="doi">10.1000/pubmed-example</ELocationID>
</Article>
</MedlineCitation>
<PubmedData>
<ArticleIdList>
<ArticleId IdType="pubmed">12345678</ArticleId>
<ArticleId IdType="pmc">PMC123456</ArticleId>
</ArticleIdList>
</PubmedData>
</PubmedArticle>
"""
)
entry = _pubmed_article_to_entry(xml)
assert entry.citation_key == "doi101000pubmedexample"
assert entry.fields["title"] == "PubMed Resolved Work"
assert entry.fields["author"] == "Smith, Jane"
assert entry.fields["journal"] == "Journal of Evidence"
assert entry.fields["year"] == "2021"
assert entry.fields["pmid"] == "12345678"
assert entry.fields["pmcid"] == "PMC123456"
assert entry.fields["abstract"] == "Background: Evidence summary. Second paragraph."
def test_merge_entries_prefers_existing_values_and_adds_missing_fields():
base = BibEntry(
entry_type="article",
citation_key="smith2024graphs",
fields={"title": "Graph-first bibliography augmentation", "doi": "10.1000/example-doi"},
)
resolved = BibEntry(
entry_type="article",
citation_key="otherkey",
fields={"title": "Different title", "journal": "Journal of Graph Studies"},
)
merged = merge_entries(base, resolved)
assert merged.fields["title"] == "Graph-first bibliography augmentation"
assert merged.fields["journal"] == "Journal of Graph Studies"
def test_merge_entries_with_conflicts_records_disagreements():
base = BibEntry(
entry_type="article",
citation_key="smith2024graphs",
fields={"title": "Existing Title", "journal": "Current Journal"},
)
resolved = BibEntry(
entry_type="article",
citation_key="resolved",
fields={"title": "Resolved Title", "journal": "Current Journal", "year": "2024"},
)
merged, conflicts = merge_entries_with_conflicts(base, resolved)
assert merged.fields["title"] == "Existing Title"
assert merged.fields["year"] == "2024"
assert conflicts == [
{
"field_name": "title",
"current_value": "Existing Title",
"proposed_value": "Resolved Title",
}
]
def test_merge_entries_replaces_placeholder_titles_without_conflict():
base = BibEntry(
entry_type="misc",
citation_key="stubdoi",
fields={"title": "Referenced work 6", "doi": "10.1200/JCO.2002.04.117"},
)
resolved = BibEntry(
entry_type="article",
citation_key="resolved",
fields={"title": "Resolved Work", "journal": "Journal of Clinical Oncology"},
)
merged, conflicts = merge_entries_with_conflicts(base, resolved)
assert merged.fields["title"] == "Resolved Work"
assert merged.fields["journal"] == "Journal of Clinical Oncology"
assert conflicts == []
def test_merge_entries_upgrades_misc_type_when_resolver_has_better_type():
base = BibEntry(
entry_type="misc",
citation_key="miscwithtitle",
fields={"title": "Avida Conference Record", "doi": "10.1117/12.512613"},
)
resolved = BibEntry(
entry_type="inproceedings",
citation_key="resolved",
fields={"title": "Genetic Programming IV", "booktitle": "GECCO"},
)
merged, conflicts = merge_entries_with_conflicts(base, resolved)
assert merged.entry_type == "inproceedings"
assert conflicts == [
{
"field_name": "title",
"current_value": "Avida Conference Record",
"proposed_value": "Genetic Programming IV",
}
]
def test_resolver_tries_doi_before_dblp():
resolver = MetadataResolver()
calls: list[tuple[str, str]] = []
def fake_doi(value: str):
calls.append(("doi", value))
return None
def fake_dblp(value: str):
calls.append(("dblp", value))
return None
def fake_datacite(value: str):
calls.append(("datacite", value))
return None
resolver.resolve_doi = fake_doi # type: ignore[method-assign]
resolver.resolve_datacite_doi = fake_datacite # type: ignore[method-assign]
resolver.resolve_dblp = fake_dblp # type: ignore[method-assign]
resolver.resolve_entry(
BibEntry(
entry_type="article",
citation_key="smith2024graphs",
fields={"doi": "10.1000/example-doi", "dblp": "conf/test/Smith24"},
)
)
assert calls == [
("doi", "10.1000/example-doi"),
("datacite", "10.1000/example-doi"),
("dblp", "conf/test/Smith24"),
]
def test_resolver_tries_pmid_before_dblp():
resolver = MetadataResolver()
calls: list[tuple[str, str]] = []
def fake_pmid(value: str):
calls.append(("pmid", value))
return None
def fake_dblp(value: str):
calls.append(("dblp", value))
return None
resolver.resolve_pmid = fake_pmid # type: ignore[method-assign]
resolver.resolve_dblp = fake_dblp # type: ignore[method-assign]
resolver.resolve_entry(
BibEntry(
entry_type="article",
citation_key="smith2024graphs",
fields={"pmid": "12345678", "dblp": "conf/test/Smith24"},
)
)
assert calls == [
("pmid", "12345678"),
("dblp", "conf/test/Smith24"),
]
def test_resolver_pubmed_requests_include_ncbi_params():
resolver = MetadataResolver(ncbi_api_key="key123", ncbi_tool="citegeist", ncbi_email="dev@example.com")
requested_urls: list[str] = []
def fake_get_json(url: str):
requested_urls.append(url)
return {"esearchresult": {"idlist": []}}
resolver.source_client.get_json = fake_get_json # type: ignore[method-assign]
resolver.search_pubmed("abiogenesis", limit=2)
assert requested_urls
assert "api_key=key123" in requested_urls[0]
assert "tool=citegeist" in requested_urls[0]
assert "email=dev%40example.com" in requested_urls[0]
def test_openalex_work_to_entry_maps_basic_fields():
entry = _openalex_work_to_entry(
{
"id": "https://openalex.org/W12345",
"doi": "https://doi.org/10.1000/example-openalex",
"display_name": "OpenAlex Resolved Work",
"publication_year": 2022,
"type": "article",
"authorships": [{"author": {"display_name": "Jane Smith"}}],
"primary_location": {"source": {"display_name": "Journal of Open Graphs"}},
"abstract_inverted_index": {"OpenAlex": [0], "resolved": [1]},
}
)
assert entry.citation_key == "doi101000exampleopenalex"
assert entry.fields["openalex"] == "W12345"
assert entry.fields["doi"] == "10.1000/example-openalex"
assert entry.fields["journal"] == "Journal of Open Graphs"
assert entry.fields["abstract"] == "OpenAlex resolved"
def test_openalex_work_to_entry_uses_journal_metadata_for_non_article_work_type():
entry = _openalex_work_to_entry(
{
"id": "https://openalex.org/W12345",
"display_name": "OpenAlex Resolved Work",
"publication_year": 2022,
"type": "reference-entry",
"authorships": [{"author": {"display_name": "Jane Smith"}}],
"primary_location": {"source": {"display_name": "Journal of Open Graphs", "type": "journal"}},
}
)
assert entry.entry_type == "article"
assert entry.fields["journal"] == "Journal of Open Graphs"
assert "booktitle" not in entry.fields
def test_openalex_work_to_entry_normalizes_reversed_initial_author_name():
entry = _openalex_work_to_entry(
{
"id": "https://openalex.org/W12345",
"display_name": "Evolutionary Programming",
"publication_year": 1995,
"type": "book-chapter",
"authorships": [{"author": {"display_name": "J., Fogel L."}}],
}
)
assert entry.fields["author"] == "Fogel, L. J."
def test_resolver_can_resolve_openalex_id():
resolver = MetadataResolver()
resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
"id": "https://openalex.org/W12345",
"display_name": "OpenAlex Resolved Work",
"publication_year": 2022,
"type": "article",
"authorships": [{"author": {"display_name": "Jane Smith"}}],
}
resolution = resolver.resolve_openalex("W12345")
assert resolution is not None
assert resolution.source_label == "openalex:id:W12345"
assert resolution.entry.fields["openalex"] == "W12345"
def test_resolve_doi_returns_none_on_http_404():
resolver = MetadataResolver()
def raise_404(_url: str):
raise urllib.error.HTTPError(_url, 404, "Not Found", hdrs=None, fp=None)
resolver.source_client.get_json = raise_404 # type: ignore[method-assign]
assert resolver.resolve_doi("10.1000/missing") is None
def test_search_crossref_returns_empty_on_fetch_error():
resolver = MetadataResolver()
def raise_url_error(_url: str):
raise urllib.error.URLError("temporary failure")
resolver.source_client.get_json = raise_url_error # type: ignore[method-assign]
assert resolver.search_crossref("Avida") == []
def test_resolver_falls_back_to_openalex_title_search():
resolver = MetadataResolver()
resolver.search_datacite = lambda title, limit=5: [] # type: ignore[method-assign]
resolver.search_crossref = lambda title, limit=5: [] # type: ignore[method-assign]
resolver.search_openalex = lambda title, limit=5: [ # type: ignore[method-assign]
_openalex_work_to_entry(
{
"id": "https://openalex.org/W12345",
"display_name": title,
"publication_year": 2022,
"type": "article",
"authorships": [{"author": {"display_name": "Jane Smith"}}],
}
)
]
resolution = resolver.resolve_entry(
BibEntry(
entry_type="article",
citation_key="smith2022openalex",
fields={"title": "OpenAlex Resolved Work", "author": "Jane Smith", "year": "2022"},
)
)
assert resolution is not None
assert resolution.source_label == "openalex:search:OpenAlex Resolved Work"
assert resolution.entry.fields["openalex"] == "W12345"
def test_resolver_prefers_exact_crossref_title_match_before_datacite():
resolver = MetadataResolver()
resolver.search_crossref = lambda title, limit=5: [ # type: ignore[method-assign]
_crossref_message_to_entry(
{
"type": "journal-article",
"title": [title],
"DOI": "10.1126/science.1090005",
"container-title": ["Science"],
"author": [
{"family": "King", "given": "Mary-Claire"},
{"family": "Wilson", "given": "A. C."},
],
"issued": {"date-parts": [[1975, 4, 11]]},
}
)
]
resolver.search_datacite = lambda title, limit=5: [ # type: ignore[method-assign]
_datacite_work_to_entry(
{
"attributes": {
"doi": "10.5061/dryad.v6wwpzh17",
"titles": [
{
"title": "Conserved patterns and locomotor-related evolutionary constraints in the hominoid vertebral column"
}
],
"creators": [
{"familyName": "Villamil", "givenName": "Catalina I."},
{"familyName": "Middleton", "givenName": "Emily R."},
],
"publicationYear": 2024,
"types": {"resourceTypeGeneral": "Dataset"},
}
}
)
]
resolution = resolver.resolve_entry(
BibEntry(
entry_type="article",
citation_key="king1975evolution2",
fields={
"title": "Evolution at two levels in humans and chimpanzees",
"author": "King, M. C. and Wilson, A. C.",
"year": "1975",
},
)
)
assert resolution is not None
assert resolution.source_label == "crossref:search:Evolution at two levels in humans and chimpanzees"
assert resolution.entry.fields["doi"] == "10.1126/science.1090005"
def test_resolver_rejects_mismatched_title_search_candidates():
resolver = MetadataResolver()
resolver.search_crossref = lambda title, limit=5: [] # type: ignore[method-assign]
resolver.search_datacite = lambda title, limit=5: [ # type: ignore[method-assign]
_datacite_work_to_entry(
{
"attributes": {
"doi": "10.5061/dryad.v6wwpzh17",
"titles": [
{
"title": "Conserved patterns and locomotor-related evolutionary constraints in the hominoid vertebral column"
}
],
"creators": [
{"familyName": "Villamil", "givenName": "Catalina I."},
],
"publicationYear": 2024,
"types": {"resourceTypeGeneral": "Dataset"},
}
}
)
]
resolver.search_openalex = lambda title, limit=5: [ # type: ignore[method-assign]
_openalex_work_to_entry(
{
"id": "https://openalex.org/W2033360601",
"display_name": "Immunological relatedness of glucose 6-phosphate dehydrogenases from vertebrate and invertebrate species.",
"publication_year": 1978,
"type": "article",
"authorships": [
{"author": {"display_name": "Yoshikazu Sado"}},
{"author": {"display_name": "Samuel H. Hori"}},
],
"doi": "https://doi.org/10.1266/jjg.53.91",
}
)
]
resolution = resolver.resolve_entry(
BibEntry(
entry_type="article",
citation_key="sarich1967immunological1",
fields={
"title": "Immunological Time Scale for Homonid Evolution",
"author": "Sarich, V. and Wilson, A.",
"year": "1967",
},
)
)
assert resolution is None
def test_datacite_work_to_entry_maps_basic_fields():
entry = _datacite_work_to_entry(
{
"attributes": {
"doi": "10.1000/datacite-example",
"titles": [{"title": "Repository Dissertation Record"}],
"creators": [{"familyName": "Doe", "givenName": "Jane"}],
"publicationYear": 2021,
"publisher": "Example University",
"url": "https://example.edu/record/123",
"types": {"resourceTypeGeneral": "Dissertation"},
"descriptions": [
{
"descriptionType": "Abstract",
"description": "An abstract from DataCite.",
}
],
}
}
)
assert entry.entry_type == "phdthesis"
assert entry.fields["doi"] == "10.1000/datacite-example"
assert entry.fields["author"] == "Doe, Jane"
assert entry.fields["publisher"] == "Example University"
assert entry.fields["abstract"] == "An abstract from DataCite."
def test_resolver_can_resolve_datacite_doi():
resolver = MetadataResolver()
resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
"data": {
"attributes": {
"doi": "10.1000/datacite-example",
"titles": [{"title": "Repository Dissertation Record"}],
"creators": [{"familyName": "Doe", "givenName": "Jane"}],
"publicationYear": 2021,
"types": {"resourceTypeGeneral": "Dissertation"},
}
}
}
resolution = resolver.resolve_datacite_doi("10.1000/datacite-example")
assert resolution is not None
assert resolution.source_label == "datacite:doi:10.1000/datacite-example"
assert resolution.entry.entry_type == "phdthesis"
def test_resolver_can_fall_back_to_datacite_title_search():
resolver = MetadataResolver()
resolver.search_crossref = lambda title, limit=5: [] # type: ignore[method-assign]
resolver.search_datacite = lambda title, limit=5: [ # type: ignore[method-assign]
_datacite_work_to_entry(
{
"attributes": {
"doi": "10.1000/datacite-example",
"titles": [{"title": title}],
"creators": [{"familyName": "Doe", "givenName": "Jane"}],
"publicationYear": 2021,
"types": {"resourceTypeGeneral": "Dissertation"},
}
}
)
]
resolver.search_openalex = lambda title, limit=5: [] # type: ignore[method-assign]
resolution = resolver.resolve_entry(
BibEntry(
entry_type="misc",
citation_key="draft1",
fields={"title": "Repository Dissertation Record", "author": "Doe, Jane", "year": "2021"},
)
)
assert resolution is not None
assert resolution.source_label == "datacite:search:Repository Dissertation Record"
assert resolution.entry.fields["doi"] == "10.1000/datacite-example"
def test_render_bibtex_tolerates_unmatched_braces_in_field_values():
rendered = render_bibtex(
[
BibEntry(
entry_type="misc",
citation_key="broken2026",
fields={
"author": "Broken, Example",
"title": "Unmatched { braces } example } tail",
"year": "2026",
"note": "Open { brace only",
},
)
]
)
assert "@misc{broken2026," in rendered
assert "Unmatched { braces } example ) tail" in rendered
assert "Open ( brace only" in rendered
def test_parse_and_render_do_not_double_escape_simple_bibtex_specials():
parsed = parse_bibtex(
"""
@misc{escaped2026,
title = "A \\& B",
note = "discovered\\_from = {doi10100718462821441}; confidence = 100\\%"
}
"""
)[0]
assert parsed.fields["title"] == "A & B"
assert parsed.fields["note"] == "discovered_from = {doi10100718462821441}; confidence = 100%"
rendered = render_bibtex([parsed])
assert 'title = "A \\& B"' in rendered
assert 'note = "discovered\\_from = {doi10100718462821441}; confidence = 100\\%"' in rendered
assert 'discovered\\\\_from' not in rendered