EcoSpecies-Atlas/apps/api/tests/test_citation_enrichment.py

528 lines
23 KiB
Python

from __future__ import annotations
import unittest
from unittest.mock import patch
from ecospecies_api.citation_enrichment import (
_crossref_message_to_entry,
_datacite_item_to_entry,
_openalex_work_to_entry,
_render_normalized_text,
apply_citation_candidate_selection,
discover_citation_candidates,
enrich_citation_payload,
LocalBibEntry,
LocalMetadataResolver,
LocalResolution,
)
from ecospecies_api.citegeist_bridge import extract_draft_citation, render_single_bibtex
class CitationEnrichmentTests(unittest.TestCase):
def test_render_normalized_text_includes_volume_number_and_pages(self) -> None:
rendered = _render_normalized_text(
"article",
{
"author": "Daniell, W.C.",
"year": "1872",
"title": "Letters referring to experiments of W.C. Daniell",
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
"volume": "2",
"number": "4",
"pages": "387-390",
"doi": "10.1000/example",
},
)
self.assertEqual(
rendered,
"Daniell, W.C. (1872). Letters referring to experiments of W.C. Daniell. Comm. Rept. U.S. Comm. Fish & Fish., 2(4): 387-390. DOI:10.1000/example",
)
def test_crossref_mapping_keeps_volume_issue_and_pages(self) -> None:
entry = _crossref_message_to_entry(
{
"type": "journal-article",
"title": ["Example Work"],
"issued": {"date-parts": [[1872]]},
"author": [{"family": "Daniell", "given": "W.C."}],
"container-title": ["Comm. Rept. U.S. Comm. Fish & Fish."],
"DOI": "10.1000/example",
"URL": "https://doi.org/10.1000/example",
"volume": "2",
"issue": "4",
"page": "387-390",
}
)
self.assertEqual(entry.fields["volume"], "2")
self.assertEqual(entry.fields["number"], "4")
self.assertEqual(entry.fields["pages"], "387-390")
def test_openalex_mapping_keeps_biblio_fields(self) -> None:
entry = _openalex_work_to_entry(
{
"id": "https://openalex.org/W12345",
"display_name": "OpenAlex Discovered Work",
"publication_year": 2022,
"type": "article",
"doi": "https://doi.org/10.1000/example-openalex",
"authorships": [{"author": {"display_name": "J S, Smith"}}],
"primary_location": {"source": {"display_name": "Journal of Graph Discovery"}},
"biblio": {"volume": "12", "issue": "3", "first_page": "101", "last_page": "118"},
"abstract_inverted_index": {"Graphs": [0], "support": [1], "learning": [2]},
}
)
self.assertEqual(entry.fields["author"], "Smith, J. S.")
self.assertEqual(entry.fields["volume"], "12")
self.assertEqual(entry.fields["number"], "3")
self.assertEqual(entry.fields["pages"], "101-118")
self.assertEqual(entry.fields["abstract"], "Graphs support learning")
def test_openalex_mapping_handles_null_source(self) -> None:
entry = _openalex_work_to_entry(
{
"id": "https://openalex.org/W54321",
"display_name": "OpenAlex Work Without Source",
"publication_year": 2021,
"type": "article",
"doi": "https://doi.org/10.1000/example-null-source",
"authorships": [{"author": {"display_name": "Jane Smith"}}],
"primary_location": {"source": None},
"biblio": {"volume": "5", "issue": "1", "first_page": "10", "last_page": "20"},
}
)
self.assertEqual(entry.fields["title"], "OpenAlex Work Without Source")
self.assertNotIn("journal", entry.fields)
self.assertEqual(entry.fields["volume"], "5")
self.assertEqual(entry.fields["number"], "1")
self.assertEqual(entry.fields["pages"], "10-20")
def test_datacite_mapping_keeps_container_and_pages(self) -> None:
entry = _datacite_item_to_entry(
{
"attributes": {
"titles": [{"title": "DataCite Work"}],
"creators": [{"name": "J R, Rivera"}],
"publicationYear": "2021",
"doi": "10.1000/datacite-work",
"url": "https://doi.org/10.1000/datacite-work",
"container": "Journal of Metadata",
"volume": "7",
"issue": "2",
"firstPage": "44",
"lastPage": "59",
"descriptions": [
{"descriptionType": "Abstract", "description": "Abstract: Metadata makes reuse easier."}
],
}
}
)
self.assertEqual(entry.fields["author"], "Rivera, J. R.")
self.assertEqual(entry.fields["journal"], "Journal of Metadata")
self.assertEqual(entry.fields["volume"], "7")
self.assertEqual(entry.fields["number"], "2")
self.assertEqual(entry.fields["pages"], "44-59")
self.assertEqual(entry.fields["abstract"], "Metadata makes reuse easier.")
def test_render_single_bibtex_preserves_balanced_braces_and_repairs_unmatched_ones(self) -> None:
rendered = render_single_bibtex(
"misc",
"example",
{
"title": "Alpha_beta {Gamma}",
"note": "raw_reference = {Alpha } beta}",
},
)
self.assertIn("title = {Alpha_beta {Gamma}}", rendered)
self.assertIn("note = {raw_reference = {Alpha } beta)}", rendered)
def test_extract_draft_citation_repairs_report_style_reference_fields(self) -> None:
class MockEntry:
entry_type = "misc"
citation_key = "badkey"
fields = {
"title": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
"year": "1872",
"note": "extracted_reference = {true}",
}
with patch(
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
return_value=lambda text: [MockEntry()],
):
draft = extract_draft_citation(
"Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
legacy_reference_number="160",
)
self.assertIsNotNone(draft)
assert draft is not None
self.assertEqual(draft.fields["author"], "Daniell, W.C")
self.assertEqual(
draft.fields["title"],
"Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
)
self.assertEqual(draft.fields["journal"], "Comm. Rept. U.S. Comm. Fish & Fish")
self.assertEqual(draft.fields["volume"], "2")
self.assertEqual(draft.fields["pages"], "387-390")
self.assertEqual(draft.citation_key, "daniell1872lettersreferringexperiments")
def test_extract_draft_citation_does_not_split_title_on_report_word(self) -> None:
class MockEntry:
entry_type = "misc"
citation_key = "badkey"
fields = {
"title": "Smith, J. 1999. Habitat report synthesis for Alabama shad. NOAA Tech. Memo. NMFS-SEFSC-200.",
"year": "1999",
"note": "extracted_reference = {true}",
}
with patch(
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
return_value=lambda text: [MockEntry()],
):
draft = extract_draft_citation(
"Smith, J. 1999. Habitat report synthesis for Alabama shad. NOAA Tech. Memo. NMFS-SEFSC-200.",
legacy_reference_number="42",
)
self.assertIsNotNone(draft)
assert draft is not None
self.assertEqual(draft.fields["author"], "Smith, J")
self.assertEqual(draft.fields["title"], "Habitat report synthesis for Alabama shad")
self.assertEqual(draft.fields["howpublished"], "NOAA Tech. Memo. NMFS-SEFSC-200")
self.assertNotIn("journal", draft.fields)
def test_extract_draft_citation_repairs_proc_abbreviation_venue(self) -> None:
class MockEntry:
entry_type = "misc"
citation_key = "badkey"
fields = {
"title": "Bailey, R.M., H.E. Winn and C.L. Smith. 1954. Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes. Proc. Acad. Sci. Philad. 106: 109-134.",
"year": "1954",
"note": "extracted_reference = {true}",
}
with patch(
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
return_value=lambda text: [MockEntry()],
):
draft = extract_draft_citation(
"Bailey, R.M., H.E. Winn and C.L. Smith. 1954. Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes. Proc. Acad. Sci. Philad. 106: 109-134.",
legacy_reference_number="26",
)
self.assertIsNotNone(draft)
assert draft is not None
self.assertEqual(
draft.fields["title"],
"Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes",
)
self.assertEqual(draft.fields["journal"], "Proc. Acad. Sci. Philad")
self.assertEqual(draft.fields["volume"], "106")
self.assertEqual(draft.fields["pages"], "109-134")
def test_extract_draft_citation_repairs_occasional_paper_venue(self) -> None:
class MockEntry:
entry_type = "misc"
citation_key = "badkey"
fields = {
"title": "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
"year": "1950",
"note": "extracted_reference = {true}",
}
with patch(
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
return_value=lambda text: [MockEntry()],
):
draft = extract_draft_citation(
"Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
legacy_reference_number="41",
)
self.assertIsNotNone(draft)
assert draft is not None
self.assertEqual(
draft.fields["title"],
"Annotated list of the fauna of the Grand Isle region, 1928-1946",
)
self.assertEqual(draft.fields["journal"], "Occas. Pap. Mar. Lab., LSU")
self.assertEqual(draft.fields["volume"], "6")
self.assertEqual(draft.fields["number"], "6")
self.assertEqual(draft.fields["pages"], "1-66")
def test_extract_draft_citation_repairs_partial_existing_venue_stub(self) -> None:
class MockEntry:
entry_type = "misc"
citation_key = "badkey"
fields = {
"title": "Annotated list of the fauna of the Grand Isle region, 1928-1946",
"year": "1950",
"howpublished": "Occas",
"note": "extracted_reference = {true}",
}
with patch(
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
return_value=lambda text: [MockEntry()],
):
draft = extract_draft_citation(
"Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
legacy_reference_number="41",
)
self.assertIsNotNone(draft)
assert draft is not None
self.assertEqual(draft.fields["journal"], "Occas. Pap. Mar. Lab., LSU")
self.assertEqual(draft.fields["volume"], "6")
self.assertEqual(draft.fields["number"], "6")
self.assertEqual(draft.fields["pages"], "1-66")
def test_falls_back_to_internal_resolver_when_citegeist_repo_is_unavailable(self) -> None:
class MockResolver:
def resolve_entry(self, entry):
class Resolution:
source_label = "crossref:doi:10.1000/example"
class Entry:
entry_type = "article"
citation_key = "doi101000example"
fields = {
"author": "Smith, Jane",
"year": "2024",
"title": "Example Work",
"journal": "Journal of Examples",
"doi": "10.1000/example",
"url": "https://doi.org/10.1000/example",
}
entry = Entry()
return Resolution()
with patch(
"ecospecies_api.citation_enrichment._load_citegeist_resolution_components",
return_value=(None, None, None, None),
):
result = enrich_citation_payload(
{
"raw_text": "Smith, Jane. 2024. Example Work.",
"legacy_reference_number": "7",
},
resolver=MockResolver(),
)
self.assertEqual(result["enrichment_status"], "resolved")
self.assertEqual(result["doi"], "10.1000/example")
self.assertEqual(result["source_url"], "https://doi.org/10.1000/example")
self.assertEqual(result["resolver_source_label"], "crossref:doi:10.1000/example")
self.assertIn("ecospecies_reference_number = {7}", result["draft_bibtex"])
def test_enrichment_replaces_raw_reference_title_and_dedupes_legacy_note(self) -> None:
class MockResolver:
def resolve_entry(self, entry):
class Resolution:
source_label = "crossref:search:Letters referring to experiments"
class Entry:
entry_type = "article"
citation_key = "daniell1872lettersshadalabama"
fields = {
"author": "Daniell, W.C.",
"year": "1872",
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
"url": "",
}
entry = Entry()
return Resolution()
result = enrich_citation_payload(
{
"raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
"legacy_reference_number": "160",
"citation_key": "daniell1948daniellwc",
},
resolver=MockResolver(),
)
self.assertEqual(result["enrichment_status"], "resolved")
self.assertEqual(result["citation_key"], "daniell1872lettersreferringexperiments")
self.assertIn(
"title = {Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River}",
result["draft_bibtex"],
)
self.assertIn("year = {1872}", result["draft_bibtex"])
self.assertEqual(result["draft_bibtex"].count("ecospecies_reference_number = {160}"), 1)
def test_enrichment_rejects_conflicting_resolved_metadata(self) -> None:
class MockResolver:
def resolve_entry(self, entry):
class Resolution:
source_label = "crossref:search:alabama-shad-false-positive"
class Entry:
entry_type = "article"
citation_key = "daniell2009habitatuseage"
fields = {
"author": "Daniell, W.C.",
"year": "2009",
"title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA",
"journal": "Transactions of the American Fisheries Society",
"doi": "10.1111/j.1600-0633.2009.00395.x",
"url": "https://doi.org/10.1111/j.1600-0633.2009.00395.x",
"volume": "19",
"number": "1",
"pages": "107-115",
}
entry = Entry()
return Resolution()
result = enrich_citation_payload(
{
"raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
"legacy_reference_number": "160",
},
resolver=MockResolver(),
)
self.assertEqual(result["enrichment_status"], "unresolved")
self.assertIn("conflicts with citation seed fields", result["enrichment_error"])
def test_unresolved_enrichment_still_returns_refreshed_seed_fields(self) -> None:
class MockResolver:
def resolve_entry(self, entry):
return None
result = enrich_citation_payload(
{
"raw_text": "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
"legacy_reference_number": "41",
"citation_key": "oldbadkey",
"entry_type": "misc",
},
resolver=MockResolver(),
)
self.assertEqual(result["enrichment_status"], "unresolved")
self.assertEqual(result["citation_key"], "behre1950annotatedlistfauna")
self.assertIn("title = {Annotated list of the fauna of the Grand Isle region, 1928-1946}", result["draft_bibtex"])
self.assertIn("Occas. Pap. Mar. Lab., LSU", result["draft_bibtex"])
def test_discover_citation_candidates_returns_scored_candidates(self) -> None:
class MockResolver:
def search_crossref_candidates(self, title):
return [
LocalResolution(
LocalBibEntry(
"article",
"daniell1872lettersreferringexperiments",
{
"author": "Daniell, W.C.",
"year": "1872",
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
"volume": "2",
"pages": "387-390",
},
),
"crossref:search:1:daniell-good",
),
LocalResolution(
LocalBibEntry(
"article",
"daniell2009habitatuseage",
{
"author": "Daniell, W.C.",
"year": "2009",
"title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA",
"journal": "Transactions of the American Fisheries Society",
"volume": "19",
"number": "1",
"pages": "107-115",
},
),
"crossref:search:2:daniell-bad",
),
]
def search_datacite_candidates(self, title):
return []
def search_openalex_candidates(self, title):
return []
result = discover_citation_candidates(
{
"raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
"legacy_reference_number": "160",
},
resolver=MockResolver(),
)
self.assertEqual(result["candidate_count"], 2)
self.assertGreater(result["candidates"][0]["score"], result["candidates"][1]["score"])
self.assertEqual(result["candidates"][0]["field_matches"]["year"]["status"], "exact")
self.assertEqual(result["candidates"][1]["field_matches"]["year"]["status"], "conflict")
def test_local_crossref_candidate_search_filters_placeholder_stub_entries(self) -> None:
resolver = LocalMetadataResolver()
resolver._safe_get_json = lambda url: {
"message": {
"items": [
{
"type": "journal-article",
"title": ["Referenced work 1"],
"issued": {"date-parts": [[2020]]},
},
{
"type": "journal-article",
"title": ["Useful Paper"],
"issued": {"date-parts": [[2020]]},
"author": [{"family": "Smith", "given": "J S"}],
"container-title": ["Journal of Examples"],
"DOI": "10.1000/useful",
},
]
}
}
results = resolver.search_crossref_candidates("Useful Paper")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entry.fields["title"], "Useful Paper")
def test_apply_citation_candidate_selection_uses_selected_candidate_fields(self) -> None:
result = apply_citation_candidate_selection(
{
"raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
"legacy_reference_number": "160",
},
{
"source_label": "crossref:search:1:daniell-good",
"entry_type": "article",
"fields": {
"author": "Daniell, W.C.",
"year": "1872",
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
"volume": "2",
"pages": "387-390",
},
},
)
self.assertEqual(result["enrichment_status"], "resolved")
self.assertEqual(result["resolver_source_label"], "editor:selected:crossref:search:1:daniell-good")
self.assertIn("Comm. Rept. U.S. Comm. Fish & Fish., 2: 387-390", result["normalized_text"])