528 lines
23 KiB
Python
528 lines
23 KiB
Python
from __future__ import annotations
|
|
|
|
import unittest
|
|
from unittest.mock import patch
|
|
|
|
from ecospecies_api.citation_enrichment import (
|
|
_crossref_message_to_entry,
|
|
_datacite_item_to_entry,
|
|
_openalex_work_to_entry,
|
|
_render_normalized_text,
|
|
apply_citation_candidate_selection,
|
|
discover_citation_candidates,
|
|
enrich_citation_payload,
|
|
LocalBibEntry,
|
|
LocalMetadataResolver,
|
|
LocalResolution,
|
|
)
|
|
from ecospecies_api.citegeist_bridge import extract_draft_citation, render_single_bibtex
|
|
|
|
|
|
class CitationEnrichmentTests(unittest.TestCase):
|
|
def test_render_normalized_text_includes_volume_number_and_pages(self) -> None:
|
|
rendered = _render_normalized_text(
|
|
"article",
|
|
{
|
|
"author": "Daniell, W.C.",
|
|
"year": "1872",
|
|
"title": "Letters referring to experiments of W.C. Daniell",
|
|
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
|
|
"volume": "2",
|
|
"number": "4",
|
|
"pages": "387-390",
|
|
"doi": "10.1000/example",
|
|
},
|
|
)
|
|
|
|
self.assertEqual(
|
|
rendered,
|
|
"Daniell, W.C. (1872). Letters referring to experiments of W.C. Daniell. Comm. Rept. U.S. Comm. Fish & Fish., 2(4): 387-390. DOI:10.1000/example",
|
|
)
|
|
|
|
def test_crossref_mapping_keeps_volume_issue_and_pages(self) -> None:
|
|
entry = _crossref_message_to_entry(
|
|
{
|
|
"type": "journal-article",
|
|
"title": ["Example Work"],
|
|
"issued": {"date-parts": [[1872]]},
|
|
"author": [{"family": "Daniell", "given": "W.C."}],
|
|
"container-title": ["Comm. Rept. U.S. Comm. Fish & Fish."],
|
|
"DOI": "10.1000/example",
|
|
"URL": "https://doi.org/10.1000/example",
|
|
"volume": "2",
|
|
"issue": "4",
|
|
"page": "387-390",
|
|
}
|
|
)
|
|
|
|
self.assertEqual(entry.fields["volume"], "2")
|
|
self.assertEqual(entry.fields["number"], "4")
|
|
self.assertEqual(entry.fields["pages"], "387-390")
|
|
|
|
def test_openalex_mapping_keeps_biblio_fields(self) -> None:
|
|
entry = _openalex_work_to_entry(
|
|
{
|
|
"id": "https://openalex.org/W12345",
|
|
"display_name": "OpenAlex Discovered Work",
|
|
"publication_year": 2022,
|
|
"type": "article",
|
|
"doi": "https://doi.org/10.1000/example-openalex",
|
|
"authorships": [{"author": {"display_name": "J S, Smith"}}],
|
|
"primary_location": {"source": {"display_name": "Journal of Graph Discovery"}},
|
|
"biblio": {"volume": "12", "issue": "3", "first_page": "101", "last_page": "118"},
|
|
"abstract_inverted_index": {"Graphs": [0], "support": [1], "learning": [2]},
|
|
}
|
|
)
|
|
|
|
self.assertEqual(entry.fields["author"], "Smith, J. S.")
|
|
self.assertEqual(entry.fields["volume"], "12")
|
|
self.assertEqual(entry.fields["number"], "3")
|
|
self.assertEqual(entry.fields["pages"], "101-118")
|
|
self.assertEqual(entry.fields["abstract"], "Graphs support learning")
|
|
|
|
def test_openalex_mapping_handles_null_source(self) -> None:
|
|
entry = _openalex_work_to_entry(
|
|
{
|
|
"id": "https://openalex.org/W54321",
|
|
"display_name": "OpenAlex Work Without Source",
|
|
"publication_year": 2021,
|
|
"type": "article",
|
|
"doi": "https://doi.org/10.1000/example-null-source",
|
|
"authorships": [{"author": {"display_name": "Jane Smith"}}],
|
|
"primary_location": {"source": None},
|
|
"biblio": {"volume": "5", "issue": "1", "first_page": "10", "last_page": "20"},
|
|
}
|
|
)
|
|
|
|
self.assertEqual(entry.fields["title"], "OpenAlex Work Without Source")
|
|
self.assertNotIn("journal", entry.fields)
|
|
self.assertEqual(entry.fields["volume"], "5")
|
|
self.assertEqual(entry.fields["number"], "1")
|
|
self.assertEqual(entry.fields["pages"], "10-20")
|
|
|
|
def test_datacite_mapping_keeps_container_and_pages(self) -> None:
|
|
entry = _datacite_item_to_entry(
|
|
{
|
|
"attributes": {
|
|
"titles": [{"title": "DataCite Work"}],
|
|
"creators": [{"name": "J R, Rivera"}],
|
|
"publicationYear": "2021",
|
|
"doi": "10.1000/datacite-work",
|
|
"url": "https://doi.org/10.1000/datacite-work",
|
|
"container": "Journal of Metadata",
|
|
"volume": "7",
|
|
"issue": "2",
|
|
"firstPage": "44",
|
|
"lastPage": "59",
|
|
"descriptions": [
|
|
{"descriptionType": "Abstract", "description": "Abstract: Metadata makes reuse easier."}
|
|
],
|
|
}
|
|
}
|
|
)
|
|
|
|
self.assertEqual(entry.fields["author"], "Rivera, J. R.")
|
|
self.assertEqual(entry.fields["journal"], "Journal of Metadata")
|
|
self.assertEqual(entry.fields["volume"], "7")
|
|
self.assertEqual(entry.fields["number"], "2")
|
|
self.assertEqual(entry.fields["pages"], "44-59")
|
|
self.assertEqual(entry.fields["abstract"], "Metadata makes reuse easier.")
|
|
|
|
def test_render_single_bibtex_preserves_balanced_braces_and_repairs_unmatched_ones(self) -> None:
|
|
rendered = render_single_bibtex(
|
|
"misc",
|
|
"example",
|
|
{
|
|
"title": "Alpha_beta {Gamma}",
|
|
"note": "raw_reference = {Alpha } beta}",
|
|
},
|
|
)
|
|
|
|
self.assertIn("title = {Alpha_beta {Gamma}}", rendered)
|
|
self.assertIn("note = {raw_reference = {Alpha } beta)}", rendered)
|
|
|
|
def test_extract_draft_citation_repairs_report_style_reference_fields(self) -> None:
|
|
class MockEntry:
|
|
entry_type = "misc"
|
|
citation_key = "badkey"
|
|
fields = {
|
|
"title": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
|
|
"year": "1872",
|
|
"note": "extracted_reference = {true}",
|
|
}
|
|
|
|
with patch(
|
|
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
|
|
return_value=lambda text: [MockEntry()],
|
|
):
|
|
draft = extract_draft_citation(
|
|
"Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
|
|
legacy_reference_number="160",
|
|
)
|
|
|
|
self.assertIsNotNone(draft)
|
|
assert draft is not None
|
|
self.assertEqual(draft.fields["author"], "Daniell, W.C")
|
|
self.assertEqual(
|
|
draft.fields["title"],
|
|
"Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
|
|
)
|
|
self.assertEqual(draft.fields["journal"], "Comm. Rept. U.S. Comm. Fish & Fish")
|
|
self.assertEqual(draft.fields["volume"], "2")
|
|
self.assertEqual(draft.fields["pages"], "387-390")
|
|
self.assertEqual(draft.citation_key, "daniell1872lettersreferringexperiments")
|
|
|
|
def test_extract_draft_citation_does_not_split_title_on_report_word(self) -> None:
|
|
class MockEntry:
|
|
entry_type = "misc"
|
|
citation_key = "badkey"
|
|
fields = {
|
|
"title": "Smith, J. 1999. Habitat report synthesis for Alabama shad. NOAA Tech. Memo. NMFS-SEFSC-200.",
|
|
"year": "1999",
|
|
"note": "extracted_reference = {true}",
|
|
}
|
|
|
|
with patch(
|
|
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
|
|
return_value=lambda text: [MockEntry()],
|
|
):
|
|
draft = extract_draft_citation(
|
|
"Smith, J. 1999. Habitat report synthesis for Alabama shad. NOAA Tech. Memo. NMFS-SEFSC-200.",
|
|
legacy_reference_number="42",
|
|
)
|
|
|
|
self.assertIsNotNone(draft)
|
|
assert draft is not None
|
|
self.assertEqual(draft.fields["author"], "Smith, J")
|
|
self.assertEqual(draft.fields["title"], "Habitat report synthesis for Alabama shad")
|
|
self.assertEqual(draft.fields["howpublished"], "NOAA Tech. Memo. NMFS-SEFSC-200")
|
|
self.assertNotIn("journal", draft.fields)
|
|
|
|
def test_extract_draft_citation_repairs_proc_abbreviation_venue(self) -> None:
|
|
class MockEntry:
|
|
entry_type = "misc"
|
|
citation_key = "badkey"
|
|
fields = {
|
|
"title": "Bailey, R.M., H.E. Winn and C.L. Smith. 1954. Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes. Proc. Acad. Sci. Philad. 106: 109-134.",
|
|
"year": "1954",
|
|
"note": "extracted_reference = {true}",
|
|
}
|
|
|
|
with patch(
|
|
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
|
|
return_value=lambda text: [MockEntry()],
|
|
):
|
|
draft = extract_draft_citation(
|
|
"Bailey, R.M., H.E. Winn and C.L. Smith. 1954. Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes. Proc. Acad. Sci. Philad. 106: 109-134.",
|
|
legacy_reference_number="26",
|
|
)
|
|
|
|
self.assertIsNotNone(draft)
|
|
assert draft is not None
|
|
self.assertEqual(
|
|
draft.fields["title"],
|
|
"Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes",
|
|
)
|
|
self.assertEqual(draft.fields["journal"], "Proc. Acad. Sci. Philad")
|
|
self.assertEqual(draft.fields["volume"], "106")
|
|
self.assertEqual(draft.fields["pages"], "109-134")
|
|
|
|
def test_extract_draft_citation_repairs_occasional_paper_venue(self) -> None:
|
|
class MockEntry:
|
|
entry_type = "misc"
|
|
citation_key = "badkey"
|
|
fields = {
|
|
"title": "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
|
|
"year": "1950",
|
|
"note": "extracted_reference = {true}",
|
|
}
|
|
|
|
with patch(
|
|
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
|
|
return_value=lambda text: [MockEntry()],
|
|
):
|
|
draft = extract_draft_citation(
|
|
"Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
|
|
legacy_reference_number="41",
|
|
)
|
|
|
|
self.assertIsNotNone(draft)
|
|
assert draft is not None
|
|
self.assertEqual(
|
|
draft.fields["title"],
|
|
"Annotated list of the fauna of the Grand Isle region, 1928-1946",
|
|
)
|
|
self.assertEqual(draft.fields["journal"], "Occas. Pap. Mar. Lab., LSU")
|
|
self.assertEqual(draft.fields["volume"], "6")
|
|
self.assertEqual(draft.fields["number"], "6")
|
|
self.assertEqual(draft.fields["pages"], "1-66")
|
|
|
|
def test_extract_draft_citation_repairs_partial_existing_venue_stub(self) -> None:
|
|
class MockEntry:
|
|
entry_type = "misc"
|
|
citation_key = "badkey"
|
|
fields = {
|
|
"title": "Annotated list of the fauna of the Grand Isle region, 1928-1946",
|
|
"year": "1950",
|
|
"howpublished": "Occas",
|
|
"note": "extracted_reference = {true}",
|
|
}
|
|
|
|
with patch(
|
|
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
|
|
return_value=lambda text: [MockEntry()],
|
|
):
|
|
draft = extract_draft_citation(
|
|
"Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
|
|
legacy_reference_number="41",
|
|
)
|
|
|
|
self.assertIsNotNone(draft)
|
|
assert draft is not None
|
|
self.assertEqual(draft.fields["journal"], "Occas. Pap. Mar. Lab., LSU")
|
|
self.assertEqual(draft.fields["volume"], "6")
|
|
self.assertEqual(draft.fields["number"], "6")
|
|
self.assertEqual(draft.fields["pages"], "1-66")
|
|
|
|
def test_falls_back_to_internal_resolver_when_citegeist_repo_is_unavailable(self) -> None:
|
|
class MockResolver:
|
|
def resolve_entry(self, entry):
|
|
class Resolution:
|
|
source_label = "crossref:doi:10.1000/example"
|
|
|
|
class Entry:
|
|
entry_type = "article"
|
|
citation_key = "doi101000example"
|
|
fields = {
|
|
"author": "Smith, Jane",
|
|
"year": "2024",
|
|
"title": "Example Work",
|
|
"journal": "Journal of Examples",
|
|
"doi": "10.1000/example",
|
|
"url": "https://doi.org/10.1000/example",
|
|
}
|
|
|
|
entry = Entry()
|
|
|
|
return Resolution()
|
|
|
|
with patch(
|
|
"ecospecies_api.citation_enrichment._load_citegeist_resolution_components",
|
|
return_value=(None, None, None, None),
|
|
):
|
|
result = enrich_citation_payload(
|
|
{
|
|
"raw_text": "Smith, Jane. 2024. Example Work.",
|
|
"legacy_reference_number": "7",
|
|
},
|
|
resolver=MockResolver(),
|
|
)
|
|
|
|
self.assertEqual(result["enrichment_status"], "resolved")
|
|
self.assertEqual(result["doi"], "10.1000/example")
|
|
self.assertEqual(result["source_url"], "https://doi.org/10.1000/example")
|
|
self.assertEqual(result["resolver_source_label"], "crossref:doi:10.1000/example")
|
|
self.assertIn("ecospecies_reference_number = {7}", result["draft_bibtex"])
|
|
|
|
def test_enrichment_replaces_raw_reference_title_and_dedupes_legacy_note(self) -> None:
|
|
class MockResolver:
|
|
def resolve_entry(self, entry):
|
|
class Resolution:
|
|
source_label = "crossref:search:Letters referring to experiments"
|
|
|
|
class Entry:
|
|
entry_type = "article"
|
|
citation_key = "daniell1872lettersshadalabama"
|
|
fields = {
|
|
"author": "Daniell, W.C.",
|
|
"year": "1872",
|
|
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
|
|
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
|
|
"url": "",
|
|
}
|
|
|
|
entry = Entry()
|
|
|
|
return Resolution()
|
|
|
|
result = enrich_citation_payload(
|
|
{
|
|
"raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
|
|
"legacy_reference_number": "160",
|
|
"citation_key": "daniell1948daniellwc",
|
|
},
|
|
resolver=MockResolver(),
|
|
)
|
|
|
|
self.assertEqual(result["enrichment_status"], "resolved")
|
|
self.assertEqual(result["citation_key"], "daniell1872lettersreferringexperiments")
|
|
self.assertIn(
|
|
"title = {Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River}",
|
|
result["draft_bibtex"],
|
|
)
|
|
self.assertIn("year = {1872}", result["draft_bibtex"])
|
|
self.assertEqual(result["draft_bibtex"].count("ecospecies_reference_number = {160}"), 1)
|
|
|
|
def test_enrichment_rejects_conflicting_resolved_metadata(self) -> None:
|
|
class MockResolver:
|
|
def resolve_entry(self, entry):
|
|
class Resolution:
|
|
source_label = "crossref:search:alabama-shad-false-positive"
|
|
|
|
class Entry:
|
|
entry_type = "article"
|
|
citation_key = "daniell2009habitatuseage"
|
|
fields = {
|
|
"author": "Daniell, W.C.",
|
|
"year": "2009",
|
|
"title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA",
|
|
"journal": "Transactions of the American Fisheries Society",
|
|
"doi": "10.1111/j.1600-0633.2009.00395.x",
|
|
"url": "https://doi.org/10.1111/j.1600-0633.2009.00395.x",
|
|
"volume": "19",
|
|
"number": "1",
|
|
"pages": "107-115",
|
|
}
|
|
|
|
entry = Entry()
|
|
|
|
return Resolution()
|
|
|
|
result = enrich_citation_payload(
|
|
{
|
|
"raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
|
|
"legacy_reference_number": "160",
|
|
},
|
|
resolver=MockResolver(),
|
|
)
|
|
|
|
self.assertEqual(result["enrichment_status"], "unresolved")
|
|
self.assertIn("conflicts with citation seed fields", result["enrichment_error"])
|
|
|
|
def test_unresolved_enrichment_still_returns_refreshed_seed_fields(self) -> None:
|
|
class MockResolver:
|
|
def resolve_entry(self, entry):
|
|
return None
|
|
|
|
result = enrich_citation_payload(
|
|
{
|
|
"raw_text": "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
|
|
"legacy_reference_number": "41",
|
|
"citation_key": "oldbadkey",
|
|
"entry_type": "misc",
|
|
},
|
|
resolver=MockResolver(),
|
|
)
|
|
|
|
self.assertEqual(result["enrichment_status"], "unresolved")
|
|
self.assertEqual(result["citation_key"], "behre1950annotatedlistfauna")
|
|
self.assertIn("title = {Annotated list of the fauna of the Grand Isle region, 1928-1946}", result["draft_bibtex"])
|
|
self.assertIn("Occas. Pap. Mar. Lab., LSU", result["draft_bibtex"])
|
|
|
|
def test_discover_citation_candidates_returns_scored_candidates(self) -> None:
|
|
class MockResolver:
|
|
def search_crossref_candidates(self, title):
|
|
return [
|
|
LocalResolution(
|
|
LocalBibEntry(
|
|
"article",
|
|
"daniell1872lettersreferringexperiments",
|
|
{
|
|
"author": "Daniell, W.C.",
|
|
"year": "1872",
|
|
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
|
|
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
|
|
"volume": "2",
|
|
"pages": "387-390",
|
|
},
|
|
),
|
|
"crossref:search:1:daniell-good",
|
|
),
|
|
LocalResolution(
|
|
LocalBibEntry(
|
|
"article",
|
|
"daniell2009habitatuseage",
|
|
{
|
|
"author": "Daniell, W.C.",
|
|
"year": "2009",
|
|
"title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA",
|
|
"journal": "Transactions of the American Fisheries Society",
|
|
"volume": "19",
|
|
"number": "1",
|
|
"pages": "107-115",
|
|
},
|
|
),
|
|
"crossref:search:2:daniell-bad",
|
|
),
|
|
]
|
|
|
|
def search_datacite_candidates(self, title):
|
|
return []
|
|
|
|
def search_openalex_candidates(self, title):
|
|
return []
|
|
|
|
result = discover_citation_candidates(
|
|
{
|
|
"raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
|
|
"legacy_reference_number": "160",
|
|
},
|
|
resolver=MockResolver(),
|
|
)
|
|
|
|
self.assertEqual(result["candidate_count"], 2)
|
|
self.assertGreater(result["candidates"][0]["score"], result["candidates"][1]["score"])
|
|
self.assertEqual(result["candidates"][0]["field_matches"]["year"]["status"], "exact")
|
|
self.assertEqual(result["candidates"][1]["field_matches"]["year"]["status"], "conflict")
|
|
|
|
def test_local_crossref_candidate_search_filters_placeholder_stub_entries(self) -> None:
|
|
resolver = LocalMetadataResolver()
|
|
resolver._safe_get_json = lambda url: {
|
|
"message": {
|
|
"items": [
|
|
{
|
|
"type": "journal-article",
|
|
"title": ["Referenced work 1"],
|
|
"issued": {"date-parts": [[2020]]},
|
|
},
|
|
{
|
|
"type": "journal-article",
|
|
"title": ["Useful Paper"],
|
|
"issued": {"date-parts": [[2020]]},
|
|
"author": [{"family": "Smith", "given": "J S"}],
|
|
"container-title": ["Journal of Examples"],
|
|
"DOI": "10.1000/useful",
|
|
},
|
|
]
|
|
}
|
|
}
|
|
|
|
results = resolver.search_crossref_candidates("Useful Paper")
|
|
|
|
self.assertEqual(len(results), 1)
|
|
self.assertEqual(results[0].entry.fields["title"], "Useful Paper")
|
|
|
|
def test_apply_citation_candidate_selection_uses_selected_candidate_fields(self) -> None:
|
|
result = apply_citation_candidate_selection(
|
|
{
|
|
"raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
|
|
"legacy_reference_number": "160",
|
|
},
|
|
{
|
|
"source_label": "crossref:search:1:daniell-good",
|
|
"entry_type": "article",
|
|
"fields": {
|
|
"author": "Daniell, W.C.",
|
|
"year": "1872",
|
|
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
|
|
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
|
|
"volume": "2",
|
|
"pages": "387-390",
|
|
},
|
|
},
|
|
)
|
|
|
|
self.assertEqual(result["enrichment_status"], "resolved")
|
|
self.assertEqual(result["resolver_source_label"], "editor:selected:crossref:search:1:daniell-good")
|
|
self.assertIn("Comm. Rept. U.S. Comm. Fish & Fish., 2: 387-390", result["normalized_text"])
|