from __future__ import annotations import unittest from unittest.mock import patch from ecospecies_api.citation_enrichment import ( _crossref_message_to_entry, _datacite_item_to_entry, _openalex_work_to_entry, _render_normalized_text, apply_citation_candidate_selection, discover_citation_candidates, enrich_citation_payload, LocalBibEntry, LocalMetadataResolver, LocalResolution, ) from ecospecies_api.citegeist_bridge import extract_draft_citation, render_single_bibtex class CitationEnrichmentTests(unittest.TestCase): def test_render_normalized_text_includes_volume_number_and_pages(self) -> None: rendered = _render_normalized_text( "article", { "author": "Daniell, W.C.", "year": "1872", "title": "Letters referring to experiments of W.C. Daniell", "journal": "Comm. Rept. U.S. Comm. Fish & Fish.", "volume": "2", "number": "4", "pages": "387-390", "doi": "10.1000/example", }, ) self.assertEqual( rendered, "Daniell, W.C. (1872). Letters referring to experiments of W.C. Daniell. Comm. Rept. U.S. Comm. Fish & Fish., 2(4): 387-390. DOI:10.1000/example", ) def test_crossref_mapping_keeps_volume_issue_and_pages(self) -> None: entry = _crossref_message_to_entry( { "type": "journal-article", "title": ["Example Work"], "issued": {"date-parts": [[1872]]}, "author": [{"family": "Daniell", "given": "W.C."}], "container-title": ["Comm. Rept. U.S. Comm. Fish & Fish."], "DOI": "10.1000/example", "URL": "https://doi.org/10.1000/example", "volume": "2", "issue": "4", "page": "387-390", } ) self.assertEqual(entry.fields["volume"], "2") self.assertEqual(entry.fields["number"], "4") self.assertEqual(entry.fields["pages"], "387-390") def test_openalex_mapping_keeps_biblio_fields(self) -> None: entry = _openalex_work_to_entry( { "id": "https://openalex.org/W12345", "display_name": "OpenAlex Discovered Work", "publication_year": 2022, "type": "article", "doi": "https://doi.org/10.1000/example-openalex", "authorships": [{"author": {"display_name": "J S, Smith"}}], "primary_location": {"source": {"display_name": "Journal of Graph Discovery"}}, "biblio": {"volume": "12", "issue": "3", "first_page": "101", "last_page": "118"}, "abstract_inverted_index": {"Graphs": [0], "support": [1], "learning": [2]}, } ) self.assertEqual(entry.fields["author"], "Smith, J. S.") self.assertEqual(entry.fields["volume"], "12") self.assertEqual(entry.fields["number"], "3") self.assertEqual(entry.fields["pages"], "101-118") self.assertEqual(entry.fields["abstract"], "Graphs support learning") def test_openalex_mapping_handles_null_source(self) -> None: entry = _openalex_work_to_entry( { "id": "https://openalex.org/W54321", "display_name": "OpenAlex Work Without Source", "publication_year": 2021, "type": "article", "doi": "https://doi.org/10.1000/example-null-source", "authorships": [{"author": {"display_name": "Jane Smith"}}], "primary_location": {"source": None}, "biblio": {"volume": "5", "issue": "1", "first_page": "10", "last_page": "20"}, } ) self.assertEqual(entry.fields["title"], "OpenAlex Work Without Source") self.assertNotIn("journal", entry.fields) self.assertEqual(entry.fields["volume"], "5") self.assertEqual(entry.fields["number"], "1") self.assertEqual(entry.fields["pages"], "10-20") def test_datacite_mapping_keeps_container_and_pages(self) -> None: entry = _datacite_item_to_entry( { "attributes": { "titles": [{"title": "DataCite Work"}], "creators": [{"name": "J R, Rivera"}], "publicationYear": "2021", "doi": "10.1000/datacite-work", "url": "https://doi.org/10.1000/datacite-work", "container": "Journal of Metadata", "volume": "7", "issue": "2", "firstPage": "44", "lastPage": "59", "descriptions": [ {"descriptionType": "Abstract", "description": "Abstract: Metadata makes reuse easier."} ], } } ) self.assertEqual(entry.fields["author"], "Rivera, J. R.") self.assertEqual(entry.fields["journal"], "Journal of Metadata") self.assertEqual(entry.fields["volume"], "7") self.assertEqual(entry.fields["number"], "2") self.assertEqual(entry.fields["pages"], "44-59") self.assertEqual(entry.fields["abstract"], "Metadata makes reuse easier.") def test_render_single_bibtex_preserves_balanced_braces_and_repairs_unmatched_ones(self) -> None: rendered = render_single_bibtex( "misc", "example", { "title": "Alpha_beta {Gamma}", "note": "raw_reference = {Alpha } beta}", }, ) self.assertIn("title = {Alpha_beta {Gamma}}", rendered) self.assertIn("note = {raw_reference = {Alpha } beta)}", rendered) def test_extract_draft_citation_repairs_report_style_reference_fields(self) -> None: class MockEntry: entry_type = "misc" citation_key = "badkey" fields = { "title": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.", "year": "1872", "note": "extracted_reference = {true}", } with patch( "ecospecies_api.citegeist_bridge._load_citegeist_extract", return_value=lambda text: [MockEntry()], ): draft = extract_draft_citation( "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.", legacy_reference_number="160", ) self.assertIsNotNone(draft) assert draft is not None self.assertEqual(draft.fields["author"], "Daniell, W.C") self.assertEqual( draft.fields["title"], "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River", ) self.assertEqual(draft.fields["journal"], "Comm. Rept. U.S. Comm. Fish & Fish") self.assertEqual(draft.fields["volume"], "2") self.assertEqual(draft.fields["pages"], "387-390") self.assertEqual(draft.citation_key, "daniell1872lettersreferringexperiments") def test_extract_draft_citation_does_not_split_title_on_report_word(self) -> None: class MockEntry: entry_type = "misc" citation_key = "badkey" fields = { "title": "Smith, J. 1999. Habitat report synthesis for Alabama shad. NOAA Tech. Memo. NMFS-SEFSC-200.", "year": "1999", "note": "extracted_reference = {true}", } with patch( "ecospecies_api.citegeist_bridge._load_citegeist_extract", return_value=lambda text: [MockEntry()], ): draft = extract_draft_citation( "Smith, J. 1999. Habitat report synthesis for Alabama shad. NOAA Tech. Memo. NMFS-SEFSC-200.", legacy_reference_number="42", ) self.assertIsNotNone(draft) assert draft is not None self.assertEqual(draft.fields["author"], "Smith, J") self.assertEqual(draft.fields["title"], "Habitat report synthesis for Alabama shad") self.assertEqual(draft.fields["howpublished"], "NOAA Tech. Memo. NMFS-SEFSC-200") self.assertNotIn("journal", draft.fields) def test_extract_draft_citation_repairs_proc_abbreviation_venue(self) -> None: class MockEntry: entry_type = "misc" citation_key = "badkey" fields = { "title": "Bailey, R.M., H.E. Winn and C.L. Smith. 1954. Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes. Proc. Acad. Sci. Philad. 106: 109-134.", "year": "1954", "note": "extracted_reference = {true}", } with patch( "ecospecies_api.citegeist_bridge._load_citegeist_extract", return_value=lambda text: [MockEntry()], ): draft = extract_draft_citation( "Bailey, R.M., H.E. Winn and C.L. Smith. 1954. Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes. Proc. Acad. Sci. Philad. 106: 109-134.", legacy_reference_number="26", ) self.assertIsNotNone(draft) assert draft is not None self.assertEqual( draft.fields["title"], "Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes", ) self.assertEqual(draft.fields["journal"], "Proc. Acad. Sci. Philad") self.assertEqual(draft.fields["volume"], "106") self.assertEqual(draft.fields["pages"], "109-134") def test_extract_draft_citation_repairs_occasional_paper_venue(self) -> None: class MockEntry: entry_type = "misc" citation_key = "badkey" fields = { "title": "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.", "year": "1950", "note": "extracted_reference = {true}", } with patch( "ecospecies_api.citegeist_bridge._load_citegeist_extract", return_value=lambda text: [MockEntry()], ): draft = extract_draft_citation( "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.", legacy_reference_number="41", ) self.assertIsNotNone(draft) assert draft is not None self.assertEqual( draft.fields["title"], "Annotated list of the fauna of the Grand Isle region, 1928-1946", ) self.assertEqual(draft.fields["journal"], "Occas. Pap. Mar. Lab., LSU") self.assertEqual(draft.fields["volume"], "6") self.assertEqual(draft.fields["number"], "6") self.assertEqual(draft.fields["pages"], "1-66") def test_extract_draft_citation_repairs_partial_existing_venue_stub(self) -> None: class MockEntry: entry_type = "misc" citation_key = "badkey" fields = { "title": "Annotated list of the fauna of the Grand Isle region, 1928-1946", "year": "1950", "howpublished": "Occas", "note": "extracted_reference = {true}", } with patch( "ecospecies_api.citegeist_bridge._load_citegeist_extract", return_value=lambda text: [MockEntry()], ): draft = extract_draft_citation( "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.", legacy_reference_number="41", ) self.assertIsNotNone(draft) assert draft is not None self.assertEqual(draft.fields["journal"], "Occas. Pap. Mar. Lab., LSU") self.assertEqual(draft.fields["volume"], "6") self.assertEqual(draft.fields["number"], "6") self.assertEqual(draft.fields["pages"], "1-66") def test_falls_back_to_internal_resolver_when_citegeist_repo_is_unavailable(self) -> None: class MockResolver: def resolve_entry(self, entry): class Resolution: source_label = "crossref:doi:10.1000/example" class Entry: entry_type = "article" citation_key = "doi101000example" fields = { "author": "Smith, Jane", "year": "2024", "title": "Example Work", "journal": "Journal of Examples", "doi": "10.1000/example", "url": "https://doi.org/10.1000/example", } entry = Entry() return Resolution() with patch( "ecospecies_api.citation_enrichment._load_citegeist_resolution_components", return_value=(None, None, None, None), ): result = enrich_citation_payload( { "raw_text": "Smith, Jane. 2024. Example Work.", "legacy_reference_number": "7", }, resolver=MockResolver(), ) self.assertEqual(result["enrichment_status"], "resolved") self.assertEqual(result["doi"], "10.1000/example") self.assertEqual(result["source_url"], "https://doi.org/10.1000/example") self.assertEqual(result["resolver_source_label"], "crossref:doi:10.1000/example") self.assertIn("ecospecies_reference_number = {7}", result["draft_bibtex"]) def test_enrichment_replaces_raw_reference_title_and_dedupes_legacy_note(self) -> None: class MockResolver: def resolve_entry(self, entry): class Resolution: source_label = "crossref:search:Letters referring to experiments" class Entry: entry_type = "article" citation_key = "daniell1872lettersshadalabama" fields = { "author": "Daniell, W.C.", "year": "1872", "title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River", "journal": "Comm. Rept. U.S. Comm. Fish & Fish.", "url": "", } entry = Entry() return Resolution() result = enrich_citation_payload( { "raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.", "legacy_reference_number": "160", "citation_key": "daniell1948daniellwc", }, resolver=MockResolver(), ) self.assertEqual(result["enrichment_status"], "resolved") self.assertEqual(result["citation_key"], "daniell1872lettersreferringexperiments") self.assertIn( "title = {Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River}", result["draft_bibtex"], ) self.assertIn("year = {1872}", result["draft_bibtex"]) self.assertEqual(result["draft_bibtex"].count("ecospecies_reference_number = {160}"), 1) def test_enrichment_rejects_conflicting_resolved_metadata(self) -> None: class MockResolver: def resolve_entry(self, entry): class Resolution: source_label = "crossref:search:alabama-shad-false-positive" class Entry: entry_type = "article" citation_key = "daniell2009habitatuseage" fields = { "author": "Daniell, W.C.", "year": "2009", "title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA", "journal": "Transactions of the American Fisheries Society", "doi": "10.1111/j.1600-0633.2009.00395.x", "url": "https://doi.org/10.1111/j.1600-0633.2009.00395.x", "volume": "19", "number": "1", "pages": "107-115", } entry = Entry() return Resolution() result = enrich_citation_payload( { "raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.", "legacy_reference_number": "160", }, resolver=MockResolver(), ) self.assertEqual(result["enrichment_status"], "unresolved") self.assertIn("conflicts with citation seed fields", result["enrichment_error"]) def test_unresolved_enrichment_still_returns_refreshed_seed_fields(self) -> None: class MockResolver: def resolve_entry(self, entry): return None result = enrich_citation_payload( { "raw_text": "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.", "legacy_reference_number": "41", "citation_key": "oldbadkey", "entry_type": "misc", }, resolver=MockResolver(), ) self.assertEqual(result["enrichment_status"], "unresolved") self.assertEqual(result["citation_key"], "behre1950annotatedlistfauna") self.assertIn("title = {Annotated list of the fauna of the Grand Isle region, 1928-1946}", result["draft_bibtex"]) self.assertIn("Occas. Pap. Mar. Lab., LSU", result["draft_bibtex"]) def test_discover_citation_candidates_returns_scored_candidates(self) -> None: class MockResolver: def search_crossref_candidates(self, title): return [ LocalResolution( LocalBibEntry( "article", "daniell1872lettersreferringexperiments", { "author": "Daniell, W.C.", "year": "1872", "title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River", "journal": "Comm. Rept. U.S. Comm. Fish & Fish.", "volume": "2", "pages": "387-390", }, ), "crossref:search:1:daniell-good", ), LocalResolution( LocalBibEntry( "article", "daniell2009habitatuseage", { "author": "Daniell, W.C.", "year": "2009", "title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA", "journal": "Transactions of the American Fisheries Society", "volume": "19", "number": "1", "pages": "107-115", }, ), "crossref:search:2:daniell-bad", ), ] def search_datacite_candidates(self, title): return [] def search_openalex_candidates(self, title): return [] result = discover_citation_candidates( { "raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.", "legacy_reference_number": "160", }, resolver=MockResolver(), ) self.assertEqual(result["candidate_count"], 2) self.assertGreater(result["candidates"][0]["score"], result["candidates"][1]["score"]) self.assertEqual(result["candidates"][0]["field_matches"]["year"]["status"], "exact") self.assertEqual(result["candidates"][1]["field_matches"]["year"]["status"], "conflict") def test_local_crossref_candidate_search_filters_placeholder_stub_entries(self) -> None: resolver = LocalMetadataResolver() resolver._safe_get_json = lambda url: { "message": { "items": [ { "type": "journal-article", "title": ["Referenced work 1"], "issued": {"date-parts": [[2020]]}, }, { "type": "journal-article", "title": ["Useful Paper"], "issued": {"date-parts": [[2020]]}, "author": [{"family": "Smith", "given": "J S"}], "container-title": ["Journal of Examples"], "DOI": "10.1000/useful", }, ] } } results = resolver.search_crossref_candidates("Useful Paper") self.assertEqual(len(results), 1) self.assertEqual(results[0].entry.fields["title"], "Useful Paper") def test_apply_citation_candidate_selection_uses_selected_candidate_fields(self) -> None: result = apply_citation_candidate_selection( { "raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.", "legacy_reference_number": "160", }, { "source_label": "crossref:search:1:daniell-good", "entry_type": "article", "fields": { "author": "Daniell, W.C.", "year": "1872", "title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River", "journal": "Comm. Rept. U.S. Comm. Fish & Fish.", "volume": "2", "pages": "387-390", }, }, ) self.assertEqual(result["enrichment_status"], "resolved") self.assertEqual(result["resolver_source_label"], "editor:selected:crossref:search:1:daniell-good") self.assertIn("Comm. Rept. U.S. Comm. Fish & Fish., 2: 387-390", result["normalized_text"])