from __future__ import annotations import json import unittest from ecospecies_api.document_format import ( DocumentNode, StructuredDocument, build_document_from_species_payload, extract_citation_entries, extract_species_projection, export_markdown_document, parse_markdown_document, validate_markdown_document, ) class StructuredMarkdownTests(unittest.TestCase): def test_round_trip_markdown_preserves_metadata_and_hierarchy(self) -> None: source = """--- title: American Oyster common_name: American Oyster scientific_name: Crassostrea virginica legacy_identifiers: - authority: legacy-ecospecies identifier: 5192 label: FLELMR taxon_identifiers: - authority: worms identifier: 159059 label: AphiaID primary: true primary_taxon_authority: worms --- ## Summary Short abstract. ## Habitat ### Type Estuarine. """ document = parse_markdown_document(source) self.assertEqual(document.metadata["title"], "American Oyster") self.assertEqual(document.metadata["primary_taxon_authority"], "worms") self.assertEqual(document.metadata["legacy_identifiers"][0]["identifier"], "5192") self.assertEqual(document.metadata["taxon_identifiers"][0]["authority"], "worms") self.assertEqual(document.nodes[0].title, "Summary") self.assertEqual(document.nodes[1].children[0].title, "Type") self.assertIn("## Habitat", export_markdown_document(document)) def test_build_document_from_species_payload_creates_markdown_sections(self) -> None: document = build_document_from_species_payload( { "title": "American Oyster", "common_name": "American Oyster", "scientific_name": "Crassostrea virginica", "flelmr_code": "5192", "source_file": "American Oyster.txt", "summary": "Short abstract.", "sections": [ {"heading": "HEADER", "content": "Ignored header"}, {"heading": "Habitat", "content": "Estuarine."}, {"heading": "Reproduction", "content": "Broadcast spawner."}, ], } ) self.assertEqual(document.metadata["legacy_identifiers"][0]["identifier"], "5192") self.assertEqual(document.metadata["legacy_identifiers"][0]["authority"], "legacy-ecospecies") self.assertEqual([node.title for node in document.nodes], ["Summary", "Habitat", "Reproduction"]) self.assertEqual(document.nodes[1].body, "Estuarine.") def test_extract_species_projection_flattens_nested_headings(self) -> None: document = parse_markdown_document( """--- title: American Oyster common_name: American Oyster scientific_name: Crassostrea virginica legacy_identifiers: - authority: legacy-ecospecies identifier: 5192 label: FLELMR --- ## Summary Short abstract. ## Habitat General habitat. ### Type Estuarine. """ ) projection = extract_species_projection(document) self.assertEqual(projection["summary"], "Short abstract.") self.assertEqual(projection["flelmr_code"], "5192") self.assertEqual( [section["heading"] for section in projection["sections"]], ["Habitat", "Habitat / Type"], ) def test_extract_species_projection_accepts_legacy_species_code_front_matter(self) -> None: document = parse_markdown_document( """--- title: Legacy Fish common_name: Legacy Fish scientific_name: Pisces historicus species_code: 4242 --- ## Habitat Estuarine. """ ) projection = extract_species_projection(document) self.assertEqual(projection["flelmr_code"], "4242") def test_validate_markdown_document_rejects_missing_front_matter_and_depth_jump(self) -> None: errors = validate_markdown_document( """## Habitat Text #### Type Nested too deeply. """ ) self.assertTrue(any("front matter" in error for error in errors)) self.assertTrue(any("Heading depth jumps" in error for error in errors)) def test_extract_citation_entries_strips_legacy_comma_number_prefix(self) -> None: document = parse_markdown_document( """--- title: Alabama Shad common_name: Alabama Shad scientific_name: Alosa alabamae --- ## References 160, Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390. """ ) citations = extract_citation_entries(document) self.assertEqual(len(citations), 1) self.assertEqual(citations[0]["legacy_reference_number"], "160") self.assertTrue(citations[0]["raw_text"].startswith("Daniell, W.C. 1872.")) self.assertFalse(citations[0]["raw_text"].startswith("160,")) def test_extract_citation_entries_accepts_colon_terminated_citation_heading(self) -> None: citations = extract_citation_entries( StructuredDocument( metadata={}, nodes=[ DocumentNode( node_type="section", title="Citations:", body="7, Ahmed, M. 1975. Speciation in living oysters.", depth=2, ) ], ) ) self.assertEqual(len(citations), 1) self.assertEqual(citations[0]["legacy_reference_number"], "7") def test_extract_citation_entries_accepts_bare_number_prefix(self) -> None: document = parse_markdown_document( """--- title: Eastern Mosquitofish common_name: Eastern Mosquitofish scientific_name: Gambusia holbrooki --- ## Citations 848 Gilmore, R.G. 1977. Fishes of the Indian River Lagoon and adjacent waters, Florida. """ ) citations = extract_citation_entries(document) self.assertEqual(len(citations), 1) self.assertEqual(citations[0]["legacy_reference_number"], "848") self.assertTrue(citations[0]["raw_text"].startswith("Gilmore, R.G. 1977."))