EcoSpecies-Atlas/apps/api/tests/test_document_format.py

196 lines
6.0 KiB
Python

from __future__ import annotations
import json
import unittest
from ecospecies_api.document_format import (
DocumentNode,
StructuredDocument,
build_document_from_species_payload,
extract_citation_entries,
extract_species_projection,
export_markdown_document,
parse_markdown_document,
validate_markdown_document,
)
class StructuredMarkdownTests(unittest.TestCase):
def test_round_trip_markdown_preserves_metadata_and_hierarchy(self) -> None:
source = """---
title: American Oyster
common_name: American Oyster
scientific_name: Crassostrea virginica
legacy_identifiers:
- authority: legacy-ecospecies
identifier: 5192
label: FLELMR
taxon_identifiers:
- authority: worms
identifier: 159059
label: AphiaID
primary: true
primary_taxon_authority: worms
---
## Summary
Short abstract.
## Habitat
### Type
Estuarine.
"""
document = parse_markdown_document(source)
self.assertEqual(document.metadata["title"], "American Oyster")
self.assertEqual(document.metadata["primary_taxon_authority"], "worms")
self.assertEqual(document.metadata["legacy_identifiers"][0]["identifier"], "5192")
self.assertEqual(document.metadata["taxon_identifiers"][0]["authority"], "worms")
self.assertEqual(document.nodes[0].title, "Summary")
self.assertEqual(document.nodes[1].children[0].title, "Type")
self.assertIn("## Habitat", export_markdown_document(document))
def test_build_document_from_species_payload_creates_markdown_sections(self) -> None:
document = build_document_from_species_payload(
{
"title": "American Oyster",
"common_name": "American Oyster",
"scientific_name": "Crassostrea virginica",
"flelmr_code": "5192",
"source_file": "American Oyster.txt",
"summary": "Short abstract.",
"sections": [
{"heading": "HEADER", "content": "Ignored header"},
{"heading": "Habitat", "content": "Estuarine."},
{"heading": "Reproduction", "content": "Broadcast spawner."},
],
}
)
self.assertEqual(document.metadata["legacy_identifiers"][0]["identifier"], "5192")
self.assertEqual(document.metadata["legacy_identifiers"][0]["authority"], "legacy-ecospecies")
self.assertEqual([node.title for node in document.nodes], ["Summary", "Habitat", "Reproduction"])
self.assertEqual(document.nodes[1].body, "Estuarine.")
def test_extract_species_projection_flattens_nested_headings(self) -> None:
document = parse_markdown_document(
"""---
title: American Oyster
common_name: American Oyster
scientific_name: Crassostrea virginica
legacy_identifiers:
- authority: legacy-ecospecies
identifier: 5192
label: FLELMR
---
## Summary
Short abstract.
## Habitat
General habitat.
### Type
Estuarine.
"""
)
projection = extract_species_projection(document)
self.assertEqual(projection["summary"], "Short abstract.")
self.assertEqual(projection["flelmr_code"], "5192")
self.assertEqual(
[section["heading"] for section in projection["sections"]],
["Habitat", "Habitat / Type"],
)
def test_extract_species_projection_accepts_legacy_species_code_front_matter(self) -> None:
document = parse_markdown_document(
"""---
title: Legacy Fish
common_name: Legacy Fish
scientific_name: Pisces historicus
species_code: 4242
---
## Habitat
Estuarine.
"""
)
projection = extract_species_projection(document)
self.assertEqual(projection["flelmr_code"], "4242")
def test_validate_markdown_document_rejects_missing_front_matter_and_depth_jump(self) -> None:
errors = validate_markdown_document(
"""## Habitat
Text
#### Type
Nested too deeply.
"""
)
self.assertTrue(any("front matter" in error for error in errors))
self.assertTrue(any("Heading depth jumps" in error for error in errors))
def test_extract_citation_entries_strips_legacy_comma_number_prefix(self) -> None:
document = parse_markdown_document(
"""---
title: Alabama Shad
common_name: Alabama Shad
scientific_name: Alosa alabamae
---
## References
160, Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.
"""
)
citations = extract_citation_entries(document)
self.assertEqual(len(citations), 1)
self.assertEqual(citations[0]["legacy_reference_number"], "160")
self.assertTrue(citations[0]["raw_text"].startswith("Daniell, W.C. 1872."))
self.assertFalse(citations[0]["raw_text"].startswith("160,"))
def test_extract_citation_entries_accepts_colon_terminated_citation_heading(self) -> None:
citations = extract_citation_entries(
StructuredDocument(
metadata={},
nodes=[
DocumentNode(
node_type="section",
title="Citations:",
body="7, Ahmed, M. 1975. Speciation in living oysters.",
depth=2,
)
],
)
)
self.assertEqual(len(citations), 1)
self.assertEqual(citations[0]["legacy_reference_number"], "7")
def test_extract_citation_entries_accepts_bare_number_prefix(self) -> None:
document = parse_markdown_document(
"""---
title: Eastern Mosquitofish
common_name: Eastern Mosquitofish
scientific_name: Gambusia holbrooki
---
## Citations
848 Gilmore, R.G. 1977. Fishes of the Indian River Lagoon and adjacent waters, Florida.
"""
)
citations = extract_citation_entries(document)
self.assertEqual(len(citations), 1)
self.assertEqual(citations[0]["legacy_reference_number"], "848")
self.assertTrue(citations[0]["raw_text"].startswith("Gilmore, R.G. 1977."))