Prepare public-safe repo update

2026-04-10 04:44:45 +00:00 · 2026-04-10 04:44:45 +00:00 · 1143f9bfcc
parent a6b04a995a
commit 1143f9bfcc
40 changed files with 9099 additions and 175 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,3 +7,12 @@ __pycache__/
 node_modules/
 test-results/
 playwright-report/
 *~
 *.env
 secrets*
 codex*
 restart.sh
 *lock.json
 input-data/
 legacy-data
 var/logs/
--- a/README.md
+++ b/README.md
@ -21,7 +21,7 @@ Docker Compose owns all runtime dependencies:
 - Python services run in `python:3.12-slim`
 - the Python virtual environment is created in a Docker-managed volume mounted at `/workspace/.docker/venv`
 - dependencies are installed from `apps/api/requirements.txt` inside that virtual environment
- the legacy corpus is mounted read-only from `../01-legacy-code-and-data`
+- the legacy corpus is mounted read-only from a sibling directory, defaulting to `../legacy-corpus`
 No host Python packages are required for the Compose workflow.
@ -48,6 +48,13 @@ Endpoints:
 - editor section detail/update: `/api/editor/species/<slug>/sections/<position>` (requires `editor` or `admin`)
 - editor audit history: `/api/editor/species/<slug>/audit` (requires `editor` or `admin`)
 The app can also be published under a URL prefix. A reverse-proxy deployment can publish the app at a host and path such as:
 - `ECOSPECIES_HOSTNAME=example.org`
 - `ECOSPECIES_BASE_PATH=/apps/ecospecies`
 When the site is served below a path prefix, the frontend derives its API base from the current page URL and nginx serves both the UI and proxied API under that same prefix.
 If those host ports are already in use, override them when starting Compose, for example:
 ```bash
@ -87,6 +94,14 @@ Run the browser-level smoke test against the real Compose stack with:
 ./scripts/check-ui-stack-smoke.sh
 ```
 Run a bounded citation backfill pass with:
 ```bash
 ./scripts/run-citation-backfill.sh
 ```
 The wrapper runs inside `ecospecies-api`, keeps a rotating cursor in `var/citation-backfill.cursor`, and skips a run if another backfill is already active.
 ## Notes
 - The importer seeds PostgreSQL from the legacy text corpus before the API starts and now synchronizes by slug instead of truncating the full dataset.
@ -98,6 +113,8 @@ Run the browser-level smoke test against the real Compose stack with:
 - Initial editor auth uses `ECOSPECIES_AUTH_TOKENS` in the format `token:username:role[,token2:username2:role2]`, where `role` is `viewer`, `editor`, or `admin`.
 - Editorial workflow state is persisted per species with `draft`, `review`, and `published` statuses. Public endpoints return only `published` records; editor endpoints can inspect and update all records.
 - Editors can curate top-level metadata and section content from the web UI, and every editorial or section change is recorded in per-species audit history.
 - Citation backfill can be scheduled externally, such as with a nightly cron job that runs `./scripts/run-citation-backfill.sh`. Use `ECOSPECIES_BACKFILL_LOG_DIR` if logs should go somewhere other than `var/logs`.
 - Unresolved citation enrichment now still refreshes the locally parsed BibTeX and normalized citation text, so parser improvements propagate even without a remote metadata match.
 - Summary authoring guidance for future FLELMR-compatible records is in `docs/flelmr-authoring.md`.
 - Legacy survey and roadmap artifacts are in `docs/`.
--- a/apps/api/src/ecospecies_api/app.py
+++ b/apps/api/src/ecospecies_api/app.py
@ -15,17 +15,36 @@ from ecospecies_api.auth import (
 )
 from ecospecies_api.parser import get_default_data_dir, load_species_records
 from ecospecies_api.repository import (
    add_species_citation_from_candidate,
    apply_species_citation_candidate_selection,
    create_contributor_species,
    get_contributor_species_citations,
    get_contributor_species_detail,
    get_contributor_species_document,
    get_contributor_species_list,
    get_species_citation_candidates,
    get_editor_species_citations,
    get_editor_species_detail,
    get_species_document,
    get_editor_species_list,
    get_editor_species_workflow,
    get_minimum_contributor_age,
    get_species_by_slug,
    list_species_audit,
    list_public_bibliography,
    get_readiness_status,
    get_summary_metrics,
    has_species_data,
    import_species_payload,
    list_diagnostics,
    list_species,
    register_contributor,
    update_species_citation_enrichment,
    backfill_species_citations,
    update_species_citations_enrichment_batch,
    update_species_citation_review,
    update_contributor_species_document_markdown,
    update_species_document_markdown,
    update_species_section,
    update_species_editorial,
 )
@ -99,6 +118,7 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
                {
                    "authenticated": session is not None,
                    "auth_configured": auth_is_configured(),
                    "minimum_contributor_age": get_minimum_contributor_age(),
                    "user": (
                        {"username": session.username, "role": session.role}
                        if session is not None
@ -108,6 +128,23 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
            )
            return
        if path == "/api/contributor/status":
            if not self.require_role(session, "contributor"):
                return
            self.write_json(
                {
                    "status": "ok",
                    "contributor_access": True,
                    "user": {"username": session.username, "role": session.role},
                    "minimum_age": get_minimum_contributor_age(),
                    "capabilities": [
                        "create_species_draft",
                        "edit_owned_drafts",
                    ],
                }
            )
            return
        if path == "/api/editor/status":
            if not self.require_role(session, "editor"):
                return
@ -135,10 +172,42 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
                    "slug": item["slug"],
                    "title": item["title"],
                    "common_name": item["common_name"],
                    "scientific_name": item["scientific_name"],
                    "legacy_identifiers": item["legacy_identifiers"],
                    "taxon_identifiers": item["taxon_identifiers"],
                    "primary_taxon_authority": item["primary_taxon_authority"],
                    "primary_taxon_identifier": item["primary_taxon_identifier"],
                    "publication_status": item["publication_status"],
                    "is_archived": item["is_archived"],
                    "last_modified_by": item["last_modified_by"],
                    "diagnostic_count": len(item["diagnostics"]),
                    "summary": item["summary"],
                }
                for item in items
            ]
            self.write_json({"items": compact, "count": len(compact)})
            return
        if path == "/api/contributor/species":
            if not self.require_role(session, "contributor"):
                return
            search = query.get("search", [""])[0].strip().lower()
            items = get_contributor_species_list(session.username, search)
            compact = [
                {
                    "slug": item["slug"],
                    "title": item["title"],
                    "common_name": item["common_name"],
                    "scientific_name": item["scientific_name"],
                    "legacy_identifiers": item["legacy_identifiers"],
                    "taxon_identifiers": item["taxon_identifiers"],
                    "primary_taxon_authority": item["primary_taxon_authority"],
                    "primary_taxon_identifier": item["primary_taxon_identifier"],
                    "publication_status": item["publication_status"],
                    "is_archived": item["is_archived"],
                    "last_modified_by": item["last_modified_by"],
                    "diagnostic_count": len(item["diagnostics"]),
                    "summary": item["summary"],
                }
                for item in items
            ]
@ -176,7 +245,68 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
            self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
            return
-        if path.startswith("/api/editor/species/") and not path.endswith("/workflow") and not path.endswith("/editorial") and not path.endswith("/audit"):
+        if path.startswith("/api/editor/species/") and path.endswith("/document"):
            if not self.require_role(session, "editor"):
                return
            slug = path[len("/api/editor/species/") : -len("/document")].strip("/")
            item = get_species_document(slug)
            if item is None:
                self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
                return
            self.write_json(item)
            return
        if path.startswith("/api/editor/species/") and path.endswith("/citations"):
            if not self.require_role(session, "editor"):
                return
            slug = path[len("/api/editor/species/") : -len("/citations")].strip("/")
            item = get_editor_species_citations(slug)
            if item is None:
                self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
                return
            self.write_json(item)
            return
        if path.startswith("/api/editor/species/") and "/citations/" in path and path.endswith("/candidates"):
            if not self.require_role(session, "editor"):
                return
            slug, _, tail = path[len("/api/editor/species/") :].partition("/citations/")
            citation_tail = tail[: -len("/candidates")].strip("/")
            try:
                citation_id = int(citation_tail)
            except ValueError:
                self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST)
                return
            item = get_species_citation_candidates(slug.strip("/"), citation_id)
            if item is None:
                self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
                return
            self.write_json(item)
            return
        if path.startswith("/api/contributor/species/") and path.endswith("/document"):
            if not self.require_role(session, "contributor"):
                return
            slug = path[len("/api/contributor/species/") : -len("/document")].strip("/")
            item = get_contributor_species_document(slug, session.username)
            if item is None:
                self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
                return
            self.write_json(item)
            return
        if path.startswith("/api/contributor/species/") and path.endswith("/citations"):
            if not self.require_role(session, "contributor"):
                return
            slug = path[len("/api/contributor/species/") : -len("/citations")].strip("/")
            item = get_contributor_species_citations(slug, session.username)
            if item is None:
                self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
                return
            self.write_json(item)
            return
        if path.startswith("/api/editor/species/") and not path.endswith("/workflow") and not path.endswith("/editorial") and not path.endswith("/audit") and not path.endswith("/document"):
            if not self.require_role(session, "editor"):
                return
            slug = path[len("/api/editor/species/") :].strip("/")
@ -187,6 +317,17 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
            self.write_json(item)
            return
        if path.startswith("/api/contributor/species/") and not path.endswith("/document"):
            if not self.require_role(session, "contributor"):
                return
            slug = path[len("/api/contributor/species/") :].strip("/")
            item = get_contributor_species_detail(slug, session.username)
            if item is None:
                self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
                return
            self.write_json(item)
            return
        if path.startswith("/api/editor/species/") and path.endswith("/workflow"):
            if not self.require_role(session, "editor"):
                return
@ -215,6 +356,12 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
            self.write_json({"items": flagged, "count": len(flagged)})
            return
        if path == "/api/bibliography":
            search = query.get("search", [""])[0].strip()
            items = list_public_bibliography(search=search)
            self.write_json({"items": items, "count": len(items)})
            return
        if path == "/api/species":
            search = query.get("search", [""])[0].strip().lower()
            species = list_species(search)
@ -225,6 +372,10 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
                    "common_name": item["common_name"],
                    "scientific_name": item["scientific_name"],
                    "flelmr_code": item["flelmr_code"],
                    "legacy_identifiers": item["legacy_identifiers"],
                    "taxon_identifiers": item["taxon_identifiers"],
                    "primary_taxon_authority": item["primary_taxon_authority"],
                    "primary_taxon_identifier": item["primary_taxon_identifier"],
                    "summary": item["summary"],
                    "section_count": item["section_count"],
                    "diagnostic_count": len(item["diagnostics"]),
@ -250,6 +401,47 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
        path = parsed.path
        session = resolve_auth_session(self.headers)
        if path == "/api/contributor/register":
            payload = self.read_json_body()
            if payload is None:
                return
            email = payload.get("email")
            age_gate_confirmed = payload.get("age_gate_confirmed")
            if not isinstance(email, str):
                self.write_json({"error": "email must be a string"}, status=HTTPStatus.BAD_REQUEST)
                return
            if not isinstance(age_gate_confirmed, bool):
                self.write_json(
                    {"error": "age_gate_confirmed must be a boolean"},
                    status=HTTPStatus.BAD_REQUEST,
                )
                return
            try:
                result = register_contributor(email=email, age_gate_confirmed=age_gate_confirmed)
            except ValueError as exc:
                self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST)
                return
            self.write_json({"status": "ok", **result}, status=HTTPStatus.CREATED)
            return
        if path == "/api/contributor/species":
            if not self.require_role(session, "contributor"):
                return
            payload = self.read_json_body()
            if payload is None:
                return
            markdown = payload.get("markdown")
            if markdown is not None and not isinstance(markdown, str):
                self.write_json({"error": "markdown must be a string"}, status=HTTPStatus.BAD_REQUEST)
                return
            try:
                result = create_contributor_species(session.username, markdown)
            except ValueError as exc:
                self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST)
                return
            self.write_json({"status": "ok", **result}, status=HTTPStatus.CREATED)
            return
        if path.startswith("/api/editor/species/") and path.endswith("/workflow"):
            if not self.require_role(session, "editor"):
                return
@ -341,6 +533,229 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
            self.write_json({"status": "ok", **result})
            return
        if path.startswith("/api/editor/species/") and path.endswith("/document"):
            if not self.require_role(session, "editor"):
                return
            payload = self.read_json_body()
            if payload is None:
                return
            markdown = payload.get("markdown")
            if not isinstance(markdown, str):
                self.write_json({"error": "markdown must be a string"}, status=HTTPStatus.BAD_REQUEST)
                return
            slug = path[len("/api/editor/species/") : -len("/document")].strip("/")
            try:
                result = update_species_document_markdown(
                    slug=slug,
                    markdown=markdown,
                    username=session.username,
                )
            except ValueError as exc:
                self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST)
                return
            if result is None:
                self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
                return
            self.write_json({"status": "ok", **result})
            return
        if (
            path.startswith("/api/editor/species/")
            and "/citations/" in path
            and not path.endswith("/citations/enrich")
            and not path.endswith("/citations/backfill")
        ):
            if not self.require_role(session, "editor"):
                return
            payload = self.read_json_body()
            if payload is None:
                return
            slug, _, tail = path[len("/api/editor/species/") :].partition("/citations/")
            if tail.endswith("/enrich"):
                citation_tail = tail[: -len("/enrich")].strip("/")
                try:
                    citation_id = int(citation_tail)
                except ValueError:
                    self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST)
                    return
                result = update_species_citation_enrichment(
                    slug=slug.strip("/"),
                    citation_id=citation_id,
                    username=session.username,
                )
                if result is None:
                    self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
                    return
                self.write_json({"status": "ok", **result})
                return
            if tail.endswith("/apply-match"):
                citation_tail = tail[: -len("/apply-match")].strip("/")
                try:
                    citation_id = int(citation_tail)
                except ValueError:
                    self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST)
                    return
                candidate = payload.get("candidate")
                if not isinstance(candidate, dict):
                    self.write_json({"error": "candidate must be an object"}, status=HTTPStatus.BAD_REQUEST)
                    return
                result = apply_species_citation_candidate_selection(
                    slug=slug.strip("/"),
                    citation_id=citation_id,
                    candidate=candidate,
                    username=session.username,
                )
                if result is None:
                    self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
                    return
                self.write_json({"status": "ok", **result})
                return
            if tail.endswith("/add-match"):
                citation_tail = tail[: -len("/add-match")].strip("/")
                try:
                    citation_id = int(citation_tail)
                except ValueError:
                    self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST)
                    return
                candidate = payload.get("candidate")
                if not isinstance(candidate, dict):
                    self.write_json({"error": "candidate must be an object"}, status=HTTPStatus.BAD_REQUEST)
                    return
                result = add_species_citation_from_candidate(
                    slug=slug.strip("/"),
                    citation_id=citation_id,
                    candidate=candidate,
                    username=session.username,
                )
                if result is None:
                    self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
                    return
                self.write_json({"status": "ok", **result})
                return
            try:
                citation_id = int(tail.strip("/"))
            except ValueError:
                self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST)
                return
            for field in ("review_status", "normalized_text", "abstract_text", "doi", "citation_key", "entry_type", "draft_bibtex"):
                value = payload.get(field)
                if value is not None and not isinstance(value, str):
                    self.write_json(
                        {"error": f"{field} must be a string"},
                        status=HTTPStatus.BAD_REQUEST,
                    )
                    return
            try:
                result = update_species_citation_review(
                    slug=slug.strip("/"),
                    citation_id=citation_id,
                    review_status=payload.get("review_status"),
                    normalized_text=payload.get("normalized_text"),
                    doi=payload.get("doi"),
                    citation_key=payload.get("citation_key"),
                    entry_type=payload.get("entry_type"),
                    draft_bibtex=payload.get("draft_bibtex"),
                    abstract_text=payload.get("abstract_text"),
                    username=session.username,
                )
            except ValueError as exc:
                self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST)
                return
            if result is None:
                self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
                return
            self.write_json({"status": "ok", **result})
            return
        if path.startswith("/api/editor/species/") and path.endswith("/citations/enrich"):
            if not self.require_role(session, "editor"):
                return
            payload = self.read_json_body()
            if payload is None:
                return
            slug = path[len("/api/editor/species/") : -len("/citations/enrich")].strip("/")
            result = update_species_citations_enrichment_batch(
                slug=slug,
                username=session.username,
            )
            if result is None:
                self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
                return
            self.write_json({"status": "ok", **result})
            return
        if path.startswith("/api/editor/species/") and path.endswith("/citations/backfill"):
            if not self.require_role(session, "editor"):
                return
            payload = self.read_json_body()
            if payload is None:
                return
            slug = path[len("/api/editor/species/") : -len("/citations/backfill")].strip("/")
            include_accepted = bool(payload.get("include_accepted", False))
            result = backfill_species_citations(
                slug=slug,
                username=session.username,
                include_accepted=include_accepted,
            )
            if result is None:
                self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
                return
            self.write_json({"status": "ok", **result})
            return
        if path.startswith("/api/contributor/species/") and path.endswith("/document"):
            if not self.require_role(session, "contributor"):
                return
            payload = self.read_json_body()
            if payload is None:
                return
            markdown = payload.get("markdown")
            if not isinstance(markdown, str):
                self.write_json({"error": "markdown must be a string"}, status=HTTPStatus.BAD_REQUEST)
                return
            slug = path[len("/api/contributor/species/") : -len("/document")].strip("/")
            try:
                result = update_contributor_species_document_markdown(
                    slug=slug,
                    markdown=markdown,
                    username=session.username,
                )
            except ValueError as exc:
                self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST)
                return
            if result is None:
                self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
                return
            self.write_json({"status": "ok", **result})
            return
        self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
    def log_message(self, format: str, *args: object) -> None:
--- a/apps/api/src/ecospecies_api/auth.py
+++ b/apps/api/src/ecospecies_api/auth.py
@ -1,14 +1,21 @@
 from __future__ import annotations
 import hashlib
 import os
 from dataclasses import dataclass
 from typing import Mapping
 from sqlalchemy import select
 from ecospecies_api.db import SessionLocal, create_db_engine
 from ecospecies_api.models import Base, ContributorAccount
 ROLE_ORDER = {
    "viewer": 1,
-    "editor": 2,
+    "contributor": 2,
-    "admin": 3,
+    "editor": 3,
    "admin": 4,
 }
@ -41,17 +48,27 @@ def _parse_token_entry(entry: str) -> tuple[str, AuthSession]:
 def get_token_registry() -> dict[str, AuthSession]:
    configured = os.environ.get("ECOSPECIES_AUTH_TOKENS", "").strip()
    if not configured:
        return {}
    registry: dict[str, AuthSession] = {}
    configured = os.environ.get("ECOSPECIES_AUTH_TOKENS", "").strip()
    if configured:
        for raw_entry in configured.split(","):
            entry = raw_entry.strip()
            if not entry:
                continue
            token, session = _parse_token_entry(entry)
            registry[token] = session
    engine = create_db_engine()
    Base.metadata.create_all(engine)
    with SessionLocal() as session:
        for account in session.scalars(
            select(ContributorAccount).where(ContributorAccount.is_active.is_(True))
        ):
            registry[account.token_hash] = AuthSession(
                token=account.token_hash,
                username=account.email,
                role="contributor",
            )
    return registry
@ -70,7 +87,11 @@ def resolve_auth_session(headers: Mapping[str, str]) -> AuthSession | None:
    token = get_bearer_token(headers)
    if not token:
        return None
-    return registry.get(token)
+    direct = registry.get(token)
    if direct is not None:
        return direct
    token_hash = hashlib.sha256(token.encode("utf-8")).hexdigest()
    return registry.get(token_hash)
 def auth_is_configured() -> bool:
--- a/apps/api/src/ecospecies_api/citation_enrichment.py
+++ b/apps/api/src/ecospecies_api/citation_enrichment.py
--- a/apps/api/src/ecospecies_api/citegeist_bridge.py
+++ b/apps/api/src/ecospecies_api/citegeist_bridge.py
@ -0,0 +1,387 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from pathlib import Path
 import re
 import sys
 def _load_citegeist_extract():
    citegeist_src = Path(__file__).resolve().parents[5] / "CiteGeist" / "src"
    if citegeist_src.exists() and str(citegeist_src) not in sys.path:
        sys.path.insert(0, str(citegeist_src))
    try:
        from citegeist.extract import extract_references  # type: ignore
    except ImportError:
        return None
    return extract_references
@dataclass
 class DraftCitation:
    citation_key: str
    entry_type: str
    fields: dict[str, str]
    draft_bibtex: str
 STOPWORD_TOKENS = {
    "a",
    "an",
    "and",
    "for",
    "from",
    "in",
    "of",
    "on",
    "the",
    "to",
    "with",
 }
 HISTORICAL_YEAR_PATTERN = r"(1\d{3}|20\d{2})"
 def build_standard_citation_key(
    authors: str = "",
    year: str = "",
    title: str = "",
    fallback_text: str = "",
 ) -> str:
    family_name = _family_name_stem(authors or fallback_text)
    year_stem = re.sub(r"[^0-9]+", "", year)[:4]
    topic_stem = _topic_stem(title or fallback_text)
    key = f"{family_name}{year_stem}{topic_stem}"
    return key or "reference"
 def extract_draft_citation(raw_text: str, legacy_reference_number: str = "") -> DraftCitation | None:
    extractor = _load_citegeist_extract()
    if extractor is None:
        return _fallback_citation(raw_text, legacy_reference_number)
    entries = extractor(raw_text)
    if not entries:
        return _fallback_citation(raw_text, legacy_reference_number)
    entry = entries[0]
    fields = dict(entry.fields)
    fields = _repair_reference_fields(raw_text, fields)
    citation_key = build_standard_citation_key(
        authors=str(fields.get("author", "")),
        year=str(fields.get("year", "")),
        title=str(fields.get("title", "")),
        fallback_text=raw_text,
    )
    note_parts = [fields.get("note", "").strip()] if fields.get("note") else []
    if legacy_reference_number:
        note_parts.append(f"ecospecies_reference_number = {{{legacy_reference_number}}}")
    fields["note"] = "; ".join(part for part in note_parts if part)
    draft_bibtex = render_single_bibtex(entry.entry_type, citation_key, fields)
    return DraftCitation(
        citation_key=citation_key,
        entry_type=entry.entry_type,
        fields=fields,
        draft_bibtex=draft_bibtex,
    )
 def _fallback_citation(raw_text: str, legacy_reference_number: str) -> DraftCitation:
    year_match = re.search(rf"\b{HISTORICAL_YEAR_PATTERN}\b", raw_text)
    year = year_match.group(0) if year_match else ""
    fields = _repair_reference_fields(
        raw_text,
        {
            "title": raw_text.strip(),
            "year": year,
        },
    )
    title = str(fields.get("title", "")).strip() or raw_text.strip()
    citation_key = build_standard_citation_key(year=year, title=title, fallback_text=raw_text)
    fields["note"] = f"raw_reference = {{{raw_text}}}"
    if legacy_reference_number:
        fields["note"] += f"; ecospecies_reference_number = {{{legacy_reference_number}}}"
    draft_bibtex = render_single_bibtex("misc", citation_key, fields)
    return DraftCitation(
        citation_key=citation_key,
        entry_type="misc",
        fields=fields,
        draft_bibtex=draft_bibtex,
    )
 def _family_name_stem(raw_text: str) -> str:
    compact = raw_text.strip()
    if not compact:
        return "ref"
    if "," in compact:
        compact = compact.split(",", 1)[0]
    else:
        compact = compact.split()[0]
    compact = re.sub(r"[^A-Za-z0-9]+", "", compact).lower()
    return compact or "ref"
 def _topic_stem(raw_text: str) -> str:
    tokens = [
        token
        for token in re.findall(r"[A-Za-z0-9]+", raw_text.lower())
        if token not in STOPWORD_TOKENS and not token.isdigit()
    ]
    topic_tokens = tokens[:3] or ["topic"]
    return "".join(topic_tokens)
 def _repair_reference_fields(raw_text: str, fields: dict[str, str]) -> dict[str, str]:
    repaired = dict(fields)
    title = str(repaired.get("title", "")).strip()
    raw = raw_text.strip()
    if not raw:
        return repaired
    parsed = _parse_report_style_reference(raw)
    if parsed is None:
        return repaired
    current_venue = (
        str(repaired.get("journal", "")).strip()
        or str(repaired.get("howpublished", "")).strip()
        or str(repaired.get("booktitle", "")).strip()
        or str(repaired.get("publisher", "")).strip()
    )
    parsed_venue = str(parsed.get("venue", "")).strip()
    needs_structural_repair = bool(
        parsed_venue
        and (
            not current_venue
            or len(current_venue) < max(8, len(parsed_venue) // 2)
            or current_venue.lower() not in parsed_venue.lower()
            or (parsed.get("volume") and not str(repaired.get("volume", "")).strip())
            or (parsed.get("number") and not str(repaired.get("number", "")).strip())
            or (parsed.get("pages") and not str(repaired.get("pages", "")).strip())
        )
    )
    if title and not _title_looks_like_raw_reference(title) and not needs_structural_repair:
        return repaired
    if parsed.get("author"):
        repaired["author"] = parsed["author"]
    if parsed.get("year"):
        repaired["year"] = parsed["year"]
    if parsed.get("title"):
        repaired["title"] = parsed["title"]
    venue = parsed.get("venue", "")
    if venue:
        repaired.pop("howpublished", None)
        if _venue_looks_journal_like(venue):
            repaired["journal"] = venue
        else:
            repaired["howpublished"] = venue
    if parsed.get("volume"):
        repaired["volume"] = parsed["volume"]
    if parsed.get("number"):
        repaired["number"] = parsed["number"]
    if parsed.get("pages"):
        repaired["pages"] = parsed["pages"]
    return repaired
 def _title_looks_like_raw_reference(title: str) -> bool:
    compact = " ".join(title.split()).strip()
    if not compact:
        return True
    if len(compact) > 120:
        return True
    return bool(re.match(rf"^[^,]+,\s+.+\b{HISTORICAL_YEAR_PATTERN}\.\s+", compact))
 def _parse_report_style_reference(raw_text: str) -> dict[str, str] | None:
    match = re.match(
        rf"^(?P<author>.+?)\s+(?P<year>{HISTORICAL_YEAR_PATTERN})\.\s+(?P<remainder>.+)$",
        raw_text.strip(),
    )
    if match is None:
        return None
    author = match.group("author").strip(" .")
    year = match.group("year").strip()
    remainder = match.group("remainder").strip()
    if not author or not remainder:
        return None
    venue_start = _find_venue_start(remainder)
    if venue_start is None:
        return {
            "author": author,
            "year": year,
            "title": remainder.strip(" ."),
            "venue": "",
        }
    title = remainder[:venue_start].strip(" .")
    venue_part = remainder[venue_start:].strip(" .")
    venue, volume, number, pages = _split_venue_and_locator(venue_part)
    return {
        "author": author,
        "year": year,
        "title": title,
        "venue": venue,
        "volume": volume,
        "number": number,
        "pages": pages,
    }
 def _split_venue_and_locator(venue_part: str) -> tuple[str, str, str, str]:
    compact = venue_part.strip(" .")
    if not compact:
        return "", "", "", ""
    match = re.search(
        r"(?P<venue>.+?)\.\s+(?P<volume>\d+)(?:\((?P<number>[^)]+)\))?\s*:\s*(?P<pages>\d+(?:-\d+)?)\.?$",
        compact,
    )
    if match is None:
        match = re.search(
            r"(?P<venue>.+?)\s+(?P<volume>\d+)(?:\((?P<number>[^)]+)\))?\s*:\s*(?P<pages>\d+(?:-\d+)?)\.?$",
            compact,
        )
    if match is None:
        return compact, "", "", ""
    return (
        match.group("venue").strip(" ."),
        (match.group("volume") or "").strip(),
        (match.group("number") or "").strip(),
        (match.group("pages") or "").strip(),
    )
 def _find_venue_start(remainder: str) -> int | None:
    for match in re.finditer(r"\.\s+", remainder):
        candidate_start = match.end()
        candidate = remainder[candidate_start:].strip()
        if _looks_like_publication_segment(candidate):
            return candidate_start
    lowered = remainder.lower()
    markers = (
        "comm. rept.",
        "rept.",
        "proc.",
        "procs.",
        "journal",
        "transactions",
        "proceedings",
        "bulletin",
        "bull.",
        "occas. pap.",
        "pap.",
        "memoir",
        "memorandum",
        "memo.",
        "tech. memo.",
        "tech memo",
        "technical memorandum",
        "technical report",
        "noaa",
    )
    positions = [lowered.find(marker) for marker in markers if lowered.find(marker) > 0]
    if positions:
        return min(positions)
    return None
 def _looks_like_publication_segment(candidate: str) -> bool:
    compact = candidate.strip(" .")
    if not compact:
        return False
    venue, volume, number, pages = _split_venue_and_locator(compact)
    if venue and (volume or number or pages) and _starts_with_publication_marker(compact):
        return True
    return _starts_with_publication_marker(compact)
 def _starts_with_publication_marker(text: str) -> bool:
    lowered = text.lower()
    publication_starts = (
        "comm. rept.",
        "rept.",
        "proc.",
        "procs.",
        "journal",
        "transactions",
        "proceedings",
        "bulletin",
        "bull.",
        "occas. pap.",
        "pap.",
        "memoir",
        "memorandum",
        "memo.",
        "tech. memo.",
        "tech memo",
        "technical memorandum",
        "technical report",
        "noaa",
        "u.s.",
    )
    return lowered.startswith(publication_starts)
 def _venue_looks_journal_like(venue: str) -> bool:
    lowered = venue.lower()
    return any(
        token in lowered
        for token in (
            "journal",
            "transactions",
            "review",
            "letters",
            "comm. rept.",
            "rept.",
            "proc.",
            "proceedings",
            "occas. pap.",
            "pap.",
        )
    )
 def render_single_bibtex(entry_type: str, citation_key: str, fields: dict[str, str]) -> str:
    lines = [f"@{entry_type}{{{citation_key},"]
    for key in sorted(fields):
        value = _sanitize_bibtex_value(fields[key])
        lines.append(f"  {key} = {{{value}}},")
    lines.append("}")
    return "\n".join(lines)
 def _sanitize_bibtex_value(value: str) -> str:
    depth = 0
    parts: list[str] = []
    for char in value:
        if char == "{":
            depth += 1
            parts.append(char)
            continue
        if char == "}":
            if depth == 0:
                parts.append(")")
            else:
                depth -= 1
                parts.append(char)
            continue
        parts.append(char)
    if depth > 0:
        open_count = depth
        normalized: list[str] = []
        for char in parts:
            if char == "{" and open_count > 0:
                normalized.append("(")
                open_count -= 1
            else:
                normalized.append(char)
        return "".join(normalized)
    return "".join(parts)
--- a/apps/api/src/ecospecies_api/document_format.py
+++ b/apps/api/src/ecospecies_api/document_format.py
@ -0,0 +1,480 @@
 from __future__ import annotations
 import json
 import re
 from dataclasses import asdict, dataclass, field
 HEADING_PATTERN = re.compile(r"^(#{2,6})\s+(?P<title>.+?)\s*$")
 INDENTED_ITEM_PATTERN = re.compile(r"^\s*-\s*(?P<body>.+?)\s*$")
 DOI_PATTERN = re.compile(r"\b10\.\d{4,9}/[-._;()/:A-Za-z0-9]+\b")
@dataclass
 class DocumentNode:
    node_type: str
    title: str
    body: str
    depth: int
    children: list["DocumentNode"] = field(default_factory=list)
@dataclass
 class StructuredDocument:
    metadata: dict[str, object]
    nodes: list[DocumentNode]
 def _parse_scalar_value(value: str) -> object:
    stripped = value.strip()
    if not stripped:
        return ""
    if stripped.lower() == "true":
        return True
    if stripped.lower() == "false":
        return False
    if stripped.startswith("{") or stripped.startswith("["):
        try:
            return json.loads(stripped)
        except json.JSONDecodeError:
            return stripped
    return stripped
 def _normalize_whitespace(value: str) -> str:
    return re.sub(r"\s+", " ", value).strip()
 def _parse_front_matter(front_matter: str) -> dict[str, object]:
    metadata: dict[str, object] = {}
    lines = front_matter.splitlines()
    index = 0
    while index < len(lines):
        raw_line = lines[index]
        if not raw_line.strip() or raw_line.lstrip().startswith("#"):
            index += 1
            continue
        if ":" not in raw_line:
            index += 1
            continue
        key, value = raw_line.split(":", 1)
        normalized_key = key.strip()
        stripped_value = value.strip()
        if stripped_value:
            metadata[normalized_key] = _parse_scalar_value(stripped_value)
            index += 1
            continue
        items: list[dict[str, object]] = []
        index += 1
        while index < len(lines):
            item_line = lines[index]
            if not item_line.strip():
                index += 1
                continue
            if not item_line.startswith("  - "):
                break
            match = INDENTED_ITEM_PATTERN.match(item_line)
            if not match:
                break
            item: dict[str, object] = {}
            first_body = match.group("body")
            if ":" in first_body:
                item_key, item_value = first_body.split(":", 1)
                item[item_key.strip()] = _parse_scalar_value(item_value.strip())
            index += 1
            while index < len(lines):
                nested_line = lines[index]
                if nested_line.startswith("    ") and ":" in nested_line.strip():
                    nested_key, nested_value = nested_line.strip().split(":", 1)
                    item[nested_key.strip()] = _parse_scalar_value(nested_value.strip())
                    index += 1
                    continue
                break
            items.append(item)
        metadata[normalized_key] = items
    return metadata
 def _split_front_matter(text: str) -> tuple[dict[str, object], str]:
    stripped = text.lstrip()
    if not stripped.startswith("---\n"):
        return {}, text
    _, _, remainder = stripped.partition("---\n")
    front_matter, separator, body = remainder.partition("\n---\n")
    if not separator:
        return {}, text
    return _parse_front_matter(front_matter), body
 def parse_markdown_document(text: str) -> StructuredDocument:
    metadata, body = _split_front_matter(text)
    root_nodes: list[DocumentNode] = []
    stack: list[DocumentNode] = []
    body_lines: list[str] = []
    def flush_body() -> None:
        if not stack:
            body_lines.clear()
            return
        stack[-1].body = "\n".join(body_lines).strip()
        body_lines.clear()
    for raw_line in body.splitlines():
        match = HEADING_PATTERN.match(raw_line)
        if not match:
            body_lines.append(raw_line)
            continue
        flush_body()
        depth = len(match.group(1))
        node = DocumentNode(
            node_type="section",
            title=match.group("title").strip(),
            body="",
            depth=depth,
        )
        while stack and stack[-1].depth >= depth:
            stack.pop()
        if stack:
            stack[-1].children.append(node)
        else:
            root_nodes.append(node)
        stack.append(node)
    flush_body()
    return StructuredDocument(metadata=metadata, nodes=root_nodes)
 def validate_markdown_document(text: str) -> list[str]:
    errors: list[str] = []
    metadata, body = _split_front_matter(text)
    if not metadata:
        errors.append("Markdown document must include YAML front matter.")
    last_depth: int | None = None
    for raw_line in body.splitlines():
        match = HEADING_PATTERN.match(raw_line)
        if not match:
            continue
        depth = len(match.group(1))
        if last_depth is not None and depth > last_depth + 1:
            errors.append(
                f"Heading depth jumps from level {last_depth} to level {depth}: {match.group('title').strip()}"
            )
        last_depth = depth
    return errors
 def _append_metadata_lines(lines: list[str], key: str, value: object) -> None:
    if isinstance(value, list):
        lines.append(f"{key}:")
        for item in value:
            if isinstance(item, dict) and item:
                first = True
                for item_key, item_value in item.items():
                    rendered = "true" if item_value is True else "false" if item_value is False else str(item_value)
                    prefix = "  - " if first else "    "
                    lines.append(f"{prefix}{item_key}: {rendered}")
                    first = False
            else:
                lines.append(f"  - {item}")
        return
    rendered = "true" if value is True else "false" if value is False else str(value)
    lines.append(f"{key}: {rendered}")
 def export_markdown_document(document: StructuredDocument) -> str:
    lines: list[str] = ["---"]
    for key, value in document.metadata.items():
        _append_metadata_lines(lines, key, value)
    lines.append("---")
    lines.append("")
    def append_nodes(nodes: list[DocumentNode]) -> None:
        for node in nodes:
            lines.append(f"{'#' * node.depth} {node.title}")
            if node.body:
                lines.append(node.body)
            lines.append("")
            append_nodes(node.children)
    append_nodes(document.nodes)
    return "\n".join(lines).rstrip() + "\n"
 def flatten_document_nodes(document: StructuredDocument) -> list[dict[str, object]]:
    flattened: list[dict[str, object]] = []
    def visit(nodes: list[DocumentNode], parent_id: str | None) -> None:
        for index, node in enumerate(nodes, start=1):
            node_id = f"node-{len(flattened) + 1}"
            flattened.append(
                {
                    "node_id": node_id,
                    "parent_id": parent_id,
                    "position": index,
                    "depth": node.depth,
                    "node_type": node.node_type,
                    "title": node.title,
                    "body_markdown": node.body,
                    "body_plaintext": node.body,
                }
            )
            visit(node.children, node_id)
    visit(document.nodes, None)
    return flattened
 def document_to_json(document: StructuredDocument) -> str:
    return json.dumps(asdict(document), ensure_ascii=True)
 def build_document_from_species_payload(item: dict[str, object]) -> StructuredDocument:
    legacy_identifiers: list[dict[str, object]] = []
    if item.get("flelmr_code"):
        legacy_identifiers.append(
            {
                "authority": "legacy-ecospecies",
                "identifier": str(item.get("flelmr_code", "")),
                "label": "FLELMR",
            }
        )
    metadata = {
        "title": str(item.get("title", "")),
        "common_name": str(item.get("common_name", "")),
        "scientific_name": str(item.get("scientific_name", "")),
        "legacy_identifiers": legacy_identifiers,
        "taxon_identifiers": list(item.get("taxon_identifiers", [])),
        "primary_taxon_authority": str(item.get("primary_taxon_authority", "")),
        "source_file": str(item.get("source_file", "")),
        "publication_status": str(item.get("publication_status", "published")),
        "source_format": "ecospecies-markdown-v1",
    }
    nodes: list[DocumentNode] = []
    summary = str(item.get("summary", "")).strip()
    if summary:
        nodes.append(
            DocumentNode(
                node_type="section",
                title="Summary",
                body=summary,
                depth=2,
            )
        )
    for section in item.get("sections", []):
        heading = str(section.get("heading", "")).strip()
        if not heading or heading == "HEADER":
            continue
        nodes.append(
            DocumentNode(
                node_type="section",
                title=heading,
                body=str(section.get("content", "")).strip(),
                depth=2,
            )
        )
    return StructuredDocument(metadata=metadata, nodes=nodes)
 def extract_species_projection(document: StructuredDocument) -> dict[str, object]:
    metadata = document.metadata
    summary = ""
    sections: list[dict[str, object]] = []
    legacy_identifiers = metadata.get("legacy_identifiers", [])
    taxon_identifiers = metadata.get("taxon_identifiers", [])
    flelmr_code = ""
    if isinstance(legacy_identifiers, list):
        for item in legacy_identifiers:
            if not isinstance(item, dict):
                continue
            authority = str(item.get("authority", "")).strip().lower()
            label = str(item.get("label", "")).strip().lower()
            if authority == "legacy-ecospecies" or label == "flelmr":
                flelmr_code = str(item.get("identifier", "")).strip()
                if flelmr_code:
                    break
    if not flelmr_code:
        flelmr_code = str(metadata.get("species_code", "")).strip()
    def visit(nodes: list[DocumentNode], path: list[str]) -> None:
        nonlocal summary
        for node in nodes:
            current_path = [*path, node.title]
            if node.title.lower() == "summary" and not summary:
                summary = node.body.strip()
            else:
                sections.append(
                    {
                        "heading": " / ".join(current_path),
                        "content": node.body.strip(),
                    }
                )
            visit(node.children, current_path)
    visit(document.nodes, [])
    return {
        "title": metadata.get("title", ""),
        "common_name": metadata.get("common_name", ""),
        "scientific_name": metadata.get("scientific_name", ""),
        "flelmr_code": flelmr_code,
        "legacy_identifiers": legacy_identifiers if isinstance(legacy_identifiers, list) else [],
        "taxon_identifiers": taxon_identifiers if isinstance(taxon_identifiers, list) else [],
        "primary_taxon_authority": str(metadata.get("primary_taxon_authority", "")),
        "summary": summary,
        "sections": sections,
    }
 def _is_citation_heading(title: str) -> bool:
    lowered = title.strip().rstrip(":").lower()
    return lowered in {
        "references",
        "reference",
        "citations",
        "citation",
        "bibliography",
        "related references",
        "related citations",
    }
 def _split_citation_lines(body: str) -> list[str]:
    entries: list[dict[str, str]] = []
    current: list[str] = []
    current_number = ""
    def flush() -> None:
        nonlocal current_number
        if not current:
            return
        compact = " ".join(part.strip() for part in current if part.strip()).strip()
        if compact:
            entries.append(
                {
                    "legacy_reference_number": current_number,
                    "raw_text": compact,
                }
            )
        current.clear()
        current_number = ""
    for raw_line in body.splitlines():
        stripped = raw_line.strip()
        if not stripped:
            flush()
            continue
        leading_number_match = re.match(r"^(?P<num>\d+)\s*,\s*(?P<text>.+)$", stripped)
        if leading_number_match:
            flush()
            current_number = leading_number_match.group("num")
            current.append(leading_number_match.group("text"))
            continue
        bare_number_match = re.match(r"^(?P<num>\d+)\s+(?P<text>[A-Z].+)$", stripped)
        if bare_number_match:
            flush()
            current_number = bare_number_match.group("num")
            current.append(bare_number_match.group("text"))
            continue
        bullet_match = re.match(
            r"^(?:[-*]|\[(?P<bracket_num>\d+)\]|(?P<plain_num>\d+)[\.,])\s+(?P<text>.+)$",
            stripped,
        )
        if bullet_match:
            flush()
            current_number = bullet_match.group("bracket_num") or bullet_match.group("plain_num") or ""
            bullet_text = bullet_match.group("text")
            if not current_number:
                nested_number_match = re.match(r"^\[(?P<num>\d+)\]\s+(?P<text>.+)$", bullet_text)
                if nested_number_match:
                    current_number = nested_number_match.group("num")
                    bullet_text = nested_number_match.group("text")
                else:
                    nested_comma_match = re.match(r"^(?P<num>\d+)\s*,\s*(?P<text>.+)$", bullet_text)
                    if nested_comma_match:
                        current_number = nested_comma_match.group("num")
                        bullet_text = nested_comma_match.group("text")
            current.append(bullet_text)
            continue
        current.append(stripped)
    flush()
    return entries
 def extract_citation_entries(document: StructuredDocument) -> list[dict[str, object]]:
    entries: list[dict[str, object]] = []
    def visit(nodes: list[DocumentNode], path: list[str]) -> None:
        for node in nodes:
            current_path = [*path, node.title]
            if _is_citation_heading(node.title):
                section_heading = " / ".join(current_path)
                for item in _split_citation_lines(node.body):
                    raw_text = item["raw_text"]
                    doi_match = DOI_PATTERN.search(raw_text)
                    entries.append(
                        {
                            "section_heading": section_heading,
                            "legacy_reference_number": item["legacy_reference_number"],
                            "raw_text": raw_text,
                            "normalized_text": _normalize_whitespace(raw_text),
                            "doi": doi_match.group(0) if doi_match else "",
                        }
                    )
            visit(node.children, current_path)
    visit(document.nodes, [])
    return entries
 def add_citation_to_document(
    document: StructuredDocument,
    citation_text: str,
    heading_title: str = "Related References",
 ) -> bool:
    normalized_citation = _normalize_whitespace(citation_text)
    if not normalized_citation:
        return False
    for node in document.nodes:
        if _is_citation_heading(node.title):
            existing = {_normalize_whitespace(item["raw_text"]) for item in _split_citation_lines(node.body)}
            if normalized_citation in existing:
                return False
            body = node.body.rstrip()
            node.body = f"{body}\n- {citation_text}".strip() if body else f"- {citation_text}"
            return True
    document.nodes.append(
        DocumentNode(
            node_type="section",
            title=heading_title,
            body=f"- {citation_text}",
            depth=2,
        )
    )
    return True
--- a/apps/api/src/ecospecies_api/document_repository.py
+++ b/apps/api/src/ecospecies_api/document_repository.py
@ -0,0 +1,267 @@
 from __future__ import annotations
 from sqlalchemy import select
 from ecospecies_api.citegeist_bridge import extract_draft_citation
 from ecospecies_api.document_format import (
    build_document_from_species_payload,
    document_to_json,
    extract_citation_entries,
    extract_species_projection,
    export_markdown_document,
    flatten_document_nodes,
    parse_markdown_document,
    validate_markdown_document,
 )
 from ecospecies_api.models import (
    DocumentSection,
    Species,
    SpeciesCitation,
    SpeciesDocument,
    SpeciesDocumentNode,
    SpeciesTaxonIdentifier,
 )
 def _persist_taxon_identifiers(session, species: Species, taxon_identifiers: list[dict[str, object]]) -> None:
    for identifier in list(species.taxon_identifiers):
        session.delete(identifier)
    session.flush()
    for position, item in enumerate(taxon_identifiers, start=1):
        authority = str(item.get("authority", "")).strip()
        identifier = str(item.get("identifier", "")).strip()
        if not authority or not identifier:
            continue
        session.add(
            SpeciesTaxonIdentifier(
                species_id=species.id,
                position=position,
                authority=authority,
                identifier=identifier,
                label=str(item.get("label", "")).strip(),
                is_primary=bool(item.get("primary") or item.get("is_primary")),
                source_url=str(item.get("source_url", "")).strip(),
            )
        )
 def _existing_taxon_identifier_payload(species: Species) -> list[dict[str, object]]:
    return [
        {
            "authority": item.authority,
            "identifier": item.identifier,
            "label": item.label,
            "primary": item.is_primary,
            "source_url": item.source_url,
        }
        for item in species.taxon_identifiers
    ]
 def _citation_match_key(item: dict[str, object]) -> tuple[str, str, str]:
    return (
        str(item.get("section_heading", "")).strip(),
        str(item.get("legacy_reference_number", "")).strip(),
        str(item.get("raw_text", "")).strip(),
    )
 def _persist_citations(session, species: Species, citations: list[dict[str, object]]) -> None:
    existing_by_key = {
        _citation_match_key(
            {
                "section_heading": citation.section_heading,
                "legacy_reference_number": citation.legacy_reference_number,
                "raw_text": citation.raw_text,
            }
        ): citation
        for citation in species.citations
    }
    retained_ids: set[int] = set()
    for position, item in enumerate(citations, start=1):
        raw_text = str(item.get("raw_text", "")).strip()
        if not raw_text:
            continue
        key = _citation_match_key(item)
        legacy_reference_number = str(item.get("legacy_reference_number", "")).strip()
        existing = existing_by_key.get(key)
        extracted_normalized = str(item.get("normalized_text", "")).strip()
        extracted_doi = str(item.get("doi", "")).strip()
        draft = extract_draft_citation(raw_text, legacy_reference_number)
        if existing is None:
            session.add(
                SpeciesCitation(
                    species_id=species.id,
                    position=position,
                    section_heading=str(item.get("section_heading", "")).strip(),
                    legacy_reference_number=legacy_reference_number,
                    citation_key=draft.citation_key if draft is not None else "",
                    entry_type=draft.entry_type if draft is not None else "misc",
                    raw_text=raw_text,
                    normalized_text=extracted_normalized,
                    abstract_text="",
                    draft_bibtex=draft.draft_bibtex if draft is not None else "",
                    doi=extracted_doi,
                    source_url="",
                    openalex_id="",
                    resolver_source_label="",
                    enrichment_status="pending",
                    enrichment_error="",
                    source_type="document_extract",
                    review_status="draft",
                )
            )
            continue
        existing.position = position
        existing.section_heading = str(item.get("section_heading", "")).strip()
        existing.legacy_reference_number = legacy_reference_number
        existing.raw_text = raw_text
        if existing.review_status == "draft":
            existing.normalized_text = extracted_normalized
            existing.abstract_text = ""
            existing.doi = extracted_doi
            existing.citation_key = draft.citation_key if draft is not None else ""
            existing.entry_type = draft.entry_type if draft is not None else "misc"
            existing.draft_bibtex = draft.draft_bibtex if draft is not None else ""
            existing.source_type = "document_extract"
            existing.enrichment_status = "pending"
            existing.enrichment_error = ""
            existing.resolver_source_label = ""
            existing.source_url = ""
            existing.openalex_id = ""
        retained_ids.add(existing.id)
        session.add(existing)
    for citation in list(species.citations):
        if citation.id not in retained_ids and citation.source_type in {"document_extract", "editor_review"}:
            session.delete(citation)
 def _persist_document_model(session, species: Species, document_model, markdown_content: str, updated_by: str) -> None:
    ast_json = document_to_json(document_model)
    document = session.scalar(
        select(SpeciesDocument).where(SpeciesDocument.species_id == species.id)
    )
    if document is None:
        document = SpeciesDocument(
            species_id=species.id,
            source_format="ecospecies-markdown-v1",
            markdown_content=markdown_content,
            ast_json=ast_json,
            updated_by=updated_by,
        )
        session.add(document)
        session.flush()
    else:
        document.source_format = "ecospecies-markdown-v1"
        document.markdown_content = markdown_content
        document.ast_json = ast_json
        document.updated_by = updated_by
        session.add(document)
    for node in list(document.nodes):
        session.delete(node)
    session.flush()
    for node in flatten_document_nodes(document_model):
        session.add(
            SpeciesDocumentNode(
                document_id=document.id,
                parent_node_ref=node["parent_id"],
                node_ref=node["node_id"],
                position=node["position"],
                depth=node["depth"],
                node_type=node["node_type"],
                title=node["title"],
                body_markdown=node["body_markdown"],
                body_plaintext=node["body_plaintext"],
                )
            )
 def sync_species_document(session, species: Species, item: dict[str, object]) -> None:
    payload = dict(item)
    if "taxon_identifiers" not in payload or not payload.get("taxon_identifiers"):
        payload["taxon_identifiers"] = _existing_taxon_identifier_payload(species)
    if "primary_taxon_authority" not in payload or not payload.get("primary_taxon_authority"):
        for identifier in payload["taxon_identifiers"]:
            if bool(identifier.get("primary")):
                payload["primary_taxon_authority"] = str(identifier.get("authority", "")).strip()
                break
    document_model = build_document_from_species_payload(payload)
    markdown_content = export_markdown_document(document_model)
    _persist_document_model(
        session,
        species,
        document_model,
        markdown_content,
        str(item.get("last_modified_by", "system-import")),
    )
    _persist_citations(session, species, extract_citation_entries(document_model))
 def get_species_document_payload(session, slug: str) -> dict[str, object] | None:
    species = session.scalar(select(Species).where(Species.slug == slug))
    if species is None or species.document is None:
        return None
    document = species.document
    return {
        "slug": species.slug,
        "source_format": document.source_format,
        "markdown": document.markdown_content,
        "ast_json": document.ast_json,
        "node_count": len(document.nodes),
        "updated_by": document.updated_by,
    }
 def save_species_document(session, species: Species, markdown: str, username: str) -> dict[str, object]:
    errors = validate_markdown_document(markdown)
    if errors:
        raise ValueError("; ".join(errors))
    document_model = parse_markdown_document(markdown)
    projection = extract_species_projection(document_model)
    _persist_document_model(session, species, document_model, markdown, username)
    _persist_citations(session, species, extract_citation_entries(document_model))
    if projection["title"]:
        species.title = str(projection["title"])
    if projection["common_name"]:
        species.common_name = str(projection["common_name"])
    if projection["scientific_name"]:
        species.scientific_name = str(projection["scientific_name"])
    if projection["flelmr_code"]:
        species.flelmr_code = str(projection["flelmr_code"])
    _persist_taxon_identifiers(session, species, list(projection["taxon_identifiers"]))
    species.summary = str(projection["summary"])
    species.section_count = len(projection["sections"])
    species.last_modified_by = username
    for section in list(species.sections):
        session.delete(section)
    session.flush()
    for position, section in enumerate(projection["sections"], start=1):
        session.add(
            DocumentSection(
                species_id=species.id,
                position=position,
                heading=str(section["heading"]),
                content=str(section["content"]),
            )
        )
    return {
        "slug": species.slug,
        "summary": species.summary,
        "section_count": species.section_count,
        "markdown": markdown,
        "updated_by": username,
    }
--- a/apps/api/src/ecospecies_api/models.py
+++ b/apps/api/src/ecospecies_api/models.py
@ -23,6 +23,9 @@ class Species(Base):
    publication_status: Mapped[str] = mapped_column(String(32), default="published", index=True)
    is_archived: Mapped[bool] = mapped_column(Boolean, default=False, index=True)
    editor_notes: Mapped[str] = mapped_column(Text, default="")
    created_by: Mapped[str] = mapped_column(String(255), default="system-import")
    owner_username: Mapped[str] = mapped_column(String(255), default="")
    owner_role: Mapped[str] = mapped_column(String(32), default="")
    last_modified_by: Mapped[str] = mapped_column(String(255), default="system-import")
    sections: Mapped[list["DocumentSection"]] = relationship(
@ -40,6 +43,21 @@ class Species(Base):
        cascade="all, delete-orphan",
        order_by="SpeciesAuditLog.id.desc()",
    )
    document: Mapped["SpeciesDocument | None"] = relationship(
        back_populates="species",
        cascade="all, delete-orphan",
        uselist=False,
    )
    taxon_identifiers: Mapped[list["SpeciesTaxonIdentifier"]] = relationship(
        back_populates="species",
        cascade="all, delete-orphan",
        order_by="SpeciesTaxonIdentifier.position",
    )
    citations: Mapped[list["SpeciesCitation"]] = relationship(
        back_populates="species",
        cascade="all, delete-orphan",
        order_by="SpeciesCitation.position",
    )
 class DocumentSection(Base):
@ -77,3 +95,93 @@ class SpeciesAuditLog(Base):
    details_json: Mapped[str] = mapped_column(Text)
    species: Mapped[Species] = relationship(back_populates="audit_entries")
 class SpeciesDocument(Base):
    __tablename__ = "species_document"
    id: Mapped[int] = mapped_column(Integer, primary_key=True)
    species_id: Mapped[int] = mapped_column(ForeignKey("species.id", ondelete="CASCADE"), unique=True, index=True)
    source_format: Mapped[str] = mapped_column(String(64), default="ecospecies-markdown-v1")
    markdown_content: Mapped[str] = mapped_column(Text, default="")
    ast_json: Mapped[str] = mapped_column(Text, default="")
    updated_by: Mapped[str] = mapped_column(String(255), default="system-import")
    species: Mapped[Species] = relationship(back_populates="document")
    nodes: Mapped[list["SpeciesDocumentNode"]] = relationship(
        back_populates="document",
        cascade="all, delete-orphan",
        order_by="SpeciesDocumentNode.position",
    )
 class SpeciesDocumentNode(Base):
    __tablename__ = "species_document_node"
    id: Mapped[int] = mapped_column(Integer, primary_key=True)
    document_id: Mapped[int] = mapped_column(ForeignKey("species_document.id", ondelete="CASCADE"), index=True)
    parent_node_ref: Mapped[str | None] = mapped_column(String(64), nullable=True, default=None)
    node_ref: Mapped[str] = mapped_column(String(64), index=True)
    position: Mapped[int] = mapped_column(Integer, default=1)
    depth: Mapped[int] = mapped_column(Integer, default=2)
    node_type: Mapped[str] = mapped_column(String(32), default="section")
    title: Mapped[str] = mapped_column(String(255), default="")
    body_markdown: Mapped[str] = mapped_column(Text, default="")
    body_plaintext: Mapped[str] = mapped_column(Text, default="")
    source_heading: Mapped[str] = mapped_column(String(255), default="")
    source_span_start: Mapped[int | None] = mapped_column(Integer, nullable=True, default=None)
    source_span_end: Mapped[int | None] = mapped_column(Integer, nullable=True, default=None)
    document: Mapped[SpeciesDocument] = relationship(back_populates="nodes")
 class ContributorAccount(Base):
    __tablename__ = "contributor_account"
    id: Mapped[int] = mapped_column(Integer, primary_key=True)
    email: Mapped[str] = mapped_column(String(255), unique=True, index=True)
    token_hash: Mapped[str] = mapped_column(String(128), unique=True, index=True)
    age_gate_confirmed: Mapped[bool] = mapped_column(Boolean, default=False)
    created_at: Mapped[str] = mapped_column(String(64), index=True)
    is_active: Mapped[bool] = mapped_column(Boolean, default=True, index=True)
 class SpeciesTaxonIdentifier(Base):
    __tablename__ = "species_taxon_identifier"
    id: Mapped[int] = mapped_column(Integer, primary_key=True)
    species_id: Mapped[int] = mapped_column(ForeignKey("species.id", ondelete="CASCADE"), index=True)
    position: Mapped[int] = mapped_column(Integer, default=1)
    authority: Mapped[str] = mapped_column(String(64), default="")
    identifier: Mapped[str] = mapped_column(String(255), default="")
    label: Mapped[str] = mapped_column(String(128), default="")
    is_primary: Mapped[bool] = mapped_column(Boolean, default=False, index=True)
    source_url: Mapped[str] = mapped_column(String(500), default="")
    species: Mapped[Species] = relationship(back_populates="taxon_identifiers")
 class SpeciesCitation(Base):
    __tablename__ = "species_citation"
    id: Mapped[int] = mapped_column(Integer, primary_key=True)
    species_id: Mapped[int] = mapped_column(ForeignKey("species.id", ondelete="CASCADE"), index=True)
    position: Mapped[int] = mapped_column(Integer, default=1)
    section_heading: Mapped[str] = mapped_column(String(255), default="")
    legacy_reference_number: Mapped[str] = mapped_column(String(64), default="", index=True)
    citation_key: Mapped[str] = mapped_column(String(255), default="", index=True)
    entry_type: Mapped[str] = mapped_column(String(64), default="misc")
    raw_text: Mapped[str] = mapped_column(Text, default="")
    normalized_text: Mapped[str] = mapped_column(Text, default="")
    abstract_text: Mapped[str] = mapped_column(Text, default="")
    draft_bibtex: Mapped[str] = mapped_column(Text, default="")
    doi: Mapped[str] = mapped_column(String(255), default="", index=True)
    source_url: Mapped[str] = mapped_column(String(500), default="")
    openalex_id: Mapped[str] = mapped_column(String(64), default="", index=True)
    resolver_source_label: Mapped[str] = mapped_column(String(255), default="")
    enrichment_status: Mapped[str] = mapped_column(String(32), default="pending", index=True)
    enrichment_error: Mapped[str] = mapped_column(Text, default="")
    source_type: Mapped[str] = mapped_column(String(64), default="document_extract")
    review_status: Mapped[str] = mapped_column(String(32), default="draft", index=True)
    species: Mapped[Species] = relationship(back_populates="citations")
--- a/apps/api/src/ecospecies_api/parser.py
+++ b/apps/api/src/ecospecies_api/parser.py
@ -1,14 +1,18 @@
 from __future__ import annotations
 import hashlib
 import os
 import re
 from collections import Counter
 from dataclasses import dataclass
 from pathlib import Path
 SECTION_PATTERN = re.compile(r"^[A-Z][A-Z\s/&()-]{2,}$")
 TITLE_SECTION_PATTERN = re.compile(r"^[A-Z][A-Za-z\s/&()-]{2,}$")
 FIELD_PATTERN = re.compile(r"^(?P<key>[A-Za-z/ _-]+):\s*(?P<value>.*)$")
 SUMMARY_MARKER_PATTERN = re.compile(r"^(summary(?:/abstract)?|abstract|executive summary):?\s*$", re.IGNORECASE)
 SAFE_DIRECTORY_NAME_PATTERN = re.compile(r"^[A-Za-z0-9_-]+$")
@dataclass
@ -38,6 +42,10 @@ class SpeciesRecord:
    diagnostics: list[IngestDiagnostic]
 def get_repo_root() -> Path:
    return Path(__file__).resolve().parents[4]
 def slugify(value: str) -> str:
    cleaned = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-")
    return cleaned or "unknown-species"
@ -53,6 +61,33 @@ def normalize_whitespace(value: str) -> str:
    return re.sub(r"\s+", " ", value).strip()
 def is_section_heading(line: str) -> bool:
    stripped = line.strip()
    if not stripped:
        return False
    normalized = stripped[:-1].strip() if stripped.endswith(":") else stripped
    if not normalized:
        return False
    if ":" in normalized:
        return False
    if SECTION_PATTERN.fullmatch(normalized):
        return True
    if not TITLE_SECTION_PATTERN.fullmatch(normalized):
        return False
    words = normalized.split()
    if len(words) > 4:
        return False
    return all(word[0].isupper() for word in words if word and word[0].isalpha())
 def normalize_heading(line: str) -> str:
    stripped = line.strip()
    if stripped.endswith(":"):
        return stripped[:-1].strip()
    return stripped
 def split_sections(lines: list[str]) -> list[Section]:
    sections: list[Section] = []
    current_heading = "HEADER"
@ -61,7 +96,7 @@ def split_sections(lines: list[str]) -> list[Section]:
    for raw_line in lines:
        line = raw_line.rstrip()
        stripped = line.strip()
-        if SECTION_PATTERN.fullmatch(stripped):
+        if is_section_heading(stripped):
            if current_lines:
                sections.append(
                    Section(
@ -69,7 +104,7 @@ def split_sections(lines: list[str]) -> list[Section]:
                        content="\n".join(current_lines).strip(),
                    )
                )
-            current_heading = stripped
+            current_heading = normalize_heading(stripped)
            current_lines = []
            continue
        current_lines.append(line)
@ -96,8 +131,9 @@ def extract_metadata(lines: list[str]) -> dict[str, str]:
        value = match.group("value").strip()
        metadata[key] = value
-        # Legacy files vary between "FLELMR", "FLELMR Code", and similar labels.
+        # Legacy files vary between "FLELMR", "FLELMR Code", "EcoSpecies Code",
-        if key.startswith("flelmr"):
+        # and similar labels.
        if key.startswith("flelmr") or key == "ecospecies code":
            metadata["flelmr"] = value
    return metadata
@ -127,7 +163,7 @@ def extract_summary(lines: list[str], sections: list[Section]) -> str:
                if summary_lines:
                    summary_lines.append("")
                continue
-            if SECTION_PATTERN.fullmatch(stripped):
+            if is_section_heading(stripped):
                break
            if stripped.startswith("[") and not summary_lines:
                break
@ -202,23 +238,76 @@ def parse_species_file(path: Path) -> SpeciesRecord:
    )
 def ensure_unique_record_slugs(records: list[SpeciesRecord]) -> list[SpeciesRecord]:
    slug_counts = Counter(record.slug for record in records)
    used_slugs: set[str] = set()
    for record in records:
        base_slug = record.slug
        if slug_counts[base_slug] == 1 and base_slug not in used_slugs:
            used_slugs.add(base_slug)
            continue
        disambiguator = slugify(Path(record.source_file).stem)
        if disambiguator == base_slug:
            disambiguator = hashlib.sha1(record.source_file.encode("utf-8")).hexdigest()[:8]
        candidate = f"{base_slug}-{disambiguator}"
        if candidate in used_slugs:
            source_hash = hashlib.sha1(record.source_file.encode("utf-8")).hexdigest()[:8]
            candidate = f"{candidate}-{source_hash}"
        suffix = 2
        while candidate in used_slugs:
            candidate = f"{base_slug}-{disambiguator}-{suffix}"
            suffix += 1
        record.slug = candidate
        used_slugs.add(candidate)
    return records
 def load_species_records(data_dir: str) -> list[SpeciesRecord]:
-    base = Path(data_dir)
+    base = resolve_data_dir(data_dir)
    if not base.exists():
        return []
    records: list[SpeciesRecord] = []
    for path in sorted(base.glob("*.txt")):
        records.append(parse_species_file(path))
-    return records
+    return ensure_unique_record_slugs(records)
 def resolve_data_dir(data_dir: str) -> Path:
    repo_root = get_repo_root().resolve()
    raw_value = data_dir.strip()
    if not raw_value:
        raise ValueError("Species data directory cannot be empty.")
    candidate = Path(raw_value)
    if candidate.is_absolute():
        resolved = candidate.resolve()
    else:
        resolved = (repo_root / candidate).resolve()
    try:
        relative = resolved.relative_to(repo_root)
    except ValueError as exc:
        raise ValueError("Species data directory must stay within the codebase directory.") from exc
    if not relative.parts:
        raise ValueError("Species data directory must be a subdirectory of the codebase.")
    for part in relative.parts:
        if not SAFE_DIRECTORY_NAME_PATTERN.fullmatch(part):
            raise ValueError(
                f"Species data directory contains an unsafe directory name: {part!r}."
            )
    return resolved
 def get_default_data_dir() -> str:
-    return os.environ.get(
+    configured = os.environ.get("ECOSPECIES_DATA_DIR", "input-data/InputFiles")
-        "ECOSPECIES_DATA_DIR",
+    return str(resolve_data_dir(configured))
        str(
            Path(__file__).resolve().parents[4].parent
            / "01-legacy-code-and-data"
            / "InputFiles - TXT"
        ),
    )
--- a/apps/api/src/ecospecies_api/repository.py
+++ b/apps/api/src/ecospecies_api/repository.py
--- a/apps/api/test_auth.py
+++ b/apps/api/test_auth.py
@ -0,0 +1,21 @@
 from __future__ import annotations
 import importlib.util
 import sys
 from pathlib import Path
 ROOT = Path(__file__).resolve().parent
 SRC = ROOT / "src"
 if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
 TEST_PATH = ROOT / "tests" / "test_auth.py"
 SPEC = importlib.util.spec_from_file_location("ecospecies_api_test_auth", TEST_PATH)
 MODULE = importlib.util.module_from_spec(SPEC)
 assert SPEC is not None and SPEC.loader is not None
 SPEC.loader.exec_module(MODULE)
 for name in dir(MODULE):
    if name.startswith("Test") or name.endswith("Tests"):
        globals()[name] = getattr(MODULE, name)
--- a/apps/api/test_citation_enrichment.py
+++ b/apps/api/test_citation_enrichment.py
@ -0,0 +1,21 @@
 from __future__ import annotations
 import importlib.util
 import sys
 from pathlib import Path
 ROOT = Path(__file__).resolve().parent
 SRC = ROOT / "src"
 if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
 TEST_PATH = ROOT / "tests" / "test_citation_enrichment.py"
 SPEC = importlib.util.spec_from_file_location("ecospecies_api_test_citation_enrichment", TEST_PATH)
 MODULE = importlib.util.module_from_spec(SPEC)
 assert SPEC is not None and SPEC.loader is not None
 SPEC.loader.exec_module(MODULE)
 for name in dir(MODULE):
    if name.startswith("Test") or name.endswith("Tests"):
        globals()[name] = getattr(MODULE, name)
--- a/apps/api/test_document_format.py
+++ b/apps/api/test_document_format.py
@ -0,0 +1,21 @@
 from __future__ import annotations
 import importlib.util
 import sys
 from pathlib import Path
 ROOT = Path(__file__).resolve().parent
 SRC = ROOT / "src"
 if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
 TEST_PATH = ROOT / "tests" / "test_document_format.py"
 SPEC = importlib.util.spec_from_file_location("ecospecies_api_test_document_format", TEST_PATH)
 MODULE = importlib.util.module_from_spec(SPEC)
 assert SPEC is not None and SPEC.loader is not None
 SPEC.loader.exec_module(MODULE)
 for name in dir(MODULE):
    if name.startswith("Test") or name.endswith("Tests"):
        globals()[name] = getattr(MODULE, name)
--- a/apps/api/test_parser.py
+++ b/apps/api/test_parser.py
@ -0,0 +1,21 @@
 from __future__ import annotations
 import importlib.util
 import sys
 from pathlib import Path
 ROOT = Path(__file__).resolve().parent
 SRC = ROOT / "src"
 if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
 TEST_PATH = ROOT / "tests" / "test_parser.py"
 SPEC = importlib.util.spec_from_file_location("ecospecies_api_test_parser", TEST_PATH)
 MODULE = importlib.util.module_from_spec(SPEC)
 assert SPEC is not None and SPEC.loader is not None
 SPEC.loader.exec_module(MODULE)
 for name in dir(MODULE):
    if name.startswith("Test") or name.endswith("Tests"):
        globals()[name] = getattr(MODULE, name)
--- a/apps/api/tests/test_auth.py
+++ b/apps/api/tests/test_auth.py
@ -0,0 +1,58 @@
 from __future__ import annotations
 import tempfile
 import unittest
 from pathlib import Path
 from unittest.mock import patch
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
 from ecospecies_api import auth, repository
 class ContributorAuthTests(unittest.TestCase):
    def setUp(self) -> None:
        self.tempdir = tempfile.TemporaryDirectory()
        db_path = Path(self.tempdir.name) / "test.db"
        self.engine = create_engine(f"sqlite:///{db_path}", future=True)
        self.session_local = sessionmaker(
            bind=self.engine,
            autoflush=False,
            autocommit=False,
            future=True,
        )
        self.repository_engine_patch = patch.object(repository, "create_db_engine", return_value=self.engine)
        self.repository_session_patch = patch.object(repository, "SessionLocal", self.session_local)
        self.auth_engine_patch = patch.object(auth, "create_db_engine", return_value=self.engine)
        self.auth_session_patch = patch.object(auth, "SessionLocal", self.session_local)
        self.repository_engine_patch.start()
        self.repository_session_patch.start()
        self.auth_engine_patch.start()
        self.auth_session_patch.start()
    def tearDown(self) -> None:
        self.auth_session_patch.stop()
        self.auth_engine_patch.stop()
        self.repository_session_patch.stop()
        self.repository_engine_patch.stop()
        self.engine.dispose()
        self.tempdir.cleanup()
    def test_contributor_token_resolves_to_contributor_session(self) -> None:
        registration = repository.register_contributor("author@example.org", True)
        session = auth.resolve_auth_session({"Authorization": f"Bearer {registration['token']}"})
        self.assertIsNotNone(session)
        assert session is not None
        self.assertEqual(session.username, "author@example.org")
        self.assertEqual(session.role, "contributor")
    def test_contributor_role_does_not_satisfy_editor(self) -> None:
        self.assertTrue(auth.role_satisfies("editor", "contributor"))
        self.assertFalse(auth.role_satisfies("contributor", "editor"))
 if __name__ == "__main__":
    unittest.main()
--- a/apps/api/tests/test_citation_enrichment.py
+++ b/apps/api/tests/test_citation_enrichment.py
@ -0,0 +1,527 @@
 from __future__ import annotations
 import unittest
 from unittest.mock import patch
 from ecospecies_api.citation_enrichment import (
    _crossref_message_to_entry,
    _datacite_item_to_entry,
    _openalex_work_to_entry,
    _render_normalized_text,
    apply_citation_candidate_selection,
    discover_citation_candidates,
    enrich_citation_payload,
    LocalBibEntry,
    LocalMetadataResolver,
    LocalResolution,
 )
 from ecospecies_api.citegeist_bridge import extract_draft_citation, render_single_bibtex
 class CitationEnrichmentTests(unittest.TestCase):
    def test_render_normalized_text_includes_volume_number_and_pages(self) -> None:
        rendered = _render_normalized_text(
            "article",
            {
                "author": "Daniell, W.C.",
                "year": "1872",
                "title": "Letters referring to experiments of W.C. Daniell",
                "journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
                "volume": "2",
                "number": "4",
                "pages": "387-390",
                "doi": "10.1000/example",
            },
        )
        self.assertEqual(
            rendered,
            "Daniell, W.C. (1872). Letters referring to experiments of W.C. Daniell. Comm. Rept. U.S. Comm. Fish & Fish., 2(4): 387-390. DOI:10.1000/example",
        )
    def test_crossref_mapping_keeps_volume_issue_and_pages(self) -> None:
        entry = _crossref_message_to_entry(
            {
                "type": "journal-article",
                "title": ["Example Work"],
                "issued": {"date-parts": [[1872]]},
                "author": [{"family": "Daniell", "given": "W.C."}],
                "container-title": ["Comm. Rept. U.S. Comm. Fish & Fish."],
                "DOI": "10.1000/example",
                "URL": "https://doi.org/10.1000/example",
                "volume": "2",
                "issue": "4",
                "page": "387-390",
            }
        )
        self.assertEqual(entry.fields["volume"], "2")
        self.assertEqual(entry.fields["number"], "4")
        self.assertEqual(entry.fields["pages"], "387-390")
    def test_openalex_mapping_keeps_biblio_fields(self) -> None:
        entry = _openalex_work_to_entry(
            {
                "id": "https://openalex.org/W12345",
                "display_name": "OpenAlex Discovered Work",
                "publication_year": 2022,
                "type": "article",
                "doi": "https://doi.org/10.1000/example-openalex",
                "authorships": [{"author": {"display_name": "J S, Smith"}}],
                "primary_location": {"source": {"display_name": "Journal of Graph Discovery"}},
                "biblio": {"volume": "12", "issue": "3", "first_page": "101", "last_page": "118"},
                "abstract_inverted_index": {"Graphs": [0], "support": [1], "learning": [2]},
            }
        )
        self.assertEqual(entry.fields["author"], "Smith, J. S.")
        self.assertEqual(entry.fields["volume"], "12")
        self.assertEqual(entry.fields["number"], "3")
        self.assertEqual(entry.fields["pages"], "101-118")
        self.assertEqual(entry.fields["abstract"], "Graphs support learning")
    def test_openalex_mapping_handles_null_source(self) -> None:
        entry = _openalex_work_to_entry(
            {
                "id": "https://openalex.org/W54321",
                "display_name": "OpenAlex Work Without Source",
                "publication_year": 2021,
                "type": "article",
                "doi": "https://doi.org/10.1000/example-null-source",
                "authorships": [{"author": {"display_name": "Jane Smith"}}],
                "primary_location": {"source": None},
                "biblio": {"volume": "5", "issue": "1", "first_page": "10", "last_page": "20"},
            }
        )
        self.assertEqual(entry.fields["title"], "OpenAlex Work Without Source")
        self.assertNotIn("journal", entry.fields)
        self.assertEqual(entry.fields["volume"], "5")
        self.assertEqual(entry.fields["number"], "1")
        self.assertEqual(entry.fields["pages"], "10-20")
    def test_datacite_mapping_keeps_container_and_pages(self) -> None:
        entry = _datacite_item_to_entry(
            {
                "attributes": {
                    "titles": [{"title": "DataCite Work"}],
                    "creators": [{"name": "J R, Rivera"}],
                    "publicationYear": "2021",
                    "doi": "10.1000/datacite-work",
                    "url": "https://doi.org/10.1000/datacite-work",
                    "container": "Journal of Metadata",
                    "volume": "7",
                    "issue": "2",
                    "firstPage": "44",
                    "lastPage": "59",
                    "descriptions": [
                        {"descriptionType": "Abstract", "description": "Abstract: Metadata makes reuse easier."}
                    ],
                }
            }
        )
        self.assertEqual(entry.fields["author"], "Rivera, J. R.")
        self.assertEqual(entry.fields["journal"], "Journal of Metadata")
        self.assertEqual(entry.fields["volume"], "7")
        self.assertEqual(entry.fields["number"], "2")
        self.assertEqual(entry.fields["pages"], "44-59")
        self.assertEqual(entry.fields["abstract"], "Metadata makes reuse easier.")
    def test_render_single_bibtex_preserves_balanced_braces_and_repairs_unmatched_ones(self) -> None:
        rendered = render_single_bibtex(
            "misc",
            "example",
            {
                "title": "Alpha_beta {Gamma}",
                "note": "raw_reference = {Alpha } beta}",
            },
        )
        self.assertIn("title = {Alpha_beta {Gamma}}", rendered)
        self.assertIn("note = {raw_reference = {Alpha } beta)}", rendered)
    def test_extract_draft_citation_repairs_report_style_reference_fields(self) -> None:
        class MockEntry:
            entry_type = "misc"
            citation_key = "badkey"
            fields = {
                "title": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
                "year": "1872",
                "note": "extracted_reference = {true}",
            }
        with patch(
            "ecospecies_api.citegeist_bridge._load_citegeist_extract",
            return_value=lambda text: [MockEntry()],
        ):
            draft = extract_draft_citation(
                "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
                legacy_reference_number="160",
            )
        self.assertIsNotNone(draft)
        assert draft is not None
        self.assertEqual(draft.fields["author"], "Daniell, W.C")
        self.assertEqual(
            draft.fields["title"],
            "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
        )
        self.assertEqual(draft.fields["journal"], "Comm. Rept. U.S. Comm. Fish & Fish")
        self.assertEqual(draft.fields["volume"], "2")
        self.assertEqual(draft.fields["pages"], "387-390")
        self.assertEqual(draft.citation_key, "daniell1872lettersreferringexperiments")
    def test_extract_draft_citation_does_not_split_title_on_report_word(self) -> None:
        class MockEntry:
            entry_type = "misc"
            citation_key = "badkey"
            fields = {
                "title": "Smith, J. 1999. Habitat report synthesis for Alabama shad. NOAA Tech. Memo. NMFS-SEFSC-200.",
                "year": "1999",
                "note": "extracted_reference = {true}",
            }
        with patch(
            "ecospecies_api.citegeist_bridge._load_citegeist_extract",
            return_value=lambda text: [MockEntry()],
        ):
            draft = extract_draft_citation(
                "Smith, J. 1999. Habitat report synthesis for Alabama shad. NOAA Tech. Memo. NMFS-SEFSC-200.",
                legacy_reference_number="42",
            )
        self.assertIsNotNone(draft)
        assert draft is not None
        self.assertEqual(draft.fields["author"], "Smith, J")
        self.assertEqual(draft.fields["title"], "Habitat report synthesis for Alabama shad")
        self.assertEqual(draft.fields["howpublished"], "NOAA Tech. Memo. NMFS-SEFSC-200")
        self.assertNotIn("journal", draft.fields)
    def test_extract_draft_citation_repairs_proc_abbreviation_venue(self) -> None:
        class MockEntry:
            entry_type = "misc"
            citation_key = "badkey"
            fields = {
                "title": "Bailey, R.M., H.E. Winn and C.L. Smith. 1954. Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes. Proc. Acad. Sci. Philad. 106: 109-134.",
                "year": "1954",
                "note": "extracted_reference = {true}",
            }
        with patch(
            "ecospecies_api.citegeist_bridge._load_citegeist_extract",
            return_value=lambda text: [MockEntry()],
        ):
            draft = extract_draft_citation(
                "Bailey, R.M., H.E. Winn and C.L. Smith. 1954. Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes. Proc. Acad. Sci. Philad. 106: 109-134.",
                legacy_reference_number="26",
            )
        self.assertIsNotNone(draft)
        assert draft is not None
        self.assertEqual(
            draft.fields["title"],
            "Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes",
        )
        self.assertEqual(draft.fields["journal"], "Proc. Acad. Sci. Philad")
        self.assertEqual(draft.fields["volume"], "106")
        self.assertEqual(draft.fields["pages"], "109-134")
    def test_extract_draft_citation_repairs_occasional_paper_venue(self) -> None:
        class MockEntry:
            entry_type = "misc"
            citation_key = "badkey"
            fields = {
                "title": "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
                "year": "1950",
                "note": "extracted_reference = {true}",
            }
        with patch(
            "ecospecies_api.citegeist_bridge._load_citegeist_extract",
            return_value=lambda text: [MockEntry()],
        ):
            draft = extract_draft_citation(
                "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
                legacy_reference_number="41",
            )
        self.assertIsNotNone(draft)
        assert draft is not None
        self.assertEqual(
            draft.fields["title"],
            "Annotated list of the fauna of the Grand Isle region, 1928-1946",
        )
        self.assertEqual(draft.fields["journal"], "Occas. Pap. Mar. Lab., LSU")
        self.assertEqual(draft.fields["volume"], "6")
        self.assertEqual(draft.fields["number"], "6")
        self.assertEqual(draft.fields["pages"], "1-66")
    def test_extract_draft_citation_repairs_partial_existing_venue_stub(self) -> None:
        class MockEntry:
            entry_type = "misc"
            citation_key = "badkey"
            fields = {
                "title": "Annotated list of the fauna of the Grand Isle region, 1928-1946",
                "year": "1950",
                "howpublished": "Occas",
                "note": "extracted_reference = {true}",
            }
        with patch(
            "ecospecies_api.citegeist_bridge._load_citegeist_extract",
            return_value=lambda text: [MockEntry()],
        ):
            draft = extract_draft_citation(
                "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
                legacy_reference_number="41",
            )
        self.assertIsNotNone(draft)
        assert draft is not None
        self.assertEqual(draft.fields["journal"], "Occas. Pap. Mar. Lab., LSU")
        self.assertEqual(draft.fields["volume"], "6")
        self.assertEqual(draft.fields["number"], "6")
        self.assertEqual(draft.fields["pages"], "1-66")
    def test_falls_back_to_internal_resolver_when_citegeist_repo_is_unavailable(self) -> None:
        class MockResolver:
            def resolve_entry(self, entry):
                class Resolution:
                    source_label = "crossref:doi:10.1000/example"
                    class Entry:
                        entry_type = "article"
                        citation_key = "doi101000example"
                        fields = {
                            "author": "Smith, Jane",
                            "year": "2024",
                            "title": "Example Work",
                            "journal": "Journal of Examples",
                            "doi": "10.1000/example",
                            "url": "https://doi.org/10.1000/example",
                        }
                    entry = Entry()
                return Resolution()
        with patch(
            "ecospecies_api.citation_enrichment._load_citegeist_resolution_components",
            return_value=(None, None, None, None),
        ):
            result = enrich_citation_payload(
                {
                    "raw_text": "Smith, Jane. 2024. Example Work.",
                    "legacy_reference_number": "7",
                },
                resolver=MockResolver(),
            )
        self.assertEqual(result["enrichment_status"], "resolved")
        self.assertEqual(result["doi"], "10.1000/example")
        self.assertEqual(result["source_url"], "https://doi.org/10.1000/example")
        self.assertEqual(result["resolver_source_label"], "crossref:doi:10.1000/example")
        self.assertIn("ecospecies_reference_number = {7}", result["draft_bibtex"])
    def test_enrichment_replaces_raw_reference_title_and_dedupes_legacy_note(self) -> None:
        class MockResolver:
            def resolve_entry(self, entry):
                class Resolution:
                    source_label = "crossref:search:Letters referring to experiments"
                    class Entry:
                        entry_type = "article"
                        citation_key = "daniell1872lettersshadalabama"
                        fields = {
                            "author": "Daniell, W.C.",
                            "year": "1872",
                            "title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
                            "journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
                            "url": "",
                        }
                    entry = Entry()
                return Resolution()
        result = enrich_citation_payload(
            {
                "raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
                "legacy_reference_number": "160",
                "citation_key": "daniell1948daniellwc",
            },
            resolver=MockResolver(),
        )
        self.assertEqual(result["enrichment_status"], "resolved")
        self.assertEqual(result["citation_key"], "daniell1872lettersreferringexperiments")
        self.assertIn(
            "title = {Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River}",
            result["draft_bibtex"],
        )
        self.assertIn("year = {1872}", result["draft_bibtex"])
        self.assertEqual(result["draft_bibtex"].count("ecospecies_reference_number = {160}"), 1)
    def test_enrichment_rejects_conflicting_resolved_metadata(self) -> None:
        class MockResolver:
            def resolve_entry(self, entry):
                class Resolution:
                    source_label = "crossref:search:alabama-shad-false-positive"
                    class Entry:
                        entry_type = "article"
                        citation_key = "daniell2009habitatuseage"
                        fields = {
                            "author": "Daniell, W.C.",
                            "year": "2009",
                            "title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA",
                            "journal": "Transactions of the American Fisheries Society",
                            "doi": "10.1111/j.1600-0633.2009.00395.x",
                            "url": "https://doi.org/10.1111/j.1600-0633.2009.00395.x",
                            "volume": "19",
                            "number": "1",
                            "pages": "107-115",
                        }
                    entry = Entry()
                return Resolution()
        result = enrich_citation_payload(
            {
                "raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
                "legacy_reference_number": "160",
            },
            resolver=MockResolver(),
        )
        self.assertEqual(result["enrichment_status"], "unresolved")
        self.assertIn("conflicts with citation seed fields", result["enrichment_error"])
    def test_unresolved_enrichment_still_returns_refreshed_seed_fields(self) -> None:
        class MockResolver:
            def resolve_entry(self, entry):
                return None
        result = enrich_citation_payload(
            {
                "raw_text": "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
                "legacy_reference_number": "41",
                "citation_key": "oldbadkey",
                "entry_type": "misc",
            },
            resolver=MockResolver(),
        )
        self.assertEqual(result["enrichment_status"], "unresolved")
        self.assertEqual(result["citation_key"], "behre1950annotatedlistfauna")
        self.assertIn("title = {Annotated list of the fauna of the Grand Isle region, 1928-1946}", result["draft_bibtex"])
        self.assertIn("Occas. Pap. Mar. Lab., LSU", result["draft_bibtex"])
    def test_discover_citation_candidates_returns_scored_candidates(self) -> None:
        class MockResolver:
            def search_crossref_candidates(self, title):
                return [
                    LocalResolution(
                        LocalBibEntry(
                            "article",
                            "daniell1872lettersreferringexperiments",
                            {
                                "author": "Daniell, W.C.",
                                "year": "1872",
                                "title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
                                "journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
                                "volume": "2",
                                "pages": "387-390",
                            },
                        ),
                        "crossref:search:1:daniell-good",
                    ),
                    LocalResolution(
                        LocalBibEntry(
                            "article",
                            "daniell2009habitatuseage",
                            {
                                "author": "Daniell, W.C.",
                                "year": "2009",
                                "title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA",
                                "journal": "Transactions of the American Fisheries Society",
                                "volume": "19",
                                "number": "1",
                                "pages": "107-115",
                            },
                        ),
                        "crossref:search:2:daniell-bad",
                    ),
                ]
            def search_datacite_candidates(self, title):
                return []
            def search_openalex_candidates(self, title):
                return []
        result = discover_citation_candidates(
            {
                "raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
                "legacy_reference_number": "160",
            },
            resolver=MockResolver(),
        )
        self.assertEqual(result["candidate_count"], 2)
        self.assertGreater(result["candidates"][0]["score"], result["candidates"][1]["score"])
        self.assertEqual(result["candidates"][0]["field_matches"]["year"]["status"], "exact")
        self.assertEqual(result["candidates"][1]["field_matches"]["year"]["status"], "conflict")
    def test_local_crossref_candidate_search_filters_placeholder_stub_entries(self) -> None:
        resolver = LocalMetadataResolver()
        resolver._safe_get_json = lambda url: {
            "message": {
                "items": [
                    {
                        "type": "journal-article",
                        "title": ["Referenced work 1"],
                        "issued": {"date-parts": [[2020]]},
                    },
                    {
                        "type": "journal-article",
                        "title": ["Useful Paper"],
                        "issued": {"date-parts": [[2020]]},
                        "author": [{"family": "Smith", "given": "J S"}],
                        "container-title": ["Journal of Examples"],
                        "DOI": "10.1000/useful",
                    },
                ]
            }
        }
        results = resolver.search_crossref_candidates("Useful Paper")
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].entry.fields["title"], "Useful Paper")
    def test_apply_citation_candidate_selection_uses_selected_candidate_fields(self) -> None:
        result = apply_citation_candidate_selection(
            {
                "raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
                "legacy_reference_number": "160",
            },
            {
                "source_label": "crossref:search:1:daniell-good",
                "entry_type": "article",
                "fields": {
                    "author": "Daniell, W.C.",
                    "year": "1872",
                    "title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
                    "journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
                    "volume": "2",
                    "pages": "387-390",
                },
            },
        )
        self.assertEqual(result["enrichment_status"], "resolved")
        self.assertEqual(result["resolver_source_label"], "editor:selected:crossref:search:1:daniell-good")
        self.assertIn("Comm. Rept. U.S. Comm. Fish & Fish., 2: 387-390", result["normalized_text"])
--- a/apps/api/tests/test_document_format.py
+++ b/apps/api/tests/test_document_format.py
@ -0,0 +1,195 @@
 from __future__ import annotations
 import json
 import unittest
 from ecospecies_api.document_format import (
    DocumentNode,
    StructuredDocument,
    build_document_from_species_payload,
    extract_citation_entries,
    extract_species_projection,
    export_markdown_document,
    parse_markdown_document,
    validate_markdown_document,
 )
 class StructuredMarkdownTests(unittest.TestCase):
    def test_round_trip_markdown_preserves_metadata_and_hierarchy(self) -> None:
        source = """---
 title: American Oyster
 common_name: American Oyster
 scientific_name: Crassostrea virginica
 legacy_identifiers:
  - authority: legacy-ecospecies
    identifier: 5192
    label: FLELMR
 taxon_identifiers:
  - authority: worms
    identifier: 159059
    label: AphiaID
    primary: true
 primary_taxon_authority: worms
 ---
 ## Summary
 Short abstract.
 ## Habitat
 ### Type
 Estuarine.
 """
        document = parse_markdown_document(source)
        self.assertEqual(document.metadata["title"], "American Oyster")
        self.assertEqual(document.metadata["primary_taxon_authority"], "worms")
        self.assertEqual(document.metadata["legacy_identifiers"][0]["identifier"], "5192")
        self.assertEqual(document.metadata["taxon_identifiers"][0]["authority"], "worms")
        self.assertEqual(document.nodes[0].title, "Summary")
        self.assertEqual(document.nodes[1].children[0].title, "Type")
        self.assertIn("## Habitat", export_markdown_document(document))
    def test_build_document_from_species_payload_creates_markdown_sections(self) -> None:
        document = build_document_from_species_payload(
            {
                "title": "American Oyster",
                "common_name": "American Oyster",
                "scientific_name": "Crassostrea virginica",
                "flelmr_code": "5192",
                "source_file": "American Oyster.txt",
                "summary": "Short abstract.",
                "sections": [
                    {"heading": "HEADER", "content": "Ignored header"},
                    {"heading": "Habitat", "content": "Estuarine."},
                    {"heading": "Reproduction", "content": "Broadcast spawner."},
                ],
            }
        )
        self.assertEqual(document.metadata["legacy_identifiers"][0]["identifier"], "5192")
        self.assertEqual(document.metadata["legacy_identifiers"][0]["authority"], "legacy-ecospecies")
        self.assertEqual([node.title for node in document.nodes], ["Summary", "Habitat", "Reproduction"])
        self.assertEqual(document.nodes[1].body, "Estuarine.")
    def test_extract_species_projection_flattens_nested_headings(self) -> None:
        document = parse_markdown_document(
            """---
 title: American Oyster
 common_name: American Oyster
 scientific_name: Crassostrea virginica
 legacy_identifiers:
  - authority: legacy-ecospecies
    identifier: 5192
    label: FLELMR
 ---
 ## Summary
 Short abstract.
 ## Habitat
 General habitat.
 ### Type
 Estuarine.
 """
        )
        projection = extract_species_projection(document)
        self.assertEqual(projection["summary"], "Short abstract.")
        self.assertEqual(projection["flelmr_code"], "5192")
        self.assertEqual(
            [section["heading"] for section in projection["sections"]],
            ["Habitat", "Habitat / Type"],
        )
    def test_extract_species_projection_accepts_legacy_species_code_front_matter(self) -> None:
        document = parse_markdown_document(
            """---
 title: Legacy Fish
 common_name: Legacy Fish
 scientific_name: Pisces historicus
 species_code: 4242
 ---
 ## Habitat
 Estuarine.
 """
        )
        projection = extract_species_projection(document)
        self.assertEqual(projection["flelmr_code"], "4242")
    def test_validate_markdown_document_rejects_missing_front_matter_and_depth_jump(self) -> None:
        errors = validate_markdown_document(
            """## Habitat
 Text
 #### Type
 Nested too deeply.
 """
        )
        self.assertTrue(any("front matter" in error for error in errors))
        self.assertTrue(any("Heading depth jumps" in error for error in errors))
    def test_extract_citation_entries_strips_legacy_comma_number_prefix(self) -> None:
        document = parse_markdown_document(
            """---
 title: Alabama Shad
 common_name: Alabama Shad
 scientific_name: Alosa alabamae
 ---
 ## References
 160, Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.
 """
        )
        citations = extract_citation_entries(document)
        self.assertEqual(len(citations), 1)
        self.assertEqual(citations[0]["legacy_reference_number"], "160")
        self.assertTrue(citations[0]["raw_text"].startswith("Daniell, W.C. 1872."))
        self.assertFalse(citations[0]["raw_text"].startswith("160,"))
    def test_extract_citation_entries_accepts_colon_terminated_citation_heading(self) -> None:
        citations = extract_citation_entries(
            StructuredDocument(
                metadata={},
                nodes=[
                    DocumentNode(
                        node_type="section",
                        title="Citations:",
                        body="7, Ahmed, M. 1975. Speciation in living oysters.",
                        depth=2,
                    )
                ],
            )
        )
        self.assertEqual(len(citations), 1)
        self.assertEqual(citations[0]["legacy_reference_number"], "7")
    def test_extract_citation_entries_accepts_bare_number_prefix(self) -> None:
        document = parse_markdown_document(
            """---
 title: Eastern Mosquitofish
 common_name: Eastern Mosquitofish
 scientific_name: Gambusia holbrooki
 ---
 ## Citations
 848 Gilmore, R.G. 1977. Fishes of the Indian River Lagoon and adjacent waters, Florida.
 """
        )
        citations = extract_citation_entries(document)
        self.assertEqual(len(citations), 1)
        self.assertEqual(citations[0]["legacy_reference_number"], "848")
        self.assertTrue(citations[0]["raw_text"].startswith("Gilmore, R.G. 1977."))
--- a/apps/api/tests/test_parser.py
+++ b/apps/api/tests/test_parser.py
@ -0,0 +1,109 @@
 from __future__ import annotations
 import tempfile
 import unittest
 from pathlib import Path
 from unittest.mock import patch
 from ecospecies_api import parser
 class ParserPathResolutionTests(unittest.TestCase):
    def test_ecospecies_code_is_treated_as_flelmr_code(self) -> None:
        metadata = parser.extract_metadata(
            [
                "Title: Test Fish",
                "EcoSpecies Code: 4242",
            ]
        )
        self.assertEqual(metadata["ecospecies code"], "4242")
        self.assertEqual(metadata["flelmr"], "4242")
    def test_title_case_headings_are_split_into_sections(self) -> None:
        sections = parser.split_sections(
            [
                "Species profile: American oyster (Crassostrea virginica)",
                "",
                "Classification",
                "      Phylum: Mollusca",
                "Value",
                "Commercial: Important fishery.",
                "Habitat",
                "Type: Estuarine.",
            ]
        )
        self.assertEqual(
            [section.heading for section in sections],
            ["HEADER", "Classification", "Value", "Habitat"],
        )
    def test_colon_terminated_title_case_headings_are_split_into_sections(self) -> None:
        sections = parser.split_sections(
            [
                "Ecological Interactions and Notes",
                "Predator text.",
                "",
                "Reference Numbers:",
                "",
                "Citations:",
                "7, Ahmed, M. 1975. Speciation in living oysters.",
            ]
        )
        self.assertEqual(
            [section.heading for section in sections],
            ["HEADER", "Citations"],
        )
    def test_default_data_dir_uses_in_repo_path_without_spaces(self) -> None:
        with patch.dict("os.environ", {}, clear=True):
            resolved = Path(parser.get_default_data_dir())
        self.assertEqual(resolved, parser.get_repo_root() / "input-data" / "InputFiles")
    def test_relative_override_must_stay_within_repo(self) -> None:
        with self.assertRaisesRegex(ValueError, "within the codebase directory"):
            parser.resolve_data_dir("../input-data/InputFiles")
    def test_absolute_override_outside_repo_is_rejected(self) -> None:
        with tempfile.TemporaryDirectory() as tempdir:
            with self.assertRaisesRegex(ValueError, "within the codebase directory"):
                parser.resolve_data_dir(tempdir)
    def test_directory_names_with_spaces_are_rejected(self) -> None:
        with self.assertRaisesRegex(ValueError, "unsafe directory name"):
            parser.resolve_data_dir("input-data/Bad Name")
    def test_directory_names_with_special_characters_are_rejected(self) -> None:
        with self.assertRaisesRegex(ValueError, "unsafe directory name"):
            parser.resolve_data_dir("input-data/bad@name")
    def test_load_species_records_resolves_repo_relative_paths(self) -> None:
        records = parser.load_species_records("input-data/InputFiles")
        self.assertGreater(len(records), 0)
    def test_duplicate_source_records_receive_unique_stable_slugs(self) -> None:
        records = parser.load_species_records("input-data/InputFiles")
        slug_by_source = {record.source_file: record.slug for record in records}
        self.assertEqual(len(records), len(set(record.slug for record in records)))
        self.assertEqual(
            slug_by_source["Red Snapper_SLH_Outline2012_0722.txt"],
            "red-snapper-red-snapper-slh-outline2012-0722",
        )
        self.assertEqual(
            slug_by_source["RedSnapper_SLH_2012_0830_combined.txt"],
            "red-snapper-redsnapper-slh-2012-0830-combined",
        )
        self.assertEqual(
            slug_by_source["Sailfin Molly SLH RGG.txt"],
            "sailfin-molly-sailfin-molly-slh-rgg",
        )
        self.assertTrue(
            slug_by_source["Sailfin_Molly SLH RGG.txt"].startswith(
                "sailfin-molly-sailfin-molly-slh-rgg-"
            )
        )
--- a/apps/api/tests/test_repository.py
+++ b/apps/api/tests/test_repository.py
@ -112,6 +112,35 @@ class RepositoryWorkflowTests(unittest.TestCase):
        self.assertEqual(detail["section_count"], 2)
        self.assertEqual([section["position"] for section in detail["sections"]], [1, 2])
        self.assertEqual([item["code"] for item in detail["diagnostics"]], ["missing_citations"])
        self.assertEqual(
            detail["legacy_identifiers"],
            [
                {
                    "authority": "legacy-ecospecies",
                    "identifier": "9999",
                    "label": "FLELMR",
                }
            ],
        )
    def test_species_detail_includes_structured_document_and_legacy_source(self) -> None:
        input_dir = Path(self.tempdir.name) / "input-data" / "InputFiles"
        input_dir.mkdir(parents=True, exist_ok=True)
        (input_dir / "Test Shad.txt").write_text("HEADER\nLegacy header content\n", encoding="utf-8")
        with patch.object(repository, "get_default_data_dir", return_value=str(input_dir)):
            detail = repository.get_species_by_slug("test-shad")
        self.assertIsNotNone(detail)
        assert detail is not None
        self.assertEqual(detail["structured_document"]["source_format"], "ecospecies-markdown-v1")
        self.assertIn(
            "HABITAT",
            [node["title"] for node in detail["structured_document"]["ast"]["nodes"]],
        )
        self.assertEqual(detail["legacy_source"]["source_file"], "Test Shad.txt")
        self.assertIn("Legacy header content", detail["legacy_source"]["text"])
        self.assertEqual(detail["taxon_identifiers"], [])
    def test_editorial_update_changes_publication_visibility_and_creates_audit(self) -> None:
        result = repository.update_species_editorial(
@ -207,6 +236,60 @@ class RepositoryWorkflowTests(unittest.TestCase):
        self.assertEqual(len(audit), 2)
        self.assertEqual([entry["action"] for entry in audit], ["section_update", "editorial_update"])
    def test_reimport_preserves_persisted_taxon_identifiers(self) -> None:
        repository.update_species_document_markdown(
            slug="test-shad",
            markdown="""---
 title: Test Shad
 common_name: Test Shad
 scientific_name: Alosa testus
 legacy_identifiers:
  - authority: legacy-ecospecies
    identifier: 9999
    label: FLELMR
 taxon_identifiers:
  - authority: gbif
    identifier: 12345
    label: taxonKey
    primary: true
 primary_taxon_authority: gbif
 ---
 ## Summary
 Taxon-reviewed summary.
 """,
            username="edith",
        )
        repository.import_species_payload(UPDATED_PAYLOAD)
        detail = repository.get_editor_species_detail("test-shad")
        self.assertIsNotNone(detail)
        self.assertEqual(detail["primary_taxon_authority"], "gbif")
        self.assertEqual(
            detail["primary_taxon_identifier"],
            {
                "authority": "gbif",
                "identifier": "12345",
                "label": "taxonKey",
                "primary": True,
                "source_url": "",
            },
        )
        self.assertEqual(
            detail["taxon_identifiers"],
            [
                {
                    "authority": "gbif",
                    "identifier": "12345",
                    "label": "taxonKey",
                    "primary": True,
                    "source_url": "",
                }
            ],
        )
    def test_reimport_updates_summary_when_no_editorial_override_exists(self) -> None:
        repository.import_species_payload(UPDATED_PAYLOAD)
@ -302,6 +385,583 @@ class RepositoryWorkflowTests(unittest.TestCase):
        self.assertEqual(audit[0]["action"], "import_restore")
        self.assertEqual(audit[0]["details"]["is_archived"], {"from": True, "to": False})
    def test_document_markdown_update_refreshes_flat_projection(self) -> None:
        result = repository.update_species_document_markdown(
            slug="test-shad",
            markdown="""---
 title: Test Shad Markdown
 common_name: Test Shad
 scientific_name: Alosa markdownus
 species_code: 4242
 ---
 ## Summary
 Markdown summary.
 ## Habitat
 Open water.
 ### Type
 Pelagic.
 """,
            username="frank",
        )
        detail = repository.get_editor_species_detail("test-shad")
        document = repository.get_species_document("test-shad")
        audit = repository.list_species_audit("test-shad")
        self.assertIsNotNone(result)
        self.assertIsNotNone(detail)
        self.assertIsNotNone(document)
        self.assertEqual(detail["title"], "Test Shad Markdown")
        self.assertEqual(detail["scientific_name"], "Alosa markdownus")
        self.assertEqual(detail["flelmr_code"], "4242")
        self.assertEqual(detail["summary"], "Markdown summary.")
        self.assertEqual(
            [section["heading"] for section in detail["sections"]],
            ["Habitat", "Habitat / Type"],
        )
        self.assertEqual(document["updated_by"], "frank")
        self.assertIsNotNone(audit)
        self.assertEqual(audit[0]["action"], "document_update")
    def test_document_markdown_update_extracts_citations(self) -> None:
        repository.update_species_document_markdown(
            slug="test-shad",
            markdown="""---
 title: Test Shad Markdown
 common_name: Test Shad
 scientific_name: Alosa markdownus
 legacy_identifiers:
  - authority: legacy-ecospecies
    identifier: 4242
    label: FLELMR
 ---
 ## Summary
 Markdown summary.
 ## References
 - Smith, J. 2024. Example paper. doi:10.1000/example-doi
 - [7] Jones, A. 2022. Fisheries review.
 """,
            username="frank",
        )
        detail = repository.get_editor_species_detail("test-shad")
        self.assertIsNotNone(detail)
        self.assertEqual(detail["citation_count"], 2)
        self.assertEqual(detail["citations"][0]["section_heading"], "References")
        self.assertEqual(detail["citations"][0]["legacy_reference_number"], "")
        self.assertEqual(detail["citations"][0]["doi"], "10.1000/example-doi")
        self.assertTrue(detail["citations"][0]["citation_key"])
        self.assertIn("@", detail["citations"][0]["draft_bibtex"])
        self.assertEqual(detail["citations"][0]["review_status"], "draft")
        self.assertEqual(detail["citations"][1]["legacy_reference_number"], "7")
        self.assertEqual(detail["citations"][1]["doi"], "")
        self.assertIn("ecospecies_reference_number = \\{7\\}", detail["citations"][1]["draft_bibtex"])
    def test_editor_can_review_citations_and_reviews_survive_document_save(self) -> None:
        repository.update_species_document_markdown(
            slug="test-shad",
            markdown="""---
 title: Test Shad Markdown
 common_name: Test Shad
 scientific_name: Alosa markdownus
 legacy_identifiers:
  - authority: legacy-ecospecies
    identifier: 4242
    label: FLELMR
 ---
 ## References
 - [7] Jones, A. 2022. Fisheries review.
 """,
            username="frank",
        )
        citations = repository.get_editor_species_citations("test-shad")
        self.assertIsNotNone(citations)
        citation = citations["citations"][0]
        result = repository.update_species_citation_review(
            slug="test-shad",
            citation_id=citation["id"],
            review_status="accepted",
            normalized_text="Jones, A. (2022). Fisheries review.",
            doi="10.1000/review-doi",
            citation_key="jones2022review",
            entry_type="article",
            draft_bibtex="@article{jones2022review,\n  doi = {10.1000/review-doi}\n}",
            username="edith",
        )
        self.assertIsNotNone(result)
        self.assertEqual(result["citation"]["review_status"], "accepted")
        self.assertEqual(result["citation"]["source_type"], "editor_review")
        repository.update_species_document_markdown(
            slug="test-shad",
            markdown="""---
 title: Test Shad Markdown
 common_name: Test Shad
 scientific_name: Alosa markdownus
 legacy_identifiers:
  - authority: legacy-ecospecies
    identifier: 4242
    label: FLELMR
 ---
 ## References
 - [7] Jones, A. 2022. Fisheries review.
 """,
            username="frank",
        )
        citations = repository.get_editor_species_citations("test-shad")
        audit = repository.list_species_audit("test-shad")
        self.assertIsNotNone(citations)
        self.assertEqual(citations["citation_count"], 1)
        self.assertEqual(citations["citations"][0]["review_status"], "accepted")
        self.assertEqual(citations["citations"][0]["doi"], "10.1000/review-doi")
        self.assertEqual(citations["citations"][0]["citation_key"], "jones2022review")
        self.assertEqual(citations["citations"][0]["entry_type"], "article")
        self.assertIn("10.1000/review-doi", citations["citations"][0]["draft_bibtex"])
        self.assertIsNotNone(audit)
        self.assertEqual(audit[1]["action"], "citation_review_update")
    def test_editor_can_run_citation_enrichment(self) -> None:
        repository.update_species_document_markdown(
            slug="test-shad",
            markdown="""---
 title: Test Shad Markdown
 common_name: Test Shad
 scientific_name: Alosa markdownus
 legacy_identifiers:
  - authority: legacy-ecospecies
    identifier: 4242
    label: FLELMR
 ---
 ## References
 - [7] Jones, A. 2022. Fisheries review.
 """,
            username="frank",
        )
        citations = repository.get_editor_species_citations("test-shad")
        self.assertIsNotNone(citations)
        citation = citations["citations"][0]
        with patch.object(
            repository,
            "enrich_citation_payload",
            return_value={
                "citation_key": "jones2022review",
                "entry_type": "article",
                "normalized_text": "Jones, A. (2022). Fisheries review. Journal of Tests. DOI:10.1000/review-doi",
                "draft_bibtex": "@article{jones2022review,\n  doi = {10.1000/review-doi},\n}",
                "doi": "10.1000/review-doi",
                "source_url": "https://doi.org/10.1000/review-doi",
                "openalex_id": "W12345",
                "resolver_source_label": "crossref:doi:10.1000/review-doi",
                "enrichment_status": "resolved",
                "enrichment_error": "",
                "conflicts": [],
            },
        ):
            result = repository.update_species_citation_enrichment(
                slug="test-shad",
                citation_id=citation["id"],
                username="edith",
            )
        self.assertIsNotNone(result)
        self.assertEqual(result["citation"]["enrichment_status"], "resolved")
        self.assertEqual(result["citation"]["doi"], "10.1000/review-doi")
        self.assertEqual(result["citation"]["openalex_id"], "W12345")
        self.assertEqual(result["citation"]["resolver_source_label"], "crossref:doi:10.1000/review-doi")
        self.assertEqual(result["citation"]["source_url"], "https://doi.org/10.1000/review-doi")
        citations = repository.get_editor_species_citations("test-shad")
        audit = repository.list_species_audit("test-shad")
        self.assertIsNotNone(citations)
        self.assertEqual(citations["citations"][0]["citation_key"], "jones2022review")
        self.assertEqual(citations["citations"][0]["entry_type"], "article")
        self.assertEqual(citations["citations"][0]["enrichment_status"], "resolved")
        self.assertIsNotNone(audit)
        self.assertEqual(audit[0]["action"], "citation_enrichment")
    def test_editor_can_run_batch_citation_enrichment(self) -> None:
        repository.update_species_document_markdown(
            slug="test-shad",
            markdown="""---
 title: Test Shad Markdown
 common_name: Test Shad
 scientific_name: Alosa markdownus
 legacy_identifiers:
  - authority: legacy-ecospecies
    identifier: 4242
    label: FLELMR
 ---
 ## References
 - [7] Jones, A. 2022. Fisheries review.
 - [8] Smith, B. 2021. Estuarine habitat paper.
 """,
            username="frank",
        )
        payloads = [
            {
                "citation_key": "jones2022review",
                "entry_type": "article",
                "normalized_text": "Jones, A. (2022). Fisheries review.",
                "draft_bibtex": "@article{jones2022review,\n}",
                "doi": "10.1000/review-doi",
                "source_url": "https://doi.org/10.1000/review-doi",
                "openalex_id": "W12345",
                "resolver_source_label": "crossref:doi:10.1000/review-doi",
                "enrichment_status": "resolved",
                "enrichment_error": "",
                "conflicts": [],
            },
            {
                "citation_key": "smith2021estuarine",
                "entry_type": "misc",
                "normalized_text": "",
                "draft_bibtex": "",
                "doi": "",
                "source_url": "",
                "openalex_id": "",
                "resolver_source_label": "",
                "enrichment_status": "unresolved",
                "enrichment_error": "No metadata match found from DOI, title, or authority identifiers.",
                "conflicts": [],
            },
        ]
        with patch.object(repository, "enrich_citation_payload", side_effect=payloads):
            result = repository.update_species_citations_enrichment_batch(
                slug="test-shad",
                username="edith",
            )
        self.assertIsNotNone(result)
        self.assertEqual(result["citation_count"], 2)
        self.assertEqual(result["changed_count"], 2)
        self.assertEqual(result["resolved_count"], 1)
        self.assertEqual(result["unresolved_count"], 1)
        self.assertEqual(result["error_count"], 0)
    def test_editor_can_review_and_apply_citation_candidates(self) -> None:
        repository.update_species_document_markdown(
            slug="test-shad",
            markdown="""---
 title: Test Shad Markdown
 common_name: Test Shad
 scientific_name: Alosa markdownus
 legacy_identifiers:
  - authority: legacy-ecospecies
    identifier: 4242
    label: FLELMR
 ---
 ## References
 - [7] Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.
 """,
            username="frank",
        )
        citations = repository.get_editor_species_citations("test-shad")
        self.assertIsNotNone(citations)
        citation = citations["citations"][0]
        with patch.object(
            repository,
            "discover_citation_candidates",
            return_value={
                "seed": {
                    "fields": {
                        "author": "Daniell, W.C.",
                        "year": "1872",
                        "title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
                        "journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
                        "volume": "2",
                        "pages": "387-390",
                    }
                },
                "candidate_count": 1,
                "candidates": [
                    {
                        "candidate_id": "crossref-search-1-daniell-good",
                        "source_label": "crossref:search:1:daniell-good",
                        "entry_type": "article",
                        "citation_key": "daniell1872lettersreferringexperiments",
                        "fields": {
                            "author": "Daniell, W.C.",
                            "year": "1872",
                            "title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
                            "journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
                            "volume": "2",
                            "pages": "387-390",
                        },
                    }
                ],
            },
        ):
            candidates = repository.get_species_citation_candidates("test-shad", citation["id"])
        self.assertIsNotNone(candidates)
        self.assertEqual(candidates["candidate_count"], 1)
        result = repository.apply_species_citation_candidate_selection(
            slug="test-shad",
            citation_id=citation["id"],
            candidate={
                "source_label": "crossref:search:1:daniell-good",
                "entry_type": "article",
                "fields": {
                    "author": "Daniell, W.C.",
                    "year": "1872",
                    "title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
                    "journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
                    "volume": "2",
                    "pages": "387-390",
                },
            },
            username="edith",
        )
        self.assertIsNotNone(result)
        self.assertEqual(result["citation"]["resolver_source_label"], "editor:selected:crossref:search:1:daniell-good")
        self.assertEqual(result["citation"]["source_type"], "editor_selected_candidate")
        self.assertEqual(result["citation"]["review_status"], "accepted")
        audit = repository.list_species_audit("test-shad")
        self.assertIsNotNone(audit)
        self.assertEqual(audit[0]["action"], "citation_candidate_selection")
    def test_editor_can_add_candidate_as_additional_citation_and_preserve_it(self) -> None:
        repository.update_species_document_markdown(
            slug="test-shad",
            markdown="""---
 title: Test Shad Markdown
 common_name: Test Shad
 scientific_name: Alosa markdownus
 legacy_identifiers:
  - authority: legacy-ecospecies
    identifier: 4242
    label: FLELMR
 ---
 ## References
 - [7] Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.
 """,
            username="frank",
        )
        citations = repository.get_editor_species_citations("test-shad")
        self.assertIsNotNone(citations)
        source_citation = citations["citations"][0]
        result = repository.add_species_citation_from_candidate(
            slug="test-shad",
            citation_id=source_citation["id"],
            candidate={
                "source_label": "crossref:search:1:daniell-related",
                "entry_type": "article",
                "fields": {
                    "author": "Jordan, F.",
                    "year": "2009",
                    "title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA",
                    "journal": "Transactions of the American Fisheries Society",
                    "volume": "19",
                    "number": "1",
                    "pages": "107-115",
                    "doi": "10.1111/j.1600-0633.2009.00395.x",
                    "url": "https://doi.org/10.1111/j.1600-0633.2009.00395.x",
                },
            },
            username="edith",
        )
        self.assertIsNotNone(result)
        self.assertEqual(result["citation"]["source_type"], "editor_added_candidate")
        self.assertEqual(result["citation"]["review_status"], "accepted")
        citations = repository.get_editor_species_citations("test-shad")
        self.assertIsNotNone(citations)
        self.assertEqual(citations["citation_count"], 2)
        self.assertEqual(citations["citations"][1]["section_heading"], "References")
        document = repository.get_species_document("test-shad")
        self.assertIsNotNone(document)
        self.assertIn("Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA", document["markdown"])
        repository.update_species_document_markdown(
            slug="test-shad",
            markdown=document["markdown"],
            username="frank",
        )
        citations = repository.get_editor_species_citations("test-shad")
        self.assertIsNotNone(citations)
        self.assertEqual(citations["citation_count"], 2)
        self.assertEqual(citations["citations"][1]["source_type"], "editor_added_candidate")
        audit = repository.list_species_audit("test-shad")
        self.assertIsNotNone(audit)
        self.assertEqual(audit[0]["action"], "document_update")
        self.assertEqual(audit[1]["action"], "citation_candidate_addition")
    def test_contributor_can_view_only_owned_citations(self) -> None:
        created = repository.create_contributor_species(
            "writer@example.org",
            """---
 title: Contributor Draft
 common_name: Contributor Fish
 scientific_name: Pisces contributoris
 species_code:
 ---
 ## References
 - [12] Example, A. 2025. Draft reference.
 """,
        )
        owned = repository.get_contributor_species_citations(created["slug"], "writer@example.org")
        other = repository.get_contributor_species_citations(created["slug"], "other@example.org")
        self.assertIsNotNone(owned)
        self.assertEqual(owned["citation_count"], 1)
        self.assertEqual(owned["citations"][0]["legacy_reference_number"], "12")
        self.assertIsNone(other)
    def test_public_bibliography_aggregates_species_citations(self) -> None:
        repository.update_species_document_markdown(
            slug="test-shad",
            markdown="""---
 title: Test Shad Markdown
 common_name: Test Shad
 scientific_name: Alosa markdownus
 legacy_identifiers:
  - authority: legacy-ecospecies
    identifier: 4242
    label: FLELMR
 ---
 ## References
 - [7] Jones, A. 2022. Fisheries review.
 """,
            username="frank",
        )
        citations = repository.get_editor_species_citations("test-shad")
        self.assertIsNotNone(citations)
        citation = citations["citations"][0]
        repository.update_species_citation_review(
            slug="test-shad",
            citation_id=citation["id"],
            review_status="accepted",
            normalized_text="Jones, A. (2022). Fisheries review.",
            doi="10.1000/review-doi",
            citation_key="jones2022review",
            entry_type="article",
            draft_bibtex="@article{jones2022review,\n  doi = {10.1000/review-doi}\n}",
            username="edith",
            abstract_text="A short abstract about fisheries review.",
        )
        bibliography = repository.list_public_bibliography()
        self.assertEqual(len(bibliography), 1)
        self.assertEqual(bibliography[0]["citation_key"], "jones2022review")
        self.assertEqual(bibliography[0]["abstract_text"], "A short abstract about fisheries review.")
        self.assertEqual(bibliography[0]["legacy_reference_numbers"], ["7"])
        self.assertEqual(bibliography[0]["species_count"], 1)
        self.assertEqual(bibliography[0]["species_refs"][0]["slug"], "test-shad")
    def test_register_contributor_creates_token_and_enforces_age_gate(self) -> None:
        with self.assertRaisesRegex(ValueError, "at least 13 years old"):
            repository.register_contributor("person@example.org", False)
        result = repository.register_contributor("Person@Example.org", True)
        self.assertEqual(result["username"], "person@example.org")
        self.assertEqual(result["role"], "contributor")
        self.assertEqual(result["minimum_age"], 13)
        self.assertTrue(result["token"])
    def test_contributor_can_create_and_edit_only_owned_species(self) -> None:
        created = repository.create_contributor_species(
            "writer@example.org",
            """---
 title: Contributor Draft
 common_name: Contributor Fish
 scientific_name: Pisces contributoris
 species_code: 
 ---
 ## Summary
 Draft summary.
 ## Habitat
 Mangroves.
 """,
        )
        detail = repository.get_contributor_species_detail(created["slug"], "writer@example.org")
        public_detail = repository.get_species_by_slug(created["slug"])
        self.assertIsNotNone(detail)
        self.assertIsNone(public_detail)
        self.assertEqual(detail["publication_status"], "draft")
        self.assertEqual(detail["common_name"], "Contributor Fish")
        updated = repository.update_contributor_species_document_markdown(
            created["slug"],
            """---
 title: Contributor Draft Revised
 common_name: Contributor Fish
 scientific_name: Pisces contributoris
 species_code: 
 ---
 ## Summary
 Revised summary.
 ## Habitat
 Seagrass.
 ### Depth
 Shallow bays.
 """,
            "writer@example.org",
        )
        self.assertIsNotNone(updated)
        detail = repository.get_contributor_species_detail(created["slug"], "writer@example.org")
        other_user_detail = repository.get_contributor_species_detail(created["slug"], "other@example.org")
        audit = repository.list_species_audit(created["slug"])
        self.assertIsNotNone(detail)
        self.assertEqual(detail["summary"], "Revised summary.")
        self.assertEqual(
            [section["heading"] for section in detail["sections"]],
            ["Habitat", "Habitat / Depth"],
        )
        self.assertIsNone(other_user_detail)
        self.assertIsNotNone(audit)
        self.assertEqual(audit[0]["action"], "contributor_document_update")
 if __name__ == "__main__":
    unittest.main()
--- a/apps/web/app.js
+++ b/apps/web/app.js
--- a/apps/web/bibliography.html
+++ b/apps/web/bibliography.html
@ -0,0 +1,43 @@
 <!doctype html>
 <html lang="en">
  <head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>EcoSpecies Bibliography</title>
    <link rel="stylesheet" href="./styles.css">
  </head>
  <body>
    <header class="site-header">
      <div class="site-header-inner">
        <div class="site-brand">
          <p class="site-brand-mark">Open Species Archive</p>
          <a href="./index.html" class="site-brand-link">EcoSpecies Atlas</a>
          <p class="site-brand-summary">Public field atlas for historical species life-history materials.</p>
        </div>
        <nav class="site-nav" aria-label="Primary">
          <a href="./index.html">Atlas</a>
          <a href="./bibliography.html">Bibliography</a>
        </nav>
      </div>
    </header>
    <main class="page">
      <section class="hero">
        <p class="eyebrow">EcoSpecies Atlas</p>
        <h1>Bibliography</h1>
        <p class="lede">
          A site-wide bibliography for the EcoSpecies atlas, including imported references and citations added during review.
        </p>
        <div class="auth-bar auth-panel-row">
          <input id="bibliography-search" type="search" placeholder="Search title, author, DOI, or abstract">
          <button id="bibliography-download" type="button" class="secondary-button">Download BibTeX</button>
          <p id="bibliography-status" class="auth-status">Loading bibliography...</p>
        </div>
      </section>
      <section class="panel">
        <div id="bibliography-list" class="public-citation-list"></div>
      </section>
    </main>
    <script src="./bibliography.js" defer></script>
  </body>
 </html>
--- a/apps/web/bibliography.js
+++ b/apps/web/bibliography.js
@ -0,0 +1,230 @@
 function getAppBase() {
  const { pathname } = window.location;
  if (pathname === "/" || pathname === "/index.html") {
    return "";
  }
  if (pathname.endsWith("/index.html")) {
    return pathname.slice(0, -"/index.html".length);
  }
  return pathname.endsWith("/") ? pathname.slice(0, -1) : pathname;
 }
 const apiBase = getAppBase().replace(/\/bibliography\.html$/, "");
 const bibliographyList = document.querySelector("#bibliography-list");
 const bibliographySearch = document.querySelector("#bibliography-search");
 const bibliographyStatus = document.querySelector("#bibliography-status");
 const bibliographyDownload = document.querySelector("#bibliography-download");
 let currentBibliographyItems = [];
 function escapeHtml(value) {
  return String(value)
    .replaceAll("&", "&amp;")
    .replaceAll('"', "&quot;")
    .replaceAll("<", "&lt;")
    .replaceAll(">", "&gt;");
 }
 function normalizeAbstractForDisplay(value) {
  const raw = String(value || "").trim();
  if (!raw) {
    return "";
  }
  const temp = document.createElement("div");
  temp.innerHTML = raw;
  return temp.textContent
    .replace(/^abstract\s*[:.\-]?\s*/i, "")
    .replace(/\s+/g, " ")
    .trim();
 }
 function parseBibtexFields(draftBibtex) {
  const fields = {};
  const text = String(draftBibtex || "");
  const pattern = /([a-zA-Z_]+)\s*=\s*\{([^}]*)\}/g;
  let match = pattern.exec(text);
  while (match) {
    fields[match[1].toLowerCase()] = match[2].trim();
    match = pattern.exec(text);
  }
  return fields;
 }
 function collectBibtexRecords(items) {
  const seen = new Set();
  const records = [];
  for (const item of items || []) {
    const draftBibtex = String(item && item.draft_bibtex ? item.draft_bibtex : "").trim();
    if (!draftBibtex || seen.has(draftBibtex)) {
      continue;
    }
    seen.add(draftBibtex);
    records.push(draftBibtex);
  }
  return records;
 }
 function downloadBibtexRecords(items, filenameStem) {
  const records = collectBibtexRecords(items);
  if (!records.length) {
    return false;
  }
  const blob = new Blob([`${records.join("\n\n")}\n`], { type: "application/x-bibtex;charset=utf-8" });
  const url = URL.createObjectURL(blob);
  const link = document.createElement("a");
  link.href = url;
  link.download = `${filenameStem}.bib`;
  document.body.appendChild(link);
  link.click();
  document.body.removeChild(link);
  window.setTimeout(() => URL.revokeObjectURL(url), 0);
  return true;
 }
 function syncDownloadButton(items) {
  if (!bibliographyDownload) {
    return;
  }
  const recordCount = collectBibtexRecords(items).length;
  bibliographyDownload.disabled = !recordCount;
  bibliographyDownload.textContent = recordCount
    ? `Download BibTeX (${recordCount})`
    : "Download BibTeX";
 }
 function buildCitationText(item) {
  const fields = parseBibtexFields(item.draft_bibtex || "");
  if (item.normalized_text) {
    return escapeHtml(item.normalized_text);
  }
  const author = fields.author || "";
  const year = fields.year || "";
  const title = fields.title || "";
  const venue = fields.journal || fields.booktitle || fields.publisher || "";
  const volume = fields.volume || "";
  const issue = fields.number || "";
  const pages = fields.pages || "";
  const parts = [];
  const lead = [author, year ? `(${year})` : ""].filter(Boolean).join(" ");
  if (lead) {
    parts.push(lead);
  }
  if (title) {
    parts.push(title);
  }
  const venueBits = [venue, volume ? `${volume}${issue ? `(${issue})` : ""}` : issue ? `(${issue})` : "", pages]
    .filter(Boolean)
    .join(", ");
  if (venueBits) {
    parts.push(venueBits);
  }
  return escapeHtml(parts.join(". ").trim() || item.raw_text || "");
 }
 function renderSpeciesRefs(refs) {
  return refs
    .map(
      (ref) =>
        `<a href="./index.html#${escapeHtml(ref.slug)}">${escapeHtml(ref.common_name || ref.slug)}</a>`,
    )
    .join(", ");
 }
 function renderAbstractBlock(text) {
  const abstract = normalizeAbstractForDisplay(text);
  if (!abstract) {
    return "";
  }
  return `
    <div class="citation-abstract-shell">
      <button type="button" class="secondary-button citation-abstract-toggle" aria-expanded="false">
        Show Abstract
      </button>
      <div class="citation-abstract-display hidden">
        <p class="public-citation-abstract">${escapeHtml(abstract)}</p>
      </div>
    </div>
  `;
 }
 function attachCitationAbstractToggles(root) {
  for (const toggle of root.querySelectorAll(".citation-abstract-toggle")) {
    const shell = toggle.parentElement;
    const display = shell && shell.querySelector(".citation-abstract-display");
    if (!display) {
      continue;
    }
    toggle.addEventListener("click", () => {
      const hidden = display.classList.toggle("hidden");
      toggle.setAttribute("aria-expanded", hidden ? "false" : "true");
      toggle.textContent = hidden ? "Show Abstract" : "Hide Abstract";
    });
  }
 }
 function renderBibliography(items) {
  bibliographyList.innerHTML = "";
  if (!items.length) {
    bibliographyList.innerHTML = `<p class="editor-status">No bibliography entries match the current search.</p>`;
    return;
  }
  for (const item of items) {
    const links = [
      item.doi ? `<a href="https://doi.org/${encodeURIComponent(String(item.doi).replace(/^https?:\/\/doi\.org\//, ""))}" target="_blank" rel="noopener noreferrer">DOI</a>` : "",
      item.source_url ? `<a href="${escapeHtml(item.source_url)}" target="_blank" rel="noopener noreferrer">Source</a>` : "",
      item.openalex_id ? `<a href="https://openalex.org/${escapeHtml(String(item.openalex_id).replace(/^https?:\/\/openalex\.org\//, ""))}" target="_blank" rel="noopener noreferrer">OpenAlex</a>` : "",
    ]
      .filter(Boolean)
      .join(" · ");
    const article = document.createElement("article");
    article.className = "public-citation-entry";
    article.innerHTML = `
      <p class="public-citation-text">${buildCitationText(item)}</p>
      ${renderAbstractBlock(item.abstract_text || "")}
      <p class="public-citation-meta">
        Appears in ${item.species_count} species record${item.species_count === 1 ? "" : "s"}
        ${item.legacy_reference_numbers && item.legacy_reference_numbers.length ? ` • Imported references: ${item.legacy_reference_numbers.map((value) => escapeHtml(value)).join(", ")}` : ""}
      </p>
      <p class="public-citation-meta">Species: ${renderSpeciesRefs(item.species_refs || [])}</p>
      ${links ? `<p class="public-citation-links">${links}</p>` : ""}
    `;
    attachCitationAbstractToggles(article);
    bibliographyList.appendChild(article);
  }
 }
 async function loadBibliography(search = "") {
  bibliographyStatus.textContent = "Loading bibliography...";
  const query = search ? `?search=${encodeURIComponent(search)}` : "";
  const response = await fetch(`${apiBase}/api/bibliography${query}`);
  const data = await response.json();
  if (!response.ok) {
    bibliographyList.innerHTML = `<p class="error">${escapeHtml(data.error || "Unable to load bibliography.")}</p>`;
    bibliographyStatus.textContent = data.error || "Bibliography load failed";
    return;
  }
  currentBibliographyItems = data.items || [];
  renderBibliography(currentBibliographyItems);
  syncDownloadButton(currentBibliographyItems);
  bibliographyStatus.textContent = `${data.count || 0} bibliography entr${data.count === 1 ? "y" : "ies"}`;
 }
 bibliographySearch.addEventListener("input", async (event) => {
  await loadBibliography(event.target.value);
 });
 loadBibliography().catch((error) => {
  bibliographyList.innerHTML = `<p class="error">Failed to load bibliography: ${escapeHtml(String(error))}</p>`;
  bibliographyStatus.textContent = "Bibliography load failed";
 });
 if (bibliographyDownload) {
  bibliographyDownload.addEventListener("click", () => {
    const downloaded = downloadBibtexRecords(currentBibliographyItems, "ecospecies-bibliography");
    if (!downloaded) {
      bibliographyStatus.textContent = "No BibTeX records are available for download yet.";
    }
  });
 }
--- a/apps/web/index.html
+++ b/apps/web/index.html
@ -7,20 +7,31 @@
    <link rel="stylesheet" href="./styles.css">
  </head>
  <body>
    <header class="site-header">
      <div class="site-header-inner">
        <div class="site-brand">
          <p class="site-brand-mark">Open Species Archive</p>
          <a href="./index.html" class="site-brand-link">EcoSpecies Atlas</a>
          <p class="site-brand-summary">Public field atlas for historical species life-history materials.</p>
        </div>
        <nav class="site-nav" aria-label="Primary">
          <a href="./index.html">Atlas</a>
          <a href="./bibliography.html">Bibliography</a>
        </nav>
      </div>
    </header>
    <main class="page">
      <section class="hero">
-        <p class="eyebrow">Marine Species Knowledge System</p>
+        <p class="eyebrow">Open Biodiversity Reference</p>
-        <h1>EcoSpecies</h1>
+        <h1>EcoSpecies Atlas</h1>
        <p class="lede">
-          A modern follow-on for the legacy EcoSpecies archive, starting with direct ingestion
+          A modern follow-on for the legacy EcoSpecies archive, built as an open ecology and
-          of historical Species Life History text files.
+          biodiversity reference workspace.
        </p>
        <p class="hero-context">
          Use EcoSpecies Atlas for species profiles, habitat evidence, ecological reading, and
          citation-aware exploration grounded in the migrated legacy corpus.
        </p>
        <div class="auth-bar">
          <input id="auth-token" type="password" placeholder="Bearer token for editor access">
          <button id="auth-save" type="button">Use Token</button>
          <button id="auth-clear" type="button" class="secondary-button">Clear</button>
          <p id="auth-status" class="auth-status">Public access</p>
        </div>
        <div class="hero-stats">
          <div class="stat">
            <span id="species-count">0</span>
@ -38,6 +49,7 @@
          <div class="panel-header">
            <h2>Species</h2>
            <input id="search" type="search" placeholder="Search common or scientific name">
            <button id="contributor-create" type="button" class="secondary-button hidden">Create New Draft</button>
            <div id="archive-filter-group" class="archive-filter-group hidden">
              <button type="button" class="archive-filter-button is-active" data-archive-filter="active">Active</button>
              <button type="button" class="archive-filter-button" data-archive-filter="all">All</button>
@ -66,16 +78,59 @@
                This record is archived. It is hidden from public endpoints but remains available to editors for audit and recovery.
              </p>
            </header>
-            <section id="editor-panel" class="detail-section editor-panel hidden">
+            <div id="detail-sections" class="detail-sections"></div>
-              <h3>Editor Controls</h3>
+            <div class="workflow-panels">
              <section id="legacy-panel" class="detail-section collapsible-panel collapsed hidden" data-label="Legacy Materials Under Review">
                <div class="collapsible-header">
                  <h3>Legacy Materials Under Review</h3>
                  <button type="button" class="secondary-button collapsible-toggle" data-target="legacy-panel" data-label="Legacy Materials Under Review" aria-expanded="false">
                    Show Legacy Materials Under Review
                  </button>
                </div>
                <div class="collapsible-body">
                  <p id="legacy-source-meta" class="editor-status"></p>
                  <pre id="legacy-source-text" class="legacy-source"></pre>
                </div>
              </section>
              <section id="access-panel" class="detail-section collapsible-panel collapsed" data-label="Access and Contribution">
                <div class="collapsible-header">
                  <h3>Access and Contribution</h3>
                  <button type="button" class="secondary-button collapsible-toggle" data-target="access-panel" data-label="Access and Contribution" aria-expanded="false">
                    Show Access and Contribution
                  </button>
                </div>
                <div class="collapsible-body">
                  <div class="auth-bar auth-panel-row">
                    <input id="auth-token" type="password" placeholder="Bearer token for editor access">
                    <button id="auth-save" type="button">Use Token</button>
                    <button id="auth-clear" type="button" class="secondary-button">Clear</button>
                    <p id="auth-status" class="auth-status">Public access</p>
                  </div>
                  <div class="auth-bar contributor-signup auth-panel-row">
                    <input id="contributor-email" type="email" placeholder="Email for contributor access">
                    <label class="archive-toggle contributor-age-gate">
                      <input id="contributor-age-gate" type="checkbox">
                      <span>I confirm I am at least <span id="contributor-age-label">13</span> years old</span>
                    </label>
                    <button id="contributor-register" type="button" class="secondary-button">Become Contributor</button>
                    <p id="contributor-status" class="auth-status"></p>
                  </div>
                </div>
              </section>
              <section id="editor-panel" class="detail-section collapsible-panel editor-panel collapsed hidden" data-label="Editing Workflow">
                <div class="collapsible-header">
                  <h3>Editing Workflow</h3>
                  <button type="button" class="secondary-button collapsible-toggle" data-target="editor-panel" data-label="Editing Workflow" aria-expanded="false">
                    Show Editing Workflow
                  </button>
                </div>
                <div class="collapsible-body">
                  <label class="editor-label" for="editor-publication-status">Publication Status</label>
                  <select id="editor-publication-status">
                    <option value="draft">Draft</option>
                    <option value="review">Review</option>
                    <option value="published">Published</option>
                  </select>
              <label class="editor-label" for="editor-summary">Summary</label>
              <textarea id="editor-summary" rows="5" placeholder="Write a concise executive summary."></textarea>
                  <label class="editor-label" for="editor-notes">Editor Notes</label>
                  <textarea id="editor-notes" rows="4" placeholder="Internal editorial notes"></textarea>
                  <label class="archive-toggle">
@ -86,24 +141,103 @@
                    <button id="editor-save" type="button">Save Editorial Changes</button>
                    <p id="editor-status" class="editor-status"></p>
                  </div>
                </div>
              </section>
-            <section id="audit-panel" class="detail-section hidden">
+              <section id="document-panel" class="detail-section collapsible-panel editor-panel collapsed hidden" data-label="Metadata and Document Workflow">
                <div class="collapsible-header">
                  <h3>Metadata and Document Workflow</h3>
                  <button type="button" class="secondary-button collapsible-toggle" data-target="document-panel" data-label="Metadata and Document Workflow" aria-expanded="false">
                    Show Metadata and Document Workflow
                  </button>
                </div>
                <div class="collapsible-body">
                  <div class="document-panel-header">
                    <div>
                      <p class="editor-status">
                        Markdown is the editable source of truth for hierarchy. Front matter and headings are validated on save.
                      </p>
                    </div>
                    <div class="editor-actions">
                      <button id="document-save" type="button">Save Document</button>
                      <p id="document-status" class="editor-status"></p>
                    </div>
                  </div>
                  <label class="editor-label" for="document-markdown">Markdown Source</label>
                  <textarea id="document-markdown" class="document-editor" rows="18" spellcheck="false"></textarea>
                  <details class="document-preview-shell" open>
                    <summary>Outline Preview</summary>
                    <div id="document-preview" class="document-preview"></div>
                  </details>
                </div>
              </section>
              <section id="citation-panel" class="detail-section collapsible-panel collapsed hidden" data-label="Review Workflow">
                <div class="collapsible-header">
                  <h3>Review Workflow</h3>
                  <button type="button" class="secondary-button collapsible-toggle" data-target="citation-panel" data-label="Review Workflow" aria-expanded="false">
                    Show Review Workflow
                  </button>
                </div>
                <div class="collapsible-body">
                  <div class="document-panel-header">
                    <div>
                      <p id="citation-status" class="editor-status">
                        Extracted bibliography entries and draft BibTeX records.
                      </p>
                    </div>
                    <div class="editor-actions">
                      <button id="citation-backfill-species" type="button" class="secondary-button hidden">Backfill This Species</button>
                      <button id="citation-enrich-all" type="button" class="secondary-button hidden">Run Enrichment For All Citations</button>
                    </div>
                  </div>
                  <div id="citation-list" class="citation-list"></div>
                </div>
              </section>
              <section id="audit-panel" class="detail-section collapsible-panel collapsed hidden" data-label="Audit History">
                <div class="collapsible-header">
                  <h3>Audit History</h3>
                  <button type="button" class="secondary-button collapsible-toggle" data-target="audit-panel" data-label="Audit History" aria-expanded="false">
                    Show Audit History
                  </button>
                </div>
                <div class="collapsible-body">
                  <div id="audit-list" class="audit-list"></div>
                </div>
              </section>
-            <div id="detail-sections" class="detail-sections"></div>
+            </div>
          </article>
        </section>
      </section>
      <footer class="footer">
        <p>
-          This migration path preserves attribution for Dr. Peter Rubec, Dr. Diane Blackwood,
+          EcoSpecies Atlas preserves attribution for Dr. Peter Rubec, Dr. Diane Blackwood,
          Dr. Welsbery R. Elsberry, and the Florida Fish and Wildlife Research Institute context
          documented in the legacy project materials.
        </p>
      </footer>
    </main>
    <section id="citation-match-dialog" class="match-dialog-shell hidden" aria-hidden="true">
      <div class="match-dialog-backdrop"></div>
      <article class="match-dialog-card" role="dialog" aria-modal="true" aria-labelledby="citation-match-title">
        <div class="match-dialog-header">
          <div>
            <h2 id="citation-match-title">Citation Candidate Review</h2>
            <p id="citation-match-status" class="editor-status">Compare the parsed source citation against candidate metadata.</p>
          </div>
          <button id="citation-match-close" type="button" class="secondary-button">Close</button>
        </div>
        <div class="match-dialog-grid">
          <section class="detail-section">
            <h3>Parsed Source Metadata</h3>
            <div id="citation-match-seed" class="match-seed"></div>
          </section>
          <section class="detail-section">
            <h3>Candidate Matches</h3>
            <div id="citation-match-candidates" class="match-candidates"></div>
          </section>
        </div>
      </article>
    </section>
    <script src="./app.js" defer></script>
  </body>
 </html>
--- a/apps/web/nginx.conf
+++ b/apps/web/nginx.conf
@ -5,6 +5,10 @@ server {
  root /usr/share/nginx/html;
  index index.html;
  location = /apps/ecospecies {
    return 301 /apps/ecospecies/;
  }
  location /api/ {
    proxy_pass http://api:8000/api/;
    proxy_http_version 1.1;
@ -14,19 +18,46 @@ server {
    proxy_set_header X-Forwarded-Proto $scheme;
  }
  location /apps/ecospecies/api/ {
    rewrite ^/apps/ecospecies/api/(.*)$ /api/$1 break;
    proxy_pass http://api:8000;
    proxy_http_version 1.1;
    proxy_set_header Host $host;
    proxy_set_header X-Real-IP $remote_addr;
    proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    proxy_set_header X-Forwarded-Proto $scheme;
  }
  location /healthz {
    proxy_pass http://api:8000/healthz;
    proxy_http_version 1.1;
    proxy_set_header Host $host;
  }
  location /apps/ecospecies/healthz {
    proxy_pass http://api:8000/healthz;
    proxy_http_version 1.1;
    proxy_set_header Host $host;
  }
  location /readyz {
    proxy_pass http://api:8000/readyz;
    proxy_http_version 1.1;
    proxy_set_header Host $host;
  }
  location /apps/ecospecies/readyz {
    proxy_pass http://api:8000/readyz;
    proxy_http_version 1.1;
    proxy_set_header Host $host;
  }
  location / {
    try_files $uri $uri/ /index.html;
  }
  location /apps/ecospecies/ {
    rewrite ^/apps/ecospecies/(.*)$ /$1 break;
    try_files $uri $uri/ /index.html;
  }
 }
--- a/apps/web/styles.css
+++ b/apps/web/styles.css
@ -1,12 +1,12 @@
 :root {
-  --bg: #f4efe6;
+  --bg: #f4f7fb;
-  --paper: rgba(255, 252, 247, 0.78);
+  --paper: rgba(255, 255, 255, 0.88);
-  --ink: #16251f;
+  --ink: #182433;
-  --muted: #58655f;
+  --muted: #5f6b7d;
-  --accent: #0f766e;
+  --accent: #2457a6;
-  --accent-2: #bc6c25;
+  --accent-2: #1f7a5a;
-  --line: rgba(22, 37, 31, 0.12);
+  --line: rgba(24, 36, 51, 0.11);
-  --shadow: 0 24px 70px rgba(24, 35, 30, 0.15);
+  --shadow: 0 24px 70px rgba(33, 52, 84, 0.14);
 }
 * {
@ -15,12 +15,83 @@
 body {
  margin: 0;
-  font-family: Georgia, "Times New Roman", serif;
+  font-family: "Segoe UI", "Helvetica Neue", Arial, sans-serif;
  color: var(--ink);
  background:
-    radial-gradient(circle at top left, rgba(15, 118, 110, 0.14), transparent 28%),
+    radial-gradient(circle at top left, rgba(36, 87, 166, 0.14), transparent 26%),
-    radial-gradient(circle at top right, rgba(188, 108, 37, 0.16), transparent 24%),
+    radial-gradient(circle at top right, rgba(31, 122, 90, 0.12), transparent 24%),
-    linear-gradient(180deg, #f8f4ec, #efe6d7 70%, #e7dcc9);
+    linear-gradient(180deg, #f4f7fb, #e4edf6 72%, #d9e6ef);
 }
 .site-header {
  width: min(1320px, calc(100vw - 32px));
  margin: 0 auto;
  padding-top: 24px;
 }
 .site-header-inner {
  display: flex;
  gap: 18px;
  align-items: center;
  justify-content: space-between;
  padding: 18px 22px;
  border-radius: 24px;
  backdrop-filter: blur(10px);
  background: var(--paper);
  border: 1px solid var(--line);
  box-shadow: var(--shadow);
 }
 .site-brand {
  display: flex;
  flex-direction: column;
  gap: 4px;
 }
 .site-brand-mark {
  margin: 0;
  color: var(--accent);
  text-transform: uppercase;
  letter-spacing: 0.18em;
  font-size: 0.76rem;
 }
 .site-brand-link {
  color: var(--ink);
  font-size: 1.5rem;
  font-weight: 700;
  text-decoration: none;
 }
 .site-brand-summary {
  margin: 0;
  color: var(--muted);
  font-size: 0.94rem;
 }
 .site-nav {
  display: flex;
  flex-wrap: wrap;
  gap: 10px;
  justify-content: flex-end;
 }
 .site-nav a {
  display: inline-flex;
  align-items: center;
  justify-content: center;
  border-radius: 999px;
  padding: 11px 16px;
  text-decoration: none;
  color: var(--ink);
  border: 1px solid var(--line);
  background: rgba(255, 255, 255, 0.72);
  transition: transform 160ms ease, border-color 160ms ease;
 }
 .site-nav a:hover {
  transform: translateY(-1px);
  border-color: rgba(15, 118, 110, 0.45);
 }
 .page {
@ -42,6 +113,9 @@ body {
 .hero {
  padding: 28px;
  margin-bottom: 20px;
  background:
    linear-gradient(135deg, rgba(255, 255, 255, 0.95), rgba(234, 244, 240, 0.92)),
    var(--paper);
 }
 .eyebrow {
@ -56,6 +130,7 @@ h1 {
  margin: 0;
  font-size: clamp(2.8rem, 7vw, 5.6rem);
  line-height: 0.92;
  letter-spacing: -0.03em;
 }
 .lede {
@ -64,6 +139,12 @@ h1 {
  font-size: 1.08rem;
 }
 .hero-context {
  max-width: 68ch;
  color: var(--muted);
  line-height: 1.58;
 }
 .hero-stats {
  display: flex;
  gap: 16px;
@ -79,6 +160,15 @@ h1 {
  margin-top: 18px;
 }
 .auth-panel-row {
  margin-top: 0;
 }
 .contributor-signup {
  padding-top: 14px;
  border-top: 1px solid var(--line);
 }
 .auth-bar input {
  min-width: min(360px, 100%);
  flex: 1;
@ -93,7 +183,7 @@ h1 {
  min-width: 180px;
  padding: 14px 16px;
  border-radius: 18px;
-  background: rgba(255, 255, 255, 0.6);
+  background: linear-gradient(180deg, rgba(255, 255, 255, 0.95), rgba(232, 242, 239, 0.92));
  border: 1px solid var(--line);
 }
@ -158,6 +248,16 @@ input[type="search"] {
  background: rgba(255, 255, 255, 0.9);
 }
 input[type="text"],
 input[type="email"],
 input[type="password"] {
  border: 1px solid var(--line);
  border-radius: 18px;
  padding: 12px 14px;
  font: inherit;
  background: rgba(255, 255, 255, 0.92);
 }
 select,
 textarea,
 button {
@ -201,7 +301,7 @@ button {
  padding: 14px;
  border-radius: 18px;
  border: 1px solid var(--line);
-  background: linear-gradient(180deg, rgba(255, 255, 255, 0.95), rgba(241, 237, 230, 0.95));
+  background: linear-gradient(180deg, rgba(255, 255, 255, 0.97), rgba(239, 246, 244, 0.94));
  cursor: pointer;
  transition: transform 160ms ease, border-color 160ms ease;
 }
@ -213,7 +313,7 @@ button {
 .species-card-archived {
  border-style: dashed;
-  background: linear-gradient(180deg, rgba(247, 241, 231, 0.98), rgba(233, 226, 214, 0.98));
+  background: linear-gradient(180deg, rgba(243, 247, 249, 0.98), rgba(227, 236, 242, 0.98));
 }
 .species-name,
@ -273,6 +373,32 @@ button {
  display: none;
 }
 .match-dialog-shell {
  position: fixed;
  inset: 0;
  z-index: 50;
 }
 .match-dialog-backdrop {
  position: absolute;
  inset: 0;
  background: rgba(12, 20, 18, 0.46);
 }
 .match-dialog-card {
  position: relative;
  z-index: 1;
  width: min(1180px, calc(100vw - 32px));
  max-height: calc(100vh - 40px);
  overflow: auto;
  margin: 20px auto;
  padding: 18px;
  border-radius: 24px;
  background: #fbf8f1;
  border: 1px solid var(--line);
  box-shadow: var(--shadow);
 }
 .detail-header {
  padding-bottom: 16px;
  border-bottom: 1px solid var(--line);
@ -313,6 +439,12 @@ button {
  margin-top: 18px;
 }
 .workflow-panels {
  display: grid;
  gap: 16px;
  margin-top: 20px;
 }
 .detail-section {
  padding: 16px;
  border-radius: 18px;
@ -329,6 +461,44 @@ button {
  margin-top: 18px;
 }
 .workflow-panels .editor-panel,
 .workflow-panels .detail-section {
  margin-top: 0;
 }
 .collapsible-panel {
  padding-top: 14px;
 }
 .collapsible-header {
  display: flex;
  gap: 12px;
  align-items: center;
  justify-content: space-between;
  flex-wrap: wrap;
 }
 .collapsible-header h3 {
  margin-bottom: 0;
 }
 .collapsible-body {
  margin-top: 16px;
 }
 .collapsible-panel.collapsed .collapsible-body {
  display: none;
 }
 .document-panel-header {
  display: flex;
  gap: 16px;
  align-items: flex-start;
  justify-content: space-between;
  flex-wrap: wrap;
  margin-bottom: 14px;
 }
 .editor-label {
  display: block;
  margin: 0 0 8px;
@ -349,6 +519,11 @@ button {
  font-weight: 700;
 }
 .contributor-age-gate {
  margin: 0;
  font-weight: 400;
 }
 .archive-toggle input {
  width: 18px;
  height: 18px;
@ -372,6 +547,149 @@ button {
  gap: 12px;
 }
 .citation-list {
  display: grid;
  gap: 14px;
 }
 .citation-entry {
  padding: 14px;
  border-radius: 16px;
  border: 1px solid var(--line);
  background: rgba(255, 255, 255, 0.76);
 }
 .citation-entry-meta {
  margin: 0 0 10px;
  color: var(--muted);
  font-size: 0.92rem;
 }
 .citation-entry-raw {
  margin: 0 0 12px;
  line-height: 1.5;
 }
 .citation-bibtex,
 .citation-bibtex-editor {
  font-family: "Courier New", monospace;
  font-size: 0.9rem;
  line-height: 1.45;
 }
 .citation-abstract-shell {
  display: grid;
  gap: 8px;
  margin: 4px 0 10px;
 }
 .citation-detail-shell {
  display: grid;
  gap: 8px;
  margin: 4px 0 10px;
 }
 .citation-abstract-display {
  padding: 10px 12px;
  border-radius: 12px;
  border: 1px solid var(--line);
  background: rgba(15, 118, 110, 0.05);
 }
 .citation-detail-display {
  padding: 10px 12px;
  border-radius: 12px;
  border: 1px solid var(--line);
  background: rgba(255, 255, 255, 0.78);
 }
 .match-dialog-header,
 .match-dialog-grid,
 .match-candidate-header,
 .match-candidates,
 .match-candidate-card,
 .match-seed,
 .match-table {
  display: grid;
  gap: 12px;
 }
 .match-dialog-header {
  grid-template-columns: minmax(0, 1fr) auto;
  align-items: start;
 }
 .match-dialog-grid {
  grid-template-columns: minmax(260px, 0.9fr) minmax(0, 1.6fr);
  margin-top: 16px;
 }
 .match-candidate-card {
  padding: 14px;
  border-radius: 16px;
  border: 1px solid var(--line);
  background: rgba(255, 255, 255, 0.84);
 }
 .match-candidate-header {
  grid-template-columns: minmax(0, 1fr) auto;
  align-items: baseline;
 }
 .match-score {
  font-weight: 700;
  color: var(--accent);
 }
 .match-table {
  border: 1px solid var(--line);
  border-radius: 14px;
  overflow: hidden;
 }
 .match-row {
  display: grid;
  grid-template-columns: 120px 110px minmax(0, 1fr) minmax(0, 1fr);
  gap: 10px;
  padding: 10px 12px;
  border-top: 1px solid var(--line);
  font-size: 0.92rem;
 }
 .match-row:first-child {
  border-top: 0;
 }
 .match-row-head {
  background: rgba(15, 118, 110, 0.08);
  font-weight: 700;
 }
 .match-label {
  color: var(--muted);
  font-weight: 700;
 }
 .match-status {
  text-transform: uppercase;
  letter-spacing: 0.04em;
  font-size: 0.78rem;
 }
 .match-status-exact {
  color: var(--accent);
 }
 .match-status-partial,
 .match-status-seed-missing,
 .match-status-candidate-missing {
  color: var(--accent-2);
 }
 .match-status-conflict {
  color: #a12626;
 }
 .audit-entry {
  padding: 14px;
  border-radius: 16px;
@ -394,6 +712,62 @@ button {
  line-height: 1.45;
 }
 .document-editor,
 .document-preview {
  font-family: "Courier New", monospace;
  font-size: 0.92rem;
  line-height: 1.5;
 }
 .document-editor {
  min-height: 420px;
  margin-bottom: 14px;
  white-space: pre;
  overflow: auto;
 }
 .document-preview-shell {
  border: 1px solid var(--line);
  border-radius: 18px;
  background: rgba(255, 255, 255, 0.72);
  overflow: hidden;
 }
 .document-preview-shell summary {
  cursor: pointer;
  padding: 12px 16px;
  font-weight: 700;
  color: var(--accent);
 }
 .document-preview {
  padding: 0 16px 16px;
 }
 .document-preview-empty {
  color: var(--muted);
 }
 .document-preview-list {
  margin: 0;
  padding-left: 22px;
 }
 .document-preview-list li + li {
  margin-top: 8px;
 }
 .document-preview-metadata {
  margin: 0 0 14px;
  padding: 0;
  list-style: none;
  color: var(--muted);
 }
 .document-preview-metadata li + li {
  margin-top: 6px;
 }
 .diagnostic-list {
  margin: 0;
  padding-left: 18px;
@ -403,6 +777,100 @@ button {
  margin-top: 8px;
 }
 .structured-node {
  display: grid;
  gap: 12px;
  background: linear-gradient(180deg, rgba(255, 255, 255, 0.84), rgba(242, 247, 252, 0.88));
 }
 .structured-node + .structured-node {
  margin-top: 4px;
 }
 .structured-node h3,
 .structured-node h4,
 .structured-node h5,
 .structured-node h6 {
  line-height: 1.18;
  letter-spacing: -0.01em;
 }
 .structured-node-body {
  margin: 0;
  line-height: 1.58;
  color: var(--ink);
 }
 .structured-node-children {
  display: grid;
  gap: 12px;
  padding: 4px 0 0 18px;
  border-left: 2px solid rgba(36, 87, 166, 0.12);
 }
 .public-citation-list {
  display: grid;
  gap: 14px;
 }
 .public-bibliography-actions {
  display: flex;
  gap: 12px;
  align-items: center;
  flex-wrap: wrap;
 }
 .public-bibliography-note {
  margin: 0;
  color: var(--muted);
  font-size: 0.92rem;
 }
 .public-citation-entry {
  display: grid;
  gap: 8px;
  padding: 14px;
  border-radius: 16px;
  border: 1px solid var(--line);
  background: rgba(255, 255, 255, 0.76);
 }
 .public-citation-text,
 .public-citation-meta,
 .public-citation-links,
 .public-citation-abstract {
  margin: 0;
 }
 .public-citation-text {
  line-height: 1.56;
 }
 .public-citation-meta,
 .public-citation-links {
  color: var(--muted);
  font-size: 0.92rem;
 }
 .public-citation-links a {
  color: var(--accent);
 }
 .public-citation-abstract {
  padding-top: 2px;
  color: var(--muted);
  line-height: 1.58;
 }
 .legacy-source {
  max-height: 28rem;
  overflow: auto;
  padding: 14px;
  border-radius: 16px;
  border: 1px solid var(--line);
  background: rgba(255, 255, 255, 0.76);
 }
 pre {
  margin: 0;
  white-space: pre-wrap;
@ -417,6 +885,15 @@ pre {
 }
@media (max-width: 960px) {
  .site-header-inner {
    flex-direction: column;
    align-items: stretch;
  }
  .site-nav {
    justify-content: flex-start;
  }
  .workspace {
    grid-template-columns: 1fr;
  }
@ -424,4 +901,12 @@ pre {
  .species-list {
    max-height: 40vh;
  }
  .match-dialog-grid {
    grid-template-columns: 1fr;
  }
  .match-row {
    grid-template-columns: 1fr;
  }
 }
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,5 +1,6 @@
 services:
  db:
    container_name: ecospecies-db
    image: postgres:16-alpine
    environment:
      POSTGRES_DB: ecospecies
@ -17,6 +18,7 @@ services:
      - postgres_data:/var/lib/postgresql/data
  importer:
    container_name: ecospecies-importer
    image: python:3.12-slim
    depends_on:
      db:
@ -30,11 +32,12 @@ services:
    command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-import.sh"]
    volumes:
      - .:/workspace
-      - ../01-legacy-code-and-data:/legacy-data:ro
+      - ${ECOSPECIES_LEGACY_DATA_DIR:-../legacy-corpus}:/legacy-data:ro
      - python_venv:/workspace/.docker/venv
      - pip_cache:/root/.cache/pip
  api:
    container_name: ecospecies-api
    image: python:3.12-slim
    restart: unless-stopped
    depends_on:
@ -56,11 +59,12 @@ services:
      - "${ECOSPECIES_API_PORT:-8000}:8000"
    volumes:
      - .:/workspace
-      - ../01-legacy-code-and-data:/legacy-data:ro
+      - ${ECOSPECIES_LEGACY_DATA_DIR:-../legacy-corpus}:/legacy-data:ro
      - python_venv:/workspace/.docker/venv
      - pip_cache:/root/.cache/pip
  web:
    container_name: ecospecies-web
    image: nginx:1.27-alpine
    restart: unless-stopped
    depends_on:
--- a/docs/citegeist-review-notes.md
+++ b/docs/citegeist-review-notes.md
@ -0,0 +1,110 @@
 ## CiteGeist Review Notes
 These notes capture parser issues seen while integrating CiteGeist-style extraction into EcoSpecies.
 ### Report-style references
 Observed failure shape:
 - references like `Daniell, W.C. 1872. Letters referring ... Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.`
 - extracted `title` may contain the full raw bibliography string
 - abbreviated venue names such as `Comm. Rept.` are not separated cleanly from the title
 Suggested upstream change in `citegeist.extract`:
 - add a report-style parser path after year detection
 - prefer sentence-boundary venue detection before naive keyword splits so words like `report` inside a real title do not trigger an early cut
 - support abbreviation-heavy venue starters such as:
  - `comm. rept.`
  - `rept.`
  - `proc.`
  - `occas. pap.`
  - `bulletin`
  - `bull.`
  - `memoir`
 - strip trailing volume/page blobs like `2: 387-390` from the venue field
 - when a first parse leaves a partial venue stub such as `Occas`, reparse the full raw reference line and prefer the fuller repaired venue/title split
 ### Placeholder title merge behavior
 Observed failure shape:
 - a raw bibliography string may survive as `title` even after DOI/title resolution finds a better title
 Suggested upstream change in `citegeist.resolve.merge_entries_with_conflicts`:
 - treat titles that look like raw bibliography strings as placeholders
 - example heuristic:
  - starts with `Surname, ... YEAR.`
  - unusually long for a title
  - contains a resolved shorter title as a substring after punctuation normalization
 ### Legacy note deduplication
 Observed failure shape:
 - note fragments like `ecospecies_reference_number = {160}` can be appended more than once downstream when re-merging enriched metadata
 Suggested upstream change:
 - when joining note fragments, split on `;`, normalize whitespace, and dedupe per fragment rather than per whole note string
 ### Unresolved entries should still refresh local parses
 Observed failure shape:
 - parser improvements may correctly rebuild `title`, venue, `volume`, `number`, and `pages`
 - but if no remote metadata source matches, the stored draft BibTeX can remain unchanged unless unresolved enrichment also writes the refreshed local seed back out
 Suggested upstream change:
 - unresolved enrichment should still return the rebuilt local draft entry
 - keep `citation_key`, normalized text, and draft BibTeX synchronized with the current local parser even when resolver status remains `unresolved`
 ### Returned metadata not carried through
 Observed concern:
 - resolver/source payloads may include bibliographic details such as:
  - `volume`
  - `issue` / BibTeX `number`
  - `page` / BibTeX `pages`
 - these should be preserved into the BibTeX entry whenever available
 Current note:
 - CiteGeist Crossref mapping already includes `volume`, `number`, and `pages`
 - verify that all resolver paths, storage round-trips, and exports preserve those fields consistently
 - OpenAlex/DataCite mappings should also be checked for analogous bibliographic fields in `biblio` / attribute payloads
 ### False-positive title-search acceptance
 Observed failure shape:
 - title search can return a thematically related but bibliographically different work
 - downstream acceptance may keep some seed fields while adopting conflicting DOI/title/volume/pages from the returned match
 - this is especially risky for historical references with sparse or abbreviated venue names
 Suggested upstream change in `citegeist.resolve` and any title-search ranking path:
 - do not fall back to the first search hit when no strong title match exists
 - prefer exact or near-exact title matches only
 - reject a candidate when structured seed metadata conflicts on strong fields such as:
  - `year`
  - venue / journal
  - `volume`
  - `number`
  - `pages`
 - treat those fields as match-validation inputs, not just merge-time metadata
 ### OpenAlex null-source handling
 Observed failure shape:
 - some OpenAlex works have `primary_location` present but `source: null`
 - downstream mapping can crash if it assumes `source` is always a dictionary
 Suggested upstream change:
 - treat null `source` payloads as empty dictionaries
 - continue mapping title, year, DOI, and `biblio` fields even when venue/source is missing
--- a/docs/dc-orig.yml
+++ b/docs/dc-orig.yml
@ -0,0 +1,89 @@
 services:
  db:
    image: postgres:16-alpine
    restart: unless-stopped
    environment:
      POSTGRES_DB: ${ECOSPECIES_DB_NAME:-ecospecies}
      POSTGRES_USER: ${ECOSPECIES_DB_USER:-ecospecies}
      POSTGRES_PASSWORD: ${ECOSPECIES_DB_PASSWORD:?set ECOSPECIES_DB_PASSWORD}
      PGDATA: /var/lib/postgresql/data/pgdata
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U ${ECOSPECIES_DB_USER:-ecospecies} -d ${ECOSPECIES_DB_NAME:-ecospecies}"]
      interval: 5s
      timeout: 5s
      retries: 10
    volumes:
      - postgres_data:/var/lib/postgresql/data
  importer:
    image: python:3.12-slim
    restart: "no"
    depends_on:
      db:
        condition: service_healthy
    working_dir: /workspace
    environment:
      ECOSPECIES_DATA_DIR: ${ECOSPECIES_DATA_DIR:-/legacy-data/InputFiles - TXT}
      ECOSPECIES_DATABASE_URL: postgresql+psycopg://${ECOSPECIES_DB_USER:-ecospecies}:${ECOSPECIES_DB_PASSWORD}@db:5432/${ECOSPECIES_DB_NAME:-ecospecies}
      ECOSPECIES_VENV_DIR: /workspace/.docker/venv
      PYTHONPATH: /workspace/apps/api/src
    command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-import.sh"]
    volumes:
      - ..:/workspace
      - ${ECOSPECIES_LEGACY_DATA_DIR:-../path-to-legacy-corpus}:/legacy-data:ro
      - python_venv:/workspace/.docker/venv
      - pip_cache:/root/.cache/pip
  api:
    image: python:3.12-slim
    restart: unless-stopped
    depends_on:
      db:
        condition: service_healthy
      importer:
        condition: service_completed_successfully
    working_dir: /workspace
    environment:
      ECOSPECIES_DATA_DIR: ${ECOSPECIES_DATA_DIR:-/legacy-data/InputFiles - TXT}
      ECOSPECIES_DATABASE_URL: postgresql+psycopg://${ECOSPECIES_DB_USER:-ecospecies}:${ECOSPECIES_DB_PASSWORD}@db:5432/${ECOSPECIES_DB_NAME:-ecospecies}
      ECOSPECIES_HOST: 0.0.0.0
      ECOSPECIES_PORT: "8000"
      ECOSPECIES_AUTH_TOKENS: ${ECOSPECIES_AUTH_TOKENS:-}
      ECOSPECIES_VENV_DIR: /workspace/.docker/venv
      PYTHONPATH: /workspace/apps/api/src
    command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-api.sh"]
    volumes:
      - ..:/workspace
      - ${ECOSPECIES_LEGACY_DATA_DIR:-../path-to-legacy-corpus}:/legacy-data:ro
      - python_venv:/workspace/.docker/venv
      - pip_cache:/root/.cache/pip
  web:
    image: nginx:1.27-alpine
    restart: unless-stopped
    depends_on:
      api:
        condition: service_started
    labels:
      - "traefik.enable=true"
      - "traefik.docker.network=${TRAEFIK_NETWORK:-traefik-network}"
      - "traefik.http.routers.ecospecies-atlas.rule=Host(`${ECOSPECIES_HOSTNAME:?set ECOSPECIES_HOSTNAME}`)"
      - "traefik.http.routers.ecospecies-atlas.entrypoints=${TRAEFIK_ENTRYPOINTS:-websecure}"
      - "traefik.http.routers.ecospecies-atlas.tls.certresolver=${TRAEFIK_CERTRESOLVER:-myresolver}"
      - "traefik.http.services.ecospecies-atlas.loadbalancer.server.port=80"
    volumes:
      - ../apps/web:/usr/share/nginx/html:ro
      - ../apps/web/nginx.conf:/etc/nginx/conf.d/default.conf:ro
    networks:
      - default
      - traefik-network
 volumes:
  postgres_data:
  python_venv:
  pip_cache:
 networks:
  traefik-network:
    external: true
    name: ${TRAEFIK_NETWORK:-traefik-network}
--- a/docs/docker-compose-traefik.env.example
+++ b/docs/docker-compose-traefik.env.example
@ -0,0 +1,20 @@
 # Required
 ECOSPECIES_HOSTNAME=example.org
 ECOSPECIES_BASE_PATH=/apps/ecospecies
 ECOSPECIES_DB_PASSWORD=replace-with-strong-password
 # Optional database settings
 ECOSPECIES_DB_NAME=ecospecies
 ECOSPECIES_DB_USER=ecospecies
 # Optional application settings
 ECOSPECIES_AUTH_TOKENS=
 ECOSPECIES_DATA_DIR=/workspace/input-data/InputFiles
 # Optional host path to the legacy corpus if it is not at ../path-to-legacy-corpus
 ECOSPECIES_LEGACY_DATA_DIR=../path-to-legacy-corpus
 # Optional Traefik settings
 TRAEFIK_NETWORK=traefik-network
 TRAEFIK_ENTRYPOINTS=websecure
 TRAEFIK_CERTRESOLVER=myresolver
--- a/docs/docker-compose-traefik.yml
+++ b/docs/docker-compose-traefik.yml
@ -0,0 +1,93 @@
 services:
  db:
    container_name: ecospecies-db
    image: postgres:16-alpine
    restart: unless-stopped
    environment:
      POSTGRES_DB: ${ECOSPECIES_DB_NAME:-ecospecies}
      POSTGRES_USER: ${ECOSPECIES_DB_USER:-ecospecies}
      POSTGRES_PASSWORD: ${ECOSPECIES_DB_PASSWORD:?set ECOSPECIES_DB_PASSWORD}
      PGDATA: /var/lib/postgresql/data/pgdata
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U ${ECOSPECIES_DB_USER:-ecospecies} -d ${ECOSPECIES_DB_NAME:-ecospecies}"]
      interval: 5s
      timeout: 5s
      retries: 10
    volumes:
      - postgres_data:/var/lib/postgresql/data
  importer:
    container_name: ecospecies-importer
    image: python:3.12-slim
    restart: "no"
    depends_on:
      db:
        condition: service_healthy
    working_dir: /workspace
    environment:
      ECOSPECIES_DATA_DIR: ${ECOSPECIES_DATA_DIR:-/workspace/input-data/InputFiles}
      ECOSPECIES_DATABASE_URL: postgresql+psycopg://${ECOSPECIES_DB_USER:-ecospecies}:${ECOSPECIES_DB_PASSWORD}@db:5432/${ECOSPECIES_DB_NAME:-ecospecies}
      ECOSPECIES_VENV_DIR: /workspace/.docker/venv
      PYTHONPATH: /workspace/apps/api/src
    command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-import.sh"]
    volumes:
      - ..:/workspace
      - ${ECOSPECIES_LEGACY_DATA_DIR:-../path-to-legacy-corpus}:/legacy-data:ro
      - python_venv:/workspace/.docker/venv
      - pip_cache:/root/.cache/pip
  api:
    container_name: ecospecies-api
    image: python:3.12-slim
    restart: unless-stopped
    depends_on:
      db:
        condition: service_healthy
      importer:
        condition: service_completed_successfully
    working_dir: /workspace
    environment:
      ECOSPECIES_DATA_DIR: ${ECOSPECIES_DATA_DIR:-/workspace/input-data/InputFiles}
      ECOSPECIES_DATABASE_URL: postgresql+psycopg://${ECOSPECIES_DB_USER:-ecospecies}:${ECOSPECIES_DB_PASSWORD}@db:5432/${ECOSPECIES_DB_NAME:-ecospecies}
      ECOSPECIES_HOST: 0.0.0.0
      ECOSPECIES_PORT: "8000"
      ECOSPECIES_AUTH_TOKENS: ${ECOSPECIES_AUTH_TOKENS:-}
      ECOSPECIES_VENV_DIR: /workspace/.docker/venv
      PYTHONPATH: /workspace/apps/api/src
    command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-api.sh"]
    volumes:
      - ..:/workspace
      - ${ECOSPECIES_LEGACY_DATA_DIR:-/input-data}:/legacy-data:ro
      - python_venv:/workspace/.docker/venv
      - pip_cache:/root/.cache/pip
  web:
    container_name: ecospecies-web
    image: nginx:1.27-alpine
    restart: unless-stopped
    depends_on:
      api:
        condition: service_started
    labels:
      - "traefik.enable=true"
      - "traefik.docker.network=${TRAEFIK_NETWORK:-traefik-network}"
      - "traefik.http.routers.ecospecies-atlas.rule=Host(`${ECOSPECIES_HOSTNAME:?set ECOSPECIES_HOSTNAME}`) && PathPrefix(`${ECOSPECIES_BASE_PATH:-/}`)"
      - "traefik.http.routers.ecospecies-atlas.entrypoints=${TRAEFIK_ENTRYPOINTS:-websecure}"
      - "traefik.http.routers.ecospecies-atlas.tls.certresolver=${TRAEFIK_CERTRESOLVER:-myresolver}"
      - "traefik.http.services.ecospecies-atlas.loadbalancer.server.port=80"
    volumes:
      - ../apps/web:/usr/share/nginx/html:ro
      - ../apps/web/nginx.conf:/etc/nginx/conf.d/default.conf:ro
    networks:
      - default
      - traefik-network
 volumes:
  postgres_data:
  python_venv:
  pip_cache:
 networks:
  traefik-network:
    external: true
    name: ${TRAEFIK_NETWORK:-traefik-network}
--- a/docs/postgres-backup.md
+++ b/docs/postgres-backup.md
@ -0,0 +1,48 @@
 # PostgreSQL Backup Notes
 This note applies to deployments that use the PostgreSQL volume defined by the Compose stack, including the Traefik deployment variant.
 ## What Needs Backup
 At minimum, back up:
 - the PostgreSQL data volume
 - the deployment env file that contains the database credentials
 For the Traefik deployment variant, that usually means:
 - the Docker volume `postgres_data`
 - `docs/docker-compose-traefik.env`
 ## Logical Backup
 From the repository root, create a SQL dump with:
 ```bash
 ./scripts/backup-postgres.sh
 ```
 To write to a specific file:
 ```bash
 ./scripts/backup-postgres.sh /path/to/ecospecies-backup.sql
 ```
 ## Restore From Logical Backup
 Restore a SQL dump with:
 ```bash
 ./scripts/restore-postgres.sh /path/to/ecospecies-backup.sql
 ```
 ## Volume-Level Backup
 If the host backup system can snapshot Docker volumes safely, include the PostgreSQL volume in that schedule. A volume snapshot is useful for full recovery, but a logical dump is still recommended for portability and validation.
 ## Operational Guidance
 - Run backups on a schedule instead of relying on ad hoc dumps.
 - Test restore procedures before relying on the backup policy.
 - Keep backup artifacts outside the live Docker host when possible.
 - The backup and restore scripts default to `docs/docker-compose-traefik.env` and `docs/docker-compose-traefik.yml`, but both can be overridden with `ECOSPECIES_ENV_FILE` and `ECOSPECIES_COMPOSE_FILE`.
--- a/docs/roadmap.md
+++ b/docs/roadmap.md
@ -1,5 +1,22 @@
 # EcoSpecies Modernization Roadmap
 ## Current Status
 As of 2026-03-27, the repo is no longer at the pure planning stage. The following pieces are already implemented and working in the live stack:
 - Docker Compose deployment with explicit `ecospecies-...` container names
 - path-based hosting support for `/apps/ecospecies`
 - in-repo-only source directory resolution with safe path validation
 - legacy SLH ingest into PostgreSQL-backed species, sections, citations, audit, and document records
 - editor/admin workflows for draft, review, publish, archive, and audit history
 - contributor registration and draft-authoring workflow with token-based access
 - structured Markdown document storage and editor/API round-trip
 - persisted taxon identifier scaffolding with legacy identifiers separated from future-facing external identifiers
 - citation extraction, review, enrichment, batch enrichment, candidate matching, and reviewed-candidate selection/addition
 - citation persistence back into the structured Markdown source of truth
 The roadmap below has been updated to reflect that actual state.
 ## Target Product
 Create a Docker Compose-based, open-source EcoSpecies successor that:
@ -31,48 +48,91 @@ Create a Docker Compose-based, open-source EcoSpecies successor that:
 ### Phase 0: Discovery and migration planning
 Status: completed
 - Inventory legacy assets and user-facing capabilities.
 - Capture the replacement architecture and ingestion strategy.
 - Define acknowledgements, provenance, and licensing boundaries.
 ### Phase 1: Ingestion foundation
 Status: substantially complete, with parser refinement ongoing
 - Parse legacy `.txt` SLH inputs into structured JSON records.
- Normalize common metadata: title, scientific name, common name, FLELMR code, headings, references.
+- Normalize common metadata: title, scientific name, common name, FLELMR/EcoSpecies code, headings, references.
 - Create ingest diagnostics to flag malformed files and missing metadata.
 - Continue parser refinement for legacy edge cases in headings, citations, and historical bibliography formats.
 ### Phase 2: Public read experience
 Status: implemented baseline
 - Species listing and search.
 - Species detail view with section navigation.
 - Provenance and acknowledgement display.
 - Summary metrics on corpus coverage.
 - Path-based deployment under `/apps/ecospecies`.
-### Phase 3: Structured persistence
+### Phase 3: Structured persistence and editorial workflow
- Move parsed content into PostgreSQL.
+Status: implemented baseline, with editor UX still maturing
 - Add editor-safe import jobs and audit metadata.
 - Preserve raw source alongside normalized records.
 - Establish authentication and role-based access for editor and admin workflows.
 - Add persisted editorial workflow state for draft, review, and published records.
 - Make document sections individually addressable for editor review and revision, with audit history for section-level changes.
-### Phase 4: Linkages and visualization
+- PostgreSQL-backed persistence for species, sections, citations, documents, taxon identifiers, and audit history.
 - Editor-safe import jobs and audit metadata.
 - Raw-source preservation alongside normalized records.
 - Authentication and role-based access for admin/editor/contributor workflows.
 - Persisted editorial workflow state for draft, review, published, and archived records.
 - Structured Markdown document storage and round-trip editing.
 - Citation review, enrichment, candidate selection, and reviewed-candidate addition.
 - Contributor draft creation and owner-scoped editing.
 ### Phase 4: Standards-aware identity and bibliography
 Status: partially implemented
 - Preserve legacy local identifiers as provenance.
 - Persist taxon identifiers separately from legacy identifiers.
 - Expose `legacy_identifiers`, `taxon_identifiers`, and `primary_taxon_*` API fields.
 - Persist structured citation records with DOI/OpenAlex/DataCite-style enrichment fields.
 - Continue toward multi-authority identifier review, richer citation entities, and CiteGeist-backed bibliography expansion.
 ### Phase 5: Editor ergonomics and advanced review
 Status: in progress
 - Structured Markdown editor is live.
 - Citation match-review dialog is live.
 - Remaining work:
  - CodeMirror-based Markdown editor with folding
  - inline parser diagnostics in the editor
  - richer citation diff/review affordances
  - clearer document-node and citation provenance in the UI
 ### Phase 6: Linkages and visualization
 Status: not started
 - Model predator/prey, habitat, and ecological association edges.
 - Add graph endpoints and species-relationship views.
 - Support public-friendly visual explanations and expert filters.
-### Phase 5: Reports and export
+### Phase 7: Reports and export
- Recreate legacy-like text/RTF export.
+Status: partially implemented
 - Add machine-readable export formats such as JSON and Markdown.
 - Support FLELMR-oriented authoring/export profiles.
-### Phase 6: Assisted research workflows
+- JSON and Markdown exports exist through the API/document model.
 - Structured Markdown is now the primary human-readable editor/export format.
 - Remaining work:
  - recreate legacy-like text/RTF export
  - support export profiles for legacy compatibility and standards-forward outputs
  - improve citation/bibliography export fidelity
 ### Phase 8: Assisted research workflows
 Status: planned
 - Add local-LLM-assisted extraction and drafting in a human-review loop.
- Integrate bibliography tooling for citation consolidation.
+- Integrate bibliography tooling for citation consolidation and topic expansion.
 - Support candidate-species intake for records not yet in the historical corpus.
 - Restrict assisted drafting and publication actions to authenticated editorial roles.
@ -84,6 +144,9 @@ Initial core entities:
 - `source_document`
 - `document_section`
 - `citation`
 - `taxon_identifier`
 - `citation_identifier`
 - `bibliography_topic`
 - `taxon`
 - `linkage`
 - `media_asset`
@ -95,6 +158,7 @@ Key design rules:
 - retain provenance and import timestamps
 - separate public published records from draft/editor states
 - make sections addressable for citation and graph linking
 - prefer a canonical document AST over direct projection from free-form source text
 ## LLM Extension Strategy
@ -103,6 +167,8 @@ Use local models only for assistive tasks, never silent publication:
 - extracting candidate structured fields from new SLH text
 - suggesting missing headings or linkage labels
 - clustering similar citations
 - resolving bibliography entries toward DOI/OpenAlex/DataCite where available
 - treating local legacy codes as provenance, not canonical identifiers
 - drafting summaries for editor review
 Guardrails:
@ -111,16 +177,19 @@ Guardrails:
 - all generated content is marked as draft
 - every automated extraction stores source spans where possible
-## Development Roadmap
+## Near-Term Priorities
-1. Implement a thin ingestion API over the legacy text corpus.
+1. Add CodeMirror-based folding and structure-aware editing to the Markdown document editor.
-2. Build a responsive browser UI for listing and viewing species.
+2. Expand taxon identifier review workflows for WoRMS, GBIF, Catalogue of Life, and related authorities.
-3. Add a persistent PostgreSQL-backed ingest store.
+3. Deepen citation quality controls, including better parsed-field visibility and stricter/manual review loops where resolver confidence is weak.
-4. Introduce export and visualization services.
+4. Add CiteGeist-style topic expansion and bibliography-suggestion review for under-cited species.
-5. Add editorial workflows and local-LLM assistance.
+5. Improve document export fidelity so reviewed citations and standards-based identifiers are clearly represented in Markdown and downstream exports.
 6. Begin the first ecological-linkage data model and API endpoints once citation/identifier workflows stabilize.
 ## Definition Of Done For The Initial Milestone
 - `docker compose up` starts a working API and frontend.
- The system can enumerate the legacy corpus and show parsed species detail for at least one real SLH file.
+- The system can enumerate the legacy corpus and show parsed species detail for real SLH files.
- Project docs describe the migration approach, target architecture, and next phases.
+- Editors can curate structured Markdown documents and citations through authenticated workflows.
 - Contributors can register, create drafts, and edit only their own submissions.
 - Project docs describe both the implemented modernization state and the next phases.
--- a/docs/standards-migration-plan.md
+++ b/docs/standards-migration-plan.md
@ -0,0 +1,315 @@
 # EcoSpecies Standards Migration Plan
 ## Problem
 The current EcoSpecies ingest and document model still treats legacy local fields such as `FLELMR code` / `species_code` as if they were primary identifiers. That is useful for historical provenance, but it is the wrong long-term center of gravity for a broader, modern biodiversity knowledge system.
 The same problem exists for citations:
 - legacy plaintext reference blocks are treated as local document text,
 - citation identity is weak or missing,
 - bibliography growth is tied to what happened to appear in the historical SLH file.
 The new system should preserve legacy local identifiers and references, but it should not be structurally bound to them.
 ## Direction
 Treat legacy local codes and freeform references as import-era artifacts, not canonical future-facing identifiers.
 Going forward, EcoSpecies should prefer broadly recognized identifiers and registries:
 - taxonomic name authority and taxon identifiers:
  - Catalogue of Life IDs and release DOIs
  - GBIF taxon keys
  - WoRMS AphiaIDs for marine taxa
  - ITIS TSNs where relevant
  - optional NCBI Taxonomy IDs for research interoperability
 - literature and dataset identifiers:
  - DOI as the primary publication/dataset identifier
  - ISBN/ISSN where DOI is absent
  - OpenAlex IDs and DataCite metadata as enrichment layers
 - contributor identity:
  - email-based local contributor accounts now
  - optional ORCID linkage later for editor and contributor identity
 The system should be marine-forward because that matches the historical corpus, but not marine-exclusive. Identifier strategy should therefore be authority-aware rather than tied to a single domain-specific registry.
 ## Authority Selection Strategy
 Choose the primary taxon authority by best-fit coverage, not by a single global rule.
 - marine taxa:
  - prefer WoRMS AphiaID as primary when confidently matched
  - retain GBIF and Catalogue of Life as crosswalks
 - non-marine or mixed-domain taxa:
  - prefer Catalogue of Life or GBIF as primary, depending on match quality and coverage
  - retain ITIS and other relevant identifiers as crosswalks
 - unresolved or conflicting cases:
  - store all candidate identifiers
  - require editorial review before a primary identifier is asserted
 This keeps the project ready for terrestrial expansion without discarding the value of WoRMS for the present corpus.
 ## Important Taxonomic Note
 PhyloCode is relevant for clade naming, not as a general-purpose replacement for species-level registry IDs. It should not become the primary EcoSpecies species identifier layer. It may be useful later for clade-aware ontology and higher-level phylogenetic naming, but not as the main substitute for local `species_code` values.
 ## Core Design Rules
 1. Legacy local identifiers remain preserved exactly as imported.
 2. Canonical taxon identity becomes multi-authority, not single-local-code.
 3. Citations become first-class structured entities, not just text inside a section.
 4. Bibliographies can be extended by topic and citation graph, not only by source-document inheritance.
 5. Exports keep provenance visible so readers can distinguish legacy source metadata from normalized external identifiers.
 ## Schema Changes
 ### Species metadata
 Retain `flelmr_code` for provenance, but demote it to a legacy metadata field.
 Add a taxon-identity layer:
 - `taxon_name_usage`
 - `taxon_identifier`
 - `taxon_authority`
 - `taxon_match_review`
 Suggested fields:
 - `taxon_identifier.authority`
 - `taxon_identifier.identifier`
 - `taxon_identifier.rank`
 - `taxon_identifier.label`
 - `taxon_identifier.is_primary`
 - `taxon_identifier.source_url`
 - `taxon_identifier.asserted_by`
 - `taxon_identifier.match_confidence`
 - `taxon_identifier.review_status`
 Examples:
 - `authority = "worms", identifier = "159059", label = "AphiaID"`
 - `authority = "gbif", identifier = "2290910", label = "taxonKey"`
 - `authority = "col", identifier = "5T7L7", label = "taxonID"`
 - `authority = "itis", identifier = "161989", label = "TSN"`
 - `authority = "legacy-ecospecies", identifier = "5192", label = "FLELMR"`
 ### Citation model
 Move from section text to structured bibliography entities:
 - `citation`
 - `citation_identifier`
 - `citation_relation`
 - `species_citation`
 - `document_node_citation`
 - `bibliography_topic`
 Suggested citation identifier types:
 - DOI
 - ISBN
 - ISSN
 - PMID
 - arXiv
 - OpenAlex
 - URL
 ## Markdown / AST Changes
 Update the constrained Markdown profile so metadata stops implying that `species_code` is canonical.
 Replace the current front matter recommendation:
 ```md
 species_code: 5192
 ```
 with a provenance-oriented shape:
 ```md
 legacy_identifiers:
  - authority: legacy-ecospecies
    identifier: 5192
    label: FLELMR
 taxon_identifiers:
  - authority: worms
    identifier: 159059
    label: AphiaID
    primary: true
  - authority: gbif
    identifier: 2290910
    label: taxonKey
 ```
 Also add explicit bibliography sections:
 ```md
 ## References
 - id: doi:10.1000/example
  text: Smith, J. 2024. Example paper...
  relation: cites
 ## Suggested Reading
 - topic: estuarine ecology
 ```
 The AST should preserve:
 - legacy identifiers
 - normalized taxon identifiers
 - structured references
 - topic links used for bibliography expansion
 ## Import Pipeline Changes
 ### Species identity
 Import should produce:
 1. raw imported name fields,
 2. legacy local identifiers,
 3. unresolved candidate taxon identifiers,
 4. optional matched external identifiers,
 5. a review state for unresolved or conflicting authority matches.
 Do not block ingest if no external authority match exists. Store the unresolved state explicitly.
 Primary identifier assignment should be determined by:
 1. domain fit of the authority
 2. confidence of the match
 3. editorial review status
 4. future ability to crosswalk to other authorities
 ### Citations
 Split citation processing into stages:
 1. detect bibliography/reference sections in the imported SLH text,
 2. extract plaintext reference strings,
 3. convert plaintext references into draft structured entries,
 4. enrich identifiers and metadata,
 5. assign accepted citations back to species and document nodes,
 6. optionally expand bibliography by topic and citation graph.
 ## CiteGeist Integration
 `../CiteGeist` is a strong fit for this migration.
 Observed capabilities in that repo already cover much of what EcoSpecies needs:
 - extracting references from plaintext,
 - converting rough references into draft structured entries,
 - DOI/Crossref/DataCite/OpenAlex enrichment,
 - citation graph expansion,
 - topic-based bibliography expansion,
 - duplicate clustering and canonicalization.
 ### Recommended integration boundary
 Do not embed CiteGeist logic directly into the EcoSpecies parser.
 Instead:
 1. EcoSpecies exports candidate plaintext references and topic phrases.
 2. CiteGeist processes and enriches them into structured bibliography data.
 3. EcoSpecies imports reviewed citation outputs into its own `citation` tables.
 ### First integration targets
 - species-level bibliography cleanup from `References` sections
 - DOI resolution and identifier assignment
 - duplicate detection across species bibliographies
 - topic expansion for subject areas such as habitat, trophic ecology, reproduction, invasive biology, and fisheries context
 ### Later integration targets
 - node-level citation attachment
 - bibliography review UI
 - suggested-reading generation per species
 - topic-seeded bibliography augmentation for under-cited species drafts
 ## API Changes
 Add standards-aware endpoints:
 - `/api/species/<slug>/identifiers`
 - `/api/species/<slug>/citations`
 - `/api/species/<slug>/bibliography/topics`
 - `/api/editor/species/<slug>/identifier-review`
 - `/api/editor/species/<slug>/citation-review`
 Do not remove legacy fields immediately. Keep `flelmr_code` in payloads for compatibility while introducing:
 - `legacy_identifiers`
 - `taxon_identifiers`
 - `primary_taxon_identifier`
 ## UI Changes
 The species detail page should distinguish:
 - scientific name
 - primary external taxon identifier
 - legacy local identifiers
 - bibliography
 - suggested reading
 Editors should see:
 - unresolved authority matches
 - conflicting taxon IDs
 - citation enrichment candidates
 - duplicate-reference clusters
 Contributors should only author content and draft references; identifier normalization and bibliography publication remain editorial functions.
 ## Migration Phases
 ### Phase A: Demote legacy code
 - Rename internal presentation from “species code” to “legacy identifier”.
 - Keep `flelmr_code` only as legacy provenance.
 - Add `legacy_identifiers` to Markdown export and AST.
 ### Phase B: Add external taxon identifiers
 - Create taxon-identifier tables and API payloads.
 - Add editor review workflows for selecting a primary authority identifier.
 - Default marine taxa review toward WoRMS where available.
 - Default broader cross-domain review toward Catalogue of Life and GBIF where WoRMS is not the right authority.
 - Keep the model open to terrestrial species from the beginning rather than treating them as out-of-scope exceptions.
 ### Phase C: Structured bibliography
 - Create citation tables.
 - Extract plaintext references from imported documents.
 - Store draft citations separately from accepted citations.
 ### Phase D: CiteGeist bridge
 - Define import/export format between EcoSpecies and CiteGeist.
 - Run draft-reference normalization and DOI enrichment.
 - Import reviewed structured citations back into EcoSpecies.
 ### Phase E: Topic-aware bibliography growth
 - Store species topic phrases.
 - Use CiteGeist topic expansion for bibliography augmentation.
 - Keep added citations flagged by source type:
  - imported
  - resolved
  - topic-expanded
  - editor-added
 ## Immediate Next Steps
 1. Update the Markdown profile to replace `species_code` with `legacy_identifiers` plus `taxon_identifiers`.
 2. Add `legacy_identifiers` and `taxon_identifiers` to the AST/document model.
 3. Introduce taxon identifier tables in the PostgreSQL schema.
 4. Define a minimal EcoSpecies-to-CiteGeist interchange format for plaintext references and topic phrases.
 5. Add editor-facing citation review before attempting automatic bibliography publication.
--- a/docs/structured-markdown-plan.md
+++ b/docs/structured-markdown-plan.md
@ -0,0 +1,338 @@
 # Structured Markdown Document Plan
 ## Goal
 Replace the current flat, parser-heavy free-form text handling with a document model that is:
 - human-readable in plaintext
 - editable in the browser with hierarchy folding
 - permissive-license friendly
 - suitable for first-pass conversion from legacy SLH text files
 - suitable as the primary export format for a species life history
 - able to project cleanly into a flexible database model with greater hierarchical depth
 ## Recommendation
 Adopt a constrained Markdown-based authoring format as the primary human-facing document format, backed by an internal hierarchical document AST and a relational projection layer in PostgreSQL.
 Use this three-layer model:
 1. Source and export format: constrained EcoSpecies Markdown
 2. Canonical application representation: hierarchical AST
 3. Database representation: relational projection for querying, indexing, publishing, and editorial workflows
 This avoids treating raw free-form text as both the storage format and the parser input.
 ## Why Markdown Instead Of Org
 Markdown is the better fit for this codebase and licensing requirement because:
 - it is familiar to most users
 - it is easier to constrain than Org
 - it maps naturally to hierarchical headings
 - it works well with CodeMirror folding
 - it does not require adopting GPL or AGPL editor code
 Org-style authoring remains conceptually attractive, but embedding Org-specific tooling such as organice would introduce copyleft code, which is not aligned with a permissive-only implementation strategy.
 ## EcoSpecies Markdown Profile
 The format should be Markdown-like, but intentionally narrower than unrestricted Markdown.
 ### Metadata
 Use YAML front matter for canonical metadata fields:
 ```md
 ---
 title: American Oyster
 common_name: American Oyster
 scientific_name: Crassostrea virginica
 legacy_identifiers:
  - authority: legacy-ecospecies
    identifier: 5192
    label: FLELMR
 taxon_identifiers:
  - authority: worms
    identifier: 159059
    label: AphiaID
    primary: true
 source_file: American Oyster SLH NOAA SEA.txt
 publication_status: published
 ---
 ```
 Recommended canonical fields:
 - `title`
 - `common_name`
 - `scientific_name`
 - `legacy_identifiers`
 - `taxon_identifiers`
 - `primary_taxon_authority`
 - `source_file`
 - `publication_status`
 - `source_format`
 - `legacy_import_id`
 ### Hierarchy
 Use headings as the sole structure-bearing primitive.
 Example:
 ```md
 ---
 title: American Oyster
 common_name: American Oyster
 scientific_name: Crassostrea virginica
 legacy_identifiers:
  - authority: legacy-ecospecies
    identifier: 5192
    label: FLELMR
 ---
 ## Summary
 Short editor-reviewed abstract.
 ## Habitat
 ### Type
 Estuarine.
 ### Substrate
 Hard bottom, shell, mud flats, and other suitable settlement surfaces.
 ## Reproduction
 ### Season
 Spawning occurs from spring through fall in much of the Gulf.
 ```
 Rules:
 - Heading depth is meaningful.
 - Skip-level headings should be rejected or normalized.
 - Body text belongs to the nearest preceding heading.
 - `#` level is optional if the document title already exists in front matter.
 - Tables, lists, and citations are allowed only where explicitly supported.
 - Arbitrary embedded HTML should be disallowed.
 ### Citations
 Keep citations readable in Markdown but structured enough to parse.
 Preferred first-pass shape:
 ```md
 ## Citations
 - [7] Ahmed, M. 1975. Speciation in living oysters. Advances in Marine Biology 13:357-397.
 - [15] Andrews, J.D. 1979. Pelecypoda: Ostreidae. Reproduction of Marine Invertebrates...
 ```
 This is intentionally simpler than trying to infer citations from arbitrary prose.
 ## Canonical AST
 Markdown should not be the sole internal representation. Parse it into an AST that preserves hierarchy explicitly.
 Example conceptual shape:
 ```json
 {
  "metadata": {
    "title": "American Oyster",
    "common_name": "American Oyster",
    "scientific_name": "Crassostrea virginica",
    "legacy_identifiers": [
      {
        "authority": "legacy-ecospecies",
        "identifier": "5192",
        "label": "FLELMR"
      }
    ]
  },
  "nodes": [
    {
      "id": "n1",
      "type": "section",
      "depth": 2,
      "title": "Summary",
      "body": "Short editor-reviewed abstract.",
      "children": []
    },
    {
      "id": "n2",
      "type": "section",
      "depth": 2,
      "title": "Habitat",
      "body": "",
      "children": [
        {
          "id": "n3",
          "type": "section",
          "depth": 3,
          "title": "Type",
          "body": "Estuarine.",
          "children": []
        }
      ]
    }
  ]
 }
 ```
 Required AST properties:
 - arbitrary hierarchical depth
 - stable node identifiers
 - separate metadata from body structure
 - support for editor audit and provenance
 - support for extracting source spans from imported legacy text when available
 ## Database Direction
 The current flat `document_section` model should evolve into a general document tree.
 Suggested core tables:
 - `species_document`
 - `species_document_node`
 - `species_document_node_revision`
 - `species_document_metadata`
 - `citation`
 - `species_document_export`
 Suggested `species_document_node` fields:
 - `id`
 - `document_id`
 - `parent_id`
 - `position`
 - `depth`
 - `node_type`
 - `title`
 - `body_markdown`
 - `body_plaintext`
 - `source_heading`
 - `source_span_start`
 - `source_span_end`
 This enables:
 - greater hierarchical depth
 - stable editor operations on subtrees
 - future insertion of machine-extracted nested content
 - simplified export back to Markdown
 ## Import Flow
 The legacy text parser should no longer attempt to infer the final database structure directly.
 Instead:
 1. Parse raw legacy text into a best-effort intermediate tree.
 2. Normalize extracted metadata.
 3. Emit constrained Markdown.
 4. Parse constrained Markdown into AST.
 5. Persist AST and project relationally.
 6. Record diagnostics on uncertain conversions.
 This changes the parser’s role from “infer final structure perfectly” to “produce a reviewable first draft”.
 ## Editor Flow
 The web editor should operate primarily on the Markdown representation, with a structured parse running on save or preview.
 Recommended behavior:
 - fold by heading depth in CodeMirror
 - validate front matter and heading structure
 - preview rendered sections
 - show parser diagnostics inline
 - save both Markdown source and parsed AST
 The editor should reject or flag:
 - invalid front matter
 - duplicate canonical metadata keys
 - heading depth jumps
 - malformed citation entries in structured sections
 ## Export Policy
 Markdown should be the primary export format for a species life history.
 Export targets:
 - constrained Markdown for editorial interchange
 - JSON AST for machine workflows
 - derived relational/API payloads for the application
 - optional report-oriented exports later
 The export path should be:
 - database document tree -> canonical AST -> constrained Markdown
 This ensures the exported plaintext remains stable and human-readable.
 ## Migration Strategy
 ### Stage 1: Introduce the document model
 - add AST schema and persistence tables
 - keep existing section-based reads working
 - build Markdown import/export helpers
 ### Stage 2: Convert current parser output
 - map current parsed sections into Markdown drafts
 - preserve existing metadata and diagnostics
 - store generated Markdown alongside current records
 ### Stage 3: Introduce Markdown editor
 - add CodeMirror-based editor with heading folding
 - add validation for front matter and heading structure
 - add round-trip save through AST
 ### Stage 4: Move public reads to the new document model
 - generate current API responses from the hierarchical document tree
 - keep compatibility shims for legacy flat sections where needed
 ### Stage 5: Expand structured extraction
 - add deeper parsing for habitat, reproduction, citations, and linkages
 - add richer projections from AST to relational tables
 ## Immediate Implementation Tasks
 Recommended first engineering tasks:
 1. Define the constrained Markdown grammar and validation rules.
 2. Design the AST schema and PostgreSQL tables.
 3. Add Markdown import/export utilities in the API service.
 4. Prototype a CodeMirror editor with heading folding.
 5. Add a migration command that converts current species records into Markdown drafts.
 6. Preserve current endpoints while introducing the document-tree backing model.
 ## Non-Goals For The First Pass
 - full unrestricted Markdown feature support
 - WYSIWYG editing
 - arbitrary embedded HTML
 - perfect citation parsing from all legacy free text
 - replacing every existing API shape immediately
 ## Decision Summary
 The planned direction is:
 - constrained Markdown as the editable and exportable document format
 - internal AST as the canonical application representation
 - relational projection for queryable application state
 - CodeMirror-based browser editing with heading folding
 This is the most practical path toward human-editable hierarchy, permissive-only implementation, cleaner parsing, and deeper long-term document structure.
--- a/docs/traefik-deploy.md
+++ b/docs/traefik-deploy.md
@ -0,0 +1,79 @@
 # Traefik Deployment Notes
 This note applies to the reverse-proxy deployment variant in `docs/docker-compose-traefik.yml`.
 ## Start The Stack
 From the repository root:
 ```bash
 cp docs/docker-compose-traefik.env.example docs/docker-compose-traefik.env
 # edit docs/docker-compose-traefik.env
 docker compose \
  --env-file docs/docker-compose-traefik.env \
  -f docs/docker-compose-traefik.yml \
  up -d
 ```
 ## Common Failure Modes
 ### Traefik cannot reach the web container
 Check:
 - the external Docker network named by `TRAEFIK_NETWORK` exists
 - the Traefik instance is attached to that same Docker network
 - the hostname in `ECOSPECIES_HOSTNAME` matches the Traefik router rule you expect
 - the path in `ECOSPECIES_BASE_PATH` matches the published application prefix, for example `/apps/ecospecies`
 ### The site opens but the API fails
 Check:
 - the `api` service is healthy and running
 - the `web` service is using the repo's `apps/web/nginx.conf`
 - the `api` service finished waiting for `importer`
 - the request path is under `ECOSPECIES_BASE_PATH` if you are publishing the app below a domain root
 ### Importer fails on startup
 Check:
 - `ECOSPECIES_LEGACY_DATA_DIR` points to a real host path
 - that path contains `InputFiles - TXT`
 - the mount is readable by Docker on the target host
 ### Database does not initialize
 Check:
 - `ECOSPECIES_DB_PASSWORD` is set
 - the PostgreSQL volume is writable
 - an old incompatible volume is not being reused unintentionally
 ### Editor login works but no editor state is available
 Check:
 - `ECOSPECIES_AUTH_TOKENS` is set on the `api` service
 - the token you entered matches the configured value exactly
 ## Operational Notes
 - This deployment variant intentionally exposes only the `web` container to Traefik.
 - The `api`, `db`, and `importer` services stay on the internal Compose network.
 - The `importer` runs before the API starts and seeds or synchronizes the dataset.
 - The web container serves both the domain root and `/apps/ecospecies/`, but the Traefik router should target the intended public path.
 ## Apache Front Door
 If Apache is the public front door for the hostname in `ECOSPECIES_HOSTNAME`, it must proxy the configured `ECOSPECIES_BASE_PATH` onward. Otherwise Apache can return its own `Not Found` page before the EcoSpecies stack sees the request.
 Example Apache directives:
 ```apache
 ProxyPass        /apps/ecospecies http://127.0.0.1:80/apps/ecospecies
 ProxyPassReverse /apps/ecospecies http://127.0.0.1:80/apps/ecospecies
 ```
 Point the backend address at the actual Traefik listener on the host if it is not `127.0.0.1:80`, and adjust the published path if `ECOSPECIES_BASE_PATH` is different.
--- a/scripts/backfill-citations.py
+++ b/scripts/backfill-citations.py
@ -0,0 +1,185 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import argparse
 from pathlib import Path
 from ecospecies_api.repository import (
    get_editor_species_citations,
    get_editor_species_list,
    update_species_citation_enrichment,
 )
 def should_backfill(citation: dict[str, object], include_accepted: bool) -> bool:
    review_status = str(citation.get("review_status", "")).strip().lower()
    source_type = str(citation.get("source_type", "")).strip().lower()
    enrichment_status = str(citation.get("enrichment_status", "")).strip().lower()
    normalized_text = str(citation.get("normalized_text", "")).strip()
    abstract_text = str(citation.get("abstract_text", "")).strip()
    if not include_accepted and review_status == "accepted":
        return False
    if source_type in {"editor_selected_candidate", "editor_added_candidate"} and not include_accepted:
        return False
    return (
        source_type in {"document_extract", "editor_review", ""}
        or enrichment_status in {"pending", "unresolved", "error", ""}
        or not normalized_text
        or not abstract_text
    )
 def reorder_species_with_cursor(
    species_items: list[dict[str, object]],
    state_file: Path | None,
 ) -> list[dict[str, object]]:
    if not state_file or not species_items:
        return species_items
    try:
        last_slug = state_file.read_text(encoding="utf-8").strip()
    except FileNotFoundError:
        return species_items
    if not last_slug:
        return species_items
    for index, item in enumerate(species_items):
        if str(item.get("slug", "")).strip() == last_slug:
            return species_items[index + 1 :] + species_items[: index + 1]
    return species_items
 def write_cursor(state_file: Path | None, slug: str) -> None:
    if not state_file or not slug:
        return
    state_file.parent.mkdir(parents=True, exist_ok=True)
    state_file.write_text(f"{slug}\n", encoding="utf-8")
 def main() -> int:
    parser = argparse.ArgumentParser(description="Backfill EcoSpecies citation enrichment.")
    parser.add_argument("--slug", help="Limit the backfill to a single species slug.")
    parser.add_argument("--username", default="citation-backfill", help="Audit username to record.")
    parser.add_argument(
        "--include-accepted",
        action="store_true",
        help="Also rerun accepted/editor-curated citations.",
    )
    parser.add_argument(
        "--max-species",
        type=int,
        default=0,
        help="Stop after this many species with eligible citations. 0 means no limit.",
    )
    parser.add_argument(
        "--max-citations",
        type=int,
        default=0,
        help="Stop after this many citations overall. 0 means no limit.",
    )
    parser.add_argument(
        "--state-file",
        help="Optional cursor file used to rotate scheduled runs through the species list.",
    )
    args = parser.parse_args()
    state_file = Path(args.state_file).expanduser() if args.state_file else None
    species_items = (
        [item for item in get_editor_species_list() if item["slug"] == args.slug]
        if args.slug
        else get_editor_species_list()
    )
    if not args.slug:
        species_items = reorder_species_with_cursor(species_items, state_file)
    if args.slug and not species_items:
        print(f"Species not found: {args.slug}")
        return 1
    species_count = 0
    citation_count = 0
    changed_count = 0
    resolved_count = 0
    unresolved_count = 0
    error_count = 0
    last_seen_slug = ""
    for species in species_items:
        if args.max_species and species_count >= args.max_species:
            break
        slug = str(species["slug"])
        last_seen_slug = slug
        citation_payload = get_editor_species_citations(slug)
        if citation_payload is None:
            continue
        eligible = [
            citation
            for citation in citation_payload["citations"]
            if should_backfill(citation, include_accepted=args.include_accepted)
        ]
        if not eligible:
            continue
        species_count += 1
        print(f"[{slug}] backfilling {len(eligible)} citation(s)", flush=True)
        for citation in eligible:
            if args.max_citations and citation_count >= args.max_citations:
                write_cursor(state_file, last_seen_slug)
                print("citation limit reached; stopping early", flush=True)
                print(
                    "summary:"
                    f" species={species_count}"
                    f" citations={citation_count}"
                    f" changed={changed_count}"
                    f" resolved={resolved_count}"
                    f" unresolved={unresolved_count}"
                    f" errors={error_count}",
                    flush=True,
                )
                return 0
            citation_count += 1
            result = update_species_citation_enrichment(
                slug=slug,
                citation_id=int(citation["id"]),
                username=args.username,
            )
            if result is None:
                print(f"  - citation {citation['id']}: skipped (not found)", flush=True)
                continue
            changed_fields = result.get("changed_fields", {})
            status = str(result["citation"].get("enrichment_status", "")).strip().lower()
            if changed_fields:
                changed_count += 1
            if status == "resolved":
                resolved_count += 1
            elif status == "unresolved":
                unresolved_count += 1
            elif status == "error":
                error_count += 1
            print(
                f"  - citation {citation['id']}: {status or 'unknown'}"
                + (f" ({len(changed_fields)} field changes)" if changed_fields else "")
            , flush=True)
    write_cursor(state_file, last_seen_slug)
    print(
        "summary:"
        f" species={species_count}"
        f" citations={citation_count}"
        f" changed={changed_count}"
        f" resolved={resolved_count}"
        f" unresolved={unresolved_count}"
        f" errors={error_count}",
        flush=True,
    )
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/scripts/backup-postgres.sh
+++ b/scripts/backup-postgres.sh
@ -0,0 +1,28 @@
 #!/bin/sh
 set -eu
 ROOT_DIR="$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)"
 ENV_FILE="${ECOSPECIES_ENV_FILE:-$ROOT_DIR/docs/docker-compose-traefik.env}"
 COMPOSE_FILE="${ECOSPECIES_COMPOSE_FILE:-$ROOT_DIR/docs/docker-compose-traefik.yml}"
 OUTPUT_FILE="${1:-$ROOT_DIR/ecospecies-backup.sql}"
 if [ ! -f "$ENV_FILE" ]; then
  echo "Missing env file: $ENV_FILE" >&2
  exit 1
 fi
 set -a
 . "$ENV_FILE"
 set +a
 DB_USER="${ECOSPECIES_DB_USER:-ecospecies}"
 DB_NAME="${ECOSPECIES_DB_NAME:-ecospecies}"
 docker compose \
  --env-file "$ENV_FILE" \
  -f "$COMPOSE_FILE" \
  exec -T db \
  pg_dump -U "$DB_USER" "$DB_NAME" \
  > "$OUTPUT_FILE"
 printf 'Backup written to %s\n' "$OUTPUT_FILE"
--- a/scripts/restore-postgres.sh
+++ b/scripts/restore-postgres.sh
@ -0,0 +1,37 @@
 #!/bin/sh
 set -eu
 ROOT_DIR="$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)"
 ENV_FILE="${ECOSPECIES_ENV_FILE:-$ROOT_DIR/docs/docker-compose-traefik.env}"
 COMPOSE_FILE="${ECOSPECIES_COMPOSE_FILE:-$ROOT_DIR/docs/docker-compose-traefik.yml}"
 INPUT_FILE="${1:-}"
 if [ -z "$INPUT_FILE" ]; then
  echo "Usage: $0 <sql-backup-file>" >&2
  exit 1
 fi
 if [ ! -f "$ENV_FILE" ]; then
  echo "Missing env file: $ENV_FILE" >&2
  exit 1
 fi
 if [ ! -f "$INPUT_FILE" ]; then
  echo "Missing backup file: $INPUT_FILE" >&2
  exit 1
 fi
 set -a
 . "$ENV_FILE"
 set +a
 DB_USER="${ECOSPECIES_DB_USER:-ecospecies}"
 DB_NAME="${ECOSPECIES_DB_NAME:-ecospecies}"
 cat "$INPUT_FILE" | docker compose \
  --env-file "$ENV_FILE" \
  -f "$COMPOSE_FILE" \
  exec -T db \
  psql -U "$DB_USER" "$DB_NAME"
 printf 'Restore completed from %s\n' "$INPUT_FILE"
--- a/scripts/run-citation-backfill.sh
+++ b/scripts/run-citation-backfill.sh
@ -0,0 +1,21 @@
 #!/bin/sh
 set -eu
 ROOT_DIR="$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)"
 LOG_DIR="${ECOSPECIES_BACKFILL_LOG_DIR:-$ROOT_DIR/var/logs}"
 STATE_FILE="${ECOSPECIES_BACKFILL_STATE_FILE:-$ROOT_DIR/var/citation-backfill.cursor}"
 LOCK_DIR="${ECOSPECIES_BACKFILL_LOCK_DIR:-$ROOT_DIR/var/citation-backfill.lock}"
 MAX_SPECIES="${ECOSPECIES_BACKFILL_MAX_SPECIES:-3}"
 mkdir -p "$LOG_DIR"
 mkdir -p "$ROOT_DIR/var"
 if ! mkdir "$LOCK_DIR" 2>/dev/null; then
  echo "citation backfill already running; skipping"
  exit 0
 fi
 trap 'rmdir "$LOCK_DIR"' EXIT INT TERM
 exec docker exec ecospecies-api /bin/sh -lc \
  "PYTHONPATH=/workspace/apps/api/src /workspace/.docker/venv/bin/python -u /workspace/scripts/backfill-citations.py --username citation-backfill --max-species ${MAX_SPECIES} --state-file ${STATE_FILE}"