From 1143f9bfcc52b91f00e1d18a002961bf027b40f5 Mon Sep 17 00:00:00 2001 From: welsberr Date: Fri, 10 Apr 2026 04:44:45 +0000 Subject: [PATCH] Prepare public-safe repo update --- .gitignore | 9 + README.md | 19 +- apps/api/src/ecospecies_api/app.py | 417 +++++- apps/api/src/ecospecies_api/auth.py | 47 +- .../src/ecospecies_api/citation_enrichment.py | 1018 +++++++++++++++ .../src/ecospecies_api/citegeist_bridge.py | 387 ++++++ .../api/src/ecospecies_api/document_format.py | 480 +++++++ .../src/ecospecies_api/document_repository.py | 267 ++++ apps/api/src/ecospecies_api/models.py | 108 ++ apps/api/src/ecospecies_api/parser.py | 119 +- apps/api/src/ecospecies_api/repository.py | 1146 ++++++++++++++++- apps/api/test_auth.py | 21 + apps/api/test_citation_enrichment.py | 21 + apps/api/test_document_format.py | 21 + apps/api/test_parser.py | 21 + apps/api/tests/test_auth.py | 58 + apps/api/tests/test_citation_enrichment.py | 527 ++++++++ apps/api/tests/test_document_format.py | 195 +++ apps/api/tests/test_parser.py | 109 ++ apps/api/tests/test_repository.py | 660 ++++++++++ apps/web/app.js | 1113 +++++++++++++++- apps/web/bibliography.html | 43 + apps/web/bibliography.js | 230 ++++ apps/web/index.html | 206 ++- apps/web/nginx.conf | 31 + apps/web/styles.css | 515 +++++++- docker-compose.yml | 8 +- docs/citegeist-review-notes.md | 110 ++ docs/dc-orig.yml | 89 ++ docs/docker-compose-traefik.env.example | 20 + docs/docker-compose-traefik.yml | 93 ++ docs/postgres-backup.md | 48 + docs/roadmap.md | 115 +- docs/standards-migration-plan.md | 315 +++++ docs/structured-markdown-plan.md | 338 +++++ docs/traefik-deploy.md | 79 ++ scripts/backfill-citations.py | 185 +++ scripts/backup-postgres.sh | 28 + scripts/restore-postgres.sh | 37 + scripts/run-citation-backfill.sh | 21 + 40 files changed, 9099 insertions(+), 175 deletions(-) create mode 100644 apps/api/src/ecospecies_api/citation_enrichment.py create mode 100644 apps/api/src/ecospecies_api/citegeist_bridge.py create mode 100644 apps/api/src/ecospecies_api/document_format.py create mode 100644 apps/api/src/ecospecies_api/document_repository.py create mode 100644 apps/api/test_auth.py create mode 100644 apps/api/test_citation_enrichment.py create mode 100644 apps/api/test_document_format.py create mode 100644 apps/api/test_parser.py create mode 100644 apps/api/tests/test_auth.py create mode 100644 apps/api/tests/test_citation_enrichment.py create mode 100644 apps/api/tests/test_document_format.py create mode 100644 apps/api/tests/test_parser.py create mode 100644 apps/web/bibliography.html create mode 100644 apps/web/bibliography.js create mode 100644 docs/citegeist-review-notes.md create mode 100644 docs/dc-orig.yml create mode 100644 docs/docker-compose-traefik.env.example create mode 100644 docs/docker-compose-traefik.yml create mode 100644 docs/postgres-backup.md create mode 100644 docs/standards-migration-plan.md create mode 100644 docs/structured-markdown-plan.md create mode 100644 docs/traefik-deploy.md create mode 100644 scripts/backfill-citations.py create mode 100644 scripts/backup-postgres.sh create mode 100644 scripts/restore-postgres.sh create mode 100644 scripts/run-citation-backfill.sh diff --git a/.gitignore b/.gitignore index 6401844..a3d8156 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,12 @@ __pycache__/ node_modules/ test-results/ playwright-report/ +*~ +*.env +secrets* +codex* +restart.sh +*lock.json +input-data/ +legacy-data +var/logs/ diff --git a/README.md b/README.md index 1c731e8..180d960 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Docker Compose owns all runtime dependencies: - Python services run in `python:3.12-slim` - the Python virtual environment is created in a Docker-managed volume mounted at `/workspace/.docker/venv` - dependencies are installed from `apps/api/requirements.txt` inside that virtual environment -- the legacy corpus is mounted read-only from `../01-legacy-code-and-data` +- the legacy corpus is mounted read-only from a sibling directory, defaulting to `../legacy-corpus` No host Python packages are required for the Compose workflow. @@ -48,6 +48,13 @@ Endpoints: - editor section detail/update: `/api/editor/species//sections/` (requires `editor` or `admin`) - editor audit history: `/api/editor/species//audit` (requires `editor` or `admin`) +The app can also be published under a URL prefix. A reverse-proxy deployment can publish the app at a host and path such as: + +- `ECOSPECIES_HOSTNAME=example.org` +- `ECOSPECIES_BASE_PATH=/apps/ecospecies` + +When the site is served below a path prefix, the frontend derives its API base from the current page URL and nginx serves both the UI and proxied API under that same prefix. + If those host ports are already in use, override them when starting Compose, for example: ```bash @@ -87,6 +94,14 @@ Run the browser-level smoke test against the real Compose stack with: ./scripts/check-ui-stack-smoke.sh ``` +Run a bounded citation backfill pass with: + +```bash +./scripts/run-citation-backfill.sh +``` + +The wrapper runs inside `ecospecies-api`, keeps a rotating cursor in `var/citation-backfill.cursor`, and skips a run if another backfill is already active. + ## Notes - The importer seeds PostgreSQL from the legacy text corpus before the API starts and now synchronizes by slug instead of truncating the full dataset. @@ -98,6 +113,8 @@ Run the browser-level smoke test against the real Compose stack with: - Initial editor auth uses `ECOSPECIES_AUTH_TOKENS` in the format `token:username:role[,token2:username2:role2]`, where `role` is `viewer`, `editor`, or `admin`. - Editorial workflow state is persisted per species with `draft`, `review`, and `published` statuses. Public endpoints return only `published` records; editor endpoints can inspect and update all records. - Editors can curate top-level metadata and section content from the web UI, and every editorial or section change is recorded in per-species audit history. +- Citation backfill can be scheduled externally, such as with a nightly cron job that runs `./scripts/run-citation-backfill.sh`. Use `ECOSPECIES_BACKFILL_LOG_DIR` if logs should go somewhere other than `var/logs`. +- Unresolved citation enrichment now still refreshes the locally parsed BibTeX and normalized citation text, so parser improvements propagate even without a remote metadata match. - Summary authoring guidance for future FLELMR-compatible records is in `docs/flelmr-authoring.md`. - Legacy survey and roadmap artifacts are in `docs/`. diff --git a/apps/api/src/ecospecies_api/app.py b/apps/api/src/ecospecies_api/app.py index f8c91ed..c75e12e 100644 --- a/apps/api/src/ecospecies_api/app.py +++ b/apps/api/src/ecospecies_api/app.py @@ -15,17 +15,36 @@ from ecospecies_api.auth import ( ) from ecospecies_api.parser import get_default_data_dir, load_species_records from ecospecies_api.repository import ( + add_species_citation_from_candidate, + apply_species_citation_candidate_selection, + create_contributor_species, + get_contributor_species_citations, + get_contributor_species_detail, + get_contributor_species_document, + get_contributor_species_list, + get_species_citation_candidates, + get_editor_species_citations, get_editor_species_detail, + get_species_document, get_editor_species_list, get_editor_species_workflow, + get_minimum_contributor_age, get_species_by_slug, list_species_audit, + list_public_bibliography, get_readiness_status, get_summary_metrics, has_species_data, import_species_payload, list_diagnostics, list_species, + register_contributor, + update_species_citation_enrichment, + backfill_species_citations, + update_species_citations_enrichment_batch, + update_species_citation_review, + update_contributor_species_document_markdown, + update_species_document_markdown, update_species_section, update_species_editorial, ) @@ -99,6 +118,7 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler): { "authenticated": session is not None, "auth_configured": auth_is_configured(), + "minimum_contributor_age": get_minimum_contributor_age(), "user": ( {"username": session.username, "role": session.role} if session is not None @@ -108,6 +128,23 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler): ) return + if path == "/api/contributor/status": + if not self.require_role(session, "contributor"): + return + self.write_json( + { + "status": "ok", + "contributor_access": True, + "user": {"username": session.username, "role": session.role}, + "minimum_age": get_minimum_contributor_age(), + "capabilities": [ + "create_species_draft", + "edit_owned_drafts", + ], + } + ) + return + if path == "/api/editor/status": if not self.require_role(session, "editor"): return @@ -135,10 +172,42 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler): "slug": item["slug"], "title": item["title"], "common_name": item["common_name"], + "scientific_name": item["scientific_name"], + "legacy_identifiers": item["legacy_identifiers"], + "taxon_identifiers": item["taxon_identifiers"], + "primary_taxon_authority": item["primary_taxon_authority"], + "primary_taxon_identifier": item["primary_taxon_identifier"], "publication_status": item["publication_status"], "is_archived": item["is_archived"], "last_modified_by": item["last_modified_by"], "diagnostic_count": len(item["diagnostics"]), + "summary": item["summary"], + } + for item in items + ] + self.write_json({"items": compact, "count": len(compact)}) + return + + if path == "/api/contributor/species": + if not self.require_role(session, "contributor"): + return + search = query.get("search", [""])[0].strip().lower() + items = get_contributor_species_list(session.username, search) + compact = [ + { + "slug": item["slug"], + "title": item["title"], + "common_name": item["common_name"], + "scientific_name": item["scientific_name"], + "legacy_identifiers": item["legacy_identifiers"], + "taxon_identifiers": item["taxon_identifiers"], + "primary_taxon_authority": item["primary_taxon_authority"], + "primary_taxon_identifier": item["primary_taxon_identifier"], + "publication_status": item["publication_status"], + "is_archived": item["is_archived"], + "last_modified_by": item["last_modified_by"], + "diagnostic_count": len(item["diagnostics"]), + "summary": item["summary"], } for item in items ] @@ -176,7 +245,68 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler): self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND) return - if path.startswith("/api/editor/species/") and not path.endswith("/workflow") and not path.endswith("/editorial") and not path.endswith("/audit"): + if path.startswith("/api/editor/species/") and path.endswith("/document"): + if not self.require_role(session, "editor"): + return + slug = path[len("/api/editor/species/") : -len("/document")].strip("/") + item = get_species_document(slug) + if item is None: + self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND) + return + self.write_json(item) + return + + if path.startswith("/api/editor/species/") and path.endswith("/citations"): + if not self.require_role(session, "editor"): + return + slug = path[len("/api/editor/species/") : -len("/citations")].strip("/") + item = get_editor_species_citations(slug) + if item is None: + self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND) + return + self.write_json(item) + return + + if path.startswith("/api/editor/species/") and "/citations/" in path and path.endswith("/candidates"): + if not self.require_role(session, "editor"): + return + slug, _, tail = path[len("/api/editor/species/") :].partition("/citations/") + citation_tail = tail[: -len("/candidates")].strip("/") + try: + citation_id = int(citation_tail) + except ValueError: + self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST) + return + item = get_species_citation_candidates(slug.strip("/"), citation_id) + if item is None: + self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND) + return + self.write_json(item) + return + + if path.startswith("/api/contributor/species/") and path.endswith("/document"): + if not self.require_role(session, "contributor"): + return + slug = path[len("/api/contributor/species/") : -len("/document")].strip("/") + item = get_contributor_species_document(slug, session.username) + if item is None: + self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND) + return + self.write_json(item) + return + + if path.startswith("/api/contributor/species/") and path.endswith("/citations"): + if not self.require_role(session, "contributor"): + return + slug = path[len("/api/contributor/species/") : -len("/citations")].strip("/") + item = get_contributor_species_citations(slug, session.username) + if item is None: + self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND) + return + self.write_json(item) + return + + if path.startswith("/api/editor/species/") and not path.endswith("/workflow") and not path.endswith("/editorial") and not path.endswith("/audit") and not path.endswith("/document"): if not self.require_role(session, "editor"): return slug = path[len("/api/editor/species/") :].strip("/") @@ -187,6 +317,17 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler): self.write_json(item) return + if path.startswith("/api/contributor/species/") and not path.endswith("/document"): + if not self.require_role(session, "contributor"): + return + slug = path[len("/api/contributor/species/") :].strip("/") + item = get_contributor_species_detail(slug, session.username) + if item is None: + self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND) + return + self.write_json(item) + return + if path.startswith("/api/editor/species/") and path.endswith("/workflow"): if not self.require_role(session, "editor"): return @@ -215,6 +356,12 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler): self.write_json({"items": flagged, "count": len(flagged)}) return + if path == "/api/bibliography": + search = query.get("search", [""])[0].strip() + items = list_public_bibliography(search=search) + self.write_json({"items": items, "count": len(items)}) + return + if path == "/api/species": search = query.get("search", [""])[0].strip().lower() species = list_species(search) @@ -225,6 +372,10 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler): "common_name": item["common_name"], "scientific_name": item["scientific_name"], "flelmr_code": item["flelmr_code"], + "legacy_identifiers": item["legacy_identifiers"], + "taxon_identifiers": item["taxon_identifiers"], + "primary_taxon_authority": item["primary_taxon_authority"], + "primary_taxon_identifier": item["primary_taxon_identifier"], "summary": item["summary"], "section_count": item["section_count"], "diagnostic_count": len(item["diagnostics"]), @@ -250,6 +401,47 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler): path = parsed.path session = resolve_auth_session(self.headers) + if path == "/api/contributor/register": + payload = self.read_json_body() + if payload is None: + return + email = payload.get("email") + age_gate_confirmed = payload.get("age_gate_confirmed") + if not isinstance(email, str): + self.write_json({"error": "email must be a string"}, status=HTTPStatus.BAD_REQUEST) + return + if not isinstance(age_gate_confirmed, bool): + self.write_json( + {"error": "age_gate_confirmed must be a boolean"}, + status=HTTPStatus.BAD_REQUEST, + ) + return + try: + result = register_contributor(email=email, age_gate_confirmed=age_gate_confirmed) + except ValueError as exc: + self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST) + return + self.write_json({"status": "ok", **result}, status=HTTPStatus.CREATED) + return + + if path == "/api/contributor/species": + if not self.require_role(session, "contributor"): + return + payload = self.read_json_body() + if payload is None: + return + markdown = payload.get("markdown") + if markdown is not None and not isinstance(markdown, str): + self.write_json({"error": "markdown must be a string"}, status=HTTPStatus.BAD_REQUEST) + return + try: + result = create_contributor_species(session.username, markdown) + except ValueError as exc: + self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST) + return + self.write_json({"status": "ok", **result}, status=HTTPStatus.CREATED) + return + if path.startswith("/api/editor/species/") and path.endswith("/workflow"): if not self.require_role(session, "editor"): return @@ -341,6 +533,229 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler): self.write_json({"status": "ok", **result}) return + if path.startswith("/api/editor/species/") and path.endswith("/document"): + if not self.require_role(session, "editor"): + return + + payload = self.read_json_body() + if payload is None: + return + + markdown = payload.get("markdown") + if not isinstance(markdown, str): + self.write_json({"error": "markdown must be a string"}, status=HTTPStatus.BAD_REQUEST) + return + + slug = path[len("/api/editor/species/") : -len("/document")].strip("/") + try: + result = update_species_document_markdown( + slug=slug, + markdown=markdown, + username=session.username, + ) + except ValueError as exc: + self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST) + return + + if result is None: + self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND) + return + + self.write_json({"status": "ok", **result}) + return + + if ( + path.startswith("/api/editor/species/") + and "/citations/" in path + and not path.endswith("/citations/enrich") + and not path.endswith("/citations/backfill") + ): + if not self.require_role(session, "editor"): + return + + payload = self.read_json_body() + if payload is None: + return + + slug, _, tail = path[len("/api/editor/species/") :].partition("/citations/") + if tail.endswith("/enrich"): + citation_tail = tail[: -len("/enrich")].strip("/") + try: + citation_id = int(citation_tail) + except ValueError: + self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST) + return + + result = update_species_citation_enrichment( + slug=slug.strip("/"), + citation_id=citation_id, + username=session.username, + ) + if result is None: + self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND) + return + + self.write_json({"status": "ok", **result}) + return + + if tail.endswith("/apply-match"): + citation_tail = tail[: -len("/apply-match")].strip("/") + try: + citation_id = int(citation_tail) + except ValueError: + self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST) + return + candidate = payload.get("candidate") + if not isinstance(candidate, dict): + self.write_json({"error": "candidate must be an object"}, status=HTTPStatus.BAD_REQUEST) + return + result = apply_species_citation_candidate_selection( + slug=slug.strip("/"), + citation_id=citation_id, + candidate=candidate, + username=session.username, + ) + if result is None: + self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND) + return + self.write_json({"status": "ok", **result}) + return + + if tail.endswith("/add-match"): + citation_tail = tail[: -len("/add-match")].strip("/") + try: + citation_id = int(citation_tail) + except ValueError: + self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST) + return + candidate = payload.get("candidate") + if not isinstance(candidate, dict): + self.write_json({"error": "candidate must be an object"}, status=HTTPStatus.BAD_REQUEST) + return + result = add_species_citation_from_candidate( + slug=slug.strip("/"), + citation_id=citation_id, + candidate=candidate, + username=session.username, + ) + if result is None: + self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND) + return + self.write_json({"status": "ok", **result}) + return + + try: + citation_id = int(tail.strip("/")) + except ValueError: + self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST) + return + + for field in ("review_status", "normalized_text", "abstract_text", "doi", "citation_key", "entry_type", "draft_bibtex"): + value = payload.get(field) + if value is not None and not isinstance(value, str): + self.write_json( + {"error": f"{field} must be a string"}, + status=HTTPStatus.BAD_REQUEST, + ) + return + + try: + result = update_species_citation_review( + slug=slug.strip("/"), + citation_id=citation_id, + review_status=payload.get("review_status"), + normalized_text=payload.get("normalized_text"), + doi=payload.get("doi"), + citation_key=payload.get("citation_key"), + entry_type=payload.get("entry_type"), + draft_bibtex=payload.get("draft_bibtex"), + abstract_text=payload.get("abstract_text"), + username=session.username, + ) + except ValueError as exc: + self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST) + return + + if result is None: + self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND) + return + + self.write_json({"status": "ok", **result}) + return + + if path.startswith("/api/editor/species/") and path.endswith("/citations/enrich"): + if not self.require_role(session, "editor"): + return + + payload = self.read_json_body() + if payload is None: + return + + slug = path[len("/api/editor/species/") : -len("/citations/enrich")].strip("/") + result = update_species_citations_enrichment_batch( + slug=slug, + username=session.username, + ) + if result is None: + self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND) + return + + self.write_json({"status": "ok", **result}) + return + + if path.startswith("/api/editor/species/") and path.endswith("/citations/backfill"): + if not self.require_role(session, "editor"): + return + + payload = self.read_json_body() + if payload is None: + return + + slug = path[len("/api/editor/species/") : -len("/citations/backfill")].strip("/") + include_accepted = bool(payload.get("include_accepted", False)) + result = backfill_species_citations( + slug=slug, + username=session.username, + include_accepted=include_accepted, + ) + if result is None: + self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND) + return + + self.write_json({"status": "ok", **result}) + return + + if path.startswith("/api/contributor/species/") and path.endswith("/document"): + if not self.require_role(session, "contributor"): + return + + payload = self.read_json_body() + if payload is None: + return + + markdown = payload.get("markdown") + if not isinstance(markdown, str): + self.write_json({"error": "markdown must be a string"}, status=HTTPStatus.BAD_REQUEST) + return + + slug = path[len("/api/contributor/species/") : -len("/document")].strip("/") + try: + result = update_contributor_species_document_markdown( + slug=slug, + markdown=markdown, + username=session.username, + ) + except ValueError as exc: + self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST) + return + + if result is None: + self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND) + return + + self.write_json({"status": "ok", **result}) + return + self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND) def log_message(self, format: str, *args: object) -> None: diff --git a/apps/api/src/ecospecies_api/auth.py b/apps/api/src/ecospecies_api/auth.py index 9d7f86c..a03fb8e 100644 --- a/apps/api/src/ecospecies_api/auth.py +++ b/apps/api/src/ecospecies_api/auth.py @@ -1,14 +1,21 @@ from __future__ import annotations +import hashlib import os from dataclasses import dataclass from typing import Mapping +from sqlalchemy import select + +from ecospecies_api.db import SessionLocal, create_db_engine +from ecospecies_api.models import Base, ContributorAccount + ROLE_ORDER = { "viewer": 1, - "editor": 2, - "admin": 3, + "contributor": 2, + "editor": 3, + "admin": 4, } @@ -41,17 +48,27 @@ def _parse_token_entry(entry: str) -> tuple[str, AuthSession]: def get_token_registry() -> dict[str, AuthSession]: - configured = os.environ.get("ECOSPECIES_AUTH_TOKENS", "").strip() - if not configured: - return {} - registry: dict[str, AuthSession] = {} - for raw_entry in configured.split(","): - entry = raw_entry.strip() - if not entry: - continue - token, session = _parse_token_entry(entry) - registry[token] = session + configured = os.environ.get("ECOSPECIES_AUTH_TOKENS", "").strip() + if configured: + for raw_entry in configured.split(","): + entry = raw_entry.strip() + if not entry: + continue + token, session = _parse_token_entry(entry) + registry[token] = session + + engine = create_db_engine() + Base.metadata.create_all(engine) + with SessionLocal() as session: + for account in session.scalars( + select(ContributorAccount).where(ContributorAccount.is_active.is_(True)) + ): + registry[account.token_hash] = AuthSession( + token=account.token_hash, + username=account.email, + role="contributor", + ) return registry @@ -70,7 +87,11 @@ def resolve_auth_session(headers: Mapping[str, str]) -> AuthSession | None: token = get_bearer_token(headers) if not token: return None - return registry.get(token) + direct = registry.get(token) + if direct is not None: + return direct + token_hash = hashlib.sha256(token.encode("utf-8")).hexdigest() + return registry.get(token_hash) def auth_is_configured() -> bool: diff --git a/apps/api/src/ecospecies_api/citation_enrichment.py b/apps/api/src/ecospecies_api/citation_enrichment.py new file mode 100644 index 0000000..9b13156 --- /dev/null +++ b/apps/api/src/ecospecies_api/citation_enrichment.py @@ -0,0 +1,1018 @@ +from __future__ import annotations + +import json +from pathlib import Path +import os +import re +import sys +import urllib.error +import urllib.parse +import urllib.request + +from ecospecies_api.citegeist_bridge import ( + DraftCitation, + HISTORICAL_YEAR_PATTERN, + build_standard_citation_key, + extract_draft_citation, + render_single_bibtex, +) + + +def _load_citegeist_resolution_components(): + citegeist_src = Path(__file__).resolve().parents[5] / "CiteGeist" / "src" + if citegeist_src.exists() and str(citegeist_src) not in sys.path: + sys.path.insert(0, str(citegeist_src)) + try: + from citegeist.bibtex import BibEntry # type: ignore + from citegeist.resolve import MetadataResolver, merge_entries_with_conflicts # type: ignore + from citegeist.sources import SourceClient # type: ignore + except ImportError: + return None, None, None, None + return BibEntry, MetadataResolver, SourceClient, merge_entries_with_conflicts + + +class LocalSourceClient: + def __init__(self, user_agent: str) -> None: + self.user_agent = user_agent + + def get_json(self, url: str) -> dict: + request = urllib.request.Request(url, headers={"User-Agent": self.user_agent}) + with urllib.request.urlopen(request, timeout=15) as response: + return json.loads(response.read().decode("utf-8")) + + +class LocalBibEntry: + def __init__(self, entry_type: str, citation_key: str, fields: dict[str, str]) -> None: + self.entry_type = entry_type + self.citation_key = citation_key + self.fields = fields + + +def _get_cache_dir() -> Path: + configured = os.environ.get("ECOSPECIES_CITEGEIST_CACHE_DIR", "").strip() + if configured: + return Path(configured) + return Path("/tmp/ecospecies-citegeist-cache") + + +def _get_fixtures_dir() -> Path | None: + configured = os.environ.get("ECOSPECIES_CITEGEIST_FIXTURES_DIR", "").strip() + if configured: + return Path(configured) + return None + + +def _normalize_openalex_id(raw_value: str) -> str: + value = raw_value.strip() + if not value: + return "" + if value.startswith("https://openalex.org/"): + return value.rsplit("/", 1)[-1] + return value + + +def _normalize_text(value: str) -> str: + normalized = " ".join(str(value or "").split()) + normalized = re.sub(r"([\(\[\{])\s+", r"\1", normalized) + normalized = re.sub(r"\s+([\)\]\},.;:!?])", r"\1", normalized) + return normalized.strip() + + +def _normalize_abstract_text(value: str) -> str: + normalized = _normalize_text(value) + return re.sub(r"^abstract\s*[:.\-]?\s*", "", normalized, flags=re.IGNORECASE) + + +def _normalize_person_display_name(value: str) -> str: + normalized = _normalize_text(value) + if "," not in normalized: + return normalized + + left, right = [part.strip() for part in normalized.split(",", 1)] + if not (_looks_like_initial_block(left) and right): + return normalized + + right_tokens = right.split() + trailing_initials: list[str] = [] + while right_tokens and _looks_like_initial_block(right_tokens[-1]): + trailing_initials.insert(0, right_tokens.pop()) + if not right_tokens: + return normalized + + family = " ".join(right_tokens).strip() + given_parts = [ + _initial_block_to_given_names(" ".join(trailing_initials)), + _initial_block_to_given_names(left), + ] + given = " ".join(part for part in given_parts if part).strip() + return f"{family}, {given}" if given else family + + +def _looks_like_initial_block(value: str) -> bool: + letters = re.sub(r"[^A-Za-z]+", "", value) + return 0 < len(letters) <= 4 and letters.upper() == letters + + +def _initial_block_to_given_names(value: str) -> str: + letters = re.findall(r"[A-Za-z]", value) + return " ".join(f"{letter.upper()}." for letter in letters) + + +def _openalex_abstract_text(inverted_index: dict) -> str: + positions: dict[int, str] = {} + for word, indexes in inverted_index.items(): + for index in indexes: + positions[int(index)] = word + text = _normalize_text(" ".join(word for _, word in sorted(positions.items()))) + return "" if _looks_like_openalex_page_blob(text) else text + + +def _looks_like_openalex_page_blob(text: str) -> bool: + lowered = text.casefold() + blob_markers = ( + "research article|", + "download citation file", + "this content is only available via pdf", + "get citation alerts", + "views icon", + "toolbar search", + "publisher site get access", + "authors info & claims", + "publication history", + "copyright ", + ) + return len(text) > 60 and any(marker in lowered for marker in blob_markers) + + +def _crossref_message_to_entry(message: dict) -> LocalBibEntry: + authors = [] + for author in message.get("author", []): + family = str(author.get("family", "")).strip() + given = str(author.get("given", "")).strip() + full_name = ", ".join(part for part in (family, given) if part) + if full_name: + authors.append(full_name) + title = "" + title_values = message.get("title", []) + if isinstance(title_values, list) and title_values: + title = str(title_values[0]).strip() + year_parts = ( + message.get("issued", {}).get("date-parts", [[None]]) + if isinstance(message.get("issued"), dict) + else [[None]] + ) + year = str(year_parts[0][0] or "").strip() + doi = str(message.get("DOI", "")).strip() + journal = "" + container = message.get("container-title", []) + if isinstance(container, list) and container: + journal = str(container[0]).strip() + abstract = _normalize_abstract_text(str(message.get("abstract", "")).strip()) + fields = { + "author": " and ".join(_normalize_person_display_name(name) for name in authors if name), + "year": year, + "title": _normalize_text(title), + "journal": _normalize_text(journal), + "doi": doi, + "url": str(message.get("URL", "")).strip(), + "volume": str(message.get("volume", "")).strip(), + "number": str(message.get("issue", "")).strip(), + "pages": str(message.get("page", "")).strip(), + "abstract": abstract, + } + citation_key = build_standard_citation_key( + authors=fields.get("author", ""), + year=year, + title=title, + fallback_text=title, + ) + return LocalBibEntry("article" if journal else "misc", citation_key, {key: value for key, value in fields.items() if value}) + + +def _datacite_item_to_entry(data: dict) -> LocalBibEntry: + attributes = data.get("attributes", {}) if isinstance(data.get("attributes"), dict) else {} + titles = attributes.get("titles", []) + title = str(titles[0].get("title", "")).strip() if titles else "" + creators = [] + for creator in attributes.get("creators", []): + family = str(creator.get("familyName", "")).strip() + given = str(creator.get("givenName", "")).strip() + name = ", ".join(part for part in (family, given) if part) or str(creator.get("name", "")).strip() + if name: + creators.append(_normalize_person_display_name(name)) + year = str(attributes.get("publicationYear", "")).strip() + doi = str(attributes.get("doi", "")).strip() + publisher = str(attributes.get("publisher", "")).strip() + url = str(attributes.get("url", "")).strip() + container = str(attributes.get("container", "")).strip() + first_page = str(attributes.get("firstPage", "")).strip() + last_page = str(attributes.get("lastPage", "")).strip() + volume = str(attributes.get("volume", "")).strip() + issue = str(attributes.get("issue", "")).strip() + pages = "" + if first_page and last_page: + pages = f"{first_page}-{last_page}" + elif first_page: + pages = first_page + abstract = "" + for description in attributes.get("descriptions", []): + if str(description.get("descriptionType", "")).strip().lower() == "abstract": + abstract = _normalize_abstract_text(str(description.get("description", "")).strip()) + if abstract: + break + fields = { + "author": " and ".join(creators), + "year": year, + "title": _normalize_text(title), + "publisher": _normalize_text(publisher), + "doi": doi, + "url": url, + "journal": _normalize_text(container), + "volume": volume, + "number": issue, + "pages": pages, + "abstract": abstract, + } + citation_key = build_standard_citation_key( + authors=fields.get("author", ""), + year=year, + title=title, + fallback_text=title, + ) + return LocalBibEntry("book" if publisher else "misc", citation_key, {key: value for key, value in fields.items() if value}) + + +def _openalex_work_to_entry(work: dict) -> LocalBibEntry: + authors = [] + for authorship in work.get("authorships", []): + author_name = _normalize_person_display_name(str(authorship.get("author", {}).get("display_name", "")).strip()) + if author_name: + authors.append(author_name) + doi = str(work.get("doi", "")).strip().removeprefix("https://doi.org/") + primary_location = work.get("primary_location", {}) + source = primary_location.get("source", {}) if isinstance(primary_location, dict) else {} + if not isinstance(source, dict): + source = {} + title = str(work.get("display_name", "")).strip() + year = str(work.get("publication_year", "")).strip() + journal = str(source.get("display_name", "")).strip() + openalex_id = _normalize_openalex_id(str(work.get("id", ""))) + biblio = work.get("biblio", {}) if isinstance(work.get("biblio"), dict) else {} + first_page = str(biblio.get("first_page", "")).strip() + last_page = str(biblio.get("last_page", "")).strip() + pages = "" + if first_page and last_page: + pages = f"{first_page}-{last_page}" + elif first_page: + pages = first_page + abstract = "" + if isinstance(work.get("abstract_inverted_index"), dict): + abstract = _openalex_abstract_text(work.get("abstract_inverted_index", {})) + fields = { + "author": " and ".join(authors), + "year": year, + "title": _normalize_text(title), + "journal": _normalize_text(journal), + "doi": doi, + "openalex": openalex_id, + "url": f"https://openalex.org/{openalex_id}" if openalex_id else "", + "volume": str(biblio.get("volume", "")).strip(), + "number": str(biblio.get("issue", "")).strip(), + "pages": pages, + "abstract": abstract, + } + citation_key = build_standard_citation_key( + authors=fields.get("author", ""), + year=year, + title=title, + fallback_text=title or openalex_id, + ) + return LocalBibEntry("article" if journal else "misc", citation_key, {key: value for key, value in fields.items() if value}) + + +def _normalized_title(value: str) -> str: + return re.sub(r"[^a-z0-9]+", " ", value.lower()).strip() + + +def _normalized_tokens(value: str) -> list[str]: + return [token for token in _normalized_title(value).split() if token] + + +def _title_similarity(query_title: str, candidate_title: str) -> float: + query_tokens = _normalized_tokens(query_title) + candidate_tokens = _normalized_tokens(candidate_title) + if not query_tokens or not candidate_tokens: + return 0.0 + overlap = len(set(query_tokens) & set(candidate_tokens)) + longest = max(len(set(query_tokens)), len(set(candidate_tokens)), 1) + return overlap / longest + + +def _select_best_title_match(entries: list[LocalBibEntry], title: str, year: str = "") -> LocalBibEntry | None: + normalized_query = _normalized_title(title) + best_entry: LocalBibEntry | None = None + best_score = 0.0 + for entry in entries: + entry_title = _normalized_title(entry.fields.get("title", "")) + if not entry_title: + continue + if entry_title == normalized_query: + if year and entry.fields.get("year", "") and entry.fields.get("year", "") != year: + continue + return entry + if year and entry.fields.get("year", "") and entry.fields.get("year", "") != year: + continue + score = _title_similarity(title, entry.fields.get("title", "")) + if score > best_score: + best_score = score + best_entry = entry + if best_score >= 0.85: + return best_entry + return None + + +class LocalResolution: + def __init__(self, entry: LocalBibEntry, source_label: str) -> None: + self.entry = entry + self.source_label = source_label + + +class LocalMetadataResolver: + def __init__(self, user_agent: str = "ecospecies/0.1 (citation enrichment)") -> None: + self.source_client = LocalSourceClient(user_agent=user_agent) + + def resolve_entry(self, entry: LocalBibEntry) -> LocalResolution | None: + doi = entry.fields.get("doi", "").strip() + if doi: + resolved = self.resolve_doi(doi) + if resolved is not None: + return resolved + resolved = self.resolve_datacite_doi(doi) + if resolved is not None: + return resolved + + openalex_id = entry.fields.get("openalex", "").strip() + if openalex_id: + resolved = self.resolve_openalex(openalex_id) + if resolved is not None: + return resolved + + title = entry.fields.get("title", "").strip() + if title: + year = entry.fields.get("year", "").strip() + resolved = self.search_crossref_best_match(title, year=year) + if resolved is not None: + return resolved + resolved = self.search_datacite_best_match(title, year=year) + if resolved is not None: + return resolved + resolved = self.search_openalex_best_match(title, year=year) + if resolved is not None: + return resolved + return None + + def resolve_doi(self, doi: str) -> LocalResolution | None: + encoded = urllib.parse.quote(doi, safe="") + payload = self._safe_get_json(f"https://api.crossref.org/works/{encoded}") + if payload is None: + return None + message = payload.get("message", {}) + if not message: + return None + return LocalResolution(_crossref_message_to_entry(message), f"crossref:doi:{doi}") + + def resolve_datacite_doi(self, doi: str) -> LocalResolution | None: + encoded = urllib.parse.quote(doi, safe="") + payload = self._safe_get_json(f"https://api.datacite.org/dois/{encoded}") + if payload is None: + return None + data = payload.get("data", {}) + if not data: + return None + return LocalResolution(_datacite_item_to_entry(data), f"datacite:doi:{doi}") + + def resolve_openalex(self, openalex_id: str) -> LocalResolution | None: + normalized = _normalize_openalex_id(openalex_id) + payload = self._safe_get_json(f"https://api.openalex.org/works/{normalized}") + if payload is None or not payload: + return None + return LocalResolution(_openalex_work_to_entry(payload), f"openalex:id:{normalized}") + + def search_crossref_best_match(self, title: str, year: str = "") -> LocalResolution | None: + entries = self.search_crossref_candidates(title) + best = _select_best_title_match([item.entry for item in entries], title=title, year=year) + if best is None: + return None + for candidate in entries: + if candidate.entry is best: + return candidate + return None + + def search_datacite_best_match(self, title: str, year: str = "") -> LocalResolution | None: + entries = self.search_datacite_candidates(title) + best = _select_best_title_match([item.entry for item in entries], title=title, year=year) + if best is None: + return None + for candidate in entries: + if candidate.entry is best: + return candidate + return None + + def search_openalex_best_match(self, title: str, year: str = "") -> LocalResolution | None: + entries = self.search_openalex_candidates(title) + best = _select_best_title_match([item.entry for item in entries], title=title, year=year) + if best is None: + return None + for candidate in entries: + if candidate.entry is best: + return candidate + return None + + def search_crossref_candidates(self, title: str) -> list[LocalResolution]: + query = urllib.parse.urlencode({"query.title": title, "rows": 5}) + payload = self._safe_get_json(f"https://api.crossref.org/works?{query}") + if payload is None: + return [] + results: list[LocalResolution] = [] + for index, item in enumerate(payload.get("message", {}).get("items", []), start=1): + entry = _crossref_message_to_entry(item) + if not _should_keep_candidate_entry(entry): + continue + results.append(LocalResolution(entry, _candidate_source_label("crossref:search", entry, index))) + return results + + def search_datacite_candidates(self, title: str) -> list[LocalResolution]: + query = urllib.parse.urlencode({"query": title, "page[size]": 5}) + payload = self._safe_get_json(f"https://api.datacite.org/dois?{query}") + if payload is None: + return [] + results: list[LocalResolution] = [] + for index, item in enumerate(payload.get("data", []), start=1): + entry = _datacite_item_to_entry(item) + if not _should_keep_candidate_entry(entry): + continue + results.append(LocalResolution(entry, _candidate_source_label("datacite:search", entry, index))) + return results + + def search_openalex_candidates(self, title: str) -> list[LocalResolution]: + query = urllib.parse.urlencode({"search": title, "per-page": 5}) + payload = self._safe_get_json(f"https://api.openalex.org/works?{query}") + if payload is None: + return [] + results: list[LocalResolution] = [] + for index, item in enumerate(payload.get("results", []), start=1): + entry = _openalex_work_to_entry(item) + if not _should_keep_candidate_entry(entry): + continue + results.append(LocalResolution(entry, _candidate_source_label("openalex:search", entry, index))) + return results + + def _safe_get_json(self, url: str) -> dict | None: + try: + return self.source_client.get_json(url) + except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError, ValueError): + return None + + +def _candidate_source_label(prefix: str, entry: LocalBibEntry, index: int) -> str: + basis = ( + entry.fields.get("doi", "").strip() + or entry.fields.get("openalex", "").strip() + or entry.citation_key.strip() + or entry.fields.get("title", "").strip() + or str(index) + ) + suffix = re.sub(r"[^a-z0-9]+", "-", basis.lower()).strip("-")[:80] or str(index) + return f"{prefix}:{index}:{suffix}" + + +def _should_keep_candidate_entry(entry: LocalBibEntry) -> bool: + title = _normalize_text(entry.fields.get("title", "")) + if not title: + return False + lowered = title.lower() + if lowered.startswith("referenced work ") or lowered.startswith("untitled"): + return False + if entry.entry_type == "misc" and not ( + entry.fields.get("doi", "").strip() + or entry.fields.get("openalex", "").strip() + or entry.fields.get("journal", "").strip() + or entry.fields.get("booktitle", "").strip() + ): + return False + return True + + +def _merge_entries(base_entry, resolved_entry) -> tuple[LocalBibEntry, list[dict[str, str]]]: + merged_fields = dict(base_entry.fields) + conflicts: list[dict[str, str]] = [] + for key, value in resolved_entry.fields.items(): + if not value: + continue + current_value = merged_fields.get(key, "") + if _is_placeholder_field_value(key, current_value): + merged_fields[key] = value + continue + if not current_value: + merged_fields[key] = value + continue + if current_value != value: + conflicts.append( + { + "field_name": key, + "current_value": current_value, + "proposed_value": value, + } + ) + return LocalBibEntry( + entry_type=(base_entry.entry_type if base_entry.entry_type != "misc" else resolved_entry.entry_type), + citation_key=base_entry.citation_key, + fields=merged_fields, + ), conflicts + + +def _is_placeholder_field_value(field_name: str, value: str) -> bool: + normalized = " ".join((value or "").split()).strip() + if not normalized: + return True + lowered = normalized.lower() + if field_name == "title": + if lowered.startswith("referenced work ") or lowered.startswith("untitled"): + return True + if re.match(rf"^[^,]+,\s*.+?\s{HISTORICAL_YEAR_PATTERN}\.\s+", normalized): + return True + if field_name == "author": + if lowered in {"reference", "unknown", "unknown author"}: + return True + return False + + +def _dedupe_note_parts(parts: list[str]) -> list[str]: + deduped: list[str] = [] + seen: set[str] = set() + for part in parts: + for segment in part.split(";"): + compact = segment.strip() + if not compact or compact in seen: + continue + seen.add(compact) + deduped.append(compact) + return deduped + + +def _looks_like_raw_reference_title(current_value: str, resolved_value: str) -> bool: + normalized_current = " ".join((current_value or "").split()).strip() + normalized_resolved = " ".join((resolved_value or "").split()).strip() + if not normalized_current or not normalized_resolved: + return False + if normalized_current == normalized_resolved: + return False + if _is_placeholder_field_value("title", normalized_current): + return True + if re.match(r"^[^,]+,\s+.+", normalized_current) and re.search(rf"\b{HISTORICAL_YEAR_PATTERN}\.\s+", normalized_current[:48]): + return True + comparison_current = re.sub(r"[^a-z0-9]+", " ", normalized_current.lower()).strip() + comparison_resolved = re.sub(r"[^a-z0-9]+", " ", normalized_resolved.lower()).strip() + if comparison_resolved and comparison_resolved in comparison_current and re.match(rf"^[^,]+,.*\b{HISTORICAL_YEAR_PATTERN}\.\s+", normalized_current): + return True + return False + + +def _apply_placeholder_overrides(merged_fields: dict[str, str], base_fields: dict[str, str], resolved_fields: dict[str, str]) -> None: + for field_name in ("title", "author"): + current_value = str(base_fields.get(field_name, "")).strip() + resolved_value = str(resolved_fields.get(field_name, "")).strip() + if field_name == "title" and resolved_value and _looks_like_raw_reference_title(current_value, resolved_value): + merged_fields[field_name] = resolved_value + continue + if _is_placeholder_field_value(field_name, current_value) and resolved_value: + merged_fields[field_name] = resolved_value + + +def _citation_raw_text_looks_like_reference(raw_text: str) -> bool: + normalized = " ".join((raw_text or "").split()).strip() + return bool( + normalized + and ( + (re.match(r"^[^,]+,\s+.+", normalized) and re.search(rf"\b{HISTORICAL_YEAR_PATTERN}\.\s+", normalized[:48])) + or len(normalized) > 80 + ) + ) + + +def _build_base_citation(citation: dict[str, object]) -> DraftCitation | None: + raw_text = str(citation.get("raw_text", "")).strip() + legacy_reference_number = str(citation.get("legacy_reference_number", "")).strip() + if not raw_text: + return None + draft = extract_draft_citation(raw_text, legacy_reference_number) + if draft is None: + return None + if citation.get("doi"): + draft.fields["doi"] = str(citation.get("doi", "")).strip() + if citation.get("openalex_id"): + draft.fields["openalex"] = str(citation.get("openalex_id", "")).strip() + if citation.get("source_url"): + draft.fields["url"] = str(citation.get("source_url", "")).strip() + if citation.get("normalized_text") and not draft.fields.get("note"): + draft.fields["note"] = str(citation.get("normalized_text", "")).strip() + return draft + + +def _render_normalized_text(entry_type: str, fields: dict[str, str]) -> str: + parts: list[str] = [] + author = fields.get("author", "").strip() + year = fields.get("year", "").strip() + title = fields.get("title", "").strip() + venue = ( + fields.get("journal", "").strip() + or fields.get("booktitle", "").strip() + or fields.get("publisher", "").strip() + or fields.get("howpublished", "").strip() + ) + volume = fields.get("volume", "").strip() + number = fields.get("number", "").strip() + pages = fields.get("pages", "").strip() + doi = fields.get("doi", "").strip() + url = fields.get("url", "").strip() + + if author: + parts.append(author) + if year: + parts.append(f"({year})") + if title: + parts.append(title) + if venue: + venue_detail = venue + if volume: + venue_detail += f", {volume}" + if number: + venue_detail += f"({number})" + elif number: + venue_detail += f", ({number})" + if pages: + venue_detail += f": {pages}" + parts.append(venue_detail) + elif pages: + parts.append(f"pp. {pages}") + if entry_type == "book" and fields.get("publisher", "").strip() and fields.get("publisher", "").strip() not in venue: + parts.append(fields["publisher"].strip()) + if doi: + parts.append(f"DOI:{doi}") + elif url: + parts.append(url) + return ". ".join(part.strip(" .") for part in parts if part).strip() + + +def _normalize_venue(value: str) -> str: + lowered = value.lower() + lowered = re.sub(r"\b(comm|rept|rep|proc|trans)\.\b", "", lowered) + lowered = re.sub(r"\b(commission|report|proceedings|transactions|journal|bulletin|review|letters)\b", "", lowered) + return re.sub(r"[^a-z0-9]+", " ", lowered).strip() + + +def _normalize_pages(value: str) -> str: + return re.sub(r"\s+", "", value).replace("--", "-").strip() + + +def _venue_fields(fields: dict[str, str]) -> str: + return ( + fields.get("journal", "").strip() + or fields.get("booktitle", "").strip() + or fields.get("publisher", "").strip() + or fields.get("howpublished", "").strip() + ) + + +def _author_overlap_score(base_author: str, candidate_author: str) -> float: + base_tokens = {token for token in _normalized_tokens(base_author) if len(token) > 1} + candidate_tokens = {token for token in _normalized_tokens(candidate_author) if len(token) > 1} + if not base_tokens or not candidate_tokens: + return 0.0 + return len(base_tokens & candidate_tokens) / max(len(base_tokens), len(candidate_tokens), 1) + + +def _venue_overlap_score(base_venue: str, candidate_venue: str) -> float: + base_tokens = set(_normalize_venue(base_venue).split()) + candidate_tokens = set(_normalize_venue(candidate_venue).split()) + if not base_tokens or not candidate_tokens: + return 0.0 + return len(base_tokens & candidate_tokens) / max(len(base_tokens), len(candidate_tokens), 1) + + +def _text_overlap_score(base_text: str, candidate_text: str) -> float: + base_tokens = {token for token in _normalized_tokens(base_text) if len(token) > 2} + candidate_tokens = {token for token in _normalized_tokens(candidate_text) if len(token) > 2} + if not base_tokens or not candidate_tokens: + return 0.0 + return len(base_tokens & candidate_tokens) / max(len(base_tokens), len(candidate_tokens), 1) + + +def _compare_field(seed_value: str, candidate_value: str, *, similarity: float = 0.0) -> str: + if not seed_value: + return "seed-missing" + if not candidate_value: + return "candidate-missing" + if seed_value == candidate_value: + return "exact" + if similarity >= 0.6: + return "partial" + return "conflict" + + +def _build_match_details(seed_fields: dict[str, str], candidate_fields: dict[str, str]) -> tuple[dict[str, dict[str, object]], float]: + title_similarity = _title_similarity(seed_fields.get("title", ""), candidate_fields.get("title", "")) + author_similarity = _author_overlap_score(seed_fields.get("author", ""), candidate_fields.get("author", "")) + venue_similarity = _venue_overlap_score(_venue_fields(seed_fields), _venue_fields(candidate_fields)) + abstract_similarity = _text_overlap_score(seed_fields.get("abstract", ""), candidate_fields.get("abstract", "")) + + comparisons = [ + ("author", seed_fields.get("author", "").strip(), candidate_fields.get("author", "").strip(), 10.0, author_similarity), + ("year", seed_fields.get("year", "").strip(), candidate_fields.get("year", "").strip(), 16.0, 1.0 if seed_fields.get("year", "").strip() == candidate_fields.get("year", "").strip() and seed_fields.get("year", "").strip() else 0.0), + ("title", seed_fields.get("title", "").strip(), candidate_fields.get("title", "").strip(), 34.0, title_similarity), + ("abstract", seed_fields.get("abstract", "").strip(), candidate_fields.get("abstract", "").strip(), 8.0, abstract_similarity), + ("venue", _venue_fields(seed_fields), _venue_fields(candidate_fields), 16.0, venue_similarity), + ("volume", seed_fields.get("volume", "").strip(), candidate_fields.get("volume", "").strip(), 10.0, 1.0 if seed_fields.get("volume", "").strip() == candidate_fields.get("volume", "").strip() and seed_fields.get("volume", "").strip() else 0.0), + ("number", seed_fields.get("number", "").strip(), candidate_fields.get("number", "").strip(), 4.0, 1.0 if seed_fields.get("number", "").strip() == candidate_fields.get("number", "").strip() and seed_fields.get("number", "").strip() else 0.0), + ("pages", _normalize_pages(seed_fields.get("pages", "")), _normalize_pages(candidate_fields.get("pages", "")), 10.0, 1.0 if _normalize_pages(seed_fields.get("pages", "")) == _normalize_pages(candidate_fields.get("pages", "")) and _normalize_pages(seed_fields.get("pages", "")) else 0.0), + ] + + score = 0.0 + details: dict[str, dict[str, object]] = {} + for field_name, seed_value, candidate_value, weight, similarity in comparisons: + status = _compare_field(seed_value, candidate_value, similarity=similarity) + field_score = 0.0 + if status == "exact": + field_score = weight + elif status == "partial": + field_score = round(weight * min(similarity, 1.0), 2) + elif status == "seed-missing": + field_score = round(weight * 0.35, 2) if candidate_value else 0.0 + score += field_score + details[field_name] = { + "seed": seed_value, + "candidate": candidate_value, + "status": status, + "weight": weight, + "score": field_score, + } + + return details, round(score, 2) + + +def _seed_metadata_conflict(base_fields: dict[str, str], resolved_fields: dict[str, str]) -> str: + base_year = base_fields.get("year", "").strip() + resolved_year = resolved_fields.get("year", "").strip() + if base_year and resolved_year and base_year != resolved_year: + return f"year mismatch: seed {base_year}, resolved {resolved_year}" + + base_venue = _normalize_venue(_venue_fields(base_fields)) + resolved_venue = _normalize_venue(_venue_fields(resolved_fields)) + if base_venue and resolved_venue and base_venue != resolved_venue: + base_tokens = set(base_venue.split()) + resolved_tokens = set(resolved_venue.split()) + if not (base_tokens and resolved_tokens and base_tokens & resolved_tokens): + return "venue mismatch between citation seed and resolved metadata" + + base_volume = base_fields.get("volume", "").strip() + resolved_volume = resolved_fields.get("volume", "").strip() + if base_volume and resolved_volume and base_volume != resolved_volume: + return f"volume mismatch: seed {base_volume}, resolved {resolved_volume}" + + base_number = base_fields.get("number", "").strip() + resolved_number = resolved_fields.get("number", "").strip() + if base_number and resolved_number and base_number != resolved_number: + return f"issue mismatch: seed {base_number}, resolved {resolved_number}" + + base_pages = _normalize_pages(base_fields.get("pages", "")) + resolved_pages = _normalize_pages(resolved_fields.get("pages", "")) + if base_pages and resolved_pages and base_pages != resolved_pages: + return f"pages mismatch: seed {base_pages}, resolved {resolved_pages}" + + return "" + + +def _candidate_to_payload(seed: DraftCitation, resolution: LocalResolution) -> dict[str, object]: + field_matches, score = _build_match_details(seed.fields, resolution.entry.fields) + candidate_fields = {key: value for key, value in resolution.entry.fields.items() if value} + citation_key = build_standard_citation_key( + authors=candidate_fields.get("author", ""), + year=candidate_fields.get("year", ""), + title=candidate_fields.get("title", ""), + fallback_text=candidate_fields.get("title", "") or seed.fields.get("title", ""), + ) + draft_bibtex = render_single_bibtex(resolution.entry.entry_type, citation_key, candidate_fields) + return { + "candidate_id": re.sub(r"[^a-z0-9]+", "-", resolution.source_label.lower()).strip("-"), + "source_label": resolution.source_label, + "entry_type": resolution.entry.entry_type, + "citation_key": citation_key, + "fields": candidate_fields, + "abstract_text": candidate_fields.get("abstract", "").strip(), + "normalized_text": _render_normalized_text(resolution.entry.entry_type, candidate_fields), + "draft_bibtex": draft_bibtex, + "score": score, + "field_matches": field_matches, + "conflict_reason": _seed_metadata_conflict(seed.fields, resolution.entry.fields), + } + + +def discover_citation_candidates( + citation: dict[str, object], + resolver: LocalMetadataResolver | None = None, +) -> dict[str, object]: + base = _build_base_citation(citation) + if base is None: + return {"error": "Citation has no raw text to enrich."} + if resolver is None: + resolver = LocalMetadataResolver() + + title = base.fields.get("title", "").strip() + candidates: list[dict[str, object]] = [] + seen_keys: set[str] = set() + if title: + for resolution in ( + resolver.search_crossref_candidates(title) + + resolver.search_datacite_candidates(title) + + resolver.search_openalex_candidates(title) + ): + identity = ( + resolution.entry.fields.get("doi", "").strip() + or resolution.entry.fields.get("openalex", "").strip() + or f"{_normalized_title(resolution.entry.fields.get('title', ''))}:{resolution.entry.fields.get('year', '').strip()}" + ) + if identity in seen_keys: + continue + seen_keys.add(identity) + candidates.append(_candidate_to_payload(base, resolution)) + + candidates.sort( + key=lambda item: ( + -float(item.get("score", 0.0)), + len(str(item.get("conflict_reason", "")).strip()), + str(item.get("source_label", "")), + ) + ) + seed_payload = { + "entry_type": base.entry_type, + "citation_key": base.citation_key, + "fields": dict(base.fields), + "abstract_text": base.fields.get("abstract", "").strip(), + "normalized_text": _render_normalized_text(base.entry_type, base.fields), + "draft_bibtex": render_single_bibtex(base.entry_type, base.citation_key, base.fields), + } + return { + "seed": seed_payload, + "candidate_count": len(candidates), + "candidates": candidates[:8], + } + + +def apply_citation_candidate_selection( + citation: dict[str, object], + candidate: dict[str, object], +) -> dict[str, object]: + base = _build_base_citation(citation) + if base is None: + return { + "enrichment_status": "error", + "enrichment_error": "Citation has no raw text to enrich.", + } + + selected_fields = { + str(key): str(value).strip() + for key, value in dict(candidate.get("fields", {})).items() + if str(value).strip() + } + entry_type = str(candidate.get("entry_type", "")).strip() or "misc" + merged_fields = dict(base.fields) + for key, value in selected_fields.items(): + merged_fields[key] = value + + if citation.get("legacy_reference_number"): + note_parts = [merged_fields.get("note", "").strip()] + note_parts.append( + f"ecospecies_reference_number = {{{str(citation.get('legacy_reference_number', '')).strip()}}}" + ) + merged_fields["note"] = "; ".join(_dedupe_note_parts(note_parts)) + + citation_key = build_standard_citation_key( + authors=merged_fields.get("author", ""), + year=merged_fields.get("year", ""), + title=merged_fields.get("title", ""), + fallback_text=str(citation.get("raw_text", "")).strip(), + ) + draft_bibtex = render_single_bibtex(entry_type, citation_key, merged_fields) + return { + "citation_key": citation_key, + "entry_type": entry_type, + "normalized_text": _render_normalized_text(entry_type, merged_fields), + "abstract_text": merged_fields.get("abstract", "").strip(), + "draft_bibtex": draft_bibtex, + "doi": merged_fields.get("doi", "").strip(), + "source_url": merged_fields.get("url", "").strip(), + "openalex_id": merged_fields.get("openalex", "").strip(), + "resolver_source_label": f"editor:selected:{str(candidate.get('source_label', '')).strip()}", + "enrichment_status": "resolved", + "enrichment_error": "", + "conflicts": [], + } + + +def enrich_citation_payload( + citation: dict[str, object], + resolver=None, +) -> dict[str, object]: + base = _build_base_citation(citation) + if base is None: + return { + "enrichment_status": "error", + "enrichment_error": "Citation has no raw text to enrich.", + } + + seed_payload = { + "citation_key": base.citation_key, + "entry_type": base.entry_type, + "normalized_text": _render_normalized_text(base.entry_type, base.fields), + "abstract_text": base.fields.get("abstract", "").strip(), + "draft_bibtex": render_single_bibtex(base.entry_type, base.citation_key, base.fields), + "doi": base.fields.get("doi", "").strip(), + "source_url": base.fields.get("url", "").strip(), + "openalex_id": base.fields.get("openalex", "").strip(), + "resolver_source_label": "", + } + + BibEntry, MetadataResolver, SourceClient, merge_entries_with_conflicts = _load_citegeist_resolution_components() + if MetadataResolver is not None and SourceClient is not None and BibEntry is not None and merge_entries_with_conflicts is not None: + if resolver is None: + resolver = MetadataResolver( + user_agent="ecospecies/0.1 (citation enrichment)", + source_client=SourceClient( + user_agent="ecospecies/0.1 (citation enrichment)", + cache_dir=_get_cache_dir(), + fixtures_dir=_get_fixtures_dir(), + ), + ) + resolution = resolver.resolve_entry( + BibEntry(entry_type=base.entry_type, citation_key=base.citation_key, fields=dict(base.fields)) + ) + merger = merge_entries_with_conflicts + base_entry = BibEntry(entry_type=base.entry_type, citation_key=base.citation_key, fields=dict(base.fields)) + else: + if resolver is None: + resolver = LocalMetadataResolver() + resolution = resolver.resolve_entry( + LocalBibEntry(entry_type=base.entry_type, citation_key=base.citation_key, fields=dict(base.fields)) + ) + merger = _merge_entries + base_entry = LocalBibEntry(entry_type=base.entry_type, citation_key=base.citation_key, fields=dict(base.fields)) + + if resolution is None: + return { + **seed_payload, + "enrichment_status": "unresolved", + "enrichment_error": "No metadata match found from DOI, title, or authority identifiers.", + } + + seed_conflict = _seed_metadata_conflict(base_entry.fields, resolution.entry.fields) + if seed_conflict: + return { + **seed_payload, + "enrichment_status": "unresolved", + "enrichment_error": f"Resolved metadata conflicts with citation seed fields: {seed_conflict}.", + } + + merged, conflicts = merger(base_entry, resolution.entry) + _apply_placeholder_overrides(merged.fields, base_entry.fields, resolution.entry.fields) + resolved_title = str(resolution.entry.fields.get("title", "")).strip() + raw_text = str(citation.get("raw_text", "")).strip() + if resolved_title and raw_text and len(resolved_title) < len(raw_text) and _citation_raw_text_looks_like_reference(raw_text): + merged.fields["title"] = resolved_title + if citation.get("legacy_reference_number"): + note_parts = [merged.fields.get("note", "").strip()] + note_parts.append( + f"ecospecies_reference_number = {{{str(citation.get('legacy_reference_number', '')).strip()}}}" + ) + merged.fields["note"] = "; ".join(_dedupe_note_parts(note_parts)) + + citation_key = build_standard_citation_key( + authors=merged.fields.get("author", ""), + year=merged.fields.get("year", ""), + title=merged.fields.get("title", ""), + fallback_text=str(citation.get("raw_text", "")).strip(), + ) + draft_bibtex = render_single_bibtex(merged.entry_type, citation_key, merged.fields) + return { + "citation_key": citation_key, + "entry_type": merged.entry_type, + "normalized_text": _render_normalized_text(merged.entry_type, merged.fields), + "abstract_text": merged.fields.get("abstract", "").strip(), + "draft_bibtex": draft_bibtex, + "doi": merged.fields.get("doi", "").strip(), + "source_url": merged.fields.get("url", "").strip(), + "openalex_id": merged.fields.get("openalex", "").strip(), + "resolver_source_label": resolution.source_label, + "enrichment_status": "resolved", + "enrichment_error": "", + "conflicts": conflicts, + } diff --git a/apps/api/src/ecospecies_api/citegeist_bridge.py b/apps/api/src/ecospecies_api/citegeist_bridge.py new file mode 100644 index 0000000..98cc67e --- /dev/null +++ b/apps/api/src/ecospecies_api/citegeist_bridge.py @@ -0,0 +1,387 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +import re +import sys + + +def _load_citegeist_extract(): + citegeist_src = Path(__file__).resolve().parents[5] / "CiteGeist" / "src" + if citegeist_src.exists() and str(citegeist_src) not in sys.path: + sys.path.insert(0, str(citegeist_src)) + try: + from citegeist.extract import extract_references # type: ignore + except ImportError: + return None + return extract_references + + +@dataclass +class DraftCitation: + citation_key: str + entry_type: str + fields: dict[str, str] + draft_bibtex: str + + +STOPWORD_TOKENS = { + "a", + "an", + "and", + "for", + "from", + "in", + "of", + "on", + "the", + "to", + "with", +} +HISTORICAL_YEAR_PATTERN = r"(1\d{3}|20\d{2})" + + +def build_standard_citation_key( + authors: str = "", + year: str = "", + title: str = "", + fallback_text: str = "", +) -> str: + family_name = _family_name_stem(authors or fallback_text) + year_stem = re.sub(r"[^0-9]+", "", year)[:4] + topic_stem = _topic_stem(title or fallback_text) + key = f"{family_name}{year_stem}{topic_stem}" + return key or "reference" + + +def extract_draft_citation(raw_text: str, legacy_reference_number: str = "") -> DraftCitation | None: + extractor = _load_citegeist_extract() + if extractor is None: + return _fallback_citation(raw_text, legacy_reference_number) + + entries = extractor(raw_text) + if not entries: + return _fallback_citation(raw_text, legacy_reference_number) + + entry = entries[0] + fields = dict(entry.fields) + fields = _repair_reference_fields(raw_text, fields) + citation_key = build_standard_citation_key( + authors=str(fields.get("author", "")), + year=str(fields.get("year", "")), + title=str(fields.get("title", "")), + fallback_text=raw_text, + ) + note_parts = [fields.get("note", "").strip()] if fields.get("note") else [] + if legacy_reference_number: + note_parts.append(f"ecospecies_reference_number = {{{legacy_reference_number}}}") + fields["note"] = "; ".join(part for part in note_parts if part) + draft_bibtex = render_single_bibtex(entry.entry_type, citation_key, fields) + return DraftCitation( + citation_key=citation_key, + entry_type=entry.entry_type, + fields=fields, + draft_bibtex=draft_bibtex, + ) + + +def _fallback_citation(raw_text: str, legacy_reference_number: str) -> DraftCitation: + year_match = re.search(rf"\b{HISTORICAL_YEAR_PATTERN}\b", raw_text) + year = year_match.group(0) if year_match else "" + fields = _repair_reference_fields( + raw_text, + { + "title": raw_text.strip(), + "year": year, + }, + ) + title = str(fields.get("title", "")).strip() or raw_text.strip() + citation_key = build_standard_citation_key(year=year, title=title, fallback_text=raw_text) + fields["note"] = f"raw_reference = {{{raw_text}}}" + if legacy_reference_number: + fields["note"] += f"; ecospecies_reference_number = {{{legacy_reference_number}}}" + draft_bibtex = render_single_bibtex("misc", citation_key, fields) + return DraftCitation( + citation_key=citation_key, + entry_type="misc", + fields=fields, + draft_bibtex=draft_bibtex, + ) + + +def _family_name_stem(raw_text: str) -> str: + compact = raw_text.strip() + if not compact: + return "ref" + if "," in compact: + compact = compact.split(",", 1)[0] + else: + compact = compact.split()[0] + compact = re.sub(r"[^A-Za-z0-9]+", "", compact).lower() + return compact or "ref" + + +def _topic_stem(raw_text: str) -> str: + tokens = [ + token + for token in re.findall(r"[A-Za-z0-9]+", raw_text.lower()) + if token not in STOPWORD_TOKENS and not token.isdigit() + ] + topic_tokens = tokens[:3] or ["topic"] + return "".join(topic_tokens) + + +def _repair_reference_fields(raw_text: str, fields: dict[str, str]) -> dict[str, str]: + repaired = dict(fields) + title = str(repaired.get("title", "")).strip() + raw = raw_text.strip() + if not raw: + return repaired + + parsed = _parse_report_style_reference(raw) + if parsed is None: + return repaired + + current_venue = ( + str(repaired.get("journal", "")).strip() + or str(repaired.get("howpublished", "")).strip() + or str(repaired.get("booktitle", "")).strip() + or str(repaired.get("publisher", "")).strip() + ) + parsed_venue = str(parsed.get("venue", "")).strip() + needs_structural_repair = bool( + parsed_venue + and ( + not current_venue + or len(current_venue) < max(8, len(parsed_venue) // 2) + or current_venue.lower() not in parsed_venue.lower() + or (parsed.get("volume") and not str(repaired.get("volume", "")).strip()) + or (parsed.get("number") and not str(repaired.get("number", "")).strip()) + or (parsed.get("pages") and not str(repaired.get("pages", "")).strip()) + ) + ) + if title and not _title_looks_like_raw_reference(title) and not needs_structural_repair: + return repaired + + if parsed.get("author"): + repaired["author"] = parsed["author"] + if parsed.get("year"): + repaired["year"] = parsed["year"] + if parsed.get("title"): + repaired["title"] = parsed["title"] + venue = parsed.get("venue", "") + if venue: + repaired.pop("howpublished", None) + if _venue_looks_journal_like(venue): + repaired["journal"] = venue + else: + repaired["howpublished"] = venue + if parsed.get("volume"): + repaired["volume"] = parsed["volume"] + if parsed.get("number"): + repaired["number"] = parsed["number"] + if parsed.get("pages"): + repaired["pages"] = parsed["pages"] + return repaired + + +def _title_looks_like_raw_reference(title: str) -> bool: + compact = " ".join(title.split()).strip() + if not compact: + return True + if len(compact) > 120: + return True + return bool(re.match(rf"^[^,]+,\s+.+\b{HISTORICAL_YEAR_PATTERN}\.\s+", compact)) + + +def _parse_report_style_reference(raw_text: str) -> dict[str, str] | None: + match = re.match( + rf"^(?P.+?)\s+(?P{HISTORICAL_YEAR_PATTERN})\.\s+(?P.+)$", + raw_text.strip(), + ) + if match is None: + return None + + author = match.group("author").strip(" .") + year = match.group("year").strip() + remainder = match.group("remainder").strip() + if not author or not remainder: + return None + + venue_start = _find_venue_start(remainder) + if venue_start is None: + return { + "author": author, + "year": year, + "title": remainder.strip(" ."), + "venue": "", + } + + title = remainder[:venue_start].strip(" .") + venue_part = remainder[venue_start:].strip(" .") + venue, volume, number, pages = _split_venue_and_locator(venue_part) + return { + "author": author, + "year": year, + "title": title, + "venue": venue, + "volume": volume, + "number": number, + "pages": pages, + } + + +def _split_venue_and_locator(venue_part: str) -> tuple[str, str, str, str]: + compact = venue_part.strip(" .") + if not compact: + return "", "", "", "" + + match = re.search( + r"(?P.+?)\.\s+(?P\d+)(?:\((?P[^)]+)\))?\s*:\s*(?P\d+(?:-\d+)?)\.?$", + compact, + ) + if match is None: + match = re.search( + r"(?P.+?)\s+(?P\d+)(?:\((?P[^)]+)\))?\s*:\s*(?P\d+(?:-\d+)?)\.?$", + compact, + ) + if match is None: + return compact, "", "", "" + + return ( + match.group("venue").strip(" ."), + (match.group("volume") or "").strip(), + (match.group("number") or "").strip(), + (match.group("pages") or "").strip(), + ) + + +def _find_venue_start(remainder: str) -> int | None: + for match in re.finditer(r"\.\s+", remainder): + candidate_start = match.end() + candidate = remainder[candidate_start:].strip() + if _looks_like_publication_segment(candidate): + return candidate_start + + lowered = remainder.lower() + markers = ( + "comm. rept.", + "rept.", + "proc.", + "procs.", + "journal", + "transactions", + "proceedings", + "bulletin", + "bull.", + "occas. pap.", + "pap.", + "memoir", + "memorandum", + "memo.", + "tech. memo.", + "tech memo", + "technical memorandum", + "technical report", + "noaa", + ) + positions = [lowered.find(marker) for marker in markers if lowered.find(marker) > 0] + if positions: + return min(positions) + return None + + +def _looks_like_publication_segment(candidate: str) -> bool: + compact = candidate.strip(" .") + if not compact: + return False + + venue, volume, number, pages = _split_venue_and_locator(compact) + if venue and (volume or number or pages) and _starts_with_publication_marker(compact): + return True + + return _starts_with_publication_marker(compact) + + +def _starts_with_publication_marker(text: str) -> bool: + lowered = text.lower() + publication_starts = ( + "comm. rept.", + "rept.", + "proc.", + "procs.", + "journal", + "transactions", + "proceedings", + "bulletin", + "bull.", + "occas. pap.", + "pap.", + "memoir", + "memorandum", + "memo.", + "tech. memo.", + "tech memo", + "technical memorandum", + "technical report", + "noaa", + "u.s.", + ) + return lowered.startswith(publication_starts) + + +def _venue_looks_journal_like(venue: str) -> bool: + lowered = venue.lower() + return any( + token in lowered + for token in ( + "journal", + "transactions", + "review", + "letters", + "comm. rept.", + "rept.", + "proc.", + "proceedings", + "occas. pap.", + "pap.", + ) + ) + + +def render_single_bibtex(entry_type: str, citation_key: str, fields: dict[str, str]) -> str: + lines = [f"@{entry_type}{{{citation_key},"] + for key in sorted(fields): + value = _sanitize_bibtex_value(fields[key]) + lines.append(f" {key} = {{{value}}},") + lines.append("}") + return "\n".join(lines) + + +def _sanitize_bibtex_value(value: str) -> str: + depth = 0 + parts: list[str] = [] + for char in value: + if char == "{": + depth += 1 + parts.append(char) + continue + if char == "}": + if depth == 0: + parts.append(")") + else: + depth -= 1 + parts.append(char) + continue + parts.append(char) + if depth > 0: + open_count = depth + normalized: list[str] = [] + for char in parts: + if char == "{" and open_count > 0: + normalized.append("(") + open_count -= 1 + else: + normalized.append(char) + return "".join(normalized) + return "".join(parts) diff --git a/apps/api/src/ecospecies_api/document_format.py b/apps/api/src/ecospecies_api/document_format.py new file mode 100644 index 0000000..d1b2cf3 --- /dev/null +++ b/apps/api/src/ecospecies_api/document_format.py @@ -0,0 +1,480 @@ +from __future__ import annotations + +import json +import re +from dataclasses import asdict, dataclass, field + + +HEADING_PATTERN = re.compile(r"^(#{2,6})\s+(?P.+?)\s*$") +INDENTED_ITEM_PATTERN = re.compile(r"^\s*-\s*(?P<body>.+?)\s*$") +DOI_PATTERN = re.compile(r"\b10\.\d{4,9}/[-._;()/:A-Za-z0-9]+\b") + + +@dataclass +class DocumentNode: + node_type: str + title: str + body: str + depth: int + children: list["DocumentNode"] = field(default_factory=list) + + +@dataclass +class StructuredDocument: + metadata: dict[str, object] + nodes: list[DocumentNode] + + +def _parse_scalar_value(value: str) -> object: + stripped = value.strip() + if not stripped: + return "" + if stripped.lower() == "true": + return True + if stripped.lower() == "false": + return False + if stripped.startswith("{") or stripped.startswith("["): + try: + return json.loads(stripped) + except json.JSONDecodeError: + return stripped + return stripped + + +def _normalize_whitespace(value: str) -> str: + return re.sub(r"\s+", " ", value).strip() + + +def _parse_front_matter(front_matter: str) -> dict[str, object]: + metadata: dict[str, object] = {} + lines = front_matter.splitlines() + index = 0 + + while index < len(lines): + raw_line = lines[index] + if not raw_line.strip() or raw_line.lstrip().startswith("#"): + index += 1 + continue + if ":" not in raw_line: + index += 1 + continue + + key, value = raw_line.split(":", 1) + normalized_key = key.strip() + stripped_value = value.strip() + if stripped_value: + metadata[normalized_key] = _parse_scalar_value(stripped_value) + index += 1 + continue + + items: list[dict[str, object]] = [] + index += 1 + while index < len(lines): + item_line = lines[index] + if not item_line.strip(): + index += 1 + continue + if not item_line.startswith(" - "): + break + + match = INDENTED_ITEM_PATTERN.match(item_line) + if not match: + break + item: dict[str, object] = {} + first_body = match.group("body") + if ":" in first_body: + item_key, item_value = first_body.split(":", 1) + item[item_key.strip()] = _parse_scalar_value(item_value.strip()) + index += 1 + + while index < len(lines): + nested_line = lines[index] + if nested_line.startswith(" ") and ":" in nested_line.strip(): + nested_key, nested_value = nested_line.strip().split(":", 1) + item[nested_key.strip()] = _parse_scalar_value(nested_value.strip()) + index += 1 + continue + break + + items.append(item) + + metadata[normalized_key] = items + + return metadata + + +def _split_front_matter(text: str) -> tuple[dict[str, object], str]: + stripped = text.lstrip() + if not stripped.startswith("---\n"): + return {}, text + + _, _, remainder = stripped.partition("---\n") + front_matter, separator, body = remainder.partition("\n---\n") + if not separator: + return {}, text + + return _parse_front_matter(front_matter), body + + +def parse_markdown_document(text: str) -> StructuredDocument: + metadata, body = _split_front_matter(text) + root_nodes: list[DocumentNode] = [] + stack: list[DocumentNode] = [] + body_lines: list[str] = [] + + def flush_body() -> None: + if not stack: + body_lines.clear() + return + stack[-1].body = "\n".join(body_lines).strip() + body_lines.clear() + + for raw_line in body.splitlines(): + match = HEADING_PATTERN.match(raw_line) + if not match: + body_lines.append(raw_line) + continue + + flush_body() + depth = len(match.group(1)) + node = DocumentNode( + node_type="section", + title=match.group("title").strip(), + body="", + depth=depth, + ) + + while stack and stack[-1].depth >= depth: + stack.pop() + + if stack: + stack[-1].children.append(node) + else: + root_nodes.append(node) + stack.append(node) + + flush_body() + return StructuredDocument(metadata=metadata, nodes=root_nodes) + + +def validate_markdown_document(text: str) -> list[str]: + errors: list[str] = [] + metadata, body = _split_front_matter(text) + if not metadata: + errors.append("Markdown document must include YAML front matter.") + + last_depth: int | None = None + for raw_line in body.splitlines(): + match = HEADING_PATTERN.match(raw_line) + if not match: + continue + depth = len(match.group(1)) + if last_depth is not None and depth > last_depth + 1: + errors.append( + f"Heading depth jumps from level {last_depth} to level {depth}: {match.group('title').strip()}" + ) + last_depth = depth + + return errors + + +def _append_metadata_lines(lines: list[str], key: str, value: object) -> None: + if isinstance(value, list): + lines.append(f"{key}:") + for item in value: + if isinstance(item, dict) and item: + first = True + for item_key, item_value in item.items(): + rendered = "true" if item_value is True else "false" if item_value is False else str(item_value) + prefix = " - " if first else " " + lines.append(f"{prefix}{item_key}: {rendered}") + first = False + else: + lines.append(f" - {item}") + return + + rendered = "true" if value is True else "false" if value is False else str(value) + lines.append(f"{key}: {rendered}") + + +def export_markdown_document(document: StructuredDocument) -> str: + lines: list[str] = ["---"] + for key, value in document.metadata.items(): + _append_metadata_lines(lines, key, value) + lines.append("---") + lines.append("") + + def append_nodes(nodes: list[DocumentNode]) -> None: + for node in nodes: + lines.append(f"{'#' * node.depth} {node.title}") + if node.body: + lines.append(node.body) + lines.append("") + append_nodes(node.children) + + append_nodes(document.nodes) + return "\n".join(lines).rstrip() + "\n" + + +def flatten_document_nodes(document: StructuredDocument) -> list[dict[str, object]]: + flattened: list[dict[str, object]] = [] + + def visit(nodes: list[DocumentNode], parent_id: str | None) -> None: + for index, node in enumerate(nodes, start=1): + node_id = f"node-{len(flattened) + 1}" + flattened.append( + { + "node_id": node_id, + "parent_id": parent_id, + "position": index, + "depth": node.depth, + "node_type": node.node_type, + "title": node.title, + "body_markdown": node.body, + "body_plaintext": node.body, + } + ) + visit(node.children, node_id) + + visit(document.nodes, None) + return flattened + + +def document_to_json(document: StructuredDocument) -> str: + return json.dumps(asdict(document), ensure_ascii=True) + + +def build_document_from_species_payload(item: dict[str, object]) -> StructuredDocument: + legacy_identifiers: list[dict[str, object]] = [] + if item.get("flelmr_code"): + legacy_identifiers.append( + { + "authority": "legacy-ecospecies", + "identifier": str(item.get("flelmr_code", "")), + "label": "FLELMR", + } + ) + + metadata = { + "title": str(item.get("title", "")), + "common_name": str(item.get("common_name", "")), + "scientific_name": str(item.get("scientific_name", "")), + "legacy_identifiers": legacy_identifiers, + "taxon_identifiers": list(item.get("taxon_identifiers", [])), + "primary_taxon_authority": str(item.get("primary_taxon_authority", "")), + "source_file": str(item.get("source_file", "")), + "publication_status": str(item.get("publication_status", "published")), + "source_format": "ecospecies-markdown-v1", + } + + nodes: list[DocumentNode] = [] + summary = str(item.get("summary", "")).strip() + if summary: + nodes.append( + DocumentNode( + node_type="section", + title="Summary", + body=summary, + depth=2, + ) + ) + + for section in item.get("sections", []): + heading = str(section.get("heading", "")).strip() + if not heading or heading == "HEADER": + continue + nodes.append( + DocumentNode( + node_type="section", + title=heading, + body=str(section.get("content", "")).strip(), + depth=2, + ) + ) + + return StructuredDocument(metadata=metadata, nodes=nodes) + + +def extract_species_projection(document: StructuredDocument) -> dict[str, object]: + metadata = document.metadata + summary = "" + sections: list[dict[str, object]] = [] + legacy_identifiers = metadata.get("legacy_identifiers", []) + taxon_identifiers = metadata.get("taxon_identifiers", []) + + flelmr_code = "" + if isinstance(legacy_identifiers, list): + for item in legacy_identifiers: + if not isinstance(item, dict): + continue + authority = str(item.get("authority", "")).strip().lower() + label = str(item.get("label", "")).strip().lower() + if authority == "legacy-ecospecies" or label == "flelmr": + flelmr_code = str(item.get("identifier", "")).strip() + if flelmr_code: + break + if not flelmr_code: + flelmr_code = str(metadata.get("species_code", "")).strip() + + def visit(nodes: list[DocumentNode], path: list[str]) -> None: + nonlocal summary + for node in nodes: + current_path = [*path, node.title] + if node.title.lower() == "summary" and not summary: + summary = node.body.strip() + else: + sections.append( + { + "heading": " / ".join(current_path), + "content": node.body.strip(), + } + ) + visit(node.children, current_path) + + visit(document.nodes, []) + return { + "title": metadata.get("title", ""), + "common_name": metadata.get("common_name", ""), + "scientific_name": metadata.get("scientific_name", ""), + "flelmr_code": flelmr_code, + "legacy_identifiers": legacy_identifiers if isinstance(legacy_identifiers, list) else [], + "taxon_identifiers": taxon_identifiers if isinstance(taxon_identifiers, list) else [], + "primary_taxon_authority": str(metadata.get("primary_taxon_authority", "")), + "summary": summary, + "sections": sections, + } + + +def _is_citation_heading(title: str) -> bool: + lowered = title.strip().rstrip(":").lower() + return lowered in { + "references", + "reference", + "citations", + "citation", + "bibliography", + "related references", + "related citations", + } + + +def _split_citation_lines(body: str) -> list[str]: + entries: list[dict[str, str]] = [] + current: list[str] = [] + current_number = "" + + def flush() -> None: + nonlocal current_number + if not current: + return + compact = " ".join(part.strip() for part in current if part.strip()).strip() + if compact: + entries.append( + { + "legacy_reference_number": current_number, + "raw_text": compact, + } + ) + current.clear() + current_number = "" + + for raw_line in body.splitlines(): + stripped = raw_line.strip() + if not stripped: + flush() + continue + + leading_number_match = re.match(r"^(?P<num>\d+)\s*,\s*(?P<text>.+)$", stripped) + if leading_number_match: + flush() + current_number = leading_number_match.group("num") + current.append(leading_number_match.group("text")) + continue + + bare_number_match = re.match(r"^(?P<num>\d+)\s+(?P<text>[A-Z].+)$", stripped) + if bare_number_match: + flush() + current_number = bare_number_match.group("num") + current.append(bare_number_match.group("text")) + continue + + bullet_match = re.match( + r"^(?:[-*]|\[(?P<bracket_num>\d+)\]|(?P<plain_num>\d+)[\.,])\s+(?P<text>.+)$", + stripped, + ) + if bullet_match: + flush() + current_number = bullet_match.group("bracket_num") or bullet_match.group("plain_num") or "" + bullet_text = bullet_match.group("text") + if not current_number: + nested_number_match = re.match(r"^\[(?P<num>\d+)\]\s+(?P<text>.+)$", bullet_text) + if nested_number_match: + current_number = nested_number_match.group("num") + bullet_text = nested_number_match.group("text") + else: + nested_comma_match = re.match(r"^(?P<num>\d+)\s*,\s*(?P<text>.+)$", bullet_text) + if nested_comma_match: + current_number = nested_comma_match.group("num") + bullet_text = nested_comma_match.group("text") + current.append(bullet_text) + continue + + current.append(stripped) + + flush() + return entries + + +def extract_citation_entries(document: StructuredDocument) -> list[dict[str, object]]: + entries: list[dict[str, object]] = [] + + def visit(nodes: list[DocumentNode], path: list[str]) -> None: + for node in nodes: + current_path = [*path, node.title] + if _is_citation_heading(node.title): + section_heading = " / ".join(current_path) + for item in _split_citation_lines(node.body): + raw_text = item["raw_text"] + doi_match = DOI_PATTERN.search(raw_text) + entries.append( + { + "section_heading": section_heading, + "legacy_reference_number": item["legacy_reference_number"], + "raw_text": raw_text, + "normalized_text": _normalize_whitespace(raw_text), + "doi": doi_match.group(0) if doi_match else "", + } + ) + visit(node.children, current_path) + + visit(document.nodes, []) + return entries + + +def add_citation_to_document( + document: StructuredDocument, + citation_text: str, + heading_title: str = "Related References", +) -> bool: + normalized_citation = _normalize_whitespace(citation_text) + if not normalized_citation: + return False + + for node in document.nodes: + if _is_citation_heading(node.title): + existing = {_normalize_whitespace(item["raw_text"]) for item in _split_citation_lines(node.body)} + if normalized_citation in existing: + return False + body = node.body.rstrip() + node.body = f"{body}\n- {citation_text}".strip() if body else f"- {citation_text}" + return True + + document.nodes.append( + DocumentNode( + node_type="section", + title=heading_title, + body=f"- {citation_text}", + depth=2, + ) + ) + return True diff --git a/apps/api/src/ecospecies_api/document_repository.py b/apps/api/src/ecospecies_api/document_repository.py new file mode 100644 index 0000000..e4ebaeb --- /dev/null +++ b/apps/api/src/ecospecies_api/document_repository.py @@ -0,0 +1,267 @@ +from __future__ import annotations + +from sqlalchemy import select + +from ecospecies_api.citegeist_bridge import extract_draft_citation +from ecospecies_api.document_format import ( + build_document_from_species_payload, + document_to_json, + extract_citation_entries, + extract_species_projection, + export_markdown_document, + flatten_document_nodes, + parse_markdown_document, + validate_markdown_document, +) +from ecospecies_api.models import ( + DocumentSection, + Species, + SpeciesCitation, + SpeciesDocument, + SpeciesDocumentNode, + SpeciesTaxonIdentifier, +) + + +def _persist_taxon_identifiers(session, species: Species, taxon_identifiers: list[dict[str, object]]) -> None: + for identifier in list(species.taxon_identifiers): + session.delete(identifier) + session.flush() + + for position, item in enumerate(taxon_identifiers, start=1): + authority = str(item.get("authority", "")).strip() + identifier = str(item.get("identifier", "")).strip() + if not authority or not identifier: + continue + session.add( + SpeciesTaxonIdentifier( + species_id=species.id, + position=position, + authority=authority, + identifier=identifier, + label=str(item.get("label", "")).strip(), + is_primary=bool(item.get("primary") or item.get("is_primary")), + source_url=str(item.get("source_url", "")).strip(), + ) + ) + + +def _existing_taxon_identifier_payload(species: Species) -> list[dict[str, object]]: + return [ + { + "authority": item.authority, + "identifier": item.identifier, + "label": item.label, + "primary": item.is_primary, + "source_url": item.source_url, + } + for item in species.taxon_identifiers + ] + + +def _citation_match_key(item: dict[str, object]) -> tuple[str, str, str]: + return ( + str(item.get("section_heading", "")).strip(), + str(item.get("legacy_reference_number", "")).strip(), + str(item.get("raw_text", "")).strip(), + ) + + +def _persist_citations(session, species: Species, citations: list[dict[str, object]]) -> None: + existing_by_key = { + _citation_match_key( + { + "section_heading": citation.section_heading, + "legacy_reference_number": citation.legacy_reference_number, + "raw_text": citation.raw_text, + } + ): citation + for citation in species.citations + } + retained_ids: set[int] = set() + + for position, item in enumerate(citations, start=1): + raw_text = str(item.get("raw_text", "")).strip() + if not raw_text: + continue + key = _citation_match_key(item) + legacy_reference_number = str(item.get("legacy_reference_number", "")).strip() + existing = existing_by_key.get(key) + extracted_normalized = str(item.get("normalized_text", "")).strip() + extracted_doi = str(item.get("doi", "")).strip() + draft = extract_draft_citation(raw_text, legacy_reference_number) + + if existing is None: + session.add( + SpeciesCitation( + species_id=species.id, + position=position, + section_heading=str(item.get("section_heading", "")).strip(), + legacy_reference_number=legacy_reference_number, + citation_key=draft.citation_key if draft is not None else "", + entry_type=draft.entry_type if draft is not None else "misc", + raw_text=raw_text, + normalized_text=extracted_normalized, + abstract_text="", + draft_bibtex=draft.draft_bibtex if draft is not None else "", + doi=extracted_doi, + source_url="", + openalex_id="", + resolver_source_label="", + enrichment_status="pending", + enrichment_error="", + source_type="document_extract", + review_status="draft", + ) + ) + continue + + existing.position = position + existing.section_heading = str(item.get("section_heading", "")).strip() + existing.legacy_reference_number = legacy_reference_number + existing.raw_text = raw_text + if existing.review_status == "draft": + existing.normalized_text = extracted_normalized + existing.abstract_text = "" + existing.doi = extracted_doi + existing.citation_key = draft.citation_key if draft is not None else "" + existing.entry_type = draft.entry_type if draft is not None else "misc" + existing.draft_bibtex = draft.draft_bibtex if draft is not None else "" + existing.source_type = "document_extract" + existing.enrichment_status = "pending" + existing.enrichment_error = "" + existing.resolver_source_label = "" + existing.source_url = "" + existing.openalex_id = "" + retained_ids.add(existing.id) + session.add(existing) + + for citation in list(species.citations): + if citation.id not in retained_ids and citation.source_type in {"document_extract", "editor_review"}: + session.delete(citation) + + +def _persist_document_model(session, species: Species, document_model, markdown_content: str, updated_by: str) -> None: + ast_json = document_to_json(document_model) + document = session.scalar( + select(SpeciesDocument).where(SpeciesDocument.species_id == species.id) + ) + if document is None: + document = SpeciesDocument( + species_id=species.id, + source_format="ecospecies-markdown-v1", + markdown_content=markdown_content, + ast_json=ast_json, + updated_by=updated_by, + ) + session.add(document) + session.flush() + else: + document.source_format = "ecospecies-markdown-v1" + document.markdown_content = markdown_content + document.ast_json = ast_json + document.updated_by = updated_by + session.add(document) + + for node in list(document.nodes): + session.delete(node) + session.flush() + + for node in flatten_document_nodes(document_model): + session.add( + SpeciesDocumentNode( + document_id=document.id, + parent_node_ref=node["parent_id"], + node_ref=node["node_id"], + position=node["position"], + depth=node["depth"], + node_type=node["node_type"], + title=node["title"], + body_markdown=node["body_markdown"], + body_plaintext=node["body_plaintext"], + ) + ) + + +def sync_species_document(session, species: Species, item: dict[str, object]) -> None: + payload = dict(item) + if "taxon_identifiers" not in payload or not payload.get("taxon_identifiers"): + payload["taxon_identifiers"] = _existing_taxon_identifier_payload(species) + if "primary_taxon_authority" not in payload or not payload.get("primary_taxon_authority"): + for identifier in payload["taxon_identifiers"]: + if bool(identifier.get("primary")): + payload["primary_taxon_authority"] = str(identifier.get("authority", "")).strip() + break + + document_model = build_document_from_species_payload(payload) + markdown_content = export_markdown_document(document_model) + _persist_document_model( + session, + species, + document_model, + markdown_content, + str(item.get("last_modified_by", "system-import")), + ) + _persist_citations(session, species, extract_citation_entries(document_model)) + + +def get_species_document_payload(session, slug: str) -> dict[str, object] | None: + species = session.scalar(select(Species).where(Species.slug == slug)) + if species is None or species.document is None: + return None + + document = species.document + return { + "slug": species.slug, + "source_format": document.source_format, + "markdown": document.markdown_content, + "ast_json": document.ast_json, + "node_count": len(document.nodes), + "updated_by": document.updated_by, + } + + +def save_species_document(session, species: Species, markdown: str, username: str) -> dict[str, object]: + errors = validate_markdown_document(markdown) + if errors: + raise ValueError("; ".join(errors)) + + document_model = parse_markdown_document(markdown) + projection = extract_species_projection(document_model) + _persist_document_model(session, species, document_model, markdown, username) + _persist_citations(session, species, extract_citation_entries(document_model)) + + if projection["title"]: + species.title = str(projection["title"]) + if projection["common_name"]: + species.common_name = str(projection["common_name"]) + if projection["scientific_name"]: + species.scientific_name = str(projection["scientific_name"]) + if projection["flelmr_code"]: + species.flelmr_code = str(projection["flelmr_code"]) + _persist_taxon_identifiers(session, species, list(projection["taxon_identifiers"])) + species.summary = str(projection["summary"]) + species.section_count = len(projection["sections"]) + species.last_modified_by = username + + for section in list(species.sections): + session.delete(section) + session.flush() + + for position, section in enumerate(projection["sections"], start=1): + session.add( + DocumentSection( + species_id=species.id, + position=position, + heading=str(section["heading"]), + content=str(section["content"]), + ) + ) + + return { + "slug": species.slug, + "summary": species.summary, + "section_count": species.section_count, + "markdown": markdown, + "updated_by": username, + } diff --git a/apps/api/src/ecospecies_api/models.py b/apps/api/src/ecospecies_api/models.py index 89248cd..238c43e 100644 --- a/apps/api/src/ecospecies_api/models.py +++ b/apps/api/src/ecospecies_api/models.py @@ -23,6 +23,9 @@ class Species(Base): publication_status: Mapped[str] = mapped_column(String(32), default="published", index=True) is_archived: Mapped[bool] = mapped_column(Boolean, default=False, index=True) editor_notes: Mapped[str] = mapped_column(Text, default="") + created_by: Mapped[str] = mapped_column(String(255), default="system-import") + owner_username: Mapped[str] = mapped_column(String(255), default="") + owner_role: Mapped[str] = mapped_column(String(32), default="") last_modified_by: Mapped[str] = mapped_column(String(255), default="system-import") sections: Mapped[list["DocumentSection"]] = relationship( @@ -40,6 +43,21 @@ class Species(Base): cascade="all, delete-orphan", order_by="SpeciesAuditLog.id.desc()", ) + document: Mapped["SpeciesDocument | None"] = relationship( + back_populates="species", + cascade="all, delete-orphan", + uselist=False, + ) + taxon_identifiers: Mapped[list["SpeciesTaxonIdentifier"]] = relationship( + back_populates="species", + cascade="all, delete-orphan", + order_by="SpeciesTaxonIdentifier.position", + ) + citations: Mapped[list["SpeciesCitation"]] = relationship( + back_populates="species", + cascade="all, delete-orphan", + order_by="SpeciesCitation.position", + ) class DocumentSection(Base): @@ -77,3 +95,93 @@ class SpeciesAuditLog(Base): details_json: Mapped[str] = mapped_column(Text) species: Mapped[Species] = relationship(back_populates="audit_entries") + + +class SpeciesDocument(Base): + __tablename__ = "species_document" + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + species_id: Mapped[int] = mapped_column(ForeignKey("species.id", ondelete="CASCADE"), unique=True, index=True) + source_format: Mapped[str] = mapped_column(String(64), default="ecospecies-markdown-v1") + markdown_content: Mapped[str] = mapped_column(Text, default="") + ast_json: Mapped[str] = mapped_column(Text, default="") + updated_by: Mapped[str] = mapped_column(String(255), default="system-import") + + species: Mapped[Species] = relationship(back_populates="document") + nodes: Mapped[list["SpeciesDocumentNode"]] = relationship( + back_populates="document", + cascade="all, delete-orphan", + order_by="SpeciesDocumentNode.position", + ) + + +class SpeciesDocumentNode(Base): + __tablename__ = "species_document_node" + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + document_id: Mapped[int] = mapped_column(ForeignKey("species_document.id", ondelete="CASCADE"), index=True) + parent_node_ref: Mapped[str | None] = mapped_column(String(64), nullable=True, default=None) + node_ref: Mapped[str] = mapped_column(String(64), index=True) + position: Mapped[int] = mapped_column(Integer, default=1) + depth: Mapped[int] = mapped_column(Integer, default=2) + node_type: Mapped[str] = mapped_column(String(32), default="section") + title: Mapped[str] = mapped_column(String(255), default="") + body_markdown: Mapped[str] = mapped_column(Text, default="") + body_plaintext: Mapped[str] = mapped_column(Text, default="") + source_heading: Mapped[str] = mapped_column(String(255), default="") + source_span_start: Mapped[int | None] = mapped_column(Integer, nullable=True, default=None) + source_span_end: Mapped[int | None] = mapped_column(Integer, nullable=True, default=None) + + document: Mapped[SpeciesDocument] = relationship(back_populates="nodes") + + +class ContributorAccount(Base): + __tablename__ = "contributor_account" + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + email: Mapped[str] = mapped_column(String(255), unique=True, index=True) + token_hash: Mapped[str] = mapped_column(String(128), unique=True, index=True) + age_gate_confirmed: Mapped[bool] = mapped_column(Boolean, default=False) + created_at: Mapped[str] = mapped_column(String(64), index=True) + is_active: Mapped[bool] = mapped_column(Boolean, default=True, index=True) + + +class SpeciesTaxonIdentifier(Base): + __tablename__ = "species_taxon_identifier" + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + species_id: Mapped[int] = mapped_column(ForeignKey("species.id", ondelete="CASCADE"), index=True) + position: Mapped[int] = mapped_column(Integer, default=1) + authority: Mapped[str] = mapped_column(String(64), default="") + identifier: Mapped[str] = mapped_column(String(255), default="") + label: Mapped[str] = mapped_column(String(128), default="") + is_primary: Mapped[bool] = mapped_column(Boolean, default=False, index=True) + source_url: Mapped[str] = mapped_column(String(500), default="") + + species: Mapped[Species] = relationship(back_populates="taxon_identifiers") + + +class SpeciesCitation(Base): + __tablename__ = "species_citation" + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + species_id: Mapped[int] = mapped_column(ForeignKey("species.id", ondelete="CASCADE"), index=True) + position: Mapped[int] = mapped_column(Integer, default=1) + section_heading: Mapped[str] = mapped_column(String(255), default="") + legacy_reference_number: Mapped[str] = mapped_column(String(64), default="", index=True) + citation_key: Mapped[str] = mapped_column(String(255), default="", index=True) + entry_type: Mapped[str] = mapped_column(String(64), default="misc") + raw_text: Mapped[str] = mapped_column(Text, default="") + normalized_text: Mapped[str] = mapped_column(Text, default="") + abstract_text: Mapped[str] = mapped_column(Text, default="") + draft_bibtex: Mapped[str] = mapped_column(Text, default="") + doi: Mapped[str] = mapped_column(String(255), default="", index=True) + source_url: Mapped[str] = mapped_column(String(500), default="") + openalex_id: Mapped[str] = mapped_column(String(64), default="", index=True) + resolver_source_label: Mapped[str] = mapped_column(String(255), default="") + enrichment_status: Mapped[str] = mapped_column(String(32), default="pending", index=True) + enrichment_error: Mapped[str] = mapped_column(Text, default="") + source_type: Mapped[str] = mapped_column(String(64), default="document_extract") + review_status: Mapped[str] = mapped_column(String(32), default="draft", index=True) + + species: Mapped[Species] = relationship(back_populates="citations") diff --git a/apps/api/src/ecospecies_api/parser.py b/apps/api/src/ecospecies_api/parser.py index 1a02e42..727ef7c 100644 --- a/apps/api/src/ecospecies_api/parser.py +++ b/apps/api/src/ecospecies_api/parser.py @@ -1,14 +1,18 @@ from __future__ import annotations +import hashlib import os import re +from collections import Counter from dataclasses import dataclass from pathlib import Path SECTION_PATTERN = re.compile(r"^[A-Z][A-Z\s/&()-]{2,}$") +TITLE_SECTION_PATTERN = re.compile(r"^[A-Z][A-Za-z\s/&()-]{2,}$") FIELD_PATTERN = re.compile(r"^(?P<key>[A-Za-z/ _-]+):\s*(?P<value>.*)$") SUMMARY_MARKER_PATTERN = re.compile(r"^(summary(?:/abstract)?|abstract|executive summary):?\s*$", re.IGNORECASE) +SAFE_DIRECTORY_NAME_PATTERN = re.compile(r"^[A-Za-z0-9_-]+$") @dataclass @@ -38,6 +42,10 @@ class SpeciesRecord: diagnostics: list[IngestDiagnostic] +def get_repo_root() -> Path: + return Path(__file__).resolve().parents[4] + + def slugify(value: str) -> str: cleaned = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-") return cleaned or "unknown-species" @@ -53,6 +61,33 @@ def normalize_whitespace(value: str) -> str: return re.sub(r"\s+", " ", value).strip() +def is_section_heading(line: str) -> bool: + stripped = line.strip() + if not stripped: + return False + normalized = stripped[:-1].strip() if stripped.endswith(":") else stripped + if not normalized: + return False + if ":" in normalized: + return False + if SECTION_PATTERN.fullmatch(normalized): + return True + if not TITLE_SECTION_PATTERN.fullmatch(normalized): + return False + + words = normalized.split() + if len(words) > 4: + return False + return all(word[0].isupper() for word in words if word and word[0].isalpha()) + + +def normalize_heading(line: str) -> str: + stripped = line.strip() + if stripped.endswith(":"): + return stripped[:-1].strip() + return stripped + + def split_sections(lines: list[str]) -> list[Section]: sections: list[Section] = [] current_heading = "HEADER" @@ -61,7 +96,7 @@ def split_sections(lines: list[str]) -> list[Section]: for raw_line in lines: line = raw_line.rstrip() stripped = line.strip() - if SECTION_PATTERN.fullmatch(stripped): + if is_section_heading(stripped): if current_lines: sections.append( Section( @@ -69,7 +104,7 @@ def split_sections(lines: list[str]) -> list[Section]: content="\n".join(current_lines).strip(), ) ) - current_heading = stripped + current_heading = normalize_heading(stripped) current_lines = [] continue current_lines.append(line) @@ -96,8 +131,9 @@ def extract_metadata(lines: list[str]) -> dict[str, str]: value = match.group("value").strip() metadata[key] = value - # Legacy files vary between "FLELMR", "FLELMR Code", and similar labels. - if key.startswith("flelmr"): + # Legacy files vary between "FLELMR", "FLELMR Code", "EcoSpecies Code", + # and similar labels. + if key.startswith("flelmr") or key == "ecospecies code": metadata["flelmr"] = value return metadata @@ -127,7 +163,7 @@ def extract_summary(lines: list[str], sections: list[Section]) -> str: if summary_lines: summary_lines.append("") continue - if SECTION_PATTERN.fullmatch(stripped): + if is_section_heading(stripped): break if stripped.startswith("[") and not summary_lines: break @@ -202,23 +238,76 @@ def parse_species_file(path: Path) -> SpeciesRecord: ) +def ensure_unique_record_slugs(records: list[SpeciesRecord]) -> list[SpeciesRecord]: + slug_counts = Counter(record.slug for record in records) + used_slugs: set[str] = set() + + for record in records: + base_slug = record.slug + if slug_counts[base_slug] == 1 and base_slug not in used_slugs: + used_slugs.add(base_slug) + continue + + disambiguator = slugify(Path(record.source_file).stem) + if disambiguator == base_slug: + disambiguator = hashlib.sha1(record.source_file.encode("utf-8")).hexdigest()[:8] + + candidate = f"{base_slug}-{disambiguator}" + if candidate in used_slugs: + source_hash = hashlib.sha1(record.source_file.encode("utf-8")).hexdigest()[:8] + candidate = f"{candidate}-{source_hash}" + + suffix = 2 + while candidate in used_slugs: + candidate = f"{base_slug}-{disambiguator}-{suffix}" + suffix += 1 + + record.slug = candidate + used_slugs.add(candidate) + + return records + + def load_species_records(data_dir: str) -> list[SpeciesRecord]: - base = Path(data_dir) + base = resolve_data_dir(data_dir) if not base.exists(): return [] records: list[SpeciesRecord] = [] for path in sorted(base.glob("*.txt")): records.append(parse_species_file(path)) - return records + return ensure_unique_record_slugs(records) + + +def resolve_data_dir(data_dir: str) -> Path: + repo_root = get_repo_root().resolve() + raw_value = data_dir.strip() + if not raw_value: + raise ValueError("Species data directory cannot be empty.") + + candidate = Path(raw_value) + if candidate.is_absolute(): + resolved = candidate.resolve() + else: + resolved = (repo_root / candidate).resolve() + + try: + relative = resolved.relative_to(repo_root) + except ValueError as exc: + raise ValueError("Species data directory must stay within the codebase directory.") from exc + + if not relative.parts: + raise ValueError("Species data directory must be a subdirectory of the codebase.") + + for part in relative.parts: + if not SAFE_DIRECTORY_NAME_PATTERN.fullmatch(part): + raise ValueError( + f"Species data directory contains an unsafe directory name: {part!r}." + ) + + return resolved def get_default_data_dir() -> str: - return os.environ.get( - "ECOSPECIES_DATA_DIR", - str( - Path(__file__).resolve().parents[4].parent - / "01-legacy-code-and-data" - / "InputFiles - TXT" - ), - ) + configured = os.environ.get("ECOSPECIES_DATA_DIR", "input-data/InputFiles") + return str(resolve_data_dir(configured)) diff --git a/apps/api/src/ecospecies_api/repository.py b/apps/api/src/ecospecies_api/repository.py index 1be8d43..42159d5 100644 --- a/apps/api/src/ecospecies_api/repository.py +++ b/apps/api/src/ecospecies_api/repository.py @@ -3,15 +3,45 @@ from __future__ import annotations from collections import Counter from datetime import datetime, timezone import json +import hashlib +import os +from pathlib import Path +import re +import secrets from sqlalchemy import inspect, select, text from sqlalchemy.exc import SQLAlchemyError +from ecospecies_api.citation_enrichment import ( + apply_citation_candidate_selection, + discover_citation_candidates, + enrich_citation_payload, +) +from ecospecies_api.document_format import extract_species_projection, parse_markdown_document +from ecospecies_api.document_format import add_citation_to_document, export_markdown_document +from ecospecies_api.document_repository import ( + get_species_document_payload, + save_species_document, + sync_species_document, +) from ecospecies_api.db import SessionLocal, create_db_engine -from ecospecies_api.models import Base, DocumentSection, IngestDiagnosticRecord, Species, SpeciesAuditLog +from ecospecies_api.models import ( + Base, + ContributorAccount, + DocumentSection, + IngestDiagnosticRecord, + Species, + SpeciesAuditLog, + SpeciesCitation, + SpeciesTaxonIdentifier, +) +from ecospecies_api.parser import get_default_data_dir, slugify WORKFLOW_STATUSES = {"draft", "review", "published"} +CITATION_REVIEW_STATUSES = {"draft", "reviewed", "accepted", "rejected"} SYSTEM_IMPORT_USER = "system-import" +CONTRIBUTOR_SUBMISSION_PREFIX = "contributor-submission" +EMAIL_PATTERN = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$") def ensure_schema() -> None: @@ -26,6 +56,12 @@ def ensure_schema() -> None: statements.append("ALTER TABLE species ADD COLUMN is_archived BOOLEAN DEFAULT FALSE") if "editor_notes" not in species_columns: statements.append("ALTER TABLE species ADD COLUMN editor_notes TEXT DEFAULT ''") + if "created_by" not in species_columns: + statements.append("ALTER TABLE species ADD COLUMN created_by VARCHAR(255) DEFAULT 'system-import'") + if "owner_username" not in species_columns: + statements.append("ALTER TABLE species ADD COLUMN owner_username VARCHAR(255) DEFAULT ''") + if "owner_role" not in species_columns: + statements.append("ALTER TABLE species ADD COLUMN owner_role VARCHAR(32) DEFAULT ''") if "last_modified_by" not in species_columns: statements.append("ALTER TABLE species ADD COLUMN last_modified_by VARCHAR(255) DEFAULT 'system-import'") if statements: @@ -37,10 +73,144 @@ def ensure_schema() -> None: "UPDATE species SET publication_status = COALESCE(publication_status, 'published'), " "is_archived = COALESCE(is_archived, FALSE), " "editor_notes = COALESCE(editor_notes, ''), " + "created_by = COALESCE(created_by, 'system-import'), " + "owner_username = COALESCE(owner_username, ''), " + "owner_role = COALESCE(owner_role, ''), " "last_modified_by = COALESCE(last_modified_by, 'system-import')" ) ) + tables = set(inspector.get_table_names()) + if "species_citation" in tables: + citation_columns = { + column["name"] for column in inspector.get_columns("species_citation") + } + citation_statements: list[str] = [] + if "legacy_reference_number" not in citation_columns: + citation_statements.append( + "ALTER TABLE species_citation ADD COLUMN legacy_reference_number VARCHAR(64) DEFAULT ''" + ) + if "citation_key" not in citation_columns: + citation_statements.append( + "ALTER TABLE species_citation ADD COLUMN citation_key VARCHAR(255) DEFAULT ''" + ) + if "entry_type" not in citation_columns: + citation_statements.append( + "ALTER TABLE species_citation ADD COLUMN entry_type VARCHAR(64) DEFAULT 'misc'" + ) + if "draft_bibtex" not in citation_columns: + citation_statements.append( + "ALTER TABLE species_citation ADD COLUMN draft_bibtex TEXT DEFAULT ''" + ) + if "abstract_text" not in citation_columns: + citation_statements.append( + "ALTER TABLE species_citation ADD COLUMN abstract_text TEXT DEFAULT ''" + ) + if "source_url" not in citation_columns: + citation_statements.append( + "ALTER TABLE species_citation ADD COLUMN source_url VARCHAR(500) DEFAULT ''" + ) + if "openalex_id" not in citation_columns: + citation_statements.append( + "ALTER TABLE species_citation ADD COLUMN openalex_id VARCHAR(64) DEFAULT ''" + ) + if "resolver_source_label" not in citation_columns: + citation_statements.append( + "ALTER TABLE species_citation ADD COLUMN resolver_source_label VARCHAR(255) DEFAULT ''" + ) + if "enrichment_status" not in citation_columns: + citation_statements.append( + "ALTER TABLE species_citation ADD COLUMN enrichment_status VARCHAR(32) DEFAULT 'pending'" + ) + if "enrichment_error" not in citation_columns: + citation_statements.append( + "ALTER TABLE species_citation ADD COLUMN enrichment_error TEXT DEFAULT ''" + ) + if citation_statements: + with engine.begin() as connection: + for statement in citation_statements: + connection.execute(text(statement)) + connection.execute( + text( + "UPDATE species_citation SET " + "legacy_reference_number = COALESCE(legacy_reference_number, ''), " + "citation_key = COALESCE(citation_key, ''), " + "entry_type = COALESCE(entry_type, 'misc'), " + "abstract_text = COALESCE(abstract_text, ''), " + "draft_bibtex = COALESCE(draft_bibtex, ''), " + "source_url = COALESCE(source_url, ''), " + "openalex_id = COALESCE(openalex_id, ''), " + "resolver_source_label = COALESCE(resolver_source_label, ''), " + "enrichment_status = COALESCE(enrichment_status, 'pending'), " + "enrichment_error = COALESCE(enrichment_error, '')" + ) + ) + + +def _citation_to_payload(citation: SpeciesCitation) -> dict[str, object]: + return { + "id": citation.id, + "position": citation.position, + "section_heading": citation.section_heading, + "legacy_reference_number": citation.legacy_reference_number, + "citation_key": citation.citation_key, + "entry_type": citation.entry_type, + "raw_text": citation.raw_text, + "normalized_text": citation.normalized_text, + "abstract_text": citation.abstract_text, + "draft_bibtex": citation.draft_bibtex, + "doi": citation.doi, + "source_url": citation.source_url, + "openalex_id": citation.openalex_id, + "resolver_source_label": citation.resolver_source_label, + "enrichment_status": citation.enrichment_status, + "enrichment_error": citation.enrichment_error, + "source_type": citation.source_type, + "review_status": citation.review_status, + } + + +def _structured_document_to_payload(species: Species) -> dict[str, object] | None: + if species.document is None: + return None + + ast: dict[str, object] | None = None + raw_ast = str(species.document.ast_json or "").strip() + if raw_ast: + try: + parsed = json.loads(raw_ast) + if isinstance(parsed, dict): + ast = parsed + except json.JSONDecodeError: + ast = None + + return { + "source_format": species.document.source_format, + "updated_by": species.document.updated_by, + "node_count": len(species.document.nodes), + "ast": ast, + } + + +def _legacy_source_to_payload(species: Species) -> dict[str, object] | None: + source_file = str(species.source_file or "").strip() + if not source_file: + return None + + try: + data_dir = Path(get_default_data_dir()).resolve() + candidate = (data_dir / source_file).resolve() + if candidate.parent != data_dir or not candidate.is_file(): + return None + text = candidate.read_text(encoding="utf-8", errors="replace") + except (OSError, ValueError): + return None + + return { + "source_file": source_file, + "text": text, + } + def import_species_payload(payload: list[dict[str, object]]) -> None: ensure_schema() @@ -66,6 +236,9 @@ def import_species_payload(payload: list[dict[str, object]]) -> None: publication_status="published", is_archived=False, editor_notes="", + created_by=SYSTEM_IMPORT_USER, + owner_username="", + owner_role="", last_modified_by=SYSTEM_IMPORT_USER, ) session.add(species) @@ -142,7 +315,7 @@ def import_species_payload(payload: list[dict[str, object]]) -> None: ) ) - session.add(species) + sync_species_document(session, species, item) for slug, species in existing_species.items(): if slug in incoming_slugs: @@ -161,11 +334,15 @@ def import_species_payload(payload: list[dict[str, object]]) -> None: ), ) ) - session.add(species) - session.commit() +def get_species_document(slug: str) -> dict[str, object] | None: + ensure_schema() + with SessionLocal() as session: + return get_species_document_payload(session, slug) + + def _get_editor_preservation_state(species: Species) -> tuple[set[str], set[int]]: editorial_fields: set[str] = set() section_positions: set[int] = set() @@ -215,6 +392,30 @@ def get_readiness_status() -> dict[str, object]: def _species_to_payload(species: Species, include_sections: bool = True) -> dict[str, object]: + legacy_identifiers: list[dict[str, object]] = [] + if species.flelmr_code: + legacy_identifiers.append( + { + "authority": "legacy-ecospecies", + "identifier": species.flelmr_code, + "label": "FLELMR", + } + ) + taxon_identifiers = [ + { + "authority": item.authority, + "identifier": item.identifier, + "label": item.label, + "primary": item.is_primary, + "source_url": item.source_url, + } + for item in species.taxon_identifiers + ] + primary_taxon_identifier = next( + (item for item in taxon_identifiers if bool(item.get("primary"))), + None, + ) + return { "slug": species.slug, "source_file": species.source_file, @@ -222,8 +423,15 @@ def _species_to_payload(species: Species, include_sections: bool = True) -> dict "common_name": species.common_name, "scientific_name": species.scientific_name, "flelmr_code": species.flelmr_code, + "legacy_identifiers": legacy_identifiers, + "taxon_identifiers": taxon_identifiers, + "primary_taxon_authority": ( + str(primary_taxon_identifier.get("authority", "")) if primary_taxon_identifier else "" + ), + "primary_taxon_identifier": primary_taxon_identifier, "summary": species.summary, "section_count": species.section_count, + "citation_count": len(species.citations), "publication_status": species.publication_status, "is_archived": species.is_archived, "editor_notes": species.editor_notes, @@ -232,6 +440,11 @@ def _species_to_payload(species: Species, include_sections: bool = True) -> dict {"level": diagnostic.level, "code": diagnostic.code, "message": diagnostic.message} for diagnostic in species.diagnostics ], + "citations": [ + _citation_to_payload(citation) for citation in species.citations + ], + "structured_document": _structured_document_to_payload(species) if include_sections else None, + "legacy_source": _legacy_source_to_payload(species) if include_sections else None, "sections": ( [ { @@ -290,6 +503,7 @@ def get_species_by_slug( return None _ = species.sections _ = species.diagnostics + _ = species.citations return _species_to_payload(species, include_sections=True) @@ -321,10 +535,135 @@ def list_diagnostics() -> list[dict[str, object]]: ] +def list_public_bibliography(search: str = "") -> list[dict[str, object]]: + ensure_schema() + with SessionLocal() as session: + species_records = list( + session.scalars( + select(Species) + .where( + Species.publication_status == "published", + Species.is_archived.is_(False), + ) + .order_by(Species.common_name, Species.title) + ) + ) + + entries: dict[str, dict[str, object]] = {} + for species in species_records: + _ = species.citations + for citation in species.citations: + doi_key = str(citation.doi).strip().lower() + openalex_key = str(citation.openalex_id).strip().lower() + citation_key = str(citation.citation_key).strip().lower() + normalized_key = " ".join(str(citation.normalized_text).split()).strip().lower() + raw_key = " ".join(str(citation.raw_text).split()).strip().lower() + dedupe_key = ( + f"doi:{doi_key}" if doi_key else "" + ) or ( + f"openalex:{openalex_key}" if openalex_key else "" + ) or ( + f"key:{citation_key}" if citation_key else "" + ) or ( + f"normalized:{normalized_key}" if normalized_key else "" + ) or ( + f"raw:{raw_key}" if raw_key else "" + ) + if not dedupe_key: + continue + + entry = entries.get(dedupe_key) + if entry is None: + entry = { + **_citation_to_payload(citation), + "species_refs": [], + "_species_ref_keys": set(), + "_legacy_reference_numbers": set(), + } + entries[dedupe_key] = entry + + if not entry.get("normalized_text") and citation.normalized_text: + entry["normalized_text"] = citation.normalized_text + if not entry.get("abstract_text") and citation.abstract_text: + entry["abstract_text"] = citation.abstract_text + if not entry.get("draft_bibtex") and citation.draft_bibtex: + entry["draft_bibtex"] = citation.draft_bibtex + if not entry.get("doi") and citation.doi: + entry["doi"] = citation.doi + if not entry.get("source_url") and citation.source_url: + entry["source_url"] = citation.source_url + if not entry.get("openalex_id") and citation.openalex_id: + entry["openalex_id"] = citation.openalex_id + + species_ref_key = species.slug + if species_ref_key not in entry["_species_ref_keys"]: + entry["_species_ref_keys"].add(species_ref_key) + entry["species_refs"].append( + { + "slug": species.slug, + "common_name": species.common_name, + "scientific_name": species.scientific_name, + } + ) + if citation.legacy_reference_number: + entry["_legacy_reference_numbers"].add(citation.legacy_reference_number) + + items: list[dict[str, object]] = [] + needle = search.strip().lower() + for entry in entries.values(): + legacy_numbers = sorted(entry.pop("_legacy_reference_numbers")) + entry.pop("_species_ref_keys", None) + entry["legacy_reference_numbers"] = legacy_numbers + entry["species_count"] = len(entry["species_refs"]) + + if needle: + haystack = " ".join( + [ + str(entry.get("normalized_text", "")), + str(entry.get("raw_text", "")), + str(entry.get("citation_key", "")), + str(entry.get("doi", "")), + str(entry.get("abstract_text", "")), + str(entry.get("draft_bibtex", "")), + ] + ).lower() + if needle not in haystack: + continue + items.append(entry) + + items.sort(key=lambda item: (str(item.get("normalized_text", "") or item.get("raw_text", "")).lower(), str(item.get("citation_key", "")).lower())) + return items + + def get_editor_species_list(search: str = "") -> list[dict[str, object]]: return list_species(search=search, include_unpublished=True, include_archived=True) +def get_contributor_species_list(username: str, search: str = "") -> list[dict[str, object]]: + ensure_schema() + with SessionLocal() as session: + query = ( + select(Species) + .where( + Species.owner_role == "contributor", + Species.owner_username == username, + ) + .order_by(Species.common_name, Species.title) + ) + species = list(session.scalars(query)) + payload = [_species_to_payload(item, include_sections=False) for item in species] + if search: + needle = search.lower() + payload = [ + item + for item in payload + if needle in item["common_name"].lower() + or needle in item["scientific_name"].lower() + or needle in item["title"].lower() + ] + return payload + + def get_editor_species_workflow(slug: str) -> dict[str, object] | None: item = get_species_by_slug(slug, include_unpublished=True, include_archived=True) if item is None: @@ -345,6 +684,61 @@ def get_editor_species_detail(slug: str) -> dict[str, object] | None: return get_species_by_slug(slug, include_unpublished=True, include_archived=True) +def get_contributor_species_detail(slug: str, username: str) -> dict[str, object] | None: + ensure_schema() + with SessionLocal() as session: + species = session.scalar( + select(Species).where( + Species.slug == slug, + Species.owner_role == "contributor", + Species.owner_username == username, + ) + ) + if species is None: + return None + _ = species.sections + _ = species.diagnostics + _ = species.citations + return _species_to_payload(species, include_sections=True) + + +def _citation_list_payload(species: Species) -> dict[str, object]: + return { + "slug": species.slug, + "title": species.title, + "common_name": species.common_name, + "scientific_name": species.scientific_name, + "citation_count": len(species.citations), + "citations": [_citation_to_payload(citation) for citation in species.citations], + } + + +def get_editor_species_citations(slug: str) -> dict[str, object] | None: + ensure_schema() + with SessionLocal() as session: + species = session.scalar(select(Species).where(Species.slug == slug)) + if species is None: + return None + _ = species.citations + return _citation_list_payload(species) + + +def get_contributor_species_citations(slug: str, username: str) -> dict[str, object] | None: + ensure_schema() + with SessionLocal() as session: + species = session.scalar( + select(Species).where( + Species.slug == slug, + Species.owner_role == "contributor", + Species.owner_username == username, + ) + ) + if species is None: + return None + _ = species.citations + return _citation_list_payload(species) + + def list_species_audit(slug: str) -> list[dict[str, object]] | None: ensure_schema() with SessionLocal() as session: @@ -441,6 +835,158 @@ def update_species_editorial( } +def _normalize_email(email: str) -> str: + normalized = email.strip().lower() + if not EMAIL_PATTERN.fullmatch(normalized): + raise ValueError("Contributor username must be a valid email address.") + return normalized + + +def get_minimum_contributor_age() -> int: + configured = os.environ.get("ECOSPECIES_MINIMUM_CONTRIBUTOR_AGE", "13").strip() + try: + minimum_age = int(configured) + except ValueError as exc: # pragma: no cover - misconfiguration path + raise ValueError("ECOSPECIES_MINIMUM_CONTRIBUTOR_AGE must be an integer.") from exc + if minimum_age < 1: + raise ValueError("ECOSPECIES_MINIMUM_CONTRIBUTOR_AGE must be positive.") + return minimum_age + + +def register_contributor(email: str, age_gate_confirmed: bool) -> dict[str, object]: + ensure_schema() + normalized_email = _normalize_email(email) + minimum_age = get_minimum_contributor_age() + if not age_gate_confirmed: + raise ValueError( + f"Contributors must confirm they are at least {minimum_age} years old." + ) + + token = secrets.token_urlsafe(24) + token_hash = hashlib.sha256(token.encode("utf-8")).hexdigest() + now = datetime.now(timezone.utc).isoformat() + + with SessionLocal() as session: + existing = session.scalar( + select(ContributorAccount).where(ContributorAccount.email == normalized_email) + ) + if existing is not None: + raise ValueError("A contributor account already exists for that email address.") + + session.add( + ContributorAccount( + email=normalized_email, + token_hash=token_hash, + age_gate_confirmed=True, + created_at=now, + is_active=True, + ) + ) + session.commit() + + return { + "username": normalized_email, + "role": "contributor", + "token": token, + "minimum_age": minimum_age, + "warning": "Store this token now. You will not be able to access your contributed species later without it.", + } + + +def _build_initial_contributor_markdown(email: str) -> str: + title = "New Species Draft" + return ( + "---\n" + f"title: {title}\n" + "common_name: \n" + "scientific_name: \n" + "species_code: \n" + f"source_file: {CONTRIBUTOR_SUBMISSION_PREFIX}.md\n" + "publication_status: draft\n" + "source_format: ecospecies-markdown-v1\n" + "---\n\n" + "## Summary\n" + "Provide a concise summary.\n\n" + "## Habitat\n" + "Describe habitat.\n" + ) + + +def _next_unique_slug(session, base_slug: str) -> str: + candidate = base_slug + suffix = 2 + while session.scalar(select(Species.id).where(Species.slug == candidate)) is not None: + candidate = f"{base_slug}-{suffix}" + suffix += 1 + return candidate + + +def create_contributor_species(username: str, markdown: str | None = None) -> dict[str, object]: + ensure_schema() + normalized_email = _normalize_email(username) + source_markdown = (markdown or _build_initial_contributor_markdown(normalized_email)).strip() + if not source_markdown.endswith("\n"): + source_markdown += "\n" + + with SessionLocal() as session: + document_model = parse_markdown_document(source_markdown) + projection = extract_species_projection(document_model) + slug_base = slugify( + str(projection.get("common_name") or projection.get("title") or CONTRIBUTOR_SUBMISSION_PREFIX) + ) + slug = _next_unique_slug(session, slug_base) + species = Species( + slug=slug, + source_file=f"{CONTRIBUTOR_SUBMISSION_PREFIX}-{slug}.md", + title=str(projection.get("title") or "New Species Draft"), + common_name=str(projection.get("common_name") or ""), + scientific_name=str(projection.get("scientific_name") or ""), + flelmr_code=str(projection.get("flelmr_code") or ""), + summary=str(projection.get("summary") or ""), + section_count=len(projection["sections"]), + publication_status="draft", + is_archived=False, + editor_notes="", + created_by=normalized_email, + owner_username=normalized_email, + owner_role="contributor", + last_modified_by=normalized_email, + ) + session.add(species) + session.flush() + save_species_document(session, species, source_markdown, normalized_email) + session.add( + SpeciesAuditLog( + species_id=species.id, + changed_by=normalized_email, + changed_at=datetime.now(timezone.utc).isoformat(), + action="contributor_create", + details_json=json.dumps({"publication_status": "draft"}, ensure_ascii=True), + ) + ) + session.commit() + return { + "slug": species.slug, + "publication_status": species.publication_status, + "last_modified_by": species.last_modified_by, + } + + +def get_contributor_species_document(slug: str, username: str) -> dict[str, object] | None: + ensure_schema() + with SessionLocal() as session: + species = session.scalar( + select(Species).where( + Species.slug == slug, + Species.owner_role == "contributor", + Species.owner_username == username, + ) + ) + if species is None: + return None + return get_species_document_payload(session, slug) + + def update_species_section( slug: str, section_position: int, @@ -506,3 +1052,595 @@ def update_species_section( "last_modified_by": species.last_modified_by, "changed_fields": changed_fields, } + + +def update_species_document_markdown( + slug: str, + markdown: str, + username: str, +) -> dict[str, object] | None: + ensure_schema() + with SessionLocal() as session: + species = session.scalar(select(Species).where(Species.slug == slug)) + if species is None: + return None + + result = save_species_document(session, species, markdown, username) + session.add( + SpeciesAuditLog( + species_id=species.id, + changed_by=username, + changed_at=datetime.now(timezone.utc).isoformat(), + action="document_update", + details_json=json.dumps( + {"source_format": "ecospecies-markdown-v1"}, + ensure_ascii=True, + ), + ) + ) + session.commit() + return result + + +def update_species_citation_review( + slug: str, + citation_id: int, + review_status: str | None, + normalized_text: str | None, + doi: str | None, + citation_key: str | None, + entry_type: str | None, + draft_bibtex: str | None, + username: str, + *, + abstract_text: str | None = None, +) -> dict[str, object] | None: + ensure_schema() + with SessionLocal() as session: + species = session.scalar(select(Species).where(Species.slug == slug)) + if species is None: + return None + + citation = session.scalar( + select(SpeciesCitation).where( + SpeciesCitation.species_id == species.id, + SpeciesCitation.id == citation_id, + ) + ) + if citation is None: + return None + + before = _citation_to_payload(citation) + if review_status is not None: + normalized_status = review_status.strip().lower() + if normalized_status not in CITATION_REVIEW_STATUSES: + raise ValueError( + f"Unsupported review_status: {review_status}. " + f"Expected one of {sorted(CITATION_REVIEW_STATUSES)}" + ) + citation.review_status = normalized_status + if normalized_text is not None: + citation.normalized_text = normalized_text.strip() + if abstract_text is not None: + citation.abstract_text = abstract_text.strip() + if doi is not None: + citation.doi = doi.strip() + if citation_key is not None: + citation.citation_key = citation_key.strip() + if entry_type is not None: + citation.entry_type = entry_type.strip() or "misc" + if draft_bibtex is not None: + citation.draft_bibtex = draft_bibtex.strip() + citation.source_type = "editor_review" + + after = _citation_to_payload(citation) + changed_fields = { + field: {"from": before[field], "to": after[field]} + for field in ( + "review_status", + "normalized_text", + "abstract_text", + "doi", + "citation_key", + "entry_type", + "draft_bibtex", + "source_type", + ) + if before[field] != after[field] + } + + if changed_fields: + species.last_modified_by = username + session.add( + SpeciesAuditLog( + species_id=species.id, + changed_by=username, + changed_at=datetime.now(timezone.utc).isoformat(), + action="citation_review_update", + details_json=json.dumps( + {"citation_id": citation.id, **changed_fields}, + ensure_ascii=True, + ), + ) + ) + + session.add(citation) + session.add(species) + session.commit() + session.refresh(citation) + + return { + "slug": species.slug, + "citation": _citation_to_payload(citation), + "last_modified_by": species.last_modified_by, + "changed_fields": changed_fields, + } + + +def get_species_citation_candidates(slug: str, citation_id: int) -> dict[str, object] | None: + ensure_schema() + with SessionLocal() as session: + species = session.scalar(select(Species).where(Species.slug == slug)) + if species is None: + return None + + citation = session.scalar( + select(SpeciesCitation).where( + SpeciesCitation.species_id == species.id, + SpeciesCitation.id == citation_id, + ) + ) + if citation is None: + return None + + candidates = discover_citation_candidates(_citation_to_payload(citation)) + return { + "slug": species.slug, + "citation": _citation_to_payload(citation), + **candidates, + } + + +def _apply_citation_enrichment( + session, + species: Species, + citation: SpeciesCitation, + username: str, +) -> dict[str, object]: + before = _citation_to_payload(citation) + enrichment = enrich_citation_payload(before) + + for field in ( + "citation_key", + "entry_type", + "normalized_text", + "abstract_text", + "draft_bibtex", + "doi", + "source_url", + "openalex_id", + "resolver_source_label", + "enrichment_status", + "enrichment_error", + ): + if field in enrichment: + setattr(citation, field, str(enrichment.get(field, "")).strip()) + + after = _citation_to_payload(citation) + changed_fields = { + field: {"from": before[field], "to": after[field]} + for field in ( + "citation_key", + "entry_type", + "normalized_text", + "abstract_text", + "draft_bibtex", + "doi", + "source_url", + "openalex_id", + "resolver_source_label", + "enrichment_status", + "enrichment_error", + ) + if before[field] != after[field] + } + conflicts = enrichment.get("conflicts") + if conflicts: + changed_fields["resolver_conflicts"] = list(conflicts) + + if changed_fields: + species.last_modified_by = username + session.add( + SpeciesAuditLog( + species_id=species.id, + changed_by=username, + changed_at=datetime.now(timezone.utc).isoformat(), + action="citation_enrichment", + details_json=json.dumps( + {"citation_id": citation.id, **changed_fields}, + ensure_ascii=True, + ), + ) + ) + + session.add(citation) + session.add(species) + return { + "citation": _citation_to_payload(citation), + "changed_fields": changed_fields, + } + + +def _next_citation_position(species: Species) -> int: + if not species.citations: + return 1 + return max(citation.position for citation in species.citations) + 1 + + +def update_species_citation_enrichment( + slug: str, + citation_id: int, + username: str, +) -> dict[str, object] | None: + ensure_schema() + with SessionLocal() as session: + species = session.scalar(select(Species).where(Species.slug == slug)) + if species is None: + return None + + citation = session.scalar( + select(SpeciesCitation).where( + SpeciesCitation.species_id == species.id, + SpeciesCitation.id == citation_id, + ) + ) + if citation is None: + return None + + result = _apply_citation_enrichment(session, species, citation, username) + session.commit() + session.refresh(citation) + + return { + "slug": species.slug, + "citation": result["citation"], + "last_modified_by": species.last_modified_by, + "changed_fields": result["changed_fields"], + } + + +def apply_species_citation_candidate_selection( + slug: str, + citation_id: int, + candidate: dict[str, object], + username: str, +) -> dict[str, object] | None: + ensure_schema() + with SessionLocal() as session: + species = session.scalar(select(Species).where(Species.slug == slug)) + if species is None: + return None + + citation = session.scalar( + select(SpeciesCitation).where( + SpeciesCitation.species_id == species.id, + SpeciesCitation.id == citation_id, + ) + ) + if citation is None: + return None + + before = _citation_to_payload(citation) + enrichment = apply_citation_candidate_selection(before, candidate) + for field in ( + "citation_key", + "entry_type", + "normalized_text", + "abstract_text", + "draft_bibtex", + "doi", + "source_url", + "openalex_id", + "resolver_source_label", + "enrichment_status", + "enrichment_error", + ): + if field in enrichment: + setattr(citation, field, str(enrichment.get(field, "")).strip()) + citation.source_type = "editor_selected_candidate" + citation.review_status = "accepted" + + after = _citation_to_payload(citation) + changed_fields = { + field: {"from": before[field], "to": after[field]} + for field in ( + "citation_key", + "entry_type", + "normalized_text", + "abstract_text", + "draft_bibtex", + "doi", + "source_url", + "openalex_id", + "resolver_source_label", + "enrichment_status", + "enrichment_error", + "source_type", + "review_status", + ) + if before[field] != after[field] + } + + if changed_fields: + species.last_modified_by = username + session.add( + SpeciesAuditLog( + species_id=species.id, + changed_by=username, + changed_at=datetime.now(timezone.utc).isoformat(), + action="citation_candidate_selection", + details_json=json.dumps( + { + "citation_id": citation.id, + "selected_source_label": str(candidate.get("source_label", "")).strip(), + **changed_fields, + }, + ensure_ascii=True, + ), + ) + ) + + session.add(citation) + session.add(species) + session.commit() + session.refresh(citation) + return { + "slug": species.slug, + "citation": _citation_to_payload(citation), + "last_modified_by": species.last_modified_by, + "changed_fields": changed_fields, + } + + +def add_species_citation_from_candidate( + slug: str, + citation_id: int, + candidate: dict[str, object], + username: str, +) -> dict[str, object] | None: + ensure_schema() + with SessionLocal() as session: + species = session.scalar(select(Species).where(Species.slug == slug)) + if species is None: + return None + + citation = session.scalar( + select(SpeciesCitation).where( + SpeciesCitation.species_id == species.id, + SpeciesCitation.id == citation_id, + ) + ) + if citation is None: + return None + + enrichment = apply_citation_candidate_selection(_citation_to_payload(citation), candidate) + raw_text = ( + str(enrichment.get("normalized_text", "")).strip() + or str(candidate.get("fields", {}).get("title", "")).strip() + or str(citation.raw_text).strip() + ) + + document_markdown = species.document.markdown_content if species.document is not None else "" + document_model = parse_markdown_document(document_markdown) + added = add_citation_to_document(document_model, raw_text, heading_title="Related References") + updated_markdown = export_markdown_document(document_model) + save_species_document(session, species, updated_markdown, username) + + new_citation = session.scalar( + select(SpeciesCitation).where( + SpeciesCitation.species_id == species.id, + SpeciesCitation.raw_text == raw_text, + ) + ) + if new_citation is None: + return None + + new_citation.source_type = "editor_added_candidate" + new_citation.review_status = "accepted" + new_citation.citation_key = str(enrichment.get("citation_key", "")).strip() + new_citation.entry_type = str(enrichment.get("entry_type", "misc")).strip() or "misc" + new_citation.normalized_text = str(enrichment.get("normalized_text", "")).strip() + new_citation.abstract_text = str(enrichment.get("abstract_text", "")).strip() + new_citation.draft_bibtex = str(enrichment.get("draft_bibtex", "")).strip() + new_citation.doi = str(enrichment.get("doi", "")).strip() + new_citation.source_url = str(enrichment.get("source_url", "")).strip() + new_citation.openalex_id = str(enrichment.get("openalex_id", "")).strip() + new_citation.resolver_source_label = str(enrichment.get("resolver_source_label", "")).strip() + new_citation.enrichment_status = str(enrichment.get("enrichment_status", "resolved")).strip() + new_citation.enrichment_error = str(enrichment.get("enrichment_error", "")).strip() + + if not added: + new_citation.source_type = "editor_added_candidate" + new_citation.review_status = "accepted" + + session.add(new_citation) + species.last_modified_by = username + session.add( + SpeciesAuditLog( + species_id=species.id, + changed_by=username, + changed_at=datetime.now(timezone.utc).isoformat(), + action="citation_candidate_addition", + details_json=json.dumps( + { + "source_citation_id": citation.id, + "selected_source_label": str(candidate.get("source_label", "")).strip(), + "new_citation_key": new_citation.citation_key, + }, + ensure_ascii=True, + ), + ) + ) + session.commit() + session.refresh(new_citation) + return { + "slug": species.slug, + "citation": _citation_to_payload(new_citation), + "last_modified_by": species.last_modified_by, + } + + +def update_species_citations_enrichment_batch( + slug: str, + username: str, +) -> dict[str, object] | None: + ensure_schema() + with SessionLocal() as session: + species = session.scalar(select(Species).where(Species.slug == slug)) + if species is None: + return None + + _ = species.citations + updated_citations: list[dict[str, object]] = [] + changed_count = 0 + resolved_count = 0 + unresolved_count = 0 + error_count = 0 + + for citation in species.citations: + result = _apply_citation_enrichment(session, species, citation, username) + updated_citations.append(result["citation"]) + if result["changed_fields"]: + changed_count += 1 + status = str(result["citation"].get("enrichment_status", "")).strip() + if status == "resolved": + resolved_count += 1 + elif status == "unresolved": + unresolved_count += 1 + elif status == "error": + error_count += 1 + + session.commit() + + return { + "slug": species.slug, + "citation_count": len(updated_citations), + "changed_count": changed_count, + "resolved_count": resolved_count, + "unresolved_count": unresolved_count, + "error_count": error_count, + "citations": updated_citations, + "last_modified_by": species.last_modified_by, + } + + +def _should_backfill_citation(citation: SpeciesCitation, include_accepted: bool = False) -> bool: + review_status = str(citation.review_status or "").strip().lower() + source_type = str(citation.source_type or "").strip().lower() + enrichment_status = str(citation.enrichment_status or "").strip().lower() + normalized_text = str(citation.normalized_text or "").strip() + abstract_text = str(citation.abstract_text or "").strip() + + if not include_accepted and review_status == "accepted": + return False + if source_type in {"editor_selected_candidate", "editor_added_candidate"} and not include_accepted: + return False + + return ( + source_type in {"document_extract", "editor_review", ""} + or enrichment_status in {"pending", "unresolved", "error", ""} + or not normalized_text + or not abstract_text + ) + + +def backfill_species_citations( + slug: str, + username: str, + include_accepted: bool = False, +) -> dict[str, object] | None: + ensure_schema() + with SessionLocal() as session: + species = session.scalar(select(Species).where(Species.slug == slug)) + if species is None: + return None + + _ = species.citations + updated_citations: list[dict[str, object]] = [] + changed_count = 0 + resolved_count = 0 + unresolved_count = 0 + error_count = 0 + backfilled_count = 0 + + for citation in species.citations: + if _should_backfill_citation(citation, include_accepted=include_accepted): + backfilled_count += 1 + result = _apply_citation_enrichment(session, species, citation, username) + payload = result["citation"] + if result["changed_fields"]: + changed_count += 1 + else: + payload = _citation_to_payload(citation) + updated_citations.append(payload) + status = str(payload.get("enrichment_status", "")).strip() + if status == "resolved": + resolved_count += 1 + elif status == "unresolved": + unresolved_count += 1 + elif status == "error": + error_count += 1 + + session.commit() + + return { + "slug": species.slug, + "citation_count": len(updated_citations), + "backfilled_count": backfilled_count, + "changed_count": changed_count, + "resolved_count": resolved_count, + "unresolved_count": unresolved_count, + "error_count": error_count, + "citations": updated_citations, + "last_modified_by": species.last_modified_by, + } + + +def update_contributor_species_document_markdown( + slug: str, + markdown: str, + username: str, +) -> dict[str, object] | None: + ensure_schema() + normalized_email = _normalize_email(username) + with SessionLocal() as session: + species = session.scalar( + select(Species).where( + Species.slug == slug, + Species.owner_role == "contributor", + Species.owner_username == normalized_email, + ) + ) + if species is None: + return None + + result = save_species_document(session, species, markdown, normalized_email) + if species.publication_status == "published": + species.publication_status = "review" + session.add( + SpeciesAuditLog( + species_id=species.id, + changed_by=normalized_email, + changed_at=datetime.now(timezone.utc).isoformat(), + action="contributor_document_update", + details_json=json.dumps( + {"source_format": "ecospecies-markdown-v1"}, + ensure_ascii=True, + ), + ) + ) + session.commit() + return { + **result, + "publication_status": species.publication_status, + } diff --git a/apps/api/test_auth.py b/apps/api/test_auth.py new file mode 100644 index 0000000..9be4678 --- /dev/null +++ b/apps/api/test_auth.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parent +SRC = ROOT / "src" +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) + +TEST_PATH = ROOT / "tests" / "test_auth.py" +SPEC = importlib.util.spec_from_file_location("ecospecies_api_test_auth", TEST_PATH) +MODULE = importlib.util.module_from_spec(SPEC) +assert SPEC is not None and SPEC.loader is not None +SPEC.loader.exec_module(MODULE) + +for name in dir(MODULE): + if name.startswith("Test") or name.endswith("Tests"): + globals()[name] = getattr(MODULE, name) diff --git a/apps/api/test_citation_enrichment.py b/apps/api/test_citation_enrichment.py new file mode 100644 index 0000000..e9d9eaf --- /dev/null +++ b/apps/api/test_citation_enrichment.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parent +SRC = ROOT / "src" +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) + +TEST_PATH = ROOT / "tests" / "test_citation_enrichment.py" +SPEC = importlib.util.spec_from_file_location("ecospecies_api_test_citation_enrichment", TEST_PATH) +MODULE = importlib.util.module_from_spec(SPEC) +assert SPEC is not None and SPEC.loader is not None +SPEC.loader.exec_module(MODULE) + +for name in dir(MODULE): + if name.startswith("Test") or name.endswith("Tests"): + globals()[name] = getattr(MODULE, name) diff --git a/apps/api/test_document_format.py b/apps/api/test_document_format.py new file mode 100644 index 0000000..151b0d5 --- /dev/null +++ b/apps/api/test_document_format.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parent +SRC = ROOT / "src" +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) + +TEST_PATH = ROOT / "tests" / "test_document_format.py" +SPEC = importlib.util.spec_from_file_location("ecospecies_api_test_document_format", TEST_PATH) +MODULE = importlib.util.module_from_spec(SPEC) +assert SPEC is not None and SPEC.loader is not None +SPEC.loader.exec_module(MODULE) + +for name in dir(MODULE): + if name.startswith("Test") or name.endswith("Tests"): + globals()[name] = getattr(MODULE, name) diff --git a/apps/api/test_parser.py b/apps/api/test_parser.py new file mode 100644 index 0000000..c15455d --- /dev/null +++ b/apps/api/test_parser.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parent +SRC = ROOT / "src" +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) + +TEST_PATH = ROOT / "tests" / "test_parser.py" +SPEC = importlib.util.spec_from_file_location("ecospecies_api_test_parser", TEST_PATH) +MODULE = importlib.util.module_from_spec(SPEC) +assert SPEC is not None and SPEC.loader is not None +SPEC.loader.exec_module(MODULE) + +for name in dir(MODULE): + if name.startswith("Test") or name.endswith("Tests"): + globals()[name] = getattr(MODULE, name) diff --git a/apps/api/tests/test_auth.py b/apps/api/tests/test_auth.py new file mode 100644 index 0000000..5a9a736 --- /dev/null +++ b/apps/api/tests/test_auth.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +from ecospecies_api import auth, repository + + +class ContributorAuthTests(unittest.TestCase): + def setUp(self) -> None: + self.tempdir = tempfile.TemporaryDirectory() + db_path = Path(self.tempdir.name) / "test.db" + self.engine = create_engine(f"sqlite:///{db_path}", future=True) + self.session_local = sessionmaker( + bind=self.engine, + autoflush=False, + autocommit=False, + future=True, + ) + self.repository_engine_patch = patch.object(repository, "create_db_engine", return_value=self.engine) + self.repository_session_patch = patch.object(repository, "SessionLocal", self.session_local) + self.auth_engine_patch = patch.object(auth, "create_db_engine", return_value=self.engine) + self.auth_session_patch = patch.object(auth, "SessionLocal", self.session_local) + self.repository_engine_patch.start() + self.repository_session_patch.start() + self.auth_engine_patch.start() + self.auth_session_patch.start() + + def tearDown(self) -> None: + self.auth_session_patch.stop() + self.auth_engine_patch.stop() + self.repository_session_patch.stop() + self.repository_engine_patch.stop() + self.engine.dispose() + self.tempdir.cleanup() + + def test_contributor_token_resolves_to_contributor_session(self) -> None: + registration = repository.register_contributor("author@example.org", True) + + session = auth.resolve_auth_session({"Authorization": f"Bearer {registration['token']}"}) + + self.assertIsNotNone(session) + assert session is not None + self.assertEqual(session.username, "author@example.org") + self.assertEqual(session.role, "contributor") + + def test_contributor_role_does_not_satisfy_editor(self) -> None: + self.assertTrue(auth.role_satisfies("editor", "contributor")) + self.assertFalse(auth.role_satisfies("contributor", "editor")) + + +if __name__ == "__main__": + unittest.main() diff --git a/apps/api/tests/test_citation_enrichment.py b/apps/api/tests/test_citation_enrichment.py new file mode 100644 index 0000000..501941a --- /dev/null +++ b/apps/api/tests/test_citation_enrichment.py @@ -0,0 +1,527 @@ +from __future__ import annotations + +import unittest +from unittest.mock import patch + +from ecospecies_api.citation_enrichment import ( + _crossref_message_to_entry, + _datacite_item_to_entry, + _openalex_work_to_entry, + _render_normalized_text, + apply_citation_candidate_selection, + discover_citation_candidates, + enrich_citation_payload, + LocalBibEntry, + LocalMetadataResolver, + LocalResolution, +) +from ecospecies_api.citegeist_bridge import extract_draft_citation, render_single_bibtex + + +class CitationEnrichmentTests(unittest.TestCase): + def test_render_normalized_text_includes_volume_number_and_pages(self) -> None: + rendered = _render_normalized_text( + "article", + { + "author": "Daniell, W.C.", + "year": "1872", + "title": "Letters referring to experiments of W.C. Daniell", + "journal": "Comm. Rept. U.S. Comm. Fish & Fish.", + "volume": "2", + "number": "4", + "pages": "387-390", + "doi": "10.1000/example", + }, + ) + + self.assertEqual( + rendered, + "Daniell, W.C. (1872). Letters referring to experiments of W.C. Daniell. Comm. Rept. U.S. Comm. Fish & Fish., 2(4): 387-390. DOI:10.1000/example", + ) + + def test_crossref_mapping_keeps_volume_issue_and_pages(self) -> None: + entry = _crossref_message_to_entry( + { + "type": "journal-article", + "title": ["Example Work"], + "issued": {"date-parts": [[1872]]}, + "author": [{"family": "Daniell", "given": "W.C."}], + "container-title": ["Comm. Rept. U.S. Comm. Fish & Fish."], + "DOI": "10.1000/example", + "URL": "https://doi.org/10.1000/example", + "volume": "2", + "issue": "4", + "page": "387-390", + } + ) + + self.assertEqual(entry.fields["volume"], "2") + self.assertEqual(entry.fields["number"], "4") + self.assertEqual(entry.fields["pages"], "387-390") + + def test_openalex_mapping_keeps_biblio_fields(self) -> None: + entry = _openalex_work_to_entry( + { + "id": "https://openalex.org/W12345", + "display_name": "OpenAlex Discovered Work", + "publication_year": 2022, + "type": "article", + "doi": "https://doi.org/10.1000/example-openalex", + "authorships": [{"author": {"display_name": "J S, Smith"}}], + "primary_location": {"source": {"display_name": "Journal of Graph Discovery"}}, + "biblio": {"volume": "12", "issue": "3", "first_page": "101", "last_page": "118"}, + "abstract_inverted_index": {"Graphs": [0], "support": [1], "learning": [2]}, + } + ) + + self.assertEqual(entry.fields["author"], "Smith, J. S.") + self.assertEqual(entry.fields["volume"], "12") + self.assertEqual(entry.fields["number"], "3") + self.assertEqual(entry.fields["pages"], "101-118") + self.assertEqual(entry.fields["abstract"], "Graphs support learning") + + def test_openalex_mapping_handles_null_source(self) -> None: + entry = _openalex_work_to_entry( + { + "id": "https://openalex.org/W54321", + "display_name": "OpenAlex Work Without Source", + "publication_year": 2021, + "type": "article", + "doi": "https://doi.org/10.1000/example-null-source", + "authorships": [{"author": {"display_name": "Jane Smith"}}], + "primary_location": {"source": None}, + "biblio": {"volume": "5", "issue": "1", "first_page": "10", "last_page": "20"}, + } + ) + + self.assertEqual(entry.fields["title"], "OpenAlex Work Without Source") + self.assertNotIn("journal", entry.fields) + self.assertEqual(entry.fields["volume"], "5") + self.assertEqual(entry.fields["number"], "1") + self.assertEqual(entry.fields["pages"], "10-20") + + def test_datacite_mapping_keeps_container_and_pages(self) -> None: + entry = _datacite_item_to_entry( + { + "attributes": { + "titles": [{"title": "DataCite Work"}], + "creators": [{"name": "J R, Rivera"}], + "publicationYear": "2021", + "doi": "10.1000/datacite-work", + "url": "https://doi.org/10.1000/datacite-work", + "container": "Journal of Metadata", + "volume": "7", + "issue": "2", + "firstPage": "44", + "lastPage": "59", + "descriptions": [ + {"descriptionType": "Abstract", "description": "Abstract: Metadata makes reuse easier."} + ], + } + } + ) + + self.assertEqual(entry.fields["author"], "Rivera, J. R.") + self.assertEqual(entry.fields["journal"], "Journal of Metadata") + self.assertEqual(entry.fields["volume"], "7") + self.assertEqual(entry.fields["number"], "2") + self.assertEqual(entry.fields["pages"], "44-59") + self.assertEqual(entry.fields["abstract"], "Metadata makes reuse easier.") + + def test_render_single_bibtex_preserves_balanced_braces_and_repairs_unmatched_ones(self) -> None: + rendered = render_single_bibtex( + "misc", + "example", + { + "title": "Alpha_beta {Gamma}", + "note": "raw_reference = {Alpha } beta}", + }, + ) + + self.assertIn("title = {Alpha_beta {Gamma}}", rendered) + self.assertIn("note = {raw_reference = {Alpha } beta)}", rendered) + + def test_extract_draft_citation_repairs_report_style_reference_fields(self) -> None: + class MockEntry: + entry_type = "misc" + citation_key = "badkey" + fields = { + "title": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.", + "year": "1872", + "note": "extracted_reference = {true}", + } + + with patch( + "ecospecies_api.citegeist_bridge._load_citegeist_extract", + return_value=lambda text: [MockEntry()], + ): + draft = extract_draft_citation( + "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.", + legacy_reference_number="160", + ) + + self.assertIsNotNone(draft) + assert draft is not None + self.assertEqual(draft.fields["author"], "Daniell, W.C") + self.assertEqual( + draft.fields["title"], + "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River", + ) + self.assertEqual(draft.fields["journal"], "Comm. Rept. U.S. Comm. Fish & Fish") + self.assertEqual(draft.fields["volume"], "2") + self.assertEqual(draft.fields["pages"], "387-390") + self.assertEqual(draft.citation_key, "daniell1872lettersreferringexperiments") + + def test_extract_draft_citation_does_not_split_title_on_report_word(self) -> None: + class MockEntry: + entry_type = "misc" + citation_key = "badkey" + fields = { + "title": "Smith, J. 1999. Habitat report synthesis for Alabama shad. NOAA Tech. Memo. NMFS-SEFSC-200.", + "year": "1999", + "note": "extracted_reference = {true}", + } + + with patch( + "ecospecies_api.citegeist_bridge._load_citegeist_extract", + return_value=lambda text: [MockEntry()], + ): + draft = extract_draft_citation( + "Smith, J. 1999. Habitat report synthesis for Alabama shad. NOAA Tech. Memo. NMFS-SEFSC-200.", + legacy_reference_number="42", + ) + + self.assertIsNotNone(draft) + assert draft is not None + self.assertEqual(draft.fields["author"], "Smith, J") + self.assertEqual(draft.fields["title"], "Habitat report synthesis for Alabama shad") + self.assertEqual(draft.fields["howpublished"], "NOAA Tech. Memo. NMFS-SEFSC-200") + self.assertNotIn("journal", draft.fields) + + def test_extract_draft_citation_repairs_proc_abbreviation_venue(self) -> None: + class MockEntry: + entry_type = "misc" + citation_key = "badkey" + fields = { + "title": "Bailey, R.M., H.E. Winn and C.L. Smith. 1954. Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes. Proc. Acad. Sci. Philad. 106: 109-134.", + "year": "1954", + "note": "extracted_reference = {true}", + } + + with patch( + "ecospecies_api.citegeist_bridge._load_citegeist_extract", + return_value=lambda text: [MockEntry()], + ): + draft = extract_draft_citation( + "Bailey, R.M., H.E. Winn and C.L. Smith. 1954. Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes. Proc. Acad. Sci. Philad. 106: 109-134.", + legacy_reference_number="26", + ) + + self.assertIsNotNone(draft) + assert draft is not None + self.assertEqual( + draft.fields["title"], + "Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes", + ) + self.assertEqual(draft.fields["journal"], "Proc. Acad. Sci. Philad") + self.assertEqual(draft.fields["volume"], "106") + self.assertEqual(draft.fields["pages"], "109-134") + + def test_extract_draft_citation_repairs_occasional_paper_venue(self) -> None: + class MockEntry: + entry_type = "misc" + citation_key = "badkey" + fields = { + "title": "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.", + "year": "1950", + "note": "extracted_reference = {true}", + } + + with patch( + "ecospecies_api.citegeist_bridge._load_citegeist_extract", + return_value=lambda text: [MockEntry()], + ): + draft = extract_draft_citation( + "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.", + legacy_reference_number="41", + ) + + self.assertIsNotNone(draft) + assert draft is not None + self.assertEqual( + draft.fields["title"], + "Annotated list of the fauna of the Grand Isle region, 1928-1946", + ) + self.assertEqual(draft.fields["journal"], "Occas. Pap. Mar. Lab., LSU") + self.assertEqual(draft.fields["volume"], "6") + self.assertEqual(draft.fields["number"], "6") + self.assertEqual(draft.fields["pages"], "1-66") + + def test_extract_draft_citation_repairs_partial_existing_venue_stub(self) -> None: + class MockEntry: + entry_type = "misc" + citation_key = "badkey" + fields = { + "title": "Annotated list of the fauna of the Grand Isle region, 1928-1946", + "year": "1950", + "howpublished": "Occas", + "note": "extracted_reference = {true}", + } + + with patch( + "ecospecies_api.citegeist_bridge._load_citegeist_extract", + return_value=lambda text: [MockEntry()], + ): + draft = extract_draft_citation( + "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.", + legacy_reference_number="41", + ) + + self.assertIsNotNone(draft) + assert draft is not None + self.assertEqual(draft.fields["journal"], "Occas. Pap. Mar. Lab., LSU") + self.assertEqual(draft.fields["volume"], "6") + self.assertEqual(draft.fields["number"], "6") + self.assertEqual(draft.fields["pages"], "1-66") + + def test_falls_back_to_internal_resolver_when_citegeist_repo_is_unavailable(self) -> None: + class MockResolver: + def resolve_entry(self, entry): + class Resolution: + source_label = "crossref:doi:10.1000/example" + + class Entry: + entry_type = "article" + citation_key = "doi101000example" + fields = { + "author": "Smith, Jane", + "year": "2024", + "title": "Example Work", + "journal": "Journal of Examples", + "doi": "10.1000/example", + "url": "https://doi.org/10.1000/example", + } + + entry = Entry() + + return Resolution() + + with patch( + "ecospecies_api.citation_enrichment._load_citegeist_resolution_components", + return_value=(None, None, None, None), + ): + result = enrich_citation_payload( + { + "raw_text": "Smith, Jane. 2024. Example Work.", + "legacy_reference_number": "7", + }, + resolver=MockResolver(), + ) + + self.assertEqual(result["enrichment_status"], "resolved") + self.assertEqual(result["doi"], "10.1000/example") + self.assertEqual(result["source_url"], "https://doi.org/10.1000/example") + self.assertEqual(result["resolver_source_label"], "crossref:doi:10.1000/example") + self.assertIn("ecospecies_reference_number = {7}", result["draft_bibtex"]) + + def test_enrichment_replaces_raw_reference_title_and_dedupes_legacy_note(self) -> None: + class MockResolver: + def resolve_entry(self, entry): + class Resolution: + source_label = "crossref:search:Letters referring to experiments" + + class Entry: + entry_type = "article" + citation_key = "daniell1872lettersshadalabama" + fields = { + "author": "Daniell, W.C.", + "year": "1872", + "title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River", + "journal": "Comm. Rept. U.S. Comm. Fish & Fish.", + "url": "", + } + + entry = Entry() + + return Resolution() + + result = enrich_citation_payload( + { + "raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.", + "legacy_reference_number": "160", + "citation_key": "daniell1948daniellwc", + }, + resolver=MockResolver(), + ) + + self.assertEqual(result["enrichment_status"], "resolved") + self.assertEqual(result["citation_key"], "daniell1872lettersreferringexperiments") + self.assertIn( + "title = {Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River}", + result["draft_bibtex"], + ) + self.assertIn("year = {1872}", result["draft_bibtex"]) + self.assertEqual(result["draft_bibtex"].count("ecospecies_reference_number = {160}"), 1) + + def test_enrichment_rejects_conflicting_resolved_metadata(self) -> None: + class MockResolver: + def resolve_entry(self, entry): + class Resolution: + source_label = "crossref:search:alabama-shad-false-positive" + + class Entry: + entry_type = "article" + citation_key = "daniell2009habitatuseage" + fields = { + "author": "Daniell, W.C.", + "year": "2009", + "title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA", + "journal": "Transactions of the American Fisheries Society", + "doi": "10.1111/j.1600-0633.2009.00395.x", + "url": "https://doi.org/10.1111/j.1600-0633.2009.00395.x", + "volume": "19", + "number": "1", + "pages": "107-115", + } + + entry = Entry() + + return Resolution() + + result = enrich_citation_payload( + { + "raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.", + "legacy_reference_number": "160", + }, + resolver=MockResolver(), + ) + + self.assertEqual(result["enrichment_status"], "unresolved") + self.assertIn("conflicts with citation seed fields", result["enrichment_error"]) + + def test_unresolved_enrichment_still_returns_refreshed_seed_fields(self) -> None: + class MockResolver: + def resolve_entry(self, entry): + return None + + result = enrich_citation_payload( + { + "raw_text": "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.", + "legacy_reference_number": "41", + "citation_key": "oldbadkey", + "entry_type": "misc", + }, + resolver=MockResolver(), + ) + + self.assertEqual(result["enrichment_status"], "unresolved") + self.assertEqual(result["citation_key"], "behre1950annotatedlistfauna") + self.assertIn("title = {Annotated list of the fauna of the Grand Isle region, 1928-1946}", result["draft_bibtex"]) + self.assertIn("Occas. Pap. Mar. Lab., LSU", result["draft_bibtex"]) + + def test_discover_citation_candidates_returns_scored_candidates(self) -> None: + class MockResolver: + def search_crossref_candidates(self, title): + return [ + LocalResolution( + LocalBibEntry( + "article", + "daniell1872lettersreferringexperiments", + { + "author": "Daniell, W.C.", + "year": "1872", + "title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River", + "journal": "Comm. Rept. U.S. Comm. Fish & Fish.", + "volume": "2", + "pages": "387-390", + }, + ), + "crossref:search:1:daniell-good", + ), + LocalResolution( + LocalBibEntry( + "article", + "daniell2009habitatuseage", + { + "author": "Daniell, W.C.", + "year": "2009", + "title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA", + "journal": "Transactions of the American Fisheries Society", + "volume": "19", + "number": "1", + "pages": "107-115", + }, + ), + "crossref:search:2:daniell-bad", + ), + ] + + def search_datacite_candidates(self, title): + return [] + + def search_openalex_candidates(self, title): + return [] + + result = discover_citation_candidates( + { + "raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.", + "legacy_reference_number": "160", + }, + resolver=MockResolver(), + ) + + self.assertEqual(result["candidate_count"], 2) + self.assertGreater(result["candidates"][0]["score"], result["candidates"][1]["score"]) + self.assertEqual(result["candidates"][0]["field_matches"]["year"]["status"], "exact") + self.assertEqual(result["candidates"][1]["field_matches"]["year"]["status"], "conflict") + + def test_local_crossref_candidate_search_filters_placeholder_stub_entries(self) -> None: + resolver = LocalMetadataResolver() + resolver._safe_get_json = lambda url: { + "message": { + "items": [ + { + "type": "journal-article", + "title": ["Referenced work 1"], + "issued": {"date-parts": [[2020]]}, + }, + { + "type": "journal-article", + "title": ["Useful Paper"], + "issued": {"date-parts": [[2020]]}, + "author": [{"family": "Smith", "given": "J S"}], + "container-title": ["Journal of Examples"], + "DOI": "10.1000/useful", + }, + ] + } + } + + results = resolver.search_crossref_candidates("Useful Paper") + + self.assertEqual(len(results), 1) + self.assertEqual(results[0].entry.fields["title"], "Useful Paper") + + def test_apply_citation_candidate_selection_uses_selected_candidate_fields(self) -> None: + result = apply_citation_candidate_selection( + { + "raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.", + "legacy_reference_number": "160", + }, + { + "source_label": "crossref:search:1:daniell-good", + "entry_type": "article", + "fields": { + "author": "Daniell, W.C.", + "year": "1872", + "title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River", + "journal": "Comm. Rept. U.S. Comm. Fish & Fish.", + "volume": "2", + "pages": "387-390", + }, + }, + ) + + self.assertEqual(result["enrichment_status"], "resolved") + self.assertEqual(result["resolver_source_label"], "editor:selected:crossref:search:1:daniell-good") + self.assertIn("Comm. Rept. U.S. Comm. Fish & Fish., 2: 387-390", result["normalized_text"]) diff --git a/apps/api/tests/test_document_format.py b/apps/api/tests/test_document_format.py new file mode 100644 index 0000000..b657580 --- /dev/null +++ b/apps/api/tests/test_document_format.py @@ -0,0 +1,195 @@ +from __future__ import annotations + +import json +import unittest + +from ecospecies_api.document_format import ( + DocumentNode, + StructuredDocument, + build_document_from_species_payload, + extract_citation_entries, + extract_species_projection, + export_markdown_document, + parse_markdown_document, + validate_markdown_document, +) + + +class StructuredMarkdownTests(unittest.TestCase): + def test_round_trip_markdown_preserves_metadata_and_hierarchy(self) -> None: + source = """--- +title: American Oyster +common_name: American Oyster +scientific_name: Crassostrea virginica +legacy_identifiers: + - authority: legacy-ecospecies + identifier: 5192 + label: FLELMR +taxon_identifiers: + - authority: worms + identifier: 159059 + label: AphiaID + primary: true +primary_taxon_authority: worms +--- + +## Summary +Short abstract. + +## Habitat + +### Type +Estuarine. +""" + + document = parse_markdown_document(source) + + self.assertEqual(document.metadata["title"], "American Oyster") + self.assertEqual(document.metadata["primary_taxon_authority"], "worms") + self.assertEqual(document.metadata["legacy_identifiers"][0]["identifier"], "5192") + self.assertEqual(document.metadata["taxon_identifiers"][0]["authority"], "worms") + self.assertEqual(document.nodes[0].title, "Summary") + self.assertEqual(document.nodes[1].children[0].title, "Type") + self.assertIn("## Habitat", export_markdown_document(document)) + + def test_build_document_from_species_payload_creates_markdown_sections(self) -> None: + document = build_document_from_species_payload( + { + "title": "American Oyster", + "common_name": "American Oyster", + "scientific_name": "Crassostrea virginica", + "flelmr_code": "5192", + "source_file": "American Oyster.txt", + "summary": "Short abstract.", + "sections": [ + {"heading": "HEADER", "content": "Ignored header"}, + {"heading": "Habitat", "content": "Estuarine."}, + {"heading": "Reproduction", "content": "Broadcast spawner."}, + ], + } + ) + + self.assertEqual(document.metadata["legacy_identifiers"][0]["identifier"], "5192") + self.assertEqual(document.metadata["legacy_identifiers"][0]["authority"], "legacy-ecospecies") + self.assertEqual([node.title for node in document.nodes], ["Summary", "Habitat", "Reproduction"]) + self.assertEqual(document.nodes[1].body, "Estuarine.") + + def test_extract_species_projection_flattens_nested_headings(self) -> None: + document = parse_markdown_document( + """--- +title: American Oyster +common_name: American Oyster +scientific_name: Crassostrea virginica +legacy_identifiers: + - authority: legacy-ecospecies + identifier: 5192 + label: FLELMR +--- + +## Summary +Short abstract. + +## Habitat +General habitat. + +### Type +Estuarine. +""" + ) + + projection = extract_species_projection(document) + + self.assertEqual(projection["summary"], "Short abstract.") + self.assertEqual(projection["flelmr_code"], "5192") + self.assertEqual( + [section["heading"] for section in projection["sections"]], + ["Habitat", "Habitat / Type"], + ) + + def test_extract_species_projection_accepts_legacy_species_code_front_matter(self) -> None: + document = parse_markdown_document( + """--- +title: Legacy Fish +common_name: Legacy Fish +scientific_name: Pisces historicus +species_code: 4242 +--- + +## Habitat +Estuarine. +""" + ) + + projection = extract_species_projection(document) + + self.assertEqual(projection["flelmr_code"], "4242") + + def test_validate_markdown_document_rejects_missing_front_matter_and_depth_jump(self) -> None: + errors = validate_markdown_document( + """## Habitat +Text + +#### Type +Nested too deeply. +""" + ) + + self.assertTrue(any("front matter" in error for error in errors)) + self.assertTrue(any("Heading depth jumps" in error for error in errors)) + + def test_extract_citation_entries_strips_legacy_comma_number_prefix(self) -> None: + document = parse_markdown_document( + """--- +title: Alabama Shad +common_name: Alabama Shad +scientific_name: Alosa alabamae +--- + +## References +160, Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390. +""" + ) + + citations = extract_citation_entries(document) + + self.assertEqual(len(citations), 1) + self.assertEqual(citations[0]["legacy_reference_number"], "160") + self.assertTrue(citations[0]["raw_text"].startswith("Daniell, W.C. 1872.")) + self.assertFalse(citations[0]["raw_text"].startswith("160,")) + + def test_extract_citation_entries_accepts_colon_terminated_citation_heading(self) -> None: + citations = extract_citation_entries( + StructuredDocument( + metadata={}, + nodes=[ + DocumentNode( + node_type="section", + title="Citations:", + body="7, Ahmed, M. 1975. Speciation in living oysters.", + depth=2, + ) + ], + ) + ) + + self.assertEqual(len(citations), 1) + self.assertEqual(citations[0]["legacy_reference_number"], "7") + + def test_extract_citation_entries_accepts_bare_number_prefix(self) -> None: + document = parse_markdown_document( + """--- +title: Eastern Mosquitofish +common_name: Eastern Mosquitofish +scientific_name: Gambusia holbrooki +--- + +## Citations +848 Gilmore, R.G. 1977. Fishes of the Indian River Lagoon and adjacent waters, Florida. +""" + ) + + citations = extract_citation_entries(document) + + self.assertEqual(len(citations), 1) + self.assertEqual(citations[0]["legacy_reference_number"], "848") + self.assertTrue(citations[0]["raw_text"].startswith("Gilmore, R.G. 1977.")) diff --git a/apps/api/tests/test_parser.py b/apps/api/tests/test_parser.py new file mode 100644 index 0000000..a53eedd --- /dev/null +++ b/apps/api/tests/test_parser.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +from ecospecies_api import parser + + +class ParserPathResolutionTests(unittest.TestCase): + def test_ecospecies_code_is_treated_as_flelmr_code(self) -> None: + metadata = parser.extract_metadata( + [ + "Title: Test Fish", + "EcoSpecies Code: 4242", + ] + ) + + self.assertEqual(metadata["ecospecies code"], "4242") + self.assertEqual(metadata["flelmr"], "4242") + + def test_title_case_headings_are_split_into_sections(self) -> None: + sections = parser.split_sections( + [ + "Species profile: American oyster (Crassostrea virginica)", + "", + "Classification", + " Phylum: Mollusca", + "Value", + "Commercial: Important fishery.", + "Habitat", + "Type: Estuarine.", + ] + ) + + self.assertEqual( + [section.heading for section in sections], + ["HEADER", "Classification", "Value", "Habitat"], + ) + + def test_colon_terminated_title_case_headings_are_split_into_sections(self) -> None: + sections = parser.split_sections( + [ + "Ecological Interactions and Notes", + "Predator text.", + "", + "Reference Numbers:", + "", + "Citations:", + "7, Ahmed, M. 1975. Speciation in living oysters.", + ] + ) + + self.assertEqual( + [section.heading for section in sections], + ["HEADER", "Citations"], + ) + + def test_default_data_dir_uses_in_repo_path_without_spaces(self) -> None: + with patch.dict("os.environ", {}, clear=True): + resolved = Path(parser.get_default_data_dir()) + + self.assertEqual(resolved, parser.get_repo_root() / "input-data" / "InputFiles") + + def test_relative_override_must_stay_within_repo(self) -> None: + with self.assertRaisesRegex(ValueError, "within the codebase directory"): + parser.resolve_data_dir("../input-data/InputFiles") + + def test_absolute_override_outside_repo_is_rejected(self) -> None: + with tempfile.TemporaryDirectory() as tempdir: + with self.assertRaisesRegex(ValueError, "within the codebase directory"): + parser.resolve_data_dir(tempdir) + + def test_directory_names_with_spaces_are_rejected(self) -> None: + with self.assertRaisesRegex(ValueError, "unsafe directory name"): + parser.resolve_data_dir("input-data/Bad Name") + + def test_directory_names_with_special_characters_are_rejected(self) -> None: + with self.assertRaisesRegex(ValueError, "unsafe directory name"): + parser.resolve_data_dir("input-data/bad@name") + + def test_load_species_records_resolves_repo_relative_paths(self) -> None: + records = parser.load_species_records("input-data/InputFiles") + + self.assertGreater(len(records), 0) + + def test_duplicate_source_records_receive_unique_stable_slugs(self) -> None: + records = parser.load_species_records("input-data/InputFiles") + slug_by_source = {record.source_file: record.slug for record in records} + + self.assertEqual(len(records), len(set(record.slug for record in records))) + self.assertEqual( + slug_by_source["Red Snapper_SLH_Outline2012_0722.txt"], + "red-snapper-red-snapper-slh-outline2012-0722", + ) + self.assertEqual( + slug_by_source["RedSnapper_SLH_2012_0830_combined.txt"], + "red-snapper-redsnapper-slh-2012-0830-combined", + ) + self.assertEqual( + slug_by_source["Sailfin Molly SLH RGG.txt"], + "sailfin-molly-sailfin-molly-slh-rgg", + ) + self.assertTrue( + slug_by_source["Sailfin_Molly SLH RGG.txt"].startswith( + "sailfin-molly-sailfin-molly-slh-rgg-" + ) + ) diff --git a/apps/api/tests/test_repository.py b/apps/api/tests/test_repository.py index 1219234..bab05a6 100644 --- a/apps/api/tests/test_repository.py +++ b/apps/api/tests/test_repository.py @@ -112,6 +112,35 @@ class RepositoryWorkflowTests(unittest.TestCase): self.assertEqual(detail["section_count"], 2) self.assertEqual([section["position"] for section in detail["sections"]], [1, 2]) self.assertEqual([item["code"] for item in detail["diagnostics"]], ["missing_citations"]) + self.assertEqual( + detail["legacy_identifiers"], + [ + { + "authority": "legacy-ecospecies", + "identifier": "9999", + "label": "FLELMR", + } + ], + ) + + def test_species_detail_includes_structured_document_and_legacy_source(self) -> None: + input_dir = Path(self.tempdir.name) / "input-data" / "InputFiles" + input_dir.mkdir(parents=True, exist_ok=True) + (input_dir / "Test Shad.txt").write_text("HEADER\nLegacy header content\n", encoding="utf-8") + + with patch.object(repository, "get_default_data_dir", return_value=str(input_dir)): + detail = repository.get_species_by_slug("test-shad") + + self.assertIsNotNone(detail) + assert detail is not None + self.assertEqual(detail["structured_document"]["source_format"], "ecospecies-markdown-v1") + self.assertIn( + "HABITAT", + [node["title"] for node in detail["structured_document"]["ast"]["nodes"]], + ) + self.assertEqual(detail["legacy_source"]["source_file"], "Test Shad.txt") + self.assertIn("Legacy header content", detail["legacy_source"]["text"]) + self.assertEqual(detail["taxon_identifiers"], []) def test_editorial_update_changes_publication_visibility_and_creates_audit(self) -> None: result = repository.update_species_editorial( @@ -207,6 +236,60 @@ class RepositoryWorkflowTests(unittest.TestCase): self.assertEqual(len(audit), 2) self.assertEqual([entry["action"] for entry in audit], ["section_update", "editorial_update"]) + def test_reimport_preserves_persisted_taxon_identifiers(self) -> None: + repository.update_species_document_markdown( + slug="test-shad", + markdown="""--- +title: Test Shad +common_name: Test Shad +scientific_name: Alosa testus +legacy_identifiers: + - authority: legacy-ecospecies + identifier: 9999 + label: FLELMR +taxon_identifiers: + - authority: gbif + identifier: 12345 + label: taxonKey + primary: true +primary_taxon_authority: gbif +--- + +## Summary +Taxon-reviewed summary. +""", + username="edith", + ) + + repository.import_species_payload(UPDATED_PAYLOAD) + + detail = repository.get_editor_species_detail("test-shad") + + self.assertIsNotNone(detail) + self.assertEqual(detail["primary_taxon_authority"], "gbif") + self.assertEqual( + detail["primary_taxon_identifier"], + { + "authority": "gbif", + "identifier": "12345", + "label": "taxonKey", + "primary": True, + "source_url": "", + }, + ) + self.assertEqual( + detail["taxon_identifiers"], + [ + { + "authority": "gbif", + "identifier": "12345", + "label": "taxonKey", + "primary": True, + "source_url": "", + } + ], + ) + def test_reimport_updates_summary_when_no_editorial_override_exists(self) -> None: repository.import_species_payload(UPDATED_PAYLOAD) @@ -302,6 +385,583 @@ class RepositoryWorkflowTests(unittest.TestCase): self.assertEqual(audit[0]["action"], "import_restore") self.assertEqual(audit[0]["details"]["is_archived"], {"from": True, "to": False}) + def test_document_markdown_update_refreshes_flat_projection(self) -> None: + result = repository.update_species_document_markdown( + slug="test-shad", + markdown="""--- +title: Test Shad Markdown +common_name: Test Shad +scientific_name: Alosa markdownus +species_code: 4242 +--- + +## Summary +Markdown summary. + +## Habitat +Open water. + +### Type +Pelagic. +""", + username="frank", + ) + + detail = repository.get_editor_species_detail("test-shad") + document = repository.get_species_document("test-shad") + audit = repository.list_species_audit("test-shad") + + self.assertIsNotNone(result) + self.assertIsNotNone(detail) + self.assertIsNotNone(document) + self.assertEqual(detail["title"], "Test Shad Markdown") + self.assertEqual(detail["scientific_name"], "Alosa markdownus") + self.assertEqual(detail["flelmr_code"], "4242") + self.assertEqual(detail["summary"], "Markdown summary.") + self.assertEqual( + [section["heading"] for section in detail["sections"]], + ["Habitat", "Habitat / Type"], + ) + self.assertEqual(document["updated_by"], "frank") + self.assertIsNotNone(audit) + self.assertEqual(audit[0]["action"], "document_update") + + def test_document_markdown_update_extracts_citations(self) -> None: + repository.update_species_document_markdown( + slug="test-shad", + markdown="""--- +title: Test Shad Markdown +common_name: Test Shad +scientific_name: Alosa markdownus +legacy_identifiers: + - authority: legacy-ecospecies + identifier: 4242 + label: FLELMR +--- + +## Summary +Markdown summary. + +## References + +- Smith, J. 2024. Example paper. doi:10.1000/example-doi +- [7] Jones, A. 2022. Fisheries review. +""", + username="frank", + ) + + detail = repository.get_editor_species_detail("test-shad") + + self.assertIsNotNone(detail) + self.assertEqual(detail["citation_count"], 2) + self.assertEqual(detail["citations"][0]["section_heading"], "References") + self.assertEqual(detail["citations"][0]["legacy_reference_number"], "") + self.assertEqual(detail["citations"][0]["doi"], "10.1000/example-doi") + self.assertTrue(detail["citations"][0]["citation_key"]) + self.assertIn("@", detail["citations"][0]["draft_bibtex"]) + self.assertEqual(detail["citations"][0]["review_status"], "draft") + self.assertEqual(detail["citations"][1]["legacy_reference_number"], "7") + self.assertEqual(detail["citations"][1]["doi"], "") + self.assertIn("ecospecies_reference_number = \\{7\\}", detail["citations"][1]["draft_bibtex"]) + + def test_editor_can_review_citations_and_reviews_survive_document_save(self) -> None: + repository.update_species_document_markdown( + slug="test-shad", + markdown="""--- +title: Test Shad Markdown +common_name: Test Shad +scientific_name: Alosa markdownus +legacy_identifiers: + - authority: legacy-ecospecies + identifier: 4242 + label: FLELMR +--- + +## References + +- [7] Jones, A. 2022. Fisheries review. +""", + username="frank", + ) + + citations = repository.get_editor_species_citations("test-shad") + self.assertIsNotNone(citations) + citation = citations["citations"][0] + + result = repository.update_species_citation_review( + slug="test-shad", + citation_id=citation["id"], + review_status="accepted", + normalized_text="Jones, A. (2022). Fisheries review.", + doi="10.1000/review-doi", + citation_key="jones2022review", + entry_type="article", + draft_bibtex="@article{jones2022review,\n doi = {10.1000/review-doi}\n}", + username="edith", + ) + + self.assertIsNotNone(result) + self.assertEqual(result["citation"]["review_status"], "accepted") + self.assertEqual(result["citation"]["source_type"], "editor_review") + + repository.update_species_document_markdown( + slug="test-shad", + markdown="""--- +title: Test Shad Markdown +common_name: Test Shad +scientific_name: Alosa markdownus +legacy_identifiers: + - authority: legacy-ecospecies + identifier: 4242 + label: FLELMR +--- + +## References + +- [7] Jones, A. 2022. Fisheries review. +""", + username="frank", + ) + + citations = repository.get_editor_species_citations("test-shad") + audit = repository.list_species_audit("test-shad") + + self.assertIsNotNone(citations) + self.assertEqual(citations["citation_count"], 1) + self.assertEqual(citations["citations"][0]["review_status"], "accepted") + self.assertEqual(citations["citations"][0]["doi"], "10.1000/review-doi") + self.assertEqual(citations["citations"][0]["citation_key"], "jones2022review") + self.assertEqual(citations["citations"][0]["entry_type"], "article") + self.assertIn("10.1000/review-doi", citations["citations"][0]["draft_bibtex"]) + self.assertIsNotNone(audit) + self.assertEqual(audit[1]["action"], "citation_review_update") + + def test_editor_can_run_citation_enrichment(self) -> None: + repository.update_species_document_markdown( + slug="test-shad", + markdown="""--- +title: Test Shad Markdown +common_name: Test Shad +scientific_name: Alosa markdownus +legacy_identifiers: + - authority: legacy-ecospecies + identifier: 4242 + label: FLELMR +--- + +## References + +- [7] Jones, A. 2022. Fisheries review. +""", + username="frank", + ) + citations = repository.get_editor_species_citations("test-shad") + self.assertIsNotNone(citations) + citation = citations["citations"][0] + + with patch.object( + repository, + "enrich_citation_payload", + return_value={ + "citation_key": "jones2022review", + "entry_type": "article", + "normalized_text": "Jones, A. (2022). Fisheries review. Journal of Tests. DOI:10.1000/review-doi", + "draft_bibtex": "@article{jones2022review,\n doi = {10.1000/review-doi},\n}", + "doi": "10.1000/review-doi", + "source_url": "https://doi.org/10.1000/review-doi", + "openalex_id": "W12345", + "resolver_source_label": "crossref:doi:10.1000/review-doi", + "enrichment_status": "resolved", + "enrichment_error": "", + "conflicts": [], + }, + ): + result = repository.update_species_citation_enrichment( + slug="test-shad", + citation_id=citation["id"], + username="edith", + ) + + self.assertIsNotNone(result) + self.assertEqual(result["citation"]["enrichment_status"], "resolved") + self.assertEqual(result["citation"]["doi"], "10.1000/review-doi") + self.assertEqual(result["citation"]["openalex_id"], "W12345") + self.assertEqual(result["citation"]["resolver_source_label"], "crossref:doi:10.1000/review-doi") + self.assertEqual(result["citation"]["source_url"], "https://doi.org/10.1000/review-doi") + + citations = repository.get_editor_species_citations("test-shad") + audit = repository.list_species_audit("test-shad") + + self.assertIsNotNone(citations) + self.assertEqual(citations["citations"][0]["citation_key"], "jones2022review") + self.assertEqual(citations["citations"][0]["entry_type"], "article") + self.assertEqual(citations["citations"][0]["enrichment_status"], "resolved") + self.assertIsNotNone(audit) + self.assertEqual(audit[0]["action"], "citation_enrichment") + + def test_editor_can_run_batch_citation_enrichment(self) -> None: + repository.update_species_document_markdown( + slug="test-shad", + markdown="""--- +title: Test Shad Markdown +common_name: Test Shad +scientific_name: Alosa markdownus +legacy_identifiers: + - authority: legacy-ecospecies + identifier: 4242 + label: FLELMR +--- + +## References + +- [7] Jones, A. 2022. Fisheries review. +- [8] Smith, B. 2021. Estuarine habitat paper. +""", + username="frank", + ) + + payloads = [ + { + "citation_key": "jones2022review", + "entry_type": "article", + "normalized_text": "Jones, A. (2022). Fisheries review.", + "draft_bibtex": "@article{jones2022review,\n}", + "doi": "10.1000/review-doi", + "source_url": "https://doi.org/10.1000/review-doi", + "openalex_id": "W12345", + "resolver_source_label": "crossref:doi:10.1000/review-doi", + "enrichment_status": "resolved", + "enrichment_error": "", + "conflicts": [], + }, + { + "citation_key": "smith2021estuarine", + "entry_type": "misc", + "normalized_text": "", + "draft_bibtex": "", + "doi": "", + "source_url": "", + "openalex_id": "", + "resolver_source_label": "", + "enrichment_status": "unresolved", + "enrichment_error": "No metadata match found from DOI, title, or authority identifiers.", + "conflicts": [], + }, + ] + + with patch.object(repository, "enrich_citation_payload", side_effect=payloads): + result = repository.update_species_citations_enrichment_batch( + slug="test-shad", + username="edith", + ) + + self.assertIsNotNone(result) + self.assertEqual(result["citation_count"], 2) + self.assertEqual(result["changed_count"], 2) + self.assertEqual(result["resolved_count"], 1) + self.assertEqual(result["unresolved_count"], 1) + self.assertEqual(result["error_count"], 0) + + def test_editor_can_review_and_apply_citation_candidates(self) -> None: + repository.update_species_document_markdown( + slug="test-shad", + markdown="""--- +title: Test Shad Markdown +common_name: Test Shad +scientific_name: Alosa markdownus +legacy_identifiers: + - authority: legacy-ecospecies + identifier: 4242 + label: FLELMR +--- + +## References + +- [7] Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390. +""", + username="frank", + ) + citations = repository.get_editor_species_citations("test-shad") + self.assertIsNotNone(citations) + citation = citations["citations"][0] + + with patch.object( + repository, + "discover_citation_candidates", + return_value={ + "seed": { + "fields": { + "author": "Daniell, W.C.", + "year": "1872", + "title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River", + "journal": "Comm. Rept. U.S. Comm. Fish & Fish.", + "volume": "2", + "pages": "387-390", + } + }, + "candidate_count": 1, + "candidates": [ + { + "candidate_id": "crossref-search-1-daniell-good", + "source_label": "crossref:search:1:daniell-good", + "entry_type": "article", + "citation_key": "daniell1872lettersreferringexperiments", + "fields": { + "author": "Daniell, W.C.", + "year": "1872", + "title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River", + "journal": "Comm. Rept. U.S. Comm. Fish & Fish.", + "volume": "2", + "pages": "387-390", + }, + } + ], + }, + ): + candidates = repository.get_species_citation_candidates("test-shad", citation["id"]) + + self.assertIsNotNone(candidates) + self.assertEqual(candidates["candidate_count"], 1) + + result = repository.apply_species_citation_candidate_selection( + slug="test-shad", + citation_id=citation["id"], + candidate={ + "source_label": "crossref:search:1:daniell-good", + "entry_type": "article", + "fields": { + "author": "Daniell, W.C.", + "year": "1872", + "title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River", + "journal": "Comm. Rept. U.S. Comm. Fish & Fish.", + "volume": "2", + "pages": "387-390", + }, + }, + username="edith", + ) + + self.assertIsNotNone(result) + self.assertEqual(result["citation"]["resolver_source_label"], "editor:selected:crossref:search:1:daniell-good") + self.assertEqual(result["citation"]["source_type"], "editor_selected_candidate") + self.assertEqual(result["citation"]["review_status"], "accepted") + audit = repository.list_species_audit("test-shad") + self.assertIsNotNone(audit) + self.assertEqual(audit[0]["action"], "citation_candidate_selection") + + def test_editor_can_add_candidate_as_additional_citation_and_preserve_it(self) -> None: + repository.update_species_document_markdown( + slug="test-shad", + markdown="""--- +title: Test Shad Markdown +common_name: Test Shad +scientific_name: Alosa markdownus +legacy_identifiers: + - authority: legacy-ecospecies + identifier: 4242 + label: FLELMR +--- + +## References + +- [7] Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390. +""", + username="frank", + ) + citations = repository.get_editor_species_citations("test-shad") + self.assertIsNotNone(citations) + source_citation = citations["citations"][0] + + result = repository.add_species_citation_from_candidate( + slug="test-shad", + citation_id=source_citation["id"], + candidate={ + "source_label": "crossref:search:1:daniell-related", + "entry_type": "article", + "fields": { + "author": "Jordan, F.", + "year": "2009", + "title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA", + "journal": "Transactions of the American Fisheries Society", + "volume": "19", + "number": "1", + "pages": "107-115", + "doi": "10.1111/j.1600-0633.2009.00395.x", + "url": "https://doi.org/10.1111/j.1600-0633.2009.00395.x", + }, + }, + username="edith", + ) + + self.assertIsNotNone(result) + self.assertEqual(result["citation"]["source_type"], "editor_added_candidate") + self.assertEqual(result["citation"]["review_status"], "accepted") + + citations = repository.get_editor_species_citations("test-shad") + self.assertIsNotNone(citations) + self.assertEqual(citations["citation_count"], 2) + self.assertEqual(citations["citations"][1]["section_heading"], "References") + document = repository.get_species_document("test-shad") + self.assertIsNotNone(document) + self.assertIn("Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA", document["markdown"]) + + repository.update_species_document_markdown( + slug="test-shad", + markdown=document["markdown"], + username="frank", + ) + + citations = repository.get_editor_species_citations("test-shad") + self.assertIsNotNone(citations) + self.assertEqual(citations["citation_count"], 2) + self.assertEqual(citations["citations"][1]["source_type"], "editor_added_candidate") + audit = repository.list_species_audit("test-shad") + self.assertIsNotNone(audit) + self.assertEqual(audit[0]["action"], "document_update") + self.assertEqual(audit[1]["action"], "citation_candidate_addition") + + def test_contributor_can_view_only_owned_citations(self) -> None: + created = repository.create_contributor_species( + "writer@example.org", + """--- +title: Contributor Draft +common_name: Contributor Fish +scientific_name: Pisces contributoris +species_code: +--- + +## References + +- [12] Example, A. 2025. Draft reference. +""", + ) + + owned = repository.get_contributor_species_citations(created["slug"], "writer@example.org") + other = repository.get_contributor_species_citations(created["slug"], "other@example.org") + + self.assertIsNotNone(owned) + self.assertEqual(owned["citation_count"], 1) + self.assertEqual(owned["citations"][0]["legacy_reference_number"], "12") + self.assertIsNone(other) + + def test_public_bibliography_aggregates_species_citations(self) -> None: + repository.update_species_document_markdown( + slug="test-shad", + markdown="""--- +title: Test Shad Markdown +common_name: Test Shad +scientific_name: Alosa markdownus +legacy_identifiers: + - authority: legacy-ecospecies + identifier: 4242 + label: FLELMR +--- + +## References + +- [7] Jones, A. 2022. Fisheries review. +""", + username="frank", + ) + + citations = repository.get_editor_species_citations("test-shad") + self.assertIsNotNone(citations) + citation = citations["citations"][0] + repository.update_species_citation_review( + slug="test-shad", + citation_id=citation["id"], + review_status="accepted", + normalized_text="Jones, A. (2022). Fisheries review.", + doi="10.1000/review-doi", + citation_key="jones2022review", + entry_type="article", + draft_bibtex="@article{jones2022review,\n doi = {10.1000/review-doi}\n}", + username="edith", + abstract_text="A short abstract about fisheries review.", + ) + + bibliography = repository.list_public_bibliography() + + self.assertEqual(len(bibliography), 1) + self.assertEqual(bibliography[0]["citation_key"], "jones2022review") + self.assertEqual(bibliography[0]["abstract_text"], "A short abstract about fisheries review.") + self.assertEqual(bibliography[0]["legacy_reference_numbers"], ["7"]) + self.assertEqual(bibliography[0]["species_count"], 1) + self.assertEqual(bibliography[0]["species_refs"][0]["slug"], "test-shad") + + def test_register_contributor_creates_token_and_enforces_age_gate(self) -> None: + with self.assertRaisesRegex(ValueError, "at least 13 years old"): + repository.register_contributor("person@example.org", False) + + result = repository.register_contributor("Person@Example.org", True) + + self.assertEqual(result["username"], "person@example.org") + self.assertEqual(result["role"], "contributor") + self.assertEqual(result["minimum_age"], 13) + self.assertTrue(result["token"]) + + def test_contributor_can_create_and_edit_only_owned_species(self) -> None: + created = repository.create_contributor_species( + "writer@example.org", + """--- +title: Contributor Draft +common_name: Contributor Fish +scientific_name: Pisces contributoris +species_code: +--- + +## Summary +Draft summary. + +## Habitat +Mangroves. +""", + ) + + detail = repository.get_contributor_species_detail(created["slug"], "writer@example.org") + public_detail = repository.get_species_by_slug(created["slug"]) + + self.assertIsNotNone(detail) + self.assertIsNone(public_detail) + self.assertEqual(detail["publication_status"], "draft") + self.assertEqual(detail["common_name"], "Contributor Fish") + + updated = repository.update_contributor_species_document_markdown( + created["slug"], + """--- +title: Contributor Draft Revised +common_name: Contributor Fish +scientific_name: Pisces contributoris +species_code: +--- + +## Summary +Revised summary. + +## Habitat +Seagrass. + +### Depth +Shallow bays. +""", + "writer@example.org", + ) + + self.assertIsNotNone(updated) + detail = repository.get_contributor_species_detail(created["slug"], "writer@example.org") + other_user_detail = repository.get_contributor_species_detail(created["slug"], "other@example.org") + audit = repository.list_species_audit(created["slug"]) + + self.assertIsNotNone(detail) + self.assertEqual(detail["summary"], "Revised summary.") + self.assertEqual( + [section["heading"] for section in detail["sections"]], + ["Habitat", "Habitat / Depth"], + ) + self.assertIsNone(other_user_detail) + self.assertIsNotNone(audit) + self.assertEqual(audit[0]["action"], "contributor_document_update") + if __name__ == "__main__": unittest.main() diff --git a/apps/web/app.js b/apps/web/app.js index 454eae8..d94d50b 100644 --- a/apps/web/app.js +++ b/apps/web/app.js @@ -1,4 +1,20 @@ -const apiBase = ""; +function getAppBase() { + const { pathname } = window.location; + if (pathname === "/" || pathname === "/index.html") { + return ""; + } + if (pathname.endsWith("/index.html")) { + return pathname.slice(0, -"/index.html".length); + } + return pathname.endsWith("/") ? pathname.slice(0, -1) : pathname; +} + +function getInitialSpeciesSlug() { + const hash = window.location.hash.replace(/^#/, "").trim(); + return hash || ""; +} + +const apiBase = getAppBase(); const speciesList = document.querySelector("#species-list"); const searchInput = document.querySelector("#search"); @@ -12,26 +28,373 @@ const detailArchiveNote = document.querySelector("#detail-archive-note"); const detailScientificName = document.querySelector("#detail-scientific-name"); const detailSummary = document.querySelector("#detail-summary"); const detailSections = document.querySelector("#detail-sections"); +const legacyPanel = document.querySelector("#legacy-panel"); +const legacySourceMeta = document.querySelector("#legacy-source-meta"); +const legacySourceText = document.querySelector("#legacy-source-text"); const speciesCount = document.querySelector("#species-count"); const sectionCount = document.querySelector("#section-count"); const authTokenInput = document.querySelector("#auth-token"); const authSaveButton = document.querySelector("#auth-save"); const authClearButton = document.querySelector("#auth-clear"); const authStatus = document.querySelector("#auth-status"); +const contributorEmailInput = document.querySelector("#contributor-email"); +const contributorAgeGate = document.querySelector("#contributor-age-gate"); +const contributorAgeLabel = document.querySelector("#contributor-age-label"); +const contributorRegisterButton = document.querySelector("#contributor-register"); +const contributorStatus = document.querySelector("#contributor-status"); +const contributorCreateButton = document.querySelector("#contributor-create"); +const accessPanel = document.querySelector("#access-panel"); const editorPanel = document.querySelector("#editor-panel"); const editorPublicationStatus = document.querySelector("#editor-publication-status"); -const editorSummary = document.querySelector("#editor-summary"); const editorNotes = document.querySelector("#editor-notes"); const editorIsArchived = document.querySelector("#editor-is-archived"); const editorSaveButton = document.querySelector("#editor-save"); const editorStatus = document.querySelector("#editor-status"); +const documentPanel = document.querySelector("#document-panel"); +const documentMarkdown = document.querySelector("#document-markdown"); +const documentPreview = document.querySelector("#document-preview"); +const documentSaveButton = document.querySelector("#document-save"); +const documentStatus = document.querySelector("#document-status"); +const citationPanel = document.querySelector("#citation-panel"); +const citationStatus = document.querySelector("#citation-status"); +const citationList = document.querySelector("#citation-list"); +const citationBackfillSpeciesButton = document.querySelector("#citation-backfill-species"); +const citationEnrichAllButton = document.querySelector("#citation-enrich-all"); +const citationMatchDialog = document.querySelector("#citation-match-dialog"); +const citationMatchSeed = document.querySelector("#citation-match-seed"); +const citationMatchCandidates = document.querySelector("#citation-match-candidates"); +const citationMatchStatus = document.querySelector("#citation-match-status"); +const citationMatchCloseButton = document.querySelector("#citation-match-close"); const auditPanel = document.querySelector("#audit-panel"); const auditList = document.querySelector("#audit-list"); +const collapsibleToggles = document.querySelectorAll(".collapsible-toggle"); let currentItems = []; let currentSlug = null; let currentSession = null; let currentArchiveFilter = "active"; +let currentCitationMatch = null; +let currentSpeciesCitations = []; +let workflowPanelState = { + "legacy-panel": false, + "access-panel": false, + "editor-panel": false, + "document-panel": false, + "citation-panel": false, + "audit-panel": false, +}; + +function setCollapsibleState(panel, expanded) { + if (!panel) { + return; + } + panel.classList.toggle("collapsed", !expanded); + const toggle = panel.querySelector(".collapsible-toggle"); + if (!toggle) { + return; + } + const label = toggle.dataset.label || panel.dataset.label || "Section"; + toggle.textContent = `${expanded ? "Hide" : "Show"} ${label}`; + toggle.setAttribute("aria-expanded", expanded ? "true" : "false"); + workflowPanelState[panel.id] = expanded; +} + +function collapseWorkflowPanels() { + [legacyPanel, accessPanel, editorPanel, documentPanel, citationPanel, auditPanel].forEach((panel) => { + setCollapsibleState(panel, false); + }); +} + +function expandCitationPanel() { + setCollapsibleState(citationPanel, true); +} + +function restoreWorkflowPanels() { + [legacyPanel, accessPanel, editorPanel, documentPanel, citationPanel, auditPanel].forEach((panel) => { + if (!panel || panel.classList.contains("hidden")) { + return; + } + setCollapsibleState(panel, Boolean(workflowPanelState[panel.id])); + }); +} + +function renderStructuredBody(body) { + const trimmed = String(body || "").trim(); + if (!trimmed) { + return ""; + } + + const paragraphs = trimmed + .split(/\n\s*\n/) + .map((paragraph) => paragraph.trim()) + .filter(Boolean); + + return paragraphs + .map((paragraph) => { + const html = escapeHtml(paragraph).replace(/\n/g, "<br>"); + return `<p class="structured-node-body">${html}</p>`; + }) + .join(""); +} + +function isCitationHeading(title) { + const normalized = String(title || "").trim().replace(/:$/, "").toLowerCase(); + return [ + "reference numbers", + "references", + "reference", + "citations", + "citation", + "bibliography", + "related references", + "related citations", + ].includes(normalized); +} + +function parseBibtexFields(draftBibtex) { + const fields = {}; + const text = String(draftBibtex || ""); + const pattern = /([a-zA-Z_]+)\s*=\s*\{([^}]*)\}/g; + let match = pattern.exec(text); + while (match) { + fields[match[1].toLowerCase()] = match[2].trim(); + match = pattern.exec(text); + } + return fields; +} + +function collectBibtexRecords(items) { + const seen = new Set(); + const records = []; + for (const item of items || []) { + const draftBibtex = String(item && item.draft_bibtex ? item.draft_bibtex : "").trim(); + if (!draftBibtex || seen.has(draftBibtex)) { + continue; + } + seen.add(draftBibtex); + records.push(draftBibtex); + } + return records; +} + +function sanitizeFilenamePart(value, fallback = "records") { + const cleaned = String(value || "") + .trim() + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, ""); + return cleaned || fallback; +} + +function downloadBibtexRecords(items, filenameStem) { + const records = collectBibtexRecords(items); + if (!records.length) { + return false; + } + const blob = new Blob([`${records.join("\n\n")}\n`], { type: "application/x-bibtex;charset=utf-8" }); + const url = URL.createObjectURL(blob); + const link = document.createElement("a"); + link.href = url; + link.download = `${sanitizeFilenamePart(filenameStem)}.bib`; + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + window.setTimeout(() => URL.revokeObjectURL(url), 0); + return true; +} + +function buildPublicCitationText(item) { + const fields = parseBibtexFields(item.draft_bibtex || ""); + if (item.normalized_text) { + return escapeHtml(String(item.normalized_text)); + } + + const author = fields.author || ""; + const year = fields.year || ""; + const title = fields.title || ""; + const venue = fields.journal || fields.booktitle || fields.publisher || ""; + const volume = fields.volume || ""; + const issue = fields.number || ""; + const pages = fields.pages || ""; + + const parts = []; + const lead = [author, year ? `(${year})` : ""].filter(Boolean).join(" "); + if (lead) { + parts.push(lead); + } + if (title) { + parts.push(title); + } + const venueBits = [venue, volume ? `${volume}${issue ? `(${issue})` : ""}` : issue ? `(${issue})` : "", pages] + .filter(Boolean) + .join(", "); + if (venueBits) { + parts.push(venueBits); + } + + return escapeHtml(parts.join(". ").trim() || String(item.raw_text || "")); +} + +function renderPublicCitationEntry(item) { + const fields = parseBibtexFields(item.draft_bibtex || ""); + const meta = [ + item.legacy_reference_number ? `Imported reference ${escapeHtml(item.legacy_reference_number)}` : "", + item.source_type === "editor_added_candidate" ? "Added citation" : "", + item.source_type === "editor_selected_candidate" ? "Reviewed citation" : "", + ] + .filter(Boolean) + .join(" • "); + + const links = [ + item.doi ? `<a href="https://doi.org/${encodeURIComponent(String(item.doi).replace(/^https?:\/\/doi\.org\//, ""))}" target="_blank" rel="noopener noreferrer">DOI</a>` : "", + item.source_url ? `<a href="${escapeHtml(item.source_url)}" target="_blank" rel="noopener noreferrer">Source</a>` : "", + item.openalex_id ? `<a href="https://openalex.org/${escapeHtml(String(item.openalex_id).replace(/^https?:\/\/openalex\.org\//, ""))}" target="_blank" rel="noopener noreferrer">OpenAlex</a>` : "", + ] + .filter(Boolean) + .join(" · "); + + return ` + <article class="public-citation-entry"> + <p class="public-citation-text">${buildPublicCitationText(item)}</p> + ${meta ? `<p class="public-citation-meta">${meta}</p>` : ""} + ${links ? `<p class="public-citation-links">${links}</p>` : ""} + ${renderCitationAbstractBlock(item.abstract_text || fields.abstract || "", false)} + </article> + `; +} + +function buildPublicBibliographyMarkup(citations, filenameStem) { + const records = collectBibtexRecords(citations); + const downloadButton = ` + <div class="public-bibliography-actions"> + <button + type="button" + class="secondary-button bibliography-download-button" + data-filename-stem="${escapeHtml(filenameStem)}" + ${records.length ? "" : "disabled"} + > + Download BibTeX + </button> + <p class="public-bibliography-note"> + ${records.length ? `${records.length} BibTeX record${records.length === 1 ? "" : "s"} available for download.` : "No BibTeX records are available for download yet."} + </p> + </div> + `; + + return Array.isArray(citations) && citations.length + ? `${downloadButton}<div class="public-citation-list">${citations.map((item) => renderPublicCitationEntry(item)).join("")}</div>` + : `${downloadButton}<p class="structured-node-body">No extracted bibliography entries are available yet.</p>`; +} + +function renderStructuredNodes(nodes, container, citations, renderState = { renderedBibliography: false }) { + for (const node of nodes || []) { + const rawTitle = String(node.title || "").trim() || "Untitled section"; + const isCitationSection = isCitationHeading(rawTitle); + if (isCitationSection && renderState.renderedBibliography) { + continue; + } + + const sectionEl = document.createElement("section"); + sectionEl.className = "detail-section structured-node"; + + const depth = Number(node.depth || 2); + const headingLevel = Math.min(6, Math.max(3, depth + 1)); + const title = escapeHtml(isCitationHeading(rawTitle) ? "Bibliography" : rawTitle); + const body = String(node.body || "").trim(); + const children = Array.isArray(node.children) ? node.children : []; + const citationMarkup = isCitationSection + ? buildPublicBibliographyMarkup(citations, `${currentSlug || "ecospecies"}-bibliography`) + : ""; + + sectionEl.innerHTML = ` + <h${headingLevel}>${title}</h${headingLevel}> + ${isCitationSection ? citationMarkup : renderStructuredBody(body)} + ${children.length ? '<div class="structured-node-children"></div>' : ""} + `; + + if (isCitationSection) { + renderState.renderedBibliography = true; + } + + if (children.length) { + renderStructuredNodes(children, sectionEl.querySelector(".structured-node-children"), citations, renderState); + } + + container.appendChild(sectionEl); + } +} + +function renderPrimaryContent(data) { + detailSections.innerHTML = ""; + + if (data.diagnostics.length) { + const diagnosticsEl = document.createElement("section"); + diagnosticsEl.className = "detail-section detail-diagnostics"; + diagnosticsEl.innerHTML = ` + <h3>Ingest Diagnostics</h3> + <ul class="diagnostic-list"> + ${data.diagnostics + .map( + (diagnostic) => + `<li><strong>${escapeHtml(diagnostic.code)}</strong>: ${escapeHtml(diagnostic.message)}</li>`, + ) + .join("")} + </ul> + `; + detailSections.appendChild(diagnosticsEl); + } + + const structuredNodes = + data.structured_document && + data.structured_document.ast && + Array.isArray(data.structured_document.ast.nodes) + ? data.structured_document.ast.nodes.filter( + (node) => String(node.title || "").trim().toLowerCase() !== "summary", + ) + : []; + + if (structuredNodes.length) { + renderStructuredNodes(structuredNodes, detailSections, data.citations || [], { renderedBibliography: false }); + attachCitationToggleControls(detailSections); + const downloadButton = detailSections.querySelector(".bibliography-download-button"); + if (downloadButton) { + downloadButton.addEventListener("click", () => { + const downloaded = downloadBibtexRecords(data.citations || [], `${data.slug || currentSlug || "ecospecies"}-bibliography`); + const note = detailSections.querySelector(".public-bibliography-note"); + if (note && !downloaded) { + note.textContent = "No BibTeX records are available for download yet."; + } + }); + } + return; + } + + for (const section of data.sections) { + const sectionEl = document.createElement("section"); + sectionEl.className = "detail-section"; + sectionEl.innerHTML = ` + <h3>${escapeHtml(section.heading)}</h3> + <pre>${escapeHtml(section.content)}</pre> + `; + detailSections.appendChild(sectionEl); + } +} + +function renderLegacySource(data) { + const legacySource = data.legacy_source; + const hasLegacySource = Boolean(legacySource && String(legacySource.text || "").trim()); + legacyPanel.classList.toggle("hidden", !hasLegacySource); + if (!hasLegacySource) { + legacySourceMeta.textContent = ""; + legacySourceText.textContent = ""; + return; + } + + legacySourceMeta.textContent = legacySource.source_file + ? `Original imported file: ${legacySource.source_file}` + : "Original imported legacy material"; + legacySourceText.textContent = String(legacySource.text || ""); +} function getAuthToken() { return window.localStorage.getItem("ecospecies_auth_token") || ""; @@ -45,10 +408,522 @@ function getAuthHeaders() { function escapeHtml(value) { return value .replaceAll("&", "&") + .replaceAll('"', """) .replaceAll("<", "<") .replaceAll(">", ">"); } +function normalizeAbstractForDisplay(value) { + const raw = String(value || "").trim(); + if (!raw) { + return ""; + } + const temp = document.createElement("div"); + temp.innerHTML = raw; + return temp.textContent + .replace(/^abstract\s*[:.\-]?\s*/i, "") + .replace(/\s+/g, " ") + .trim(); +} + +function parseMarkdownFrontMatter(markdown) { + const stripped = markdown.trimStart(); + if (!stripped.startsWith("---\n")) { + return { metadata: {}, body: markdown }; + } + + const remainder = stripped.slice(4); + const separatorIndex = remainder.indexOf("\n---\n"); + if (separatorIndex === -1) { + return { metadata: {}, body: markdown }; + } + + const metadataBlock = remainder.slice(0, separatorIndex); + const body = remainder.slice(separatorIndex + 5); + const metadata = {}; + + for (const line of metadataBlock.split("\n")) { + const separator = line.indexOf(":"); + if (separator === -1) { + continue; + } + const key = line.slice(0, separator).trim(); + const value = line.slice(separator + 1).trim(); + if (key) { + metadata[key] = value; + } + } + + return { metadata, body }; +} + +function renderDocumentPreview(markdown) { + const { metadata, body } = parseMarkdownFrontMatter(markdown); + const headings = body + .split("\n") + .map((line) => { + const match = line.match(/^(#{2,6})\s+(.+?)\s*$/); + if (!match) { + return null; + } + return { + depth: match[1].length, + title: match[2].trim(), + }; + }) + .filter(Boolean); + + if (!headings.length && !Object.keys(metadata).length) { + documentPreview.innerHTML = `<p class="document-preview-empty">No headings detected yet.</p>`; + return; + } + + const metadataItems = Object.entries(metadata) + .map(([key, value]) => `<li><strong>${escapeHtml(key)}</strong>: ${escapeHtml(value)}</li>`) + .join(""); + + const headingItems = headings + .map( + (heading) => + `<li style="margin-left:${Math.max(0, heading.depth - 2) * 18}px">${escapeHtml(heading.title)}</li>`, + ) + .join(""); + + documentPreview.innerHTML = ` + ${metadataItems ? `<ul class="document-preview-metadata">${metadataItems}</ul>` : ""} + ${headingItems ? `<ol class="document-preview-list">${headingItems}</ol>` : '<p class="document-preview-empty">No headings detected yet.</p>'} + `; +} + +function renderCitationList(items, editable) { + citationList.innerHTML = ""; + if (!items.length) { + citationList.innerHTML = `<p class="editor-status">No citations have been extracted yet.</p>`; + return; + } + + for (const item of items) { + const article = document.createElement("article"); + article.className = "citation-entry"; + + const readOnlyMeta = [ + item.section_heading ? `Section: ${escapeHtml(item.section_heading)}` : "", + item.legacy_reference_number + ? `Legacy reference: ${escapeHtml(item.legacy_reference_number)}` + : "", + item.source_type ? `Source: ${escapeHtml(item.source_type)}` : "", + item.enrichment_status ? `Enrichment: ${escapeHtml(item.enrichment_status)}` : "", + ] + .filter(Boolean) + .join(" • "); + + if (!editable) { + article.innerHTML = ` + <p class="citation-entry-meta">${readOnlyMeta}</p> + <p class="citation-entry-raw">${escapeHtml(item.raw_text || "")}</p> + <p class="citation-entry-meta">Review status: ${escapeHtml(item.review_status || "draft")}</p> + ${item.doi ? `<p class="citation-entry-meta">DOI: ${escapeHtml(item.doi)}</p>` : ""} + ${item.openalex_id ? `<p class="citation-entry-meta">OpenAlex: ${escapeHtml(item.openalex_id)}</p>` : ""} + ${item.resolver_source_label ? `<p class="citation-entry-meta">Resolver source: ${escapeHtml(item.resolver_source_label)}</p>` : ""} + ${renderCitationAbstractBlock(item.abstract_text || "", false)} + ${item.enrichment_error ? `<p class="citation-entry-meta error">${escapeHtml(item.enrichment_error)}</p>` : ""} + ${renderCitationBibtexBlock(item.draft_bibtex || "", false)} + `; + attachCitationToggleControls(article); + citationList.appendChild(article); + continue; + } + + article.innerHTML = ` + <p class="citation-entry-meta">${readOnlyMeta}</p> + <p class="citation-entry-raw">${escapeHtml(item.raw_text || "")}</p> + <label class="editor-label">Review Status</label> + <select class="citation-review-status"> + <option value="draft"${item.review_status === "draft" ? " selected" : ""}>Draft</option> + <option value="reviewed"${item.review_status === "reviewed" ? " selected" : ""}>Reviewed</option> + <option value="accepted"${item.review_status === "accepted" ? " selected" : ""}>Accepted</option> + <option value="rejected"${item.review_status === "rejected" ? " selected" : ""}>Rejected</option> + </select> + <label class="editor-label">DOI</label> + <input class="citation-doi" type="text" value="${escapeHtml(item.doi || "")}"> + <label class="editor-label">OpenAlex ID</label> + <input class="citation-openalex" type="text" value="${escapeHtml(item.openalex_id || "")}" disabled> + <label class="editor-label">Source URL</label> + <input class="citation-source-url" type="text" value="${escapeHtml(item.source_url || "")}" disabled> + <label class="editor-label">Resolver Source</label> + <input class="citation-resolver-source" type="text" value="${escapeHtml(item.resolver_source_label || "")}" disabled> + <label class="editor-label">Citation Key</label> + <input class="citation-key" type="text" value="${escapeHtml(item.citation_key || "")}"> + <label class="editor-label">Entry Type</label> + <input class="citation-entry-type" type="text" value="${escapeHtml(item.entry_type || "misc")}"> + <label class="editor-label">Normalized Citation</label> + <textarea class="citation-normalized" rows="3">${escapeHtml(item.normalized_text || "")}</textarea> + <label class="editor-label">Abstract</label> + <textarea class="citation-abstract" rows="5">${escapeHtml(item.abstract_text || "")}</textarea> + ${renderCitationAbstractBlock(item.abstract_text || "", true)} + <label class="editor-label">Draft BibTeX</label> + <textarea class="citation-bibtex-editor" rows="8">${escapeHtml(item.draft_bibtex || "")}</textarea> + ${renderCitationBibtexBlock(item.draft_bibtex || "", true)} + ${item.enrichment_error ? `<p class="citation-entry-meta error">${escapeHtml(item.enrichment_error)}</p>` : ""} + <div class="editor-actions"> + <button type="button" class="secondary-button citation-enrich">Run Enrichment</button> + <button type="button" class="secondary-button citation-review-matches">Review Matches</button> + <button type="button" class="citation-save">Save Citation Review</button> + </div> + `; + + article.querySelector(".citation-enrich").addEventListener("click", async () => { + if (!currentSlug) { + return; + } + citationStatus.textContent = `Running enrichment for citation ${item.position}...`; + const { response, data } = await requestJson(`/api/editor/species/${currentSlug}/citations/${item.id}/enrich`, { + method: "POST", + body: JSON.stringify({}), + }); + if (!response.ok) { + citationStatus.textContent = data.error || "Citation enrichment failed"; + return; + } + citationStatus.textContent = `Citation ${data.citation.position} enrichment ${data.citation.enrichment_status}`; + await Promise.all([loadSpecies(currentSlug), loadSpeciesCitations(currentSlug)]); + }); + + article.querySelector(".citation-save").addEventListener("click", async () => { + if (!currentSlug) { + return; + } + citationStatus.textContent = `Saving citation ${item.position}...`; + const { response, data } = await requestJson(`/api/editor/species/${currentSlug}/citations/${item.id}`, { + method: "POST", + body: JSON.stringify({ + review_status: article.querySelector(".citation-review-status").value, + doi: article.querySelector(".citation-doi").value, + citation_key: article.querySelector(".citation-key").value, + entry_type: article.querySelector(".citation-entry-type").value, + normalized_text: article.querySelector(".citation-normalized").value, + abstract_text: article.querySelector(".citation-abstract").value, + draft_bibtex: article.querySelector(".citation-bibtex-editor").value, + }), + }); + if (!response.ok) { + citationStatus.textContent = data.error || "Citation review save failed"; + return; + } + citationStatus.textContent = `Citation ${data.citation.position} saved by ${data.last_modified_by}`; + await Promise.all([loadSpecies(currentSlug), loadSpeciesCitations(currentSlug)]); + }); + + article.querySelector(".citation-review-matches").addEventListener("click", async () => { + await openCitationMatchDialog(item.id); + }); + + attachCitationToggleControls(article); + citationList.appendChild(article); + } +} + +function renderCitationAbstractBlock(abstractText, editable) { + const text = normalizeAbstractForDisplay(abstractText); + if (!text) { + return ""; + } + const label = editable ? "Stored Abstract" : "Abstract"; + return ` + <div class="citation-abstract-shell"> + <button type="button" class="secondary-button citation-abstract-toggle" aria-expanded="false"> + Show ${label} + </button> + <div class="citation-abstract-display hidden"> + <p class="public-citation-abstract">${escapeHtml(text)}</p> + </div> + </div> + `; +} + +function renderCitationBibtexBlock(draftBibtex, editable) { + const text = String(draftBibtex || "").trim(); + if (!text) { + return ""; + } + const label = editable ? "Stored BibTeX" : "BibTeX"; + return ` + <div class="citation-detail-shell"> + <button type="button" class="secondary-button citation-detail-toggle" aria-expanded="false"> + Show ${label} + </button> + <div class="citation-detail-display hidden"> + <pre class="citation-bibtex">${escapeHtml(text)}</pre> + </div> + </div> + `; +} + +function attachCitationToggleControls(root) { + const toggles = root.querySelectorAll(".citation-abstract-toggle, .citation-detail-toggle"); + for (const toggle of toggles) { + const shell = toggle.parentElement; + const display = shell && shell.querySelector(".citation-abstract-display, .citation-detail-display"); + if (!display) { + continue; + } + const showLabel = toggle.textContent.replace(/^Hide /, "Show ").trim(); + const hideLabel = showLabel.replace(/^Show /, "Hide "); + toggle.addEventListener("click", () => { + const hidden = display.classList.toggle("hidden"); + toggle.setAttribute("aria-expanded", hidden ? "false" : "true"); + toggle.textContent = hidden ? showLabel : hideLabel; + }); + } +} + +function renderMetadataTable(fields) { + const rows = [ + ["Author", fields.author || ""], + ["Year", fields.year || ""], + ["Title", fields.title || ""], + ["Venue", fields.journal || fields.booktitle || fields.publisher || fields.howpublished || ""], + ["Volume", fields.volume || ""], + ["Issue", fields.number || ""], + ["Pages", fields.pages || ""], + ["DOI", fields.doi || ""], + ] + .filter(([, value]) => value) + .map( + ([label, value]) => + `<div class="match-row"><span class="match-label">${escapeHtml(label)}</span><span>${escapeHtml(value)}</span></div>`, + ) + .join(""); + return rows || `<p class="editor-status">No structured metadata extracted yet.</p>`; +} + +function renderFieldMatches(fieldMatches) { + return Object.entries(fieldMatches || {}) + .map(([field, detail]) => { + const status = String(detail.status || "unknown"); + return ` + <div class="match-row"> + <span class="match-label">${escapeHtml(field)}</span> + <span class="match-status match-status-${escapeHtml(status)}">${escapeHtml(status)}</span> + <span>${escapeHtml(String(detail.seed || ""))}</span> + <span>${escapeHtml(String(detail.candidate || ""))}</span> + </div> + `; + }) + .join(""); +} + +function normalizeCitationIdentity(value) { + return String(value || "").trim().toLowerCase(); +} + +function candidateAlreadyExists(candidate) { + const candidateFields = candidate && candidate.fields ? candidate.fields : {}; + const candidateDoi = normalizeCitationIdentity(candidateFields.doi || candidate.doi || ""); + const candidateOpenAlex = normalizeCitationIdentity(candidateFields.openalex || candidate.openalex_id || ""); + const candidateKey = normalizeCitationIdentity(candidate.citation_key || ""); + const candidateText = normalizeCitationIdentity(candidate.normalized_text || ""); + + return currentSpeciesCitations.some((item) => { + const itemDoi = normalizeCitationIdentity(item.doi || ""); + const itemOpenAlex = normalizeCitationIdentity(item.openalex_id || ""); + const itemKey = normalizeCitationIdentity(item.citation_key || ""); + const itemText = normalizeCitationIdentity(item.normalized_text || ""); + return ( + (candidateDoi && itemDoi && candidateDoi === itemDoi) + || (candidateOpenAlex && itemOpenAlex && candidateOpenAlex === itemOpenAlex) + || (candidateKey && itemKey && candidateKey === itemKey) + || (candidateText && itemText && candidateText === itemText) + ); + }); +} + +function closeCitationMatchDialog() { + currentCitationMatch = null; + citationMatchDialog.classList.add("hidden"); + citationMatchDialog.setAttribute("aria-hidden", "true"); + citationMatchSeed.innerHTML = ""; + citationMatchCandidates.innerHTML = ""; + citationMatchStatus.textContent = "Compare the parsed source citation against candidate metadata."; +} + +async function applyCitationCandidate(candidate) { + if (!currentSlug || !currentCitationMatch) { + return; + } + citationMatchStatus.textContent = `Applying ${candidate.source_label || "candidate"}...`; + const { response, data } = await requestJson( + `/api/editor/species/${currentSlug}/citations/${currentCitationMatch.citationId}/apply-match`, + { + method: "POST", + body: JSON.stringify({ candidate }), + }, + ); + if (!response.ok) { + citationMatchStatus.textContent = data.error || "Candidate application failed"; + return; + } + citationStatus.textContent = `Citation ${data.citation.position} accepted from reviewed candidate`; + closeCitationMatchDialog(); + expandCitationPanel(); + await Promise.all([loadSummary(), loadSpeciesList(searchInput.value), loadSpeciesCitations(currentSlug)]); + expandCitationPanel(); +} + +async function addCitationCandidate(candidate) { + if (!currentSlug || !currentCitationMatch) { + return; + } + citationMatchStatus.textContent = `Adding ${candidate.source_label || "candidate"} as another citation...`; + const { response, data } = await requestJson( + `/api/editor/species/${currentSlug}/citations/${currentCitationMatch.citationId}/add-match`, + { + method: "POST", + body: JSON.stringify({ candidate }), + }, + ); + if (!response.ok) { + citationMatchStatus.textContent = data.error || "Candidate addition failed"; + return; + } + citationStatus.textContent = `Added reviewed candidate as citation ${data.citation.position}`; + citationMatchStatus.textContent = `Added as citation ${data.citation.position}. You can continue reviewing other candidates.`; + expandCitationPanel(); + await Promise.all([loadSummary(), loadSpeciesList(searchInput.value), loadSpeciesCitations(currentSlug)]); + expandCitationPanel(); +} + +async function openCitationMatchDialog(citationId) { + if (!currentSlug || !isEditorSession()) { + return; + } + currentCitationMatch = { citationId }; + citationMatchDialog.classList.remove("hidden"); + citationMatchDialog.setAttribute("aria-hidden", "false"); + citationMatchSeed.innerHTML = ""; + citationMatchCandidates.innerHTML = ""; + citationMatchStatus.textContent = "Loading candidate matches..."; + + const { response, data } = await requestJson(`/api/editor/species/${currentSlug}/citations/${citationId}/candidates`); + if (!response.ok) { + citationMatchStatus.textContent = data.error || "Candidate lookup failed"; + citationMatchCandidates.innerHTML = `<p class="error">${escapeHtml(data.error || "Unable to load candidates.")}</p>`; + return; + } + + citationMatchSeed.innerHTML = ` + <p class="citation-entry-raw">${escapeHtml(data.citation.raw_text || "")}</p> + ${renderMetadataTable((data.seed && data.seed.fields) || {})} + ${renderCitationAbstractBlock((data.seed && (data.seed.abstract_text || (data.seed.fields && data.seed.fields.abstract))) || "", false)} + ${data.seed && data.seed.normalized_text ? `<p class="editor-status">${escapeHtml(data.seed.normalized_text)}</p>` : ""} + `; + attachCitationToggleControls(citationMatchSeed); + + const candidates = Array.isArray(data.candidates) ? data.candidates : []; + citationMatchStatus.textContent = `${candidates.length} candidate${candidates.length === 1 ? "" : "s"} found`; + if (!candidates.length) { + citationMatchCandidates.innerHTML = `<p class="editor-status">No close candidates were returned for this citation.</p>`; + return; + } + + citationMatchCandidates.innerHTML = ""; + for (const candidate of candidates) { + const alreadyExists = candidateAlreadyExists(candidate); + const card = document.createElement("article"); + card.className = "match-candidate-card"; + card.innerHTML = ` + <div class="match-candidate-header"> + <strong>${escapeHtml(candidate.fields?.title || "Untitled candidate")}</strong> + <span class="match-score">Score ${escapeHtml(String(candidate.score || 0))}</span> + </div> + <p class="citation-entry-meta">${escapeHtml(candidate.source_label || "")}</p> + ${alreadyExists ? `<p class="citation-entry-meta">Already present in this species' citation set.</p>` : ""} + ${candidate.conflict_reason ? `<p class="citation-entry-meta error">${escapeHtml(candidate.conflict_reason)}</p>` : ""} + ${renderMetadataTable(candidate.fields || {})} + ${renderCitationAbstractBlock(candidate.abstract_text || (candidate.fields && candidate.fields.abstract) || "", false)} + <div class="match-table"> + <div class="match-row match-row-head"> + <span class="match-label">Field</span> + <span>Status</span> + <span>Source</span> + <span>Candidate</span> + </div> + ${renderFieldMatches(candidate.field_matches || {})} + </div> + ${candidate.normalized_text ? `<p class="editor-status">${escapeHtml(candidate.normalized_text)}</p>` : ""} + <div class="editor-actions"> + <button type="button" class="candidate-apply">Use This Candidate</button> + <button type="button" class="secondary-button candidate-add"${alreadyExists ? " disabled" : ""}>Add As Another Citation</button> + </div> + `; + card.querySelector(".candidate-apply").addEventListener("click", async () => { + await applyCitationCandidate(candidate); + }); + if (!alreadyExists) { + card.querySelector(".candidate-add").addEventListener("click", async () => { + await addCitationCandidate(candidate); + }); + } + attachCitationToggleControls(card); + citationMatchCandidates.appendChild(card); + } +} + +async function loadSpeciesDocument(slug) { + if (!isEditorSession() && !isContributorSession()) { + documentPanel.classList.add("hidden"); + return; + } + + documentPanel.classList.remove("hidden"); + documentStatus.textContent = "Loading document..."; + const path = isEditorSession() + ? `/api/editor/species/${slug}/document` + : `/api/contributor/species/${slug}/document`; + const { response, data } = await requestJson(path); + if (!response.ok) { + documentMarkdown.value = ""; + documentPreview.innerHTML = `<p class="error">${escapeHtml(data.error || "Unable to load document.")}</p>`; + documentStatus.textContent = data.error || "Document load failed"; + return; + } + + documentMarkdown.value = data.markdown || ""; + renderDocumentPreview(documentMarkdown.value); + documentStatus.textContent = data.updated_by + ? `Document last updated by ${data.updated_by}` + : "Document loaded"; +} + +async function loadSpeciesCitations(slug, fallbackData = null) { + citationPanel.classList.remove("hidden"); + citationBackfillSpeciesButton.classList.toggle("hidden", !isEditorSession()); + citationEnrichAllButton.classList.toggle("hidden", !isEditorSession()); + + if (!isEditorSession() && !isContributorSession()) { + const items = Array.isArray(fallbackData && fallbackData.citations) ? fallbackData.citations : []; + currentSpeciesCitations = items; + renderCitationList(items, false); + citationStatus.textContent = `${items.length} citation${items.length === 1 ? "" : "s"}`; + return; + } + + citationStatus.textContent = "Loading citations..."; + const path = isEditorSession() + ? `/api/editor/species/${slug}/citations` + : `/api/contributor/species/${slug}/citations`; + const { response, data } = await requestJson(path); + if (!response.ok) { + citationList.innerHTML = `<p class="error">${escapeHtml(data.error || "Unable to load citations.")}</p>`; + citationStatus.textContent = data.error || "Citation load failed"; + return; + } + + currentSpeciesCitations = Array.isArray(data.citations) ? data.citations : []; + renderCitationList(currentSpeciesCitations, isEditorSession()); + citationStatus.textContent = `${data.citation_count || 0} citation${data.citation_count === 1 ? "" : "s"} extracted`; +} + async function requestJson(path, options = {}) { const headers = new Headers(options.headers || {}); const authHeaders = getAuthHeaders(); @@ -67,6 +942,10 @@ function isEditorSession() { return Boolean(currentSession && currentSession.user && ["editor", "admin"].includes(currentSession.user.role)); } +function isContributorSession() { + return Boolean(currentSession && currentSession.user && currentSession.user.role === "contributor"); +} + function getVisibleItems(items) { if (!isEditorSession()) { return items; @@ -82,6 +961,7 @@ function getVisibleItems(items) { function syncArchiveFilterUi() { archiveFilterGroup.classList.toggle("hidden", !isEditorSession()); + contributorCreateButton.classList.toggle("hidden", !isContributorSession()); for (const button of archiveFilterGroup.querySelectorAll("[data-archive-filter]")) { button.classList.toggle("is-active", button.dataset.archiveFilter === currentArchiveFilter); } @@ -93,9 +973,11 @@ async function loadSession() { if (!isEditorSession()) { currentArchiveFilter = "active"; } + contributorAgeLabel.textContent = String(data.minimum_contributor_age || 13); authTokenInput.value = getAuthToken(); if (data.authenticated) { authStatus.textContent = `${data.user.username} (${data.user.role})`; + contributorStatus.textContent = isContributorSession() ? "Contributor token stored in this browser." : ""; } else if (data.auth_configured) { authStatus.textContent = "Auth configured, public session"; } else { @@ -124,21 +1006,43 @@ function renderSpecies(items) { button.className = item.is_archived ? "species-card species-card-archived" : "species-card"; button.type = "button"; const archivedMeta = item.is_archived ? `<span class="species-state-badge">Archived</span>` : ""; + const commonName = item.common_name || item.title; + const scientificName = item.scientific_name || "Scientific name missing"; button.innerHTML = ` - <span class="species-name">${escapeHtml(item.common_name || item.title)}</span> - <span class="species-meta">${escapeHtml(item.scientific_name || "Scientific name missing")}</span> + <span class="species-name">${escapeHtml(commonName)}</span> + <span class="species-meta">Common name: ${escapeHtml(commonName)}</span> + <span class="species-meta">Scientific name: ${escapeHtml(scientificName)}</span> <span class="species-meta">${escapeHtml(item.publication_status || "published")}${archivedMeta}</span> <span class="species-meta">${item.diagnostic_count ? `${item.diagnostic_count} ingest flags` : "No ingest flags"}</span> <span class="species-snippet">${escapeHtml((item.summary || "No summary extracted yet.").slice(0, 180))}</span> `; - button.addEventListener("click", () => loadSpecies(item.slug)); + button.addEventListener("click", () => { + window.location.hash = item.slug; + loadSpecies(item.slug); + }); speciesList.appendChild(button); } } +function formatIdentifierBanner(item) { + if (item.primary_taxon_identifier && item.primary_taxon_authority) { + return `${String(item.primary_taxon_authority).toUpperCase()} ${item.primary_taxon_identifier.identifier || ""}`.trim(); + } + const legacyIdentifier = Array.isArray(item.legacy_identifiers) ? item.legacy_identifiers[0] : null; + if (legacyIdentifier && legacyIdentifier.identifier) { + const label = legacyIdentifier.label || "Legacy identifier"; + return `${label} ${legacyIdentifier.identifier}`; + } + return "No external taxon identifier assigned"; +} + async function loadSpeciesList(search = "") { const query = search ? `?search=${encodeURIComponent(search)}` : ""; - const path = isEditorSession() ? `/api/editor/species${query}` : `/api/species${query}`; + const path = isEditorSession() + ? `/api/editor/species${query}` + : isContributorSession() + ? `/api/contributor/species${query}` + : `/api/species${query}`; const { data } = await requestJson(path); currentItems = data.items; syncArchiveFilterUi(); @@ -146,8 +1050,14 @@ async function loadSpeciesList(search = "") { } async function loadSpecies(slug) { + const previousSlug = currentSlug; currentSlug = slug; - const path = isEditorSession() ? `/api/editor/species/${slug}` : `/api/species/${slug}`; + closeCitationMatchDialog(); + const path = isEditorSession() + ? `/api/editor/species/${slug}` + : isContributorSession() + ? `/api/contributor/species/${slug}` + : `/api/species/${slug}`; const { response, data } = await requestJson(path); if (!response.ok) { detailEmpty.classList.remove("hidden"); @@ -158,8 +1068,11 @@ async function loadSpecies(slug) { detailEmpty.classList.add("hidden"); detail.classList.remove("hidden"); + if (previousSlug !== slug) { + collapseWorkflowPanels(); + } - detailCode.textContent = data.flelmr_code ? `FLELMR ${data.flelmr_code}` : "Legacy source file"; + detailCode.textContent = formatIdentifierBanner(data); detailCommonName.textContent = data.common_name || data.title; detailArchiveBadge.classList.toggle("hidden", !data.is_archived); detailArchiveNote.classList.toggle("hidden", !data.is_archived); @@ -169,61 +1082,23 @@ async function loadSpecies(slug) { auditPanel.classList.toggle("hidden", !isEditorSession()); if (isEditorSession()) { editorPublicationStatus.value = data.publication_status || "published"; - editorSummary.value = data.summary || ""; editorNotes.value = data.editor_notes || ""; editorIsArchived.checked = Boolean(data.is_archived); editorStatus.textContent = data.last_modified_by ? `Last modified by ${data.last_modified_by}` : "Editor session active"; - await loadAudit(slug); + await Promise.all([loadAudit(slug), loadSpeciesDocument(slug), loadSpeciesCitations(slug)]); + } else if (isContributorSession()) { + editorStatus.textContent = ""; + await Promise.all([loadSpeciesDocument(slug), loadSpeciesCitations(slug)]); + } else { + documentPanel.classList.add("hidden"); + await loadSpeciesCitations(slug, data); } - detailSections.innerHTML = ""; - if (data.diagnostics.length) { - const diagnosticsEl = document.createElement("section"); - diagnosticsEl.className = "detail-section detail-diagnostics"; - diagnosticsEl.innerHTML = ` - <h3>Ingest Diagnostics</h3> - <ul class="diagnostic-list"> - ${data.diagnostics - .map( - (diagnostic) => - `<li><strong>${escapeHtml(diagnostic.code)}</strong>: ${escapeHtml(diagnostic.message)}</li>`, - ) - .join("")} - </ul> - `; - detailSections.appendChild(diagnosticsEl); - } - for (const section of data.sections) { - const sectionEl = document.createElement("section"); - sectionEl.className = "detail-section"; - if (isEditorSession()) { - sectionEl.innerHTML = ` - <h3>${escapeHtml(section.heading)}</h3> - <textarea class="section-editor" data-section-position="${section.position}" rows="10">${escapeHtml(section.content)}</textarea> - <div class="editor-actions"> - <button type="button" class="section-save" data-section-position="${section.position}">Save Section</button> - </div> - `; - } else { - sectionEl.innerHTML = ` - <h3>${escapeHtml(section.heading)}</h3> - <pre>${escapeHtml(section.content)}</pre> - `; - } - detailSections.appendChild(sectionEl); - } - - if (isEditorSession()) { - for (const button of detailSections.querySelectorAll(".section-save")) { - button.addEventListener("click", async (event) => { - const position = event.currentTarget.dataset.sectionPosition; - const textarea = detailSections.querySelector(`textarea[data-section-position="${position}"]`); - await saveSectionContent(Number(position), textarea.value); - }); - } - } + renderLegacySource(data); + restoreWorkflowPanels(); + renderPrimaryContent(data); } function renderAudit(items) { @@ -273,7 +1148,6 @@ async function saveEditorialChanges() { method: "POST", body: JSON.stringify({ publication_status: editorPublicationStatus.value, - summary: editorSummary.value, editor_notes: editorNotes.value, is_archived: editorIsArchived.checked, }), @@ -286,21 +1160,63 @@ async function saveEditorialChanges() { await Promise.all([loadSummary(), loadSpeciesList(searchInput.value), loadSpecies(currentSlug)]); } -async function saveSectionContent(sectionPosition, content) { - if (!currentSlug || !isEditorSession()) { +async function saveDocumentMarkdown() { + if (!currentSlug || (!isEditorSession() && !isContributorSession())) { return; } - editorStatus.textContent = `Saving section ${sectionPosition}...`; - const { response, data } = await requestJson(`/api/editor/species/${currentSlug}/sections/${sectionPosition}`, { + documentStatus.textContent = "Saving document..."; + const path = isEditorSession() + ? `/api/editor/species/${currentSlug}/document` + : `/api/contributor/species/${currentSlug}/document`; + const { response, data } = await requestJson(path, { method: "POST", - body: JSON.stringify({ content }), + body: JSON.stringify({ markdown: documentMarkdown.value }), }); if (!response.ok) { - editorStatus.textContent = data.error || "Section save failed"; + documentStatus.textContent = data.error || "Document save failed"; return; } - editorStatus.textContent = `Section ${sectionPosition} saved by ${data.last_modified_by}`; - await loadSpecies(currentSlug); + renderDocumentPreview(documentMarkdown.value); + documentStatus.textContent = `Document saved by ${data.updated_by}`; + await Promise.all([loadSummary(), loadSpeciesList(searchInput.value), loadSpecies(currentSlug)]); +} + +async function registerContributor() { + contributorStatus.textContent = "Registering contributor..."; + const { response, data } = await requestJson("/api/contributor/register", { + method: "POST", + body: JSON.stringify({ + email: contributorEmailInput.value.trim(), + age_gate_confirmed: contributorAgeGate.checked, + }), + }); + if (!response.ok) { + contributorStatus.textContent = data.error || "Contributor registration failed"; + return; + } + window.localStorage.setItem("ecospecies_auth_token", data.token); + authTokenInput.value = data.token; + contributorStatus.textContent = data.warning; + await loadSession(); + await loadSpeciesList(searchInput.value); +} + +async function createContributorDraft() { + if (!isContributorSession()) { + return; + } + contributorStatus.textContent = "Creating new contributor draft..."; + const { response, data } = await requestJson("/api/contributor/species", { + method: "POST", + body: JSON.stringify({}), + }); + if (!response.ok) { + contributorStatus.textContent = data.error || "Draft creation failed"; + return; + } + contributorStatus.textContent = "Draft created. Store your token carefully."; + await loadSpeciesList(searchInput.value); + await loadSpecies(data.slug); } searchInput.addEventListener("input", async (event) => { @@ -315,6 +1231,17 @@ for (const button of archiveFilterGroup.querySelectorAll("[data-archive-filter]" }); } +for (const button of collapsibleToggles) { + button.addEventListener("click", () => { + const panel = document.getElementById(button.dataset.target || ""); + if (!panel || panel.classList.contains("hidden")) { + return; + } + const expanded = panel.classList.contains("collapsed"); + setCollapsibleState(panel, expanded); + }); +} + authSaveButton.addEventListener("click", async () => { const token = authTokenInput.value.trim(); if (token) { @@ -330,6 +1257,7 @@ authSaveButton.addEventListener("click", async () => { authClearButton.addEventListener("click", async () => { window.localStorage.removeItem("ecospecies_auth_token"); authTokenInput.value = ""; + contributorStatus.textContent = ""; await loadSession(); await loadSpeciesList(searchInput.value); if (currentSlug) { @@ -338,12 +1266,67 @@ authClearButton.addEventListener("click", async () => { }); editorSaveButton.addEventListener("click", saveEditorialChanges); +documentSaveButton.addEventListener("click", saveDocumentMarkdown); +contributorRegisterButton.addEventListener("click", registerContributor); +contributorCreateButton.addEventListener("click", createContributorDraft); +citationMatchCloseButton.addEventListener("click", closeCitationMatchDialog); +citationMatchDialog.querySelector(".match-dialog-backdrop").addEventListener("click", closeCitationMatchDialog); +citationBackfillSpeciesButton.addEventListener("click", async () => { + if (!currentSlug || !isEditorSession()) { + return; + } + expandCitationPanel(); + citationStatus.textContent = "Running citation backfill for this species..."; + const { response, data } = await requestJson(`/api/editor/species/${currentSlug}/citations/backfill`, { + method: "POST", + body: JSON.stringify({}), + }); + if (!response.ok) { + citationStatus.textContent = data.error || "Species citation backfill failed"; + return; + } + citationStatus.textContent = + `Species backfill complete: ${data.backfilled_count || 0} checked, ${data.changed_count || 0} changed, ${data.resolved_count || 0} resolved, ${data.unresolved_count || 0} unresolved, ${data.error_count || 0} errors`; + await Promise.all([loadSpecies(currentSlug), loadSpeciesCitations(currentSlug)]); +}); +citationEnrichAllButton.addEventListener("click", async () => { + if (!currentSlug || !isEditorSession()) { + return; + } + expandCitationPanel(); + citationStatus.textContent = "Running enrichment for all citations..."; + const { response, data } = await requestJson(`/api/editor/species/${currentSlug}/citations/enrich`, { + method: "POST", + body: JSON.stringify({}), + }); + if (!response.ok) { + citationStatus.textContent = data.error || "Batch citation enrichment failed"; + return; + } + citationStatus.textContent = + `Batch enrichment complete: ${data.resolved_count || 0} resolved, ${data.unresolved_count || 0} unresolved, ${data.error_count || 0} errors`; + await Promise.all([loadSpecies(currentSlug), loadSpeciesCitations(currentSlug)]); +}); +documentMarkdown.addEventListener("input", () => { + renderDocumentPreview(documentMarkdown.value); +}); async function bootstrap() { await loadSession(); await Promise.all([loadSummary(), loadSpeciesList()]); + const initialSlug = getInitialSpeciesSlug(); + if (initialSlug) { + await loadSpecies(initialSlug); + } } bootstrap().catch((error) => { speciesList.innerHTML = `<p class="error">Failed to load data: ${escapeHtml(String(error))}</p>`; }); + +window.addEventListener("hashchange", async () => { + const slug = getInitialSpeciesSlug(); + if (slug && slug !== currentSlug) { + await loadSpecies(slug); + } +}); diff --git a/apps/web/bibliography.html b/apps/web/bibliography.html new file mode 100644 index 0000000..0a1e9ad --- /dev/null +++ b/apps/web/bibliography.html @@ -0,0 +1,43 @@ +<!doctype html> +<html lang="en"> + <head> + <meta charset="utf-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <title>EcoSpecies Bibliography + + + + +
+
+

EcoSpecies Atlas

+

Bibliography

+

+ A site-wide bibliography for the EcoSpecies atlas, including imported references and citations added during review. +

+
+ + +

Loading bibliography...

+
+
+ +
+
+
+
+ + + diff --git a/apps/web/bibliography.js b/apps/web/bibliography.js new file mode 100644 index 0000000..6935ec5 --- /dev/null +++ b/apps/web/bibliography.js @@ -0,0 +1,230 @@ +function getAppBase() { + const { pathname } = window.location; + if (pathname === "/" || pathname === "/index.html") { + return ""; + } + if (pathname.endsWith("/index.html")) { + return pathname.slice(0, -"/index.html".length); + } + return pathname.endsWith("/") ? pathname.slice(0, -1) : pathname; +} + +const apiBase = getAppBase().replace(/\/bibliography\.html$/, ""); +const bibliographyList = document.querySelector("#bibliography-list"); +const bibliographySearch = document.querySelector("#bibliography-search"); +const bibliographyStatus = document.querySelector("#bibliography-status"); +const bibliographyDownload = document.querySelector("#bibliography-download"); +let currentBibliographyItems = []; + +function escapeHtml(value) { + return String(value) + .replaceAll("&", "&") + .replaceAll('"', """) + .replaceAll("<", "<") + .replaceAll(">", ">"); +} + +function normalizeAbstractForDisplay(value) { + const raw = String(value || "").trim(); + if (!raw) { + return ""; + } + const temp = document.createElement("div"); + temp.innerHTML = raw; + return temp.textContent + .replace(/^abstract\s*[:.\-]?\s*/i, "") + .replace(/\s+/g, " ") + .trim(); +} + +function parseBibtexFields(draftBibtex) { + const fields = {}; + const text = String(draftBibtex || ""); + const pattern = /([a-zA-Z_]+)\s*=\s*\{([^}]*)\}/g; + let match = pattern.exec(text); + while (match) { + fields[match[1].toLowerCase()] = match[2].trim(); + match = pattern.exec(text); + } + return fields; +} + +function collectBibtexRecords(items) { + const seen = new Set(); + const records = []; + for (const item of items || []) { + const draftBibtex = String(item && item.draft_bibtex ? item.draft_bibtex : "").trim(); + if (!draftBibtex || seen.has(draftBibtex)) { + continue; + } + seen.add(draftBibtex); + records.push(draftBibtex); + } + return records; +} + +function downloadBibtexRecords(items, filenameStem) { + const records = collectBibtexRecords(items); + if (!records.length) { + return false; + } + const blob = new Blob([`${records.join("\n\n")}\n`], { type: "application/x-bibtex;charset=utf-8" }); + const url = URL.createObjectURL(blob); + const link = document.createElement("a"); + link.href = url; + link.download = `${filenameStem}.bib`; + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + window.setTimeout(() => URL.revokeObjectURL(url), 0); + return true; +} + +function syncDownloadButton(items) { + if (!bibliographyDownload) { + return; + } + const recordCount = collectBibtexRecords(items).length; + bibliographyDownload.disabled = !recordCount; + bibliographyDownload.textContent = recordCount + ? `Download BibTeX (${recordCount})` + : "Download BibTeX"; +} + +function buildCitationText(item) { + const fields = parseBibtexFields(item.draft_bibtex || ""); + if (item.normalized_text) { + return escapeHtml(item.normalized_text); + } + const author = fields.author || ""; + const year = fields.year || ""; + const title = fields.title || ""; + const venue = fields.journal || fields.booktitle || fields.publisher || ""; + const volume = fields.volume || ""; + const issue = fields.number || ""; + const pages = fields.pages || ""; + const parts = []; + const lead = [author, year ? `(${year})` : ""].filter(Boolean).join(" "); + if (lead) { + parts.push(lead); + } + if (title) { + parts.push(title); + } + const venueBits = [venue, volume ? `${volume}${issue ? `(${issue})` : ""}` : issue ? `(${issue})` : "", pages] + .filter(Boolean) + .join(", "); + if (venueBits) { + parts.push(venueBits); + } + return escapeHtml(parts.join(". ").trim() || item.raw_text || ""); +} + +function renderSpeciesRefs(refs) { + return refs + .map( + (ref) => + `${escapeHtml(ref.common_name || ref.slug)}`, + ) + .join(", "); +} + +function renderAbstractBlock(text) { + const abstract = normalizeAbstractForDisplay(text); + if (!abstract) { + return ""; + } + return ` +
+ + +
+ `; +} + +function attachCitationAbstractToggles(root) { + for (const toggle of root.querySelectorAll(".citation-abstract-toggle")) { + const shell = toggle.parentElement; + const display = shell && shell.querySelector(".citation-abstract-display"); + if (!display) { + continue; + } + toggle.addEventListener("click", () => { + const hidden = display.classList.toggle("hidden"); + toggle.setAttribute("aria-expanded", hidden ? "false" : "true"); + toggle.textContent = hidden ? "Show Abstract" : "Hide Abstract"; + }); + } +} + +function renderBibliography(items) { + bibliographyList.innerHTML = ""; + if (!items.length) { + bibliographyList.innerHTML = `

No bibliography entries match the current search.

`; + return; + } + + for (const item of items) { + const links = [ + item.doi ? `DOI` : "", + item.source_url ? `Source` : "", + item.openalex_id ? `OpenAlex` : "", + ] + .filter(Boolean) + .join(" · "); + + const article = document.createElement("article"); + article.className = "public-citation-entry"; + article.innerHTML = ` +

${buildCitationText(item)}

+ ${renderAbstractBlock(item.abstract_text || "")} +

+ Appears in ${item.species_count} species record${item.species_count === 1 ? "" : "s"} + ${item.legacy_reference_numbers && item.legacy_reference_numbers.length ? ` • Imported references: ${item.legacy_reference_numbers.map((value) => escapeHtml(value)).join(", ")}` : ""} +

+

Species: ${renderSpeciesRefs(item.species_refs || [])}

+ ${links ? `` : ""} + `; + attachCitationAbstractToggles(article); + bibliographyList.appendChild(article); + } +} + +async function loadBibliography(search = "") { + bibliographyStatus.textContent = "Loading bibliography..."; + const query = search ? `?search=${encodeURIComponent(search)}` : ""; + const response = await fetch(`${apiBase}/api/bibliography${query}`); + const data = await response.json(); + if (!response.ok) { + bibliographyList.innerHTML = `

${escapeHtml(data.error || "Unable to load bibliography.")}

`; + bibliographyStatus.textContent = data.error || "Bibliography load failed"; + return; + } + + currentBibliographyItems = data.items || []; + renderBibliography(currentBibliographyItems); + syncDownloadButton(currentBibliographyItems); + bibliographyStatus.textContent = `${data.count || 0} bibliography entr${data.count === 1 ? "y" : "ies"}`; +} + +bibliographySearch.addEventListener("input", async (event) => { + await loadBibliography(event.target.value); +}); + +loadBibliography().catch((error) => { + bibliographyList.innerHTML = `

Failed to load bibliography: ${escapeHtml(String(error))}

`; + bibliographyStatus.textContent = "Bibliography load failed"; +}); + +if (bibliographyDownload) { + bibliographyDownload.addEventListener("click", () => { + const downloaded = downloadBibtexRecords(currentBibliographyItems, "ecospecies-bibliography"); + if (!downloaded) { + bibliographyStatus.textContent = "No BibTeX records are available for download yet."; + } + }); +} diff --git a/apps/web/index.html b/apps/web/index.html index e1f969a..b93b972 100644 --- a/apps/web/index.html +++ b/apps/web/index.html @@ -7,20 +7,31 @@ +
-

Marine Species Knowledge System

-

EcoSpecies

+

Open Biodiversity Reference

+

EcoSpecies Atlas

- A modern follow-on for the legacy EcoSpecies archive, starting with direct ingestion - of historical Species Life History text files. + A modern follow-on for the legacy EcoSpecies archive, built as an open ecology and + biodiversity reference workspace. +

+

+ Use EcoSpecies Atlas for species profiles, habitat evidence, ecological reading, and + citation-aware exploration grounded in the migrated legacy corpus.

-
- - - -

Public access

-
0 @@ -38,6 +49,7 @@

Species

+

- This migration path preserves attribution for Dr. Peter Rubec, Dr. Diane Blackwood, + EcoSpecies Atlas preserves attribution for Dr. Peter Rubec, Dr. Diane Blackwood, Dr. Welsbery R. Elsberry, and the Florida Fish and Wildlife Research Institute context documented in the legacy project materials.

+ diff --git a/apps/web/nginx.conf b/apps/web/nginx.conf index 29d2394..88408f5 100644 --- a/apps/web/nginx.conf +++ b/apps/web/nginx.conf @@ -5,6 +5,10 @@ server { root /usr/share/nginx/html; index index.html; + location = /apps/ecospecies { + return 301 /apps/ecospecies/; + } + location /api/ { proxy_pass http://api:8000/api/; proxy_http_version 1.1; @@ -14,19 +18,46 @@ server { proxy_set_header X-Forwarded-Proto $scheme; } + location /apps/ecospecies/api/ { + rewrite ^/apps/ecospecies/api/(.*)$ /api/$1 break; + proxy_pass http://api:8000; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + location /healthz { proxy_pass http://api:8000/healthz; proxy_http_version 1.1; proxy_set_header Host $host; } + location /apps/ecospecies/healthz { + proxy_pass http://api:8000/healthz; + proxy_http_version 1.1; + proxy_set_header Host $host; + } + location /readyz { proxy_pass http://api:8000/readyz; proxy_http_version 1.1; proxy_set_header Host $host; } + location /apps/ecospecies/readyz { + proxy_pass http://api:8000/readyz; + proxy_http_version 1.1; + proxy_set_header Host $host; + } + location / { try_files $uri $uri/ /index.html; } + + location /apps/ecospecies/ { + rewrite ^/apps/ecospecies/(.*)$ /$1 break; + try_files $uri $uri/ /index.html; + } } diff --git a/apps/web/styles.css b/apps/web/styles.css index b5aa5cb..1741846 100644 --- a/apps/web/styles.css +++ b/apps/web/styles.css @@ -1,12 +1,12 @@ :root { - --bg: #f4efe6; - --paper: rgba(255, 252, 247, 0.78); - --ink: #16251f; - --muted: #58655f; - --accent: #0f766e; - --accent-2: #bc6c25; - --line: rgba(22, 37, 31, 0.12); - --shadow: 0 24px 70px rgba(24, 35, 30, 0.15); + --bg: #f4f7fb; + --paper: rgba(255, 255, 255, 0.88); + --ink: #182433; + --muted: #5f6b7d; + --accent: #2457a6; + --accent-2: #1f7a5a; + --line: rgba(24, 36, 51, 0.11); + --shadow: 0 24px 70px rgba(33, 52, 84, 0.14); } * { @@ -15,12 +15,83 @@ body { margin: 0; - font-family: Georgia, "Times New Roman", serif; + font-family: "Segoe UI", "Helvetica Neue", Arial, sans-serif; color: var(--ink); background: - radial-gradient(circle at top left, rgba(15, 118, 110, 0.14), transparent 28%), - radial-gradient(circle at top right, rgba(188, 108, 37, 0.16), transparent 24%), - linear-gradient(180deg, #f8f4ec, #efe6d7 70%, #e7dcc9); + radial-gradient(circle at top left, rgba(36, 87, 166, 0.14), transparent 26%), + radial-gradient(circle at top right, rgba(31, 122, 90, 0.12), transparent 24%), + linear-gradient(180deg, #f4f7fb, #e4edf6 72%, #d9e6ef); +} + +.site-header { + width: min(1320px, calc(100vw - 32px)); + margin: 0 auto; + padding-top: 24px; +} + +.site-header-inner { + display: flex; + gap: 18px; + align-items: center; + justify-content: space-between; + padding: 18px 22px; + border-radius: 24px; + backdrop-filter: blur(10px); + background: var(--paper); + border: 1px solid var(--line); + box-shadow: var(--shadow); +} + +.site-brand { + display: flex; + flex-direction: column; + gap: 4px; +} + +.site-brand-mark { + margin: 0; + color: var(--accent); + text-transform: uppercase; + letter-spacing: 0.18em; + font-size: 0.76rem; +} + +.site-brand-link { + color: var(--ink); + font-size: 1.5rem; + font-weight: 700; + text-decoration: none; +} + +.site-brand-summary { + margin: 0; + color: var(--muted); + font-size: 0.94rem; +} + +.site-nav { + display: flex; + flex-wrap: wrap; + gap: 10px; + justify-content: flex-end; +} + +.site-nav a { + display: inline-flex; + align-items: center; + justify-content: center; + border-radius: 999px; + padding: 11px 16px; + text-decoration: none; + color: var(--ink); + border: 1px solid var(--line); + background: rgba(255, 255, 255, 0.72); + transition: transform 160ms ease, border-color 160ms ease; +} + +.site-nav a:hover { + transform: translateY(-1px); + border-color: rgba(15, 118, 110, 0.45); } .page { @@ -42,6 +113,9 @@ body { .hero { padding: 28px; margin-bottom: 20px; + background: + linear-gradient(135deg, rgba(255, 255, 255, 0.95), rgba(234, 244, 240, 0.92)), + var(--paper); } .eyebrow { @@ -56,6 +130,7 @@ h1 { margin: 0; font-size: clamp(2.8rem, 7vw, 5.6rem); line-height: 0.92; + letter-spacing: -0.03em; } .lede { @@ -64,6 +139,12 @@ h1 { font-size: 1.08rem; } +.hero-context { + max-width: 68ch; + color: var(--muted); + line-height: 1.58; +} + .hero-stats { display: flex; gap: 16px; @@ -79,6 +160,15 @@ h1 { margin-top: 18px; } +.auth-panel-row { + margin-top: 0; +} + +.contributor-signup { + padding-top: 14px; + border-top: 1px solid var(--line); +} + .auth-bar input { min-width: min(360px, 100%); flex: 1; @@ -93,7 +183,7 @@ h1 { min-width: 180px; padding: 14px 16px; border-radius: 18px; - background: rgba(255, 255, 255, 0.6); + background: linear-gradient(180deg, rgba(255, 255, 255, 0.95), rgba(232, 242, 239, 0.92)); border: 1px solid var(--line); } @@ -158,6 +248,16 @@ input[type="search"] { background: rgba(255, 255, 255, 0.9); } +input[type="text"], +input[type="email"], +input[type="password"] { + border: 1px solid var(--line); + border-radius: 18px; + padding: 12px 14px; + font: inherit; + background: rgba(255, 255, 255, 0.92); +} + select, textarea, button { @@ -201,7 +301,7 @@ button { padding: 14px; border-radius: 18px; border: 1px solid var(--line); - background: linear-gradient(180deg, rgba(255, 255, 255, 0.95), rgba(241, 237, 230, 0.95)); + background: linear-gradient(180deg, rgba(255, 255, 255, 0.97), rgba(239, 246, 244, 0.94)); cursor: pointer; transition: transform 160ms ease, border-color 160ms ease; } @@ -213,7 +313,7 @@ button { .species-card-archived { border-style: dashed; - background: linear-gradient(180deg, rgba(247, 241, 231, 0.98), rgba(233, 226, 214, 0.98)); + background: linear-gradient(180deg, rgba(243, 247, 249, 0.98), rgba(227, 236, 242, 0.98)); } .species-name, @@ -273,6 +373,32 @@ button { display: none; } +.match-dialog-shell { + position: fixed; + inset: 0; + z-index: 50; +} + +.match-dialog-backdrop { + position: absolute; + inset: 0; + background: rgba(12, 20, 18, 0.46); +} + +.match-dialog-card { + position: relative; + z-index: 1; + width: min(1180px, calc(100vw - 32px)); + max-height: calc(100vh - 40px); + overflow: auto; + margin: 20px auto; + padding: 18px; + border-radius: 24px; + background: #fbf8f1; + border: 1px solid var(--line); + box-shadow: var(--shadow); +} + .detail-header { padding-bottom: 16px; border-bottom: 1px solid var(--line); @@ -313,6 +439,12 @@ button { margin-top: 18px; } +.workflow-panels { + display: grid; + gap: 16px; + margin-top: 20px; +} + .detail-section { padding: 16px; border-radius: 18px; @@ -329,6 +461,44 @@ button { margin-top: 18px; } +.workflow-panels .editor-panel, +.workflow-panels .detail-section { + margin-top: 0; +} + +.collapsible-panel { + padding-top: 14px; +} + +.collapsible-header { + display: flex; + gap: 12px; + align-items: center; + justify-content: space-between; + flex-wrap: wrap; +} + +.collapsible-header h3 { + margin-bottom: 0; +} + +.collapsible-body { + margin-top: 16px; +} + +.collapsible-panel.collapsed .collapsible-body { + display: none; +} + +.document-panel-header { + display: flex; + gap: 16px; + align-items: flex-start; + justify-content: space-between; + flex-wrap: wrap; + margin-bottom: 14px; +} + .editor-label { display: block; margin: 0 0 8px; @@ -349,6 +519,11 @@ button { font-weight: 700; } +.contributor-age-gate { + margin: 0; + font-weight: 400; +} + .archive-toggle input { width: 18px; height: 18px; @@ -372,6 +547,149 @@ button { gap: 12px; } +.citation-list { + display: grid; + gap: 14px; +} + +.citation-entry { + padding: 14px; + border-radius: 16px; + border: 1px solid var(--line); + background: rgba(255, 255, 255, 0.76); +} + +.citation-entry-meta { + margin: 0 0 10px; + color: var(--muted); + font-size: 0.92rem; +} + +.citation-entry-raw { + margin: 0 0 12px; + line-height: 1.5; +} + +.citation-bibtex, +.citation-bibtex-editor { + font-family: "Courier New", monospace; + font-size: 0.9rem; + line-height: 1.45; +} + +.citation-abstract-shell { + display: grid; + gap: 8px; + margin: 4px 0 10px; +} + +.citation-detail-shell { + display: grid; + gap: 8px; + margin: 4px 0 10px; +} + +.citation-abstract-display { + padding: 10px 12px; + border-radius: 12px; + border: 1px solid var(--line); + background: rgba(15, 118, 110, 0.05); +} + +.citation-detail-display { + padding: 10px 12px; + border-radius: 12px; + border: 1px solid var(--line); + background: rgba(255, 255, 255, 0.78); +} + +.match-dialog-header, +.match-dialog-grid, +.match-candidate-header, +.match-candidates, +.match-candidate-card, +.match-seed, +.match-table { + display: grid; + gap: 12px; +} + +.match-dialog-header { + grid-template-columns: minmax(0, 1fr) auto; + align-items: start; +} + +.match-dialog-grid { + grid-template-columns: minmax(260px, 0.9fr) minmax(0, 1.6fr); + margin-top: 16px; +} + +.match-candidate-card { + padding: 14px; + border-radius: 16px; + border: 1px solid var(--line); + background: rgba(255, 255, 255, 0.84); +} + +.match-candidate-header { + grid-template-columns: minmax(0, 1fr) auto; + align-items: baseline; +} + +.match-score { + font-weight: 700; + color: var(--accent); +} + +.match-table { + border: 1px solid var(--line); + border-radius: 14px; + overflow: hidden; +} + +.match-row { + display: grid; + grid-template-columns: 120px 110px minmax(0, 1fr) minmax(0, 1fr); + gap: 10px; + padding: 10px 12px; + border-top: 1px solid var(--line); + font-size: 0.92rem; +} + +.match-row:first-child { + border-top: 0; +} + +.match-row-head { + background: rgba(15, 118, 110, 0.08); + font-weight: 700; +} + +.match-label { + color: var(--muted); + font-weight: 700; +} + +.match-status { + text-transform: uppercase; + letter-spacing: 0.04em; + font-size: 0.78rem; +} + +.match-status-exact { + color: var(--accent); +} + +.match-status-partial, +.match-status-seed-missing, +.match-status-candidate-missing { + color: var(--accent-2); +} + +.match-status-conflict { + color: #a12626; +} + .audit-entry { padding: 14px; border-radius: 16px; @@ -394,6 +712,62 @@ button { line-height: 1.45; } +.document-editor, +.document-preview { + font-family: "Courier New", monospace; + font-size: 0.92rem; + line-height: 1.5; +} + +.document-editor { + min-height: 420px; + margin-bottom: 14px; + white-space: pre; + overflow: auto; +} + +.document-preview-shell { + border: 1px solid var(--line); + border-radius: 18px; + background: rgba(255, 255, 255, 0.72); + overflow: hidden; +} + +.document-preview-shell summary { + cursor: pointer; + padding: 12px 16px; + font-weight: 700; + color: var(--accent); +} + +.document-preview { + padding: 0 16px 16px; +} + +.document-preview-empty { + color: var(--muted); +} + +.document-preview-list { + margin: 0; + padding-left: 22px; +} + +.document-preview-list li + li { + margin-top: 8px; +} + +.document-preview-metadata { + margin: 0 0 14px; + padding: 0; + list-style: none; + color: var(--muted); +} + +.document-preview-metadata li + li { + margin-top: 6px; +} + .diagnostic-list { margin: 0; padding-left: 18px; @@ -403,6 +777,100 @@ button { margin-top: 8px; } +.structured-node { + display: grid; + gap: 12px; + background: linear-gradient(180deg, rgba(255, 255, 255, 0.84), rgba(242, 247, 252, 0.88)); +} + +.structured-node + .structured-node { + margin-top: 4px; +} + +.structured-node h3, +.structured-node h4, +.structured-node h5, +.structured-node h6 { + line-height: 1.18; + letter-spacing: -0.01em; +} + +.structured-node-body { + margin: 0; + line-height: 1.58; + color: var(--ink); +} + +.structured-node-children { + display: grid; + gap: 12px; + padding: 4px 0 0 18px; + border-left: 2px solid rgba(36, 87, 166, 0.12); +} + +.public-citation-list { + display: grid; + gap: 14px; +} + +.public-bibliography-actions { + display: flex; + gap: 12px; + align-items: center; + flex-wrap: wrap; +} + +.public-bibliography-note { + margin: 0; + color: var(--muted); + font-size: 0.92rem; +} + +.public-citation-entry { + display: grid; + gap: 8px; + padding: 14px; + border-radius: 16px; + border: 1px solid var(--line); + background: rgba(255, 255, 255, 0.76); +} + +.public-citation-text, +.public-citation-meta, +.public-citation-links, +.public-citation-abstract { + margin: 0; +} + +.public-citation-text { + line-height: 1.56; +} + +.public-citation-meta, +.public-citation-links { + color: var(--muted); + font-size: 0.92rem; +} + +.public-citation-links a { + color: var(--accent); +} + +.public-citation-abstract { + padding-top: 2px; + color: var(--muted); + line-height: 1.58; +} + +.legacy-source { + max-height: 28rem; + overflow: auto; + padding: 14px; + border-radius: 16px; + border: 1px solid var(--line); + background: rgba(255, 255, 255, 0.76); +} + pre { margin: 0; white-space: pre-wrap; @@ -417,6 +885,15 @@ pre { } @media (max-width: 960px) { + .site-header-inner { + flex-direction: column; + align-items: stretch; + } + + .site-nav { + justify-content: flex-start; + } + .workspace { grid-template-columns: 1fr; } @@ -424,4 +901,12 @@ pre { .species-list { max-height: 40vh; } + + .match-dialog-grid { + grid-template-columns: 1fr; + } + + .match-row { + grid-template-columns: 1fr; + } } diff --git a/docker-compose.yml b/docker-compose.yml index 28889c9..52200ff 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,6 @@ services: db: + container_name: ecospecies-db image: postgres:16-alpine environment: POSTGRES_DB: ecospecies @@ -17,6 +18,7 @@ services: - postgres_data:/var/lib/postgresql/data importer: + container_name: ecospecies-importer image: python:3.12-slim depends_on: db: @@ -30,11 +32,12 @@ services: command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-import.sh"] volumes: - .:/workspace - - ../01-legacy-code-and-data:/legacy-data:ro + - ${ECOSPECIES_LEGACY_DATA_DIR:-../legacy-corpus}:/legacy-data:ro - python_venv:/workspace/.docker/venv - pip_cache:/root/.cache/pip api: + container_name: ecospecies-api image: python:3.12-slim restart: unless-stopped depends_on: @@ -56,11 +59,12 @@ services: - "${ECOSPECIES_API_PORT:-8000}:8000" volumes: - .:/workspace - - ../01-legacy-code-and-data:/legacy-data:ro + - ${ECOSPECIES_LEGACY_DATA_DIR:-../legacy-corpus}:/legacy-data:ro - python_venv:/workspace/.docker/venv - pip_cache:/root/.cache/pip web: + container_name: ecospecies-web image: nginx:1.27-alpine restart: unless-stopped depends_on: diff --git a/docs/citegeist-review-notes.md b/docs/citegeist-review-notes.md new file mode 100644 index 0000000..981d5d9 --- /dev/null +++ b/docs/citegeist-review-notes.md @@ -0,0 +1,110 @@ +## CiteGeist Review Notes + +These notes capture parser issues seen while integrating CiteGeist-style extraction into EcoSpecies. + +### Report-style references + +Observed failure shape: + +- references like `Daniell, W.C. 1872. Letters referring ... Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.` +- extracted `title` may contain the full raw bibliography string +- abbreviated venue names such as `Comm. Rept.` are not separated cleanly from the title + +Suggested upstream change in `citegeist.extract`: + +- add a report-style parser path after year detection +- prefer sentence-boundary venue detection before naive keyword splits so words like `report` inside a real title do not trigger an early cut +- support abbreviation-heavy venue starters such as: + - `comm. rept.` + - `rept.` + - `proc.` + - `occas. pap.` + - `bulletin` + - `bull.` + - `memoir` +- strip trailing volume/page blobs like `2: 387-390` from the venue field +- when a first parse leaves a partial venue stub such as `Occas`, reparse the full raw reference line and prefer the fuller repaired venue/title split + +### Placeholder title merge behavior + +Observed failure shape: + +- a raw bibliography string may survive as `title` even after DOI/title resolution finds a better title + +Suggested upstream change in `citegeist.resolve.merge_entries_with_conflicts`: + +- treat titles that look like raw bibliography strings as placeholders +- example heuristic: + - starts with `Surname, ... YEAR.` + - unusually long for a title + - contains a resolved shorter title as a substring after punctuation normalization + +### Legacy note deduplication + +Observed failure shape: + +- note fragments like `ecospecies_reference_number = {160}` can be appended more than once downstream when re-merging enriched metadata + +Suggested upstream change: + +- when joining note fragments, split on `;`, normalize whitespace, and dedupe per fragment rather than per whole note string + +### Unresolved entries should still refresh local parses + +Observed failure shape: + +- parser improvements may correctly rebuild `title`, venue, `volume`, `number`, and `pages` +- but if no remote metadata source matches, the stored draft BibTeX can remain unchanged unless unresolved enrichment also writes the refreshed local seed back out + +Suggested upstream change: + +- unresolved enrichment should still return the rebuilt local draft entry +- keep `citation_key`, normalized text, and draft BibTeX synchronized with the current local parser even when resolver status remains `unresolved` + +### Returned metadata not carried through + +Observed concern: + +- resolver/source payloads may include bibliographic details such as: + - `volume` + - `issue` / BibTeX `number` + - `page` / BibTeX `pages` +- these should be preserved into the BibTeX entry whenever available + +Current note: + +- CiteGeist Crossref mapping already includes `volume`, `number`, and `pages` +- verify that all resolver paths, storage round-trips, and exports preserve those fields consistently +- OpenAlex/DataCite mappings should also be checked for analogous bibliographic fields in `biblio` / attribute payloads + +### False-positive title-search acceptance + +Observed failure shape: + +- title search can return a thematically related but bibliographically different work +- downstream acceptance may keep some seed fields while adopting conflicting DOI/title/volume/pages from the returned match +- this is especially risky for historical references with sparse or abbreviated venue names + +Suggested upstream change in `citegeist.resolve` and any title-search ranking path: + +- do not fall back to the first search hit when no strong title match exists +- prefer exact or near-exact title matches only +- reject a candidate when structured seed metadata conflicts on strong fields such as: + - `year` + - venue / journal + - `volume` + - `number` + - `pages` +- treat those fields as match-validation inputs, not just merge-time metadata + +### OpenAlex null-source handling + +Observed failure shape: + +- some OpenAlex works have `primary_location` present but `source: null` +- downstream mapping can crash if it assumes `source` is always a dictionary + +Suggested upstream change: + +- treat null `source` payloads as empty dictionaries +- continue mapping title, year, DOI, and `biblio` fields even when venue/source is missing diff --git a/docs/dc-orig.yml b/docs/dc-orig.yml new file mode 100644 index 0000000..0f0c9b0 --- /dev/null +++ b/docs/dc-orig.yml @@ -0,0 +1,89 @@ +services: + db: + image: postgres:16-alpine + restart: unless-stopped + environment: + POSTGRES_DB: ${ECOSPECIES_DB_NAME:-ecospecies} + POSTGRES_USER: ${ECOSPECIES_DB_USER:-ecospecies} + POSTGRES_PASSWORD: ${ECOSPECIES_DB_PASSWORD:?set ECOSPECIES_DB_PASSWORD} + PGDATA: /var/lib/postgresql/data/pgdata + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${ECOSPECIES_DB_USER:-ecospecies} -d ${ECOSPECIES_DB_NAME:-ecospecies}"] + interval: 5s + timeout: 5s + retries: 10 + volumes: + - postgres_data:/var/lib/postgresql/data + + importer: + image: python:3.12-slim + restart: "no" + depends_on: + db: + condition: service_healthy + working_dir: /workspace + environment: + ECOSPECIES_DATA_DIR: ${ECOSPECIES_DATA_DIR:-/legacy-data/InputFiles - TXT} + ECOSPECIES_DATABASE_URL: postgresql+psycopg://${ECOSPECIES_DB_USER:-ecospecies}:${ECOSPECIES_DB_PASSWORD}@db:5432/${ECOSPECIES_DB_NAME:-ecospecies} + ECOSPECIES_VENV_DIR: /workspace/.docker/venv + PYTHONPATH: /workspace/apps/api/src + command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-import.sh"] + volumes: + - ..:/workspace + - ${ECOSPECIES_LEGACY_DATA_DIR:-../path-to-legacy-corpus}:/legacy-data:ro + - python_venv:/workspace/.docker/venv + - pip_cache:/root/.cache/pip + + api: + image: python:3.12-slim + restart: unless-stopped + depends_on: + db: + condition: service_healthy + importer: + condition: service_completed_successfully + working_dir: /workspace + environment: + ECOSPECIES_DATA_DIR: ${ECOSPECIES_DATA_DIR:-/legacy-data/InputFiles - TXT} + ECOSPECIES_DATABASE_URL: postgresql+psycopg://${ECOSPECIES_DB_USER:-ecospecies}:${ECOSPECIES_DB_PASSWORD}@db:5432/${ECOSPECIES_DB_NAME:-ecospecies} + ECOSPECIES_HOST: 0.0.0.0 + ECOSPECIES_PORT: "8000" + ECOSPECIES_AUTH_TOKENS: ${ECOSPECIES_AUTH_TOKENS:-} + ECOSPECIES_VENV_DIR: /workspace/.docker/venv + PYTHONPATH: /workspace/apps/api/src + command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-api.sh"] + volumes: + - ..:/workspace + - ${ECOSPECIES_LEGACY_DATA_DIR:-../path-to-legacy-corpus}:/legacy-data:ro + - python_venv:/workspace/.docker/venv + - pip_cache:/root/.cache/pip + + web: + image: nginx:1.27-alpine + restart: unless-stopped + depends_on: + api: + condition: service_started + labels: + - "traefik.enable=true" + - "traefik.docker.network=${TRAEFIK_NETWORK:-traefik-network}" + - "traefik.http.routers.ecospecies-atlas.rule=Host(`${ECOSPECIES_HOSTNAME:?set ECOSPECIES_HOSTNAME}`)" + - "traefik.http.routers.ecospecies-atlas.entrypoints=${TRAEFIK_ENTRYPOINTS:-websecure}" + - "traefik.http.routers.ecospecies-atlas.tls.certresolver=${TRAEFIK_CERTRESOLVER:-myresolver}" + - "traefik.http.services.ecospecies-atlas.loadbalancer.server.port=80" + volumes: + - ../apps/web:/usr/share/nginx/html:ro + - ../apps/web/nginx.conf:/etc/nginx/conf.d/default.conf:ro + networks: + - default + - traefik-network + +volumes: + postgres_data: + python_venv: + pip_cache: + +networks: + traefik-network: + external: true + name: ${TRAEFIK_NETWORK:-traefik-network} diff --git a/docs/docker-compose-traefik.env.example b/docs/docker-compose-traefik.env.example new file mode 100644 index 0000000..c9c13b7 --- /dev/null +++ b/docs/docker-compose-traefik.env.example @@ -0,0 +1,20 @@ +# Required +ECOSPECIES_HOSTNAME=example.org +ECOSPECIES_BASE_PATH=/apps/ecospecies +ECOSPECIES_DB_PASSWORD=replace-with-strong-password + +# Optional database settings +ECOSPECIES_DB_NAME=ecospecies +ECOSPECIES_DB_USER=ecospecies + +# Optional application settings +ECOSPECIES_AUTH_TOKENS= +ECOSPECIES_DATA_DIR=/workspace/input-data/InputFiles + +# Optional host path to the legacy corpus if it is not at ../path-to-legacy-corpus +ECOSPECIES_LEGACY_DATA_DIR=../path-to-legacy-corpus + +# Optional Traefik settings +TRAEFIK_NETWORK=traefik-network +TRAEFIK_ENTRYPOINTS=websecure +TRAEFIK_CERTRESOLVER=myresolver diff --git a/docs/docker-compose-traefik.yml b/docs/docker-compose-traefik.yml new file mode 100644 index 0000000..70f425e --- /dev/null +++ b/docs/docker-compose-traefik.yml @@ -0,0 +1,93 @@ +services: + db: + container_name: ecospecies-db + image: postgres:16-alpine + restart: unless-stopped + environment: + POSTGRES_DB: ${ECOSPECIES_DB_NAME:-ecospecies} + POSTGRES_USER: ${ECOSPECIES_DB_USER:-ecospecies} + POSTGRES_PASSWORD: ${ECOSPECIES_DB_PASSWORD:?set ECOSPECIES_DB_PASSWORD} + PGDATA: /var/lib/postgresql/data/pgdata + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${ECOSPECIES_DB_USER:-ecospecies} -d ${ECOSPECIES_DB_NAME:-ecospecies}"] + interval: 5s + timeout: 5s + retries: 10 + volumes: + - postgres_data:/var/lib/postgresql/data + + importer: + container_name: ecospecies-importer + image: python:3.12-slim + restart: "no" + depends_on: + db: + condition: service_healthy + working_dir: /workspace + environment: + ECOSPECIES_DATA_DIR: ${ECOSPECIES_DATA_DIR:-/workspace/input-data/InputFiles} + ECOSPECIES_DATABASE_URL: postgresql+psycopg://${ECOSPECIES_DB_USER:-ecospecies}:${ECOSPECIES_DB_PASSWORD}@db:5432/${ECOSPECIES_DB_NAME:-ecospecies} + ECOSPECIES_VENV_DIR: /workspace/.docker/venv + PYTHONPATH: /workspace/apps/api/src + command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-import.sh"] + volumes: + - ..:/workspace + - ${ECOSPECIES_LEGACY_DATA_DIR:-../path-to-legacy-corpus}:/legacy-data:ro + - python_venv:/workspace/.docker/venv + - pip_cache:/root/.cache/pip + + api: + container_name: ecospecies-api + image: python:3.12-slim + restart: unless-stopped + depends_on: + db: + condition: service_healthy + importer: + condition: service_completed_successfully + working_dir: /workspace + environment: + ECOSPECIES_DATA_DIR: ${ECOSPECIES_DATA_DIR:-/workspace/input-data/InputFiles} + ECOSPECIES_DATABASE_URL: postgresql+psycopg://${ECOSPECIES_DB_USER:-ecospecies}:${ECOSPECIES_DB_PASSWORD}@db:5432/${ECOSPECIES_DB_NAME:-ecospecies} + ECOSPECIES_HOST: 0.0.0.0 + ECOSPECIES_PORT: "8000" + ECOSPECIES_AUTH_TOKENS: ${ECOSPECIES_AUTH_TOKENS:-} + ECOSPECIES_VENV_DIR: /workspace/.docker/venv + PYTHONPATH: /workspace/apps/api/src + command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-api.sh"] + volumes: + - ..:/workspace + - ${ECOSPECIES_LEGACY_DATA_DIR:-/input-data}:/legacy-data:ro + - python_venv:/workspace/.docker/venv + - pip_cache:/root/.cache/pip + + web: + container_name: ecospecies-web + image: nginx:1.27-alpine + restart: unless-stopped + depends_on: + api: + condition: service_started + labels: + - "traefik.enable=true" + - "traefik.docker.network=${TRAEFIK_NETWORK:-traefik-network}" + - "traefik.http.routers.ecospecies-atlas.rule=Host(`${ECOSPECIES_HOSTNAME:?set ECOSPECIES_HOSTNAME}`) && PathPrefix(`${ECOSPECIES_BASE_PATH:-/}`)" + - "traefik.http.routers.ecospecies-atlas.entrypoints=${TRAEFIK_ENTRYPOINTS:-websecure}" + - "traefik.http.routers.ecospecies-atlas.tls.certresolver=${TRAEFIK_CERTRESOLVER:-myresolver}" + - "traefik.http.services.ecospecies-atlas.loadbalancer.server.port=80" + volumes: + - ../apps/web:/usr/share/nginx/html:ro + - ../apps/web/nginx.conf:/etc/nginx/conf.d/default.conf:ro + networks: + - default + - traefik-network + +volumes: + postgres_data: + python_venv: + pip_cache: + +networks: + traefik-network: + external: true + name: ${TRAEFIK_NETWORK:-traefik-network} diff --git a/docs/postgres-backup.md b/docs/postgres-backup.md new file mode 100644 index 0000000..2187ee9 --- /dev/null +++ b/docs/postgres-backup.md @@ -0,0 +1,48 @@ +# PostgreSQL Backup Notes + +This note applies to deployments that use the PostgreSQL volume defined by the Compose stack, including the Traefik deployment variant. + +## What Needs Backup + +At minimum, back up: + +- the PostgreSQL data volume +- the deployment env file that contains the database credentials + +For the Traefik deployment variant, that usually means: + +- the Docker volume `postgres_data` +- `docs/docker-compose-traefik.env` + +## Logical Backup + +From the repository root, create a SQL dump with: + +```bash +./scripts/backup-postgres.sh +``` + +To write to a specific file: + +```bash +./scripts/backup-postgres.sh /path/to/ecospecies-backup.sql +``` + +## Restore From Logical Backup + +Restore a SQL dump with: + +```bash +./scripts/restore-postgres.sh /path/to/ecospecies-backup.sql +``` + +## Volume-Level Backup + +If the host backup system can snapshot Docker volumes safely, include the PostgreSQL volume in that schedule. A volume snapshot is useful for full recovery, but a logical dump is still recommended for portability and validation. + +## Operational Guidance + +- Run backups on a schedule instead of relying on ad hoc dumps. +- Test restore procedures before relying on the backup policy. +- Keep backup artifacts outside the live Docker host when possible. +- The backup and restore scripts default to `docs/docker-compose-traefik.env` and `docs/docker-compose-traefik.yml`, but both can be overridden with `ECOSPECIES_ENV_FILE` and `ECOSPECIES_COMPOSE_FILE`. diff --git a/docs/roadmap.md b/docs/roadmap.md index 7d25664..cbdaa49 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -1,5 +1,22 @@ # EcoSpecies Modernization Roadmap +## Current Status + +As of 2026-03-27, the repo is no longer at the pure planning stage. The following pieces are already implemented and working in the live stack: + +- Docker Compose deployment with explicit `ecospecies-...` container names +- path-based hosting support for `/apps/ecospecies` +- in-repo-only source directory resolution with safe path validation +- legacy SLH ingest into PostgreSQL-backed species, sections, citations, audit, and document records +- editor/admin workflows for draft, review, publish, archive, and audit history +- contributor registration and draft-authoring workflow with token-based access +- structured Markdown document storage and editor/API round-trip +- persisted taxon identifier scaffolding with legacy identifiers separated from future-facing external identifiers +- citation extraction, review, enrichment, batch enrichment, candidate matching, and reviewed-candidate selection/addition +- citation persistence back into the structured Markdown source of truth + +The roadmap below has been updated to reflect that actual state. + ## Target Product Create a Docker Compose-based, open-source EcoSpecies successor that: @@ -31,48 +48,91 @@ Create a Docker Compose-based, open-source EcoSpecies successor that: ### Phase 0: Discovery and migration planning +Status: completed + - Inventory legacy assets and user-facing capabilities. - Capture the replacement architecture and ingestion strategy. - Define acknowledgements, provenance, and licensing boundaries. ### Phase 1: Ingestion foundation +Status: substantially complete, with parser refinement ongoing + - Parse legacy `.txt` SLH inputs into structured JSON records. -- Normalize common metadata: title, scientific name, common name, FLELMR code, headings, references. +- Normalize common metadata: title, scientific name, common name, FLELMR/EcoSpecies code, headings, references. - Create ingest diagnostics to flag malformed files and missing metadata. +- Continue parser refinement for legacy edge cases in headings, citations, and historical bibliography formats. ### Phase 2: Public read experience +Status: implemented baseline + - Species listing and search. - Species detail view with section navigation. - Provenance and acknowledgement display. - Summary metrics on corpus coverage. +- Path-based deployment under `/apps/ecospecies`. -### Phase 3: Structured persistence +### Phase 3: Structured persistence and editorial workflow -- Move parsed content into PostgreSQL. -- Add editor-safe import jobs and audit metadata. -- Preserve raw source alongside normalized records. -- Establish authentication and role-based access for editor and admin workflows. -- Add persisted editorial workflow state for draft, review, and published records. -- Make document sections individually addressable for editor review and revision, with audit history for section-level changes. +Status: implemented baseline, with editor UX still maturing -### Phase 4: Linkages and visualization +- PostgreSQL-backed persistence for species, sections, citations, documents, taxon identifiers, and audit history. +- Editor-safe import jobs and audit metadata. +- Raw-source preservation alongside normalized records. +- Authentication and role-based access for admin/editor/contributor workflows. +- Persisted editorial workflow state for draft, review, published, and archived records. +- Structured Markdown document storage and round-trip editing. +- Citation review, enrichment, candidate selection, and reviewed-candidate addition. +- Contributor draft creation and owner-scoped editing. + +### Phase 4: Standards-aware identity and bibliography + +Status: partially implemented + +- Preserve legacy local identifiers as provenance. +- Persist taxon identifiers separately from legacy identifiers. +- Expose `legacy_identifiers`, `taxon_identifiers`, and `primary_taxon_*` API fields. +- Persist structured citation records with DOI/OpenAlex/DataCite-style enrichment fields. +- Continue toward multi-authority identifier review, richer citation entities, and CiteGeist-backed bibliography expansion. + +### Phase 5: Editor ergonomics and advanced review + +Status: in progress + +- Structured Markdown editor is live. +- Citation match-review dialog is live. +- Remaining work: + - CodeMirror-based Markdown editor with folding + - inline parser diagnostics in the editor + - richer citation diff/review affordances + - clearer document-node and citation provenance in the UI + +### Phase 6: Linkages and visualization + +Status: not started - Model predator/prey, habitat, and ecological association edges. - Add graph endpoints and species-relationship views. - Support public-friendly visual explanations and expert filters. -### Phase 5: Reports and export +### Phase 7: Reports and export -- Recreate legacy-like text/RTF export. -- Add machine-readable export formats such as JSON and Markdown. -- Support FLELMR-oriented authoring/export profiles. +Status: partially implemented -### Phase 6: Assisted research workflows +- JSON and Markdown exports exist through the API/document model. +- Structured Markdown is now the primary human-readable editor/export format. +- Remaining work: + - recreate legacy-like text/RTF export + - support export profiles for legacy compatibility and standards-forward outputs + - improve citation/bibliography export fidelity + +### Phase 8: Assisted research workflows + +Status: planned - Add local-LLM-assisted extraction and drafting in a human-review loop. -- Integrate bibliography tooling for citation consolidation. +- Integrate bibliography tooling for citation consolidation and topic expansion. - Support candidate-species intake for records not yet in the historical corpus. - Restrict assisted drafting and publication actions to authenticated editorial roles. @@ -84,6 +144,9 @@ Initial core entities: - `source_document` - `document_section` - `citation` +- `taxon_identifier` +- `citation_identifier` +- `bibliography_topic` - `taxon` - `linkage` - `media_asset` @@ -95,6 +158,7 @@ Key design rules: - retain provenance and import timestamps - separate public published records from draft/editor states - make sections addressable for citation and graph linking +- prefer a canonical document AST over direct projection from free-form source text ## LLM Extension Strategy @@ -103,6 +167,8 @@ Use local models only for assistive tasks, never silent publication: - extracting candidate structured fields from new SLH text - suggesting missing headings or linkage labels - clustering similar citations +- resolving bibliography entries toward DOI/OpenAlex/DataCite where available +- treating local legacy codes as provenance, not canonical identifiers - drafting summaries for editor review Guardrails: @@ -111,16 +177,19 @@ Guardrails: - all generated content is marked as draft - every automated extraction stores source spans where possible -## Development Roadmap +## Near-Term Priorities -1. Implement a thin ingestion API over the legacy text corpus. -2. Build a responsive browser UI for listing and viewing species. -3. Add a persistent PostgreSQL-backed ingest store. -4. Introduce export and visualization services. -5. Add editorial workflows and local-LLM assistance. +1. Add CodeMirror-based folding and structure-aware editing to the Markdown document editor. +2. Expand taxon identifier review workflows for WoRMS, GBIF, Catalogue of Life, and related authorities. +3. Deepen citation quality controls, including better parsed-field visibility and stricter/manual review loops where resolver confidence is weak. +4. Add CiteGeist-style topic expansion and bibliography-suggestion review for under-cited species. +5. Improve document export fidelity so reviewed citations and standards-based identifiers are clearly represented in Markdown and downstream exports. +6. Begin the first ecological-linkage data model and API endpoints once citation/identifier workflows stabilize. ## Definition Of Done For The Initial Milestone - `docker compose up` starts a working API and frontend. -- The system can enumerate the legacy corpus and show parsed species detail for at least one real SLH file. -- Project docs describe the migration approach, target architecture, and next phases. +- The system can enumerate the legacy corpus and show parsed species detail for real SLH files. +- Editors can curate structured Markdown documents and citations through authenticated workflows. +- Contributors can register, create drafts, and edit only their own submissions. +- Project docs describe both the implemented modernization state and the next phases. diff --git a/docs/standards-migration-plan.md b/docs/standards-migration-plan.md new file mode 100644 index 0000000..99dbe6a --- /dev/null +++ b/docs/standards-migration-plan.md @@ -0,0 +1,315 @@ +# EcoSpecies Standards Migration Plan + +## Problem + +The current EcoSpecies ingest and document model still treats legacy local fields such as `FLELMR code` / `species_code` as if they were primary identifiers. That is useful for historical provenance, but it is the wrong long-term center of gravity for a broader, modern biodiversity knowledge system. + +The same problem exists for citations: + +- legacy plaintext reference blocks are treated as local document text, +- citation identity is weak or missing, +- bibliography growth is tied to what happened to appear in the historical SLH file. + +The new system should preserve legacy local identifiers and references, but it should not be structurally bound to them. + +## Direction + +Treat legacy local codes and freeform references as import-era artifacts, not canonical future-facing identifiers. + +Going forward, EcoSpecies should prefer broadly recognized identifiers and registries: + +- taxonomic name authority and taxon identifiers: + - Catalogue of Life IDs and release DOIs + - GBIF taxon keys + - WoRMS AphiaIDs for marine taxa + - ITIS TSNs where relevant + - optional NCBI Taxonomy IDs for research interoperability +- literature and dataset identifiers: + - DOI as the primary publication/dataset identifier + - ISBN/ISSN where DOI is absent + - OpenAlex IDs and DataCite metadata as enrichment layers +- contributor identity: + - email-based local contributor accounts now + - optional ORCID linkage later for editor and contributor identity + +The system should be marine-forward because that matches the historical corpus, but not marine-exclusive. Identifier strategy should therefore be authority-aware rather than tied to a single domain-specific registry. + +## Authority Selection Strategy + +Choose the primary taxon authority by best-fit coverage, not by a single global rule. + +- marine taxa: + - prefer WoRMS AphiaID as primary when confidently matched + - retain GBIF and Catalogue of Life as crosswalks +- non-marine or mixed-domain taxa: + - prefer Catalogue of Life or GBIF as primary, depending on match quality and coverage + - retain ITIS and other relevant identifiers as crosswalks +- unresolved or conflicting cases: + - store all candidate identifiers + - require editorial review before a primary identifier is asserted + +This keeps the project ready for terrestrial expansion without discarding the value of WoRMS for the present corpus. + +## Important Taxonomic Note + +PhyloCode is relevant for clade naming, not as a general-purpose replacement for species-level registry IDs. It should not become the primary EcoSpecies species identifier layer. It may be useful later for clade-aware ontology and higher-level phylogenetic naming, but not as the main substitute for local `species_code` values. + +## Core Design Rules + +1. Legacy local identifiers remain preserved exactly as imported. +2. Canonical taxon identity becomes multi-authority, not single-local-code. +3. Citations become first-class structured entities, not just text inside a section. +4. Bibliographies can be extended by topic and citation graph, not only by source-document inheritance. +5. Exports keep provenance visible so readers can distinguish legacy source metadata from normalized external identifiers. + +## Schema Changes + +### Species metadata + +Retain `flelmr_code` for provenance, but demote it to a legacy metadata field. + +Add a taxon-identity layer: + +- `taxon_name_usage` +- `taxon_identifier` +- `taxon_authority` +- `taxon_match_review` + +Suggested fields: + +- `taxon_identifier.authority` +- `taxon_identifier.identifier` +- `taxon_identifier.rank` +- `taxon_identifier.label` +- `taxon_identifier.is_primary` +- `taxon_identifier.source_url` +- `taxon_identifier.asserted_by` +- `taxon_identifier.match_confidence` +- `taxon_identifier.review_status` + +Examples: + +- `authority = "worms", identifier = "159059", label = "AphiaID"` +- `authority = "gbif", identifier = "2290910", label = "taxonKey"` +- `authority = "col", identifier = "5T7L7", label = "taxonID"` +- `authority = "itis", identifier = "161989", label = "TSN"` +- `authority = "legacy-ecospecies", identifier = "5192", label = "FLELMR"` + +### Citation model + +Move from section text to structured bibliography entities: + +- `citation` +- `citation_identifier` +- `citation_relation` +- `species_citation` +- `document_node_citation` +- `bibliography_topic` + +Suggested citation identifier types: + +- DOI +- ISBN +- ISSN +- PMID +- arXiv +- OpenAlex +- URL + +## Markdown / AST Changes + +Update the constrained Markdown profile so metadata stops implying that `species_code` is canonical. + +Replace the current front matter recommendation: + +```md +species_code: 5192 +``` + +with a provenance-oriented shape: + +```md +legacy_identifiers: + - authority: legacy-ecospecies + identifier: 5192 + label: FLELMR +taxon_identifiers: + - authority: worms + identifier: 159059 + label: AphiaID + primary: true + - authority: gbif + identifier: 2290910 + label: taxonKey +``` + +Also add explicit bibliography sections: + +```md +## References + +- id: doi:10.1000/example + text: Smith, J. 2024. Example paper... + relation: cites + +## Suggested Reading + +- topic: estuarine ecology +``` + +The AST should preserve: + +- legacy identifiers +- normalized taxon identifiers +- structured references +- topic links used for bibliography expansion + +## Import Pipeline Changes + +### Species identity + +Import should produce: + +1. raw imported name fields, +2. legacy local identifiers, +3. unresolved candidate taxon identifiers, +4. optional matched external identifiers, +5. a review state for unresolved or conflicting authority matches. + +Do not block ingest if no external authority match exists. Store the unresolved state explicitly. + +Primary identifier assignment should be determined by: + +1. domain fit of the authority +2. confidence of the match +3. editorial review status +4. future ability to crosswalk to other authorities + +### Citations + +Split citation processing into stages: + +1. detect bibliography/reference sections in the imported SLH text, +2. extract plaintext reference strings, +3. convert plaintext references into draft structured entries, +4. enrich identifiers and metadata, +5. assign accepted citations back to species and document nodes, +6. optionally expand bibliography by topic and citation graph. + +## CiteGeist Integration + +`../CiteGeist` is a strong fit for this migration. + +Observed capabilities in that repo already cover much of what EcoSpecies needs: + +- extracting references from plaintext, +- converting rough references into draft structured entries, +- DOI/Crossref/DataCite/OpenAlex enrichment, +- citation graph expansion, +- topic-based bibliography expansion, +- duplicate clustering and canonicalization. + +### Recommended integration boundary + +Do not embed CiteGeist logic directly into the EcoSpecies parser. + +Instead: + +1. EcoSpecies exports candidate plaintext references and topic phrases. +2. CiteGeist processes and enriches them into structured bibliography data. +3. EcoSpecies imports reviewed citation outputs into its own `citation` tables. + +### First integration targets + +- species-level bibliography cleanup from `References` sections +- DOI resolution and identifier assignment +- duplicate detection across species bibliographies +- topic expansion for subject areas such as habitat, trophic ecology, reproduction, invasive biology, and fisheries context + +### Later integration targets + +- node-level citation attachment +- bibliography review UI +- suggested-reading generation per species +- topic-seeded bibliography augmentation for under-cited species drafts + +## API Changes + +Add standards-aware endpoints: + +- `/api/species//identifiers` +- `/api/species//citations` +- `/api/species//bibliography/topics` +- `/api/editor/species//identifier-review` +- `/api/editor/species//citation-review` + +Do not remove legacy fields immediately. Keep `flelmr_code` in payloads for compatibility while introducing: + +- `legacy_identifiers` +- `taxon_identifiers` +- `primary_taxon_identifier` + +## UI Changes + +The species detail page should distinguish: + +- scientific name +- primary external taxon identifier +- legacy local identifiers +- bibliography +- suggested reading + +Editors should see: + +- unresolved authority matches +- conflicting taxon IDs +- citation enrichment candidates +- duplicate-reference clusters + +Contributors should only author content and draft references; identifier normalization and bibliography publication remain editorial functions. + +## Migration Phases + +### Phase A: Demote legacy code + +- Rename internal presentation from “species code” to “legacy identifier”. +- Keep `flelmr_code` only as legacy provenance. +- Add `legacy_identifiers` to Markdown export and AST. + +### Phase B: Add external taxon identifiers + +- Create taxon-identifier tables and API payloads. +- Add editor review workflows for selecting a primary authority identifier. +- Default marine taxa review toward WoRMS where available. +- Default broader cross-domain review toward Catalogue of Life and GBIF where WoRMS is not the right authority. +- Keep the model open to terrestrial species from the beginning rather than treating them as out-of-scope exceptions. + +### Phase C: Structured bibliography + +- Create citation tables. +- Extract plaintext references from imported documents. +- Store draft citations separately from accepted citations. + +### Phase D: CiteGeist bridge + +- Define import/export format between EcoSpecies and CiteGeist. +- Run draft-reference normalization and DOI enrichment. +- Import reviewed structured citations back into EcoSpecies. + +### Phase E: Topic-aware bibliography growth + +- Store species topic phrases. +- Use CiteGeist topic expansion for bibliography augmentation. +- Keep added citations flagged by source type: + - imported + - resolved + - topic-expanded + - editor-added + +## Immediate Next Steps + +1. Update the Markdown profile to replace `species_code` with `legacy_identifiers` plus `taxon_identifiers`. +2. Add `legacy_identifiers` and `taxon_identifiers` to the AST/document model. +3. Introduce taxon identifier tables in the PostgreSQL schema. +4. Define a minimal EcoSpecies-to-CiteGeist interchange format for plaintext references and topic phrases. +5. Add editor-facing citation review before attempting automatic bibliography publication. diff --git a/docs/structured-markdown-plan.md b/docs/structured-markdown-plan.md new file mode 100644 index 0000000..40ccaa3 --- /dev/null +++ b/docs/structured-markdown-plan.md @@ -0,0 +1,338 @@ +# Structured Markdown Document Plan + +## Goal + +Replace the current flat, parser-heavy free-form text handling with a document model that is: + +- human-readable in plaintext +- editable in the browser with hierarchy folding +- permissive-license friendly +- suitable for first-pass conversion from legacy SLH text files +- suitable as the primary export format for a species life history +- able to project cleanly into a flexible database model with greater hierarchical depth + +## Recommendation + +Adopt a constrained Markdown-based authoring format as the primary human-facing document format, backed by an internal hierarchical document AST and a relational projection layer in PostgreSQL. + +Use this three-layer model: + +1. Source and export format: constrained EcoSpecies Markdown +2. Canonical application representation: hierarchical AST +3. Database representation: relational projection for querying, indexing, publishing, and editorial workflows + +This avoids treating raw free-form text as both the storage format and the parser input. + +## Why Markdown Instead Of Org + +Markdown is the better fit for this codebase and licensing requirement because: + +- it is familiar to most users +- it is easier to constrain than Org +- it maps naturally to hierarchical headings +- it works well with CodeMirror folding +- it does not require adopting GPL or AGPL editor code + +Org-style authoring remains conceptually attractive, but embedding Org-specific tooling such as organice would introduce copyleft code, which is not aligned with a permissive-only implementation strategy. + +## EcoSpecies Markdown Profile + +The format should be Markdown-like, but intentionally narrower than unrestricted Markdown. + +### Metadata + +Use YAML front matter for canonical metadata fields: + +```md +--- +title: American Oyster +common_name: American Oyster +scientific_name: Crassostrea virginica +legacy_identifiers: + - authority: legacy-ecospecies + identifier: 5192 + label: FLELMR +taxon_identifiers: + - authority: worms + identifier: 159059 + label: AphiaID + primary: true +source_file: American Oyster SLH NOAA SEA.txt +publication_status: published +--- +``` + +Recommended canonical fields: + +- `title` +- `common_name` +- `scientific_name` +- `legacy_identifiers` +- `taxon_identifiers` +- `primary_taxon_authority` +- `source_file` +- `publication_status` +- `source_format` +- `legacy_import_id` + +### Hierarchy + +Use headings as the sole structure-bearing primitive. + +Example: + +```md +--- +title: American Oyster +common_name: American Oyster +scientific_name: Crassostrea virginica +legacy_identifiers: + - authority: legacy-ecospecies + identifier: 5192 + label: FLELMR +--- + +## Summary +Short editor-reviewed abstract. + +## Habitat + +### Type +Estuarine. + +### Substrate +Hard bottom, shell, mud flats, and other suitable settlement surfaces. + +## Reproduction + +### Season +Spawning occurs from spring through fall in much of the Gulf. +``` + +Rules: + +- Heading depth is meaningful. +- Skip-level headings should be rejected or normalized. +- Body text belongs to the nearest preceding heading. +- `#` level is optional if the document title already exists in front matter. +- Tables, lists, and citations are allowed only where explicitly supported. +- Arbitrary embedded HTML should be disallowed. + +### Citations + +Keep citations readable in Markdown but structured enough to parse. + +Preferred first-pass shape: + +```md +## Citations + +- [7] Ahmed, M. 1975. Speciation in living oysters. Advances in Marine Biology 13:357-397. +- [15] Andrews, J.D. 1979. Pelecypoda: Ostreidae. Reproduction of Marine Invertebrates... +``` + +This is intentionally simpler than trying to infer citations from arbitrary prose. + +## Canonical AST + +Markdown should not be the sole internal representation. Parse it into an AST that preserves hierarchy explicitly. + +Example conceptual shape: + +```json +{ + "metadata": { + "title": "American Oyster", + "common_name": "American Oyster", + "scientific_name": "Crassostrea virginica", + "legacy_identifiers": [ + { + "authority": "legacy-ecospecies", + "identifier": "5192", + "label": "FLELMR" + } + ] + }, + "nodes": [ + { + "id": "n1", + "type": "section", + "depth": 2, + "title": "Summary", + "body": "Short editor-reviewed abstract.", + "children": [] + }, + { + "id": "n2", + "type": "section", + "depth": 2, + "title": "Habitat", + "body": "", + "children": [ + { + "id": "n3", + "type": "section", + "depth": 3, + "title": "Type", + "body": "Estuarine.", + "children": [] + } + ] + } + ] +} +``` + +Required AST properties: + +- arbitrary hierarchical depth +- stable node identifiers +- separate metadata from body structure +- support for editor audit and provenance +- support for extracting source spans from imported legacy text when available + +## Database Direction + +The current flat `document_section` model should evolve into a general document tree. + +Suggested core tables: + +- `species_document` +- `species_document_node` +- `species_document_node_revision` +- `species_document_metadata` +- `citation` +- `species_document_export` + +Suggested `species_document_node` fields: + +- `id` +- `document_id` +- `parent_id` +- `position` +- `depth` +- `node_type` +- `title` +- `body_markdown` +- `body_plaintext` +- `source_heading` +- `source_span_start` +- `source_span_end` + +This enables: + +- greater hierarchical depth +- stable editor operations on subtrees +- future insertion of machine-extracted nested content +- simplified export back to Markdown + +## Import Flow + +The legacy text parser should no longer attempt to infer the final database structure directly. + +Instead: + +1. Parse raw legacy text into a best-effort intermediate tree. +2. Normalize extracted metadata. +3. Emit constrained Markdown. +4. Parse constrained Markdown into AST. +5. Persist AST and project relationally. +6. Record diagnostics on uncertain conversions. + +This changes the parser’s role from “infer final structure perfectly” to “produce a reviewable first draft”. + +## Editor Flow + +The web editor should operate primarily on the Markdown representation, with a structured parse running on save or preview. + +Recommended behavior: + +- fold by heading depth in CodeMirror +- validate front matter and heading structure +- preview rendered sections +- show parser diagnostics inline +- save both Markdown source and parsed AST + +The editor should reject or flag: + +- invalid front matter +- duplicate canonical metadata keys +- heading depth jumps +- malformed citation entries in structured sections + +## Export Policy + +Markdown should be the primary export format for a species life history. + +Export targets: + +- constrained Markdown for editorial interchange +- JSON AST for machine workflows +- derived relational/API payloads for the application +- optional report-oriented exports later + +The export path should be: + +- database document tree -> canonical AST -> constrained Markdown + +This ensures the exported plaintext remains stable and human-readable. + +## Migration Strategy + +### Stage 1: Introduce the document model + +- add AST schema and persistence tables +- keep existing section-based reads working +- build Markdown import/export helpers + +### Stage 2: Convert current parser output + +- map current parsed sections into Markdown drafts +- preserve existing metadata and diagnostics +- store generated Markdown alongside current records + +### Stage 3: Introduce Markdown editor + +- add CodeMirror-based editor with heading folding +- add validation for front matter and heading structure +- add round-trip save through AST + +### Stage 4: Move public reads to the new document model + +- generate current API responses from the hierarchical document tree +- keep compatibility shims for legacy flat sections where needed + +### Stage 5: Expand structured extraction + +- add deeper parsing for habitat, reproduction, citations, and linkages +- add richer projections from AST to relational tables + +## Immediate Implementation Tasks + +Recommended first engineering tasks: + +1. Define the constrained Markdown grammar and validation rules. +2. Design the AST schema and PostgreSQL tables. +3. Add Markdown import/export utilities in the API service. +4. Prototype a CodeMirror editor with heading folding. +5. Add a migration command that converts current species records into Markdown drafts. +6. Preserve current endpoints while introducing the document-tree backing model. + +## Non-Goals For The First Pass + +- full unrestricted Markdown feature support +- WYSIWYG editing +- arbitrary embedded HTML +- perfect citation parsing from all legacy free text +- replacing every existing API shape immediately + +## Decision Summary + +The planned direction is: + +- constrained Markdown as the editable and exportable document format +- internal AST as the canonical application representation +- relational projection for queryable application state +- CodeMirror-based browser editing with heading folding + +This is the most practical path toward human-editable hierarchy, permissive-only implementation, cleaner parsing, and deeper long-term document structure. diff --git a/docs/traefik-deploy.md b/docs/traefik-deploy.md new file mode 100644 index 0000000..647bf4f --- /dev/null +++ b/docs/traefik-deploy.md @@ -0,0 +1,79 @@ +# Traefik Deployment Notes + +This note applies to the reverse-proxy deployment variant in `docs/docker-compose-traefik.yml`. + +## Start The Stack + +From the repository root: + +```bash +cp docs/docker-compose-traefik.env.example docs/docker-compose-traefik.env +# edit docs/docker-compose-traefik.env +docker compose \ + --env-file docs/docker-compose-traefik.env \ + -f docs/docker-compose-traefik.yml \ + up -d +``` + +## Common Failure Modes + +### Traefik cannot reach the web container + +Check: + +- the external Docker network named by `TRAEFIK_NETWORK` exists +- the Traefik instance is attached to that same Docker network +- the hostname in `ECOSPECIES_HOSTNAME` matches the Traefik router rule you expect +- the path in `ECOSPECIES_BASE_PATH` matches the published application prefix, for example `/apps/ecospecies` + +### The site opens but the API fails + +Check: + +- the `api` service is healthy and running +- the `web` service is using the repo's `apps/web/nginx.conf` +- the `api` service finished waiting for `importer` +- the request path is under `ECOSPECIES_BASE_PATH` if you are publishing the app below a domain root + +### Importer fails on startup + +Check: + +- `ECOSPECIES_LEGACY_DATA_DIR` points to a real host path +- that path contains `InputFiles - TXT` +- the mount is readable by Docker on the target host + +### Database does not initialize + +Check: + +- `ECOSPECIES_DB_PASSWORD` is set +- the PostgreSQL volume is writable +- an old incompatible volume is not being reused unintentionally + +### Editor login works but no editor state is available + +Check: + +- `ECOSPECIES_AUTH_TOKENS` is set on the `api` service +- the token you entered matches the configured value exactly + +## Operational Notes + +- This deployment variant intentionally exposes only the `web` container to Traefik. +- The `api`, `db`, and `importer` services stay on the internal Compose network. +- The `importer` runs before the API starts and seeds or synchronizes the dataset. +- The web container serves both the domain root and `/apps/ecospecies/`, but the Traefik router should target the intended public path. + +## Apache Front Door + +If Apache is the public front door for the hostname in `ECOSPECIES_HOSTNAME`, it must proxy the configured `ECOSPECIES_BASE_PATH` onward. Otherwise Apache can return its own `Not Found` page before the EcoSpecies stack sees the request. + +Example Apache directives: + +```apache +ProxyPass /apps/ecospecies http://127.0.0.1:80/apps/ecospecies +ProxyPassReverse /apps/ecospecies http://127.0.0.1:80/apps/ecospecies +``` + +Point the backend address at the actual Traefik listener on the host if it is not `127.0.0.1:80`, and adjust the published path if `ECOSPECIES_BASE_PATH` is different. diff --git a/scripts/backfill-citations.py b/scripts/backfill-citations.py new file mode 100644 index 0000000..9b29cd3 --- /dev/null +++ b/scripts/backfill-citations.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +from pathlib import Path + +from ecospecies_api.repository import ( + get_editor_species_citations, + get_editor_species_list, + update_species_citation_enrichment, +) + + +def should_backfill(citation: dict[str, object], include_accepted: bool) -> bool: + review_status = str(citation.get("review_status", "")).strip().lower() + source_type = str(citation.get("source_type", "")).strip().lower() + enrichment_status = str(citation.get("enrichment_status", "")).strip().lower() + normalized_text = str(citation.get("normalized_text", "")).strip() + abstract_text = str(citation.get("abstract_text", "")).strip() + + if not include_accepted and review_status == "accepted": + return False + if source_type in {"editor_selected_candidate", "editor_added_candidate"} and not include_accepted: + return False + + return ( + source_type in {"document_extract", "editor_review", ""} + or enrichment_status in {"pending", "unresolved", "error", ""} + or not normalized_text + or not abstract_text + ) + + +def reorder_species_with_cursor( + species_items: list[dict[str, object]], + state_file: Path | None, +) -> list[dict[str, object]]: + if not state_file or not species_items: + return species_items + + try: + last_slug = state_file.read_text(encoding="utf-8").strip() + except FileNotFoundError: + return species_items + + if not last_slug: + return species_items + + for index, item in enumerate(species_items): + if str(item.get("slug", "")).strip() == last_slug: + return species_items[index + 1 :] + species_items[: index + 1] + return species_items + + +def write_cursor(state_file: Path | None, slug: str) -> None: + if not state_file or not slug: + return + state_file.parent.mkdir(parents=True, exist_ok=True) + state_file.write_text(f"{slug}\n", encoding="utf-8") + + +def main() -> int: + parser = argparse.ArgumentParser(description="Backfill EcoSpecies citation enrichment.") + parser.add_argument("--slug", help="Limit the backfill to a single species slug.") + parser.add_argument("--username", default="citation-backfill", help="Audit username to record.") + parser.add_argument( + "--include-accepted", + action="store_true", + help="Also rerun accepted/editor-curated citations.", + ) + parser.add_argument( + "--max-species", + type=int, + default=0, + help="Stop after this many species with eligible citations. 0 means no limit.", + ) + parser.add_argument( + "--max-citations", + type=int, + default=0, + help="Stop after this many citations overall. 0 means no limit.", + ) + parser.add_argument( + "--state-file", + help="Optional cursor file used to rotate scheduled runs through the species list.", + ) + args = parser.parse_args() + + state_file = Path(args.state_file).expanduser() if args.state_file else None + species_items = ( + [item for item in get_editor_species_list() if item["slug"] == args.slug] + if args.slug + else get_editor_species_list() + ) + if not args.slug: + species_items = reorder_species_with_cursor(species_items, state_file) + + if args.slug and not species_items: + print(f"Species not found: {args.slug}") + return 1 + + species_count = 0 + citation_count = 0 + changed_count = 0 + resolved_count = 0 + unresolved_count = 0 + error_count = 0 + last_seen_slug = "" + + for species in species_items: + if args.max_species and species_count >= args.max_species: + break + slug = str(species["slug"]) + last_seen_slug = slug + citation_payload = get_editor_species_citations(slug) + if citation_payload is None: + continue + + eligible = [ + citation + for citation in citation_payload["citations"] + if should_backfill(citation, include_accepted=args.include_accepted) + ] + if not eligible: + continue + + species_count += 1 + print(f"[{slug}] backfilling {len(eligible)} citation(s)", flush=True) + + for citation in eligible: + if args.max_citations and citation_count >= args.max_citations: + write_cursor(state_file, last_seen_slug) + print("citation limit reached; stopping early", flush=True) + print( + "summary:" + f" species={species_count}" + f" citations={citation_count}" + f" changed={changed_count}" + f" resolved={resolved_count}" + f" unresolved={unresolved_count}" + f" errors={error_count}", + flush=True, + ) + return 0 + citation_count += 1 + result = update_species_citation_enrichment( + slug=slug, + citation_id=int(citation["id"]), + username=args.username, + ) + if result is None: + print(f" - citation {citation['id']}: skipped (not found)", flush=True) + continue + + changed_fields = result.get("changed_fields", {}) + status = str(result["citation"].get("enrichment_status", "")).strip().lower() + if changed_fields: + changed_count += 1 + if status == "resolved": + resolved_count += 1 + elif status == "unresolved": + unresolved_count += 1 + elif status == "error": + error_count += 1 + print( + f" - citation {citation['id']}: {status or 'unknown'}" + + (f" ({len(changed_fields)} field changes)" if changed_fields else "") + , flush=True) + + write_cursor(state_file, last_seen_slug) + print( + "summary:" + f" species={species_count}" + f" citations={citation_count}" + f" changed={changed_count}" + f" resolved={resolved_count}" + f" unresolved={unresolved_count}" + f" errors={error_count}", + flush=True, + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/backup-postgres.sh b/scripts/backup-postgres.sh new file mode 100644 index 0000000..77fac31 --- /dev/null +++ b/scripts/backup-postgres.sh @@ -0,0 +1,28 @@ +#!/bin/sh +set -eu + +ROOT_DIR="$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)" +ENV_FILE="${ECOSPECIES_ENV_FILE:-$ROOT_DIR/docs/docker-compose-traefik.env}" +COMPOSE_FILE="${ECOSPECIES_COMPOSE_FILE:-$ROOT_DIR/docs/docker-compose-traefik.yml}" +OUTPUT_FILE="${1:-$ROOT_DIR/ecospecies-backup.sql}" + +if [ ! -f "$ENV_FILE" ]; then + echo "Missing env file: $ENV_FILE" >&2 + exit 1 +fi + +set -a +. "$ENV_FILE" +set +a + +DB_USER="${ECOSPECIES_DB_USER:-ecospecies}" +DB_NAME="${ECOSPECIES_DB_NAME:-ecospecies}" + +docker compose \ + --env-file "$ENV_FILE" \ + -f "$COMPOSE_FILE" \ + exec -T db \ + pg_dump -U "$DB_USER" "$DB_NAME" \ + > "$OUTPUT_FILE" + +printf 'Backup written to %s\n' "$OUTPUT_FILE" diff --git a/scripts/restore-postgres.sh b/scripts/restore-postgres.sh new file mode 100644 index 0000000..1f86814 --- /dev/null +++ b/scripts/restore-postgres.sh @@ -0,0 +1,37 @@ +#!/bin/sh +set -eu + +ROOT_DIR="$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)" +ENV_FILE="${ECOSPECIES_ENV_FILE:-$ROOT_DIR/docs/docker-compose-traefik.env}" +COMPOSE_FILE="${ECOSPECIES_COMPOSE_FILE:-$ROOT_DIR/docs/docker-compose-traefik.yml}" +INPUT_FILE="${1:-}" + +if [ -z "$INPUT_FILE" ]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +if [ ! -f "$ENV_FILE" ]; then + echo "Missing env file: $ENV_FILE" >&2 + exit 1 +fi + +if [ ! -f "$INPUT_FILE" ]; then + echo "Missing backup file: $INPUT_FILE" >&2 + exit 1 +fi + +set -a +. "$ENV_FILE" +set +a + +DB_USER="${ECOSPECIES_DB_USER:-ecospecies}" +DB_NAME="${ECOSPECIES_DB_NAME:-ecospecies}" + +cat "$INPUT_FILE" | docker compose \ + --env-file "$ENV_FILE" \ + -f "$COMPOSE_FILE" \ + exec -T db \ + psql -U "$DB_USER" "$DB_NAME" + +printf 'Restore completed from %s\n' "$INPUT_FILE" diff --git a/scripts/run-citation-backfill.sh b/scripts/run-citation-backfill.sh new file mode 100644 index 0000000..dc21570 --- /dev/null +++ b/scripts/run-citation-backfill.sh @@ -0,0 +1,21 @@ +#!/bin/sh +set -eu + +ROOT_DIR="$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)" +LOG_DIR="${ECOSPECIES_BACKFILL_LOG_DIR:-$ROOT_DIR/var/logs}" +STATE_FILE="${ECOSPECIES_BACKFILL_STATE_FILE:-$ROOT_DIR/var/citation-backfill.cursor}" +LOCK_DIR="${ECOSPECIES_BACKFILL_LOCK_DIR:-$ROOT_DIR/var/citation-backfill.lock}" +MAX_SPECIES="${ECOSPECIES_BACKFILL_MAX_SPECIES:-3}" + +mkdir -p "$LOG_DIR" +mkdir -p "$ROOT_DIR/var" + +if ! mkdir "$LOCK_DIR" 2>/dev/null; then + echo "citation backfill already running; skipping" + exit 0 +fi + +trap 'rmdir "$LOCK_DIR"' EXIT INT TERM + +exec docker exec ecospecies-api /bin/sh -lc \ + "PYTHONPATH=/workspace/apps/api/src /workspace/.docker/venv/bin/python -u /workspace/scripts/backfill-citations.py --username citation-backfill --max-species ${MAX_SPECIES} --state-file ${STATE_FILE}"