Prepare public-safe repo update

This commit is contained in:
welsberr 2026-04-10 04:44:45 +00:00
parent a6b04a995a
commit 1143f9bfcc
40 changed files with 9099 additions and 175 deletions

9
.gitignore vendored
View File

@ -7,3 +7,12 @@ __pycache__/
node_modules/
test-results/
playwright-report/
*~
*.env
secrets*
codex*
restart.sh
*lock.json
input-data/
legacy-data
var/logs/

View File

@ -21,7 +21,7 @@ Docker Compose owns all runtime dependencies:
- Python services run in `python:3.12-slim`
- the Python virtual environment is created in a Docker-managed volume mounted at `/workspace/.docker/venv`
- dependencies are installed from `apps/api/requirements.txt` inside that virtual environment
- the legacy corpus is mounted read-only from `../01-legacy-code-and-data`
- the legacy corpus is mounted read-only from a sibling directory, defaulting to `../legacy-corpus`
No host Python packages are required for the Compose workflow.
@ -48,6 +48,13 @@ Endpoints:
- editor section detail/update: `/api/editor/species/<slug>/sections/<position>` (requires `editor` or `admin`)
- editor audit history: `/api/editor/species/<slug>/audit` (requires `editor` or `admin`)
The app can also be published under a URL prefix. A reverse-proxy deployment can publish the app at a host and path such as:
- `ECOSPECIES_HOSTNAME=example.org`
- `ECOSPECIES_BASE_PATH=/apps/ecospecies`
When the site is served below a path prefix, the frontend derives its API base from the current page URL and nginx serves both the UI and proxied API under that same prefix.
If those host ports are already in use, override them when starting Compose, for example:
```bash
@ -87,6 +94,14 @@ Run the browser-level smoke test against the real Compose stack with:
./scripts/check-ui-stack-smoke.sh
```
Run a bounded citation backfill pass with:
```bash
./scripts/run-citation-backfill.sh
```
The wrapper runs inside `ecospecies-api`, keeps a rotating cursor in `var/citation-backfill.cursor`, and skips a run if another backfill is already active.
## Notes
- The importer seeds PostgreSQL from the legacy text corpus before the API starts and now synchronizes by slug instead of truncating the full dataset.
@ -98,6 +113,8 @@ Run the browser-level smoke test against the real Compose stack with:
- Initial editor auth uses `ECOSPECIES_AUTH_TOKENS` in the format `token:username:role[,token2:username2:role2]`, where `role` is `viewer`, `editor`, or `admin`.
- Editorial workflow state is persisted per species with `draft`, `review`, and `published` statuses. Public endpoints return only `published` records; editor endpoints can inspect and update all records.
- Editors can curate top-level metadata and section content from the web UI, and every editorial or section change is recorded in per-species audit history.
- Citation backfill can be scheduled externally, such as with a nightly cron job that runs `./scripts/run-citation-backfill.sh`. Use `ECOSPECIES_BACKFILL_LOG_DIR` if logs should go somewhere other than `var/logs`.
- Unresolved citation enrichment now still refreshes the locally parsed BibTeX and normalized citation text, so parser improvements propagate even without a remote metadata match.
- Summary authoring guidance for future FLELMR-compatible records is in `docs/flelmr-authoring.md`.
- Legacy survey and roadmap artifacts are in `docs/`.

View File

@ -15,17 +15,36 @@ from ecospecies_api.auth import (
)
from ecospecies_api.parser import get_default_data_dir, load_species_records
from ecospecies_api.repository import (
add_species_citation_from_candidate,
apply_species_citation_candidate_selection,
create_contributor_species,
get_contributor_species_citations,
get_contributor_species_detail,
get_contributor_species_document,
get_contributor_species_list,
get_species_citation_candidates,
get_editor_species_citations,
get_editor_species_detail,
get_species_document,
get_editor_species_list,
get_editor_species_workflow,
get_minimum_contributor_age,
get_species_by_slug,
list_species_audit,
list_public_bibliography,
get_readiness_status,
get_summary_metrics,
has_species_data,
import_species_payload,
list_diagnostics,
list_species,
register_contributor,
update_species_citation_enrichment,
backfill_species_citations,
update_species_citations_enrichment_batch,
update_species_citation_review,
update_contributor_species_document_markdown,
update_species_document_markdown,
update_species_section,
update_species_editorial,
)
@ -99,6 +118,7 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
{
"authenticated": session is not None,
"auth_configured": auth_is_configured(),
"minimum_contributor_age": get_minimum_contributor_age(),
"user": (
{"username": session.username, "role": session.role}
if session is not None
@ -108,6 +128,23 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
)
return
if path == "/api/contributor/status":
if not self.require_role(session, "contributor"):
return
self.write_json(
{
"status": "ok",
"contributor_access": True,
"user": {"username": session.username, "role": session.role},
"minimum_age": get_minimum_contributor_age(),
"capabilities": [
"create_species_draft",
"edit_owned_drafts",
],
}
)
return
if path == "/api/editor/status":
if not self.require_role(session, "editor"):
return
@ -135,10 +172,42 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
"slug": item["slug"],
"title": item["title"],
"common_name": item["common_name"],
"scientific_name": item["scientific_name"],
"legacy_identifiers": item["legacy_identifiers"],
"taxon_identifiers": item["taxon_identifiers"],
"primary_taxon_authority": item["primary_taxon_authority"],
"primary_taxon_identifier": item["primary_taxon_identifier"],
"publication_status": item["publication_status"],
"is_archived": item["is_archived"],
"last_modified_by": item["last_modified_by"],
"diagnostic_count": len(item["diagnostics"]),
"summary": item["summary"],
}
for item in items
]
self.write_json({"items": compact, "count": len(compact)})
return
if path == "/api/contributor/species":
if not self.require_role(session, "contributor"):
return
search = query.get("search", [""])[0].strip().lower()
items = get_contributor_species_list(session.username, search)
compact = [
{
"slug": item["slug"],
"title": item["title"],
"common_name": item["common_name"],
"scientific_name": item["scientific_name"],
"legacy_identifiers": item["legacy_identifiers"],
"taxon_identifiers": item["taxon_identifiers"],
"primary_taxon_authority": item["primary_taxon_authority"],
"primary_taxon_identifier": item["primary_taxon_identifier"],
"publication_status": item["publication_status"],
"is_archived": item["is_archived"],
"last_modified_by": item["last_modified_by"],
"diagnostic_count": len(item["diagnostics"]),
"summary": item["summary"],
}
for item in items
]
@ -176,7 +245,68 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
return
if path.startswith("/api/editor/species/") and not path.endswith("/workflow") and not path.endswith("/editorial") and not path.endswith("/audit"):
if path.startswith("/api/editor/species/") and path.endswith("/document"):
if not self.require_role(session, "editor"):
return
slug = path[len("/api/editor/species/") : -len("/document")].strip("/")
item = get_species_document(slug)
if item is None:
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
return
self.write_json(item)
return
if path.startswith("/api/editor/species/") and path.endswith("/citations"):
if not self.require_role(session, "editor"):
return
slug = path[len("/api/editor/species/") : -len("/citations")].strip("/")
item = get_editor_species_citations(slug)
if item is None:
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
return
self.write_json(item)
return
if path.startswith("/api/editor/species/") and "/citations/" in path and path.endswith("/candidates"):
if not self.require_role(session, "editor"):
return
slug, _, tail = path[len("/api/editor/species/") :].partition("/citations/")
citation_tail = tail[: -len("/candidates")].strip("/")
try:
citation_id = int(citation_tail)
except ValueError:
self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST)
return
item = get_species_citation_candidates(slug.strip("/"), citation_id)
if item is None:
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
return
self.write_json(item)
return
if path.startswith("/api/contributor/species/") and path.endswith("/document"):
if not self.require_role(session, "contributor"):
return
slug = path[len("/api/contributor/species/") : -len("/document")].strip("/")
item = get_contributor_species_document(slug, session.username)
if item is None:
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
return
self.write_json(item)
return
if path.startswith("/api/contributor/species/") and path.endswith("/citations"):
if not self.require_role(session, "contributor"):
return
slug = path[len("/api/contributor/species/") : -len("/citations")].strip("/")
item = get_contributor_species_citations(slug, session.username)
if item is None:
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
return
self.write_json(item)
return
if path.startswith("/api/editor/species/") and not path.endswith("/workflow") and not path.endswith("/editorial") and not path.endswith("/audit") and not path.endswith("/document"):
if not self.require_role(session, "editor"):
return
slug = path[len("/api/editor/species/") :].strip("/")
@ -187,6 +317,17 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
self.write_json(item)
return
if path.startswith("/api/contributor/species/") and not path.endswith("/document"):
if not self.require_role(session, "contributor"):
return
slug = path[len("/api/contributor/species/") :].strip("/")
item = get_contributor_species_detail(slug, session.username)
if item is None:
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
return
self.write_json(item)
return
if path.startswith("/api/editor/species/") and path.endswith("/workflow"):
if not self.require_role(session, "editor"):
return
@ -215,6 +356,12 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
self.write_json({"items": flagged, "count": len(flagged)})
return
if path == "/api/bibliography":
search = query.get("search", [""])[0].strip()
items = list_public_bibliography(search=search)
self.write_json({"items": items, "count": len(items)})
return
if path == "/api/species":
search = query.get("search", [""])[0].strip().lower()
species = list_species(search)
@ -225,6 +372,10 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
"common_name": item["common_name"],
"scientific_name": item["scientific_name"],
"flelmr_code": item["flelmr_code"],
"legacy_identifiers": item["legacy_identifiers"],
"taxon_identifiers": item["taxon_identifiers"],
"primary_taxon_authority": item["primary_taxon_authority"],
"primary_taxon_identifier": item["primary_taxon_identifier"],
"summary": item["summary"],
"section_count": item["section_count"],
"diagnostic_count": len(item["diagnostics"]),
@ -250,6 +401,47 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
path = parsed.path
session = resolve_auth_session(self.headers)
if path == "/api/contributor/register":
payload = self.read_json_body()
if payload is None:
return
email = payload.get("email")
age_gate_confirmed = payload.get("age_gate_confirmed")
if not isinstance(email, str):
self.write_json({"error": "email must be a string"}, status=HTTPStatus.BAD_REQUEST)
return
if not isinstance(age_gate_confirmed, bool):
self.write_json(
{"error": "age_gate_confirmed must be a boolean"},
status=HTTPStatus.BAD_REQUEST,
)
return
try:
result = register_contributor(email=email, age_gate_confirmed=age_gate_confirmed)
except ValueError as exc:
self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST)
return
self.write_json({"status": "ok", **result}, status=HTTPStatus.CREATED)
return
if path == "/api/contributor/species":
if not self.require_role(session, "contributor"):
return
payload = self.read_json_body()
if payload is None:
return
markdown = payload.get("markdown")
if markdown is not None and not isinstance(markdown, str):
self.write_json({"error": "markdown must be a string"}, status=HTTPStatus.BAD_REQUEST)
return
try:
result = create_contributor_species(session.username, markdown)
except ValueError as exc:
self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST)
return
self.write_json({"status": "ok", **result}, status=HTTPStatus.CREATED)
return
if path.startswith("/api/editor/species/") and path.endswith("/workflow"):
if not self.require_role(session, "editor"):
return
@ -341,6 +533,229 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
self.write_json({"status": "ok", **result})
return
if path.startswith("/api/editor/species/") and path.endswith("/document"):
if not self.require_role(session, "editor"):
return
payload = self.read_json_body()
if payload is None:
return
markdown = payload.get("markdown")
if not isinstance(markdown, str):
self.write_json({"error": "markdown must be a string"}, status=HTTPStatus.BAD_REQUEST)
return
slug = path[len("/api/editor/species/") : -len("/document")].strip("/")
try:
result = update_species_document_markdown(
slug=slug,
markdown=markdown,
username=session.username,
)
except ValueError as exc:
self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST)
return
if result is None:
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
return
self.write_json({"status": "ok", **result})
return
if (
path.startswith("/api/editor/species/")
and "/citations/" in path
and not path.endswith("/citations/enrich")
and not path.endswith("/citations/backfill")
):
if not self.require_role(session, "editor"):
return
payload = self.read_json_body()
if payload is None:
return
slug, _, tail = path[len("/api/editor/species/") :].partition("/citations/")
if tail.endswith("/enrich"):
citation_tail = tail[: -len("/enrich")].strip("/")
try:
citation_id = int(citation_tail)
except ValueError:
self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST)
return
result = update_species_citation_enrichment(
slug=slug.strip("/"),
citation_id=citation_id,
username=session.username,
)
if result is None:
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
return
self.write_json({"status": "ok", **result})
return
if tail.endswith("/apply-match"):
citation_tail = tail[: -len("/apply-match")].strip("/")
try:
citation_id = int(citation_tail)
except ValueError:
self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST)
return
candidate = payload.get("candidate")
if not isinstance(candidate, dict):
self.write_json({"error": "candidate must be an object"}, status=HTTPStatus.BAD_REQUEST)
return
result = apply_species_citation_candidate_selection(
slug=slug.strip("/"),
citation_id=citation_id,
candidate=candidate,
username=session.username,
)
if result is None:
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
return
self.write_json({"status": "ok", **result})
return
if tail.endswith("/add-match"):
citation_tail = tail[: -len("/add-match")].strip("/")
try:
citation_id = int(citation_tail)
except ValueError:
self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST)
return
candidate = payload.get("candidate")
if not isinstance(candidate, dict):
self.write_json({"error": "candidate must be an object"}, status=HTTPStatus.BAD_REQUEST)
return
result = add_species_citation_from_candidate(
slug=slug.strip("/"),
citation_id=citation_id,
candidate=candidate,
username=session.username,
)
if result is None:
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
return
self.write_json({"status": "ok", **result})
return
try:
citation_id = int(tail.strip("/"))
except ValueError:
self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST)
return
for field in ("review_status", "normalized_text", "abstract_text", "doi", "citation_key", "entry_type", "draft_bibtex"):
value = payload.get(field)
if value is not None and not isinstance(value, str):
self.write_json(
{"error": f"{field} must be a string"},
status=HTTPStatus.BAD_REQUEST,
)
return
try:
result = update_species_citation_review(
slug=slug.strip("/"),
citation_id=citation_id,
review_status=payload.get("review_status"),
normalized_text=payload.get("normalized_text"),
doi=payload.get("doi"),
citation_key=payload.get("citation_key"),
entry_type=payload.get("entry_type"),
draft_bibtex=payload.get("draft_bibtex"),
abstract_text=payload.get("abstract_text"),
username=session.username,
)
except ValueError as exc:
self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST)
return
if result is None:
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
return
self.write_json({"status": "ok", **result})
return
if path.startswith("/api/editor/species/") and path.endswith("/citations/enrich"):
if not self.require_role(session, "editor"):
return
payload = self.read_json_body()
if payload is None:
return
slug = path[len("/api/editor/species/") : -len("/citations/enrich")].strip("/")
result = update_species_citations_enrichment_batch(
slug=slug,
username=session.username,
)
if result is None:
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
return
self.write_json({"status": "ok", **result})
return
if path.startswith("/api/editor/species/") and path.endswith("/citations/backfill"):
if not self.require_role(session, "editor"):
return
payload = self.read_json_body()
if payload is None:
return
slug = path[len("/api/editor/species/") : -len("/citations/backfill")].strip("/")
include_accepted = bool(payload.get("include_accepted", False))
result = backfill_species_citations(
slug=slug,
username=session.username,
include_accepted=include_accepted,
)
if result is None:
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
return
self.write_json({"status": "ok", **result})
return
if path.startswith("/api/contributor/species/") and path.endswith("/document"):
if not self.require_role(session, "contributor"):
return
payload = self.read_json_body()
if payload is None:
return
markdown = payload.get("markdown")
if not isinstance(markdown, str):
self.write_json({"error": "markdown must be a string"}, status=HTTPStatus.BAD_REQUEST)
return
slug = path[len("/api/contributor/species/") : -len("/document")].strip("/")
try:
result = update_contributor_species_document_markdown(
slug=slug,
markdown=markdown,
username=session.username,
)
except ValueError as exc:
self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST)
return
if result is None:
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
return
self.write_json({"status": "ok", **result})
return
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
def log_message(self, format: str, *args: object) -> None:

View File

@ -1,14 +1,21 @@
from __future__ import annotations
import hashlib
import os
from dataclasses import dataclass
from typing import Mapping
from sqlalchemy import select
from ecospecies_api.db import SessionLocal, create_db_engine
from ecospecies_api.models import Base, ContributorAccount
ROLE_ORDER = {
"viewer": 1,
"editor": 2,
"admin": 3,
"contributor": 2,
"editor": 3,
"admin": 4,
}
@ -41,17 +48,27 @@ def _parse_token_entry(entry: str) -> tuple[str, AuthSession]:
def get_token_registry() -> dict[str, AuthSession]:
configured = os.environ.get("ECOSPECIES_AUTH_TOKENS", "").strip()
if not configured:
return {}
registry: dict[str, AuthSession] = {}
for raw_entry in configured.split(","):
entry = raw_entry.strip()
if not entry:
continue
token, session = _parse_token_entry(entry)
registry[token] = session
configured = os.environ.get("ECOSPECIES_AUTH_TOKENS", "").strip()
if configured:
for raw_entry in configured.split(","):
entry = raw_entry.strip()
if not entry:
continue
token, session = _parse_token_entry(entry)
registry[token] = session
engine = create_db_engine()
Base.metadata.create_all(engine)
with SessionLocal() as session:
for account in session.scalars(
select(ContributorAccount).where(ContributorAccount.is_active.is_(True))
):
registry[account.token_hash] = AuthSession(
token=account.token_hash,
username=account.email,
role="contributor",
)
return registry
@ -70,7 +87,11 @@ def resolve_auth_session(headers: Mapping[str, str]) -> AuthSession | None:
token = get_bearer_token(headers)
if not token:
return None
return registry.get(token)
direct = registry.get(token)
if direct is not None:
return direct
token_hash = hashlib.sha256(token.encode("utf-8")).hexdigest()
return registry.get(token_hash)
def auth_is_configured() -> bool:

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,387 @@
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
import re
import sys
def _load_citegeist_extract():
citegeist_src = Path(__file__).resolve().parents[5] / "CiteGeist" / "src"
if citegeist_src.exists() and str(citegeist_src) not in sys.path:
sys.path.insert(0, str(citegeist_src))
try:
from citegeist.extract import extract_references # type: ignore
except ImportError:
return None
return extract_references
@dataclass
class DraftCitation:
citation_key: str
entry_type: str
fields: dict[str, str]
draft_bibtex: str
STOPWORD_TOKENS = {
"a",
"an",
"and",
"for",
"from",
"in",
"of",
"on",
"the",
"to",
"with",
}
HISTORICAL_YEAR_PATTERN = r"(1\d{3}|20\d{2})"
def build_standard_citation_key(
authors: str = "",
year: str = "",
title: str = "",
fallback_text: str = "",
) -> str:
family_name = _family_name_stem(authors or fallback_text)
year_stem = re.sub(r"[^0-9]+", "", year)[:4]
topic_stem = _topic_stem(title or fallback_text)
key = f"{family_name}{year_stem}{topic_stem}"
return key or "reference"
def extract_draft_citation(raw_text: str, legacy_reference_number: str = "") -> DraftCitation | None:
extractor = _load_citegeist_extract()
if extractor is None:
return _fallback_citation(raw_text, legacy_reference_number)
entries = extractor(raw_text)
if not entries:
return _fallback_citation(raw_text, legacy_reference_number)
entry = entries[0]
fields = dict(entry.fields)
fields = _repair_reference_fields(raw_text, fields)
citation_key = build_standard_citation_key(
authors=str(fields.get("author", "")),
year=str(fields.get("year", "")),
title=str(fields.get("title", "")),
fallback_text=raw_text,
)
note_parts = [fields.get("note", "").strip()] if fields.get("note") else []
if legacy_reference_number:
note_parts.append(f"ecospecies_reference_number = {{{legacy_reference_number}}}")
fields["note"] = "; ".join(part for part in note_parts if part)
draft_bibtex = render_single_bibtex(entry.entry_type, citation_key, fields)
return DraftCitation(
citation_key=citation_key,
entry_type=entry.entry_type,
fields=fields,
draft_bibtex=draft_bibtex,
)
def _fallback_citation(raw_text: str, legacy_reference_number: str) -> DraftCitation:
year_match = re.search(rf"\b{HISTORICAL_YEAR_PATTERN}\b", raw_text)
year = year_match.group(0) if year_match else ""
fields = _repair_reference_fields(
raw_text,
{
"title": raw_text.strip(),
"year": year,
},
)
title = str(fields.get("title", "")).strip() or raw_text.strip()
citation_key = build_standard_citation_key(year=year, title=title, fallback_text=raw_text)
fields["note"] = f"raw_reference = {{{raw_text}}}"
if legacy_reference_number:
fields["note"] += f"; ecospecies_reference_number = {{{legacy_reference_number}}}"
draft_bibtex = render_single_bibtex("misc", citation_key, fields)
return DraftCitation(
citation_key=citation_key,
entry_type="misc",
fields=fields,
draft_bibtex=draft_bibtex,
)
def _family_name_stem(raw_text: str) -> str:
compact = raw_text.strip()
if not compact:
return "ref"
if "," in compact:
compact = compact.split(",", 1)[0]
else:
compact = compact.split()[0]
compact = re.sub(r"[^A-Za-z0-9]+", "", compact).lower()
return compact or "ref"
def _topic_stem(raw_text: str) -> str:
tokens = [
token
for token in re.findall(r"[A-Za-z0-9]+", raw_text.lower())
if token not in STOPWORD_TOKENS and not token.isdigit()
]
topic_tokens = tokens[:3] or ["topic"]
return "".join(topic_tokens)
def _repair_reference_fields(raw_text: str, fields: dict[str, str]) -> dict[str, str]:
repaired = dict(fields)
title = str(repaired.get("title", "")).strip()
raw = raw_text.strip()
if not raw:
return repaired
parsed = _parse_report_style_reference(raw)
if parsed is None:
return repaired
current_venue = (
str(repaired.get("journal", "")).strip()
or str(repaired.get("howpublished", "")).strip()
or str(repaired.get("booktitle", "")).strip()
or str(repaired.get("publisher", "")).strip()
)
parsed_venue = str(parsed.get("venue", "")).strip()
needs_structural_repair = bool(
parsed_venue
and (
not current_venue
or len(current_venue) < max(8, len(parsed_venue) // 2)
or current_venue.lower() not in parsed_venue.lower()
or (parsed.get("volume") and not str(repaired.get("volume", "")).strip())
or (parsed.get("number") and not str(repaired.get("number", "")).strip())
or (parsed.get("pages") and not str(repaired.get("pages", "")).strip())
)
)
if title and not _title_looks_like_raw_reference(title) and not needs_structural_repair:
return repaired
if parsed.get("author"):
repaired["author"] = parsed["author"]
if parsed.get("year"):
repaired["year"] = parsed["year"]
if parsed.get("title"):
repaired["title"] = parsed["title"]
venue = parsed.get("venue", "")
if venue:
repaired.pop("howpublished", None)
if _venue_looks_journal_like(venue):
repaired["journal"] = venue
else:
repaired["howpublished"] = venue
if parsed.get("volume"):
repaired["volume"] = parsed["volume"]
if parsed.get("number"):
repaired["number"] = parsed["number"]
if parsed.get("pages"):
repaired["pages"] = parsed["pages"]
return repaired
def _title_looks_like_raw_reference(title: str) -> bool:
compact = " ".join(title.split()).strip()
if not compact:
return True
if len(compact) > 120:
return True
return bool(re.match(rf"^[^,]+,\s+.+\b{HISTORICAL_YEAR_PATTERN}\.\s+", compact))
def _parse_report_style_reference(raw_text: str) -> dict[str, str] | None:
match = re.match(
rf"^(?P<author>.+?)\s+(?P<year>{HISTORICAL_YEAR_PATTERN})\.\s+(?P<remainder>.+)$",
raw_text.strip(),
)
if match is None:
return None
author = match.group("author").strip(" .")
year = match.group("year").strip()
remainder = match.group("remainder").strip()
if not author or not remainder:
return None
venue_start = _find_venue_start(remainder)
if venue_start is None:
return {
"author": author,
"year": year,
"title": remainder.strip(" ."),
"venue": "",
}
title = remainder[:venue_start].strip(" .")
venue_part = remainder[venue_start:].strip(" .")
venue, volume, number, pages = _split_venue_and_locator(venue_part)
return {
"author": author,
"year": year,
"title": title,
"venue": venue,
"volume": volume,
"number": number,
"pages": pages,
}
def _split_venue_and_locator(venue_part: str) -> tuple[str, str, str, str]:
compact = venue_part.strip(" .")
if not compact:
return "", "", "", ""
match = re.search(
r"(?P<venue>.+?)\.\s+(?P<volume>\d+)(?:\((?P<number>[^)]+)\))?\s*:\s*(?P<pages>\d+(?:-\d+)?)\.?$",
compact,
)
if match is None:
match = re.search(
r"(?P<venue>.+?)\s+(?P<volume>\d+)(?:\((?P<number>[^)]+)\))?\s*:\s*(?P<pages>\d+(?:-\d+)?)\.?$",
compact,
)
if match is None:
return compact, "", "", ""
return (
match.group("venue").strip(" ."),
(match.group("volume") or "").strip(),
(match.group("number") or "").strip(),
(match.group("pages") or "").strip(),
)
def _find_venue_start(remainder: str) -> int | None:
for match in re.finditer(r"\.\s+", remainder):
candidate_start = match.end()
candidate = remainder[candidate_start:].strip()
if _looks_like_publication_segment(candidate):
return candidate_start
lowered = remainder.lower()
markers = (
"comm. rept.",
"rept.",
"proc.",
"procs.",
"journal",
"transactions",
"proceedings",
"bulletin",
"bull.",
"occas. pap.",
"pap.",
"memoir",
"memorandum",
"memo.",
"tech. memo.",
"tech memo",
"technical memorandum",
"technical report",
"noaa",
)
positions = [lowered.find(marker) for marker in markers if lowered.find(marker) > 0]
if positions:
return min(positions)
return None
def _looks_like_publication_segment(candidate: str) -> bool:
compact = candidate.strip(" .")
if not compact:
return False
venue, volume, number, pages = _split_venue_and_locator(compact)
if venue and (volume or number or pages) and _starts_with_publication_marker(compact):
return True
return _starts_with_publication_marker(compact)
def _starts_with_publication_marker(text: str) -> bool:
lowered = text.lower()
publication_starts = (
"comm. rept.",
"rept.",
"proc.",
"procs.",
"journal",
"transactions",
"proceedings",
"bulletin",
"bull.",
"occas. pap.",
"pap.",
"memoir",
"memorandum",
"memo.",
"tech. memo.",
"tech memo",
"technical memorandum",
"technical report",
"noaa",
"u.s.",
)
return lowered.startswith(publication_starts)
def _venue_looks_journal_like(venue: str) -> bool:
lowered = venue.lower()
return any(
token in lowered
for token in (
"journal",
"transactions",
"review",
"letters",
"comm. rept.",
"rept.",
"proc.",
"proceedings",
"occas. pap.",
"pap.",
)
)
def render_single_bibtex(entry_type: str, citation_key: str, fields: dict[str, str]) -> str:
lines = [f"@{entry_type}{{{citation_key},"]
for key in sorted(fields):
value = _sanitize_bibtex_value(fields[key])
lines.append(f" {key} = {{{value}}},")
lines.append("}")
return "\n".join(lines)
def _sanitize_bibtex_value(value: str) -> str:
depth = 0
parts: list[str] = []
for char in value:
if char == "{":
depth += 1
parts.append(char)
continue
if char == "}":
if depth == 0:
parts.append(")")
else:
depth -= 1
parts.append(char)
continue
parts.append(char)
if depth > 0:
open_count = depth
normalized: list[str] = []
for char in parts:
if char == "{" and open_count > 0:
normalized.append("(")
open_count -= 1
else:
normalized.append(char)
return "".join(normalized)
return "".join(parts)

View File

@ -0,0 +1,480 @@
from __future__ import annotations
import json
import re
from dataclasses import asdict, dataclass, field
HEADING_PATTERN = re.compile(r"^(#{2,6})\s+(?P<title>.+?)\s*$")
INDENTED_ITEM_PATTERN = re.compile(r"^\s*-\s*(?P<body>.+?)\s*$")
DOI_PATTERN = re.compile(r"\b10\.\d{4,9}/[-._;()/:A-Za-z0-9]+\b")
@dataclass
class DocumentNode:
node_type: str
title: str
body: str
depth: int
children: list["DocumentNode"] = field(default_factory=list)
@dataclass
class StructuredDocument:
metadata: dict[str, object]
nodes: list[DocumentNode]
def _parse_scalar_value(value: str) -> object:
stripped = value.strip()
if not stripped:
return ""
if stripped.lower() == "true":
return True
if stripped.lower() == "false":
return False
if stripped.startswith("{") or stripped.startswith("["):
try:
return json.loads(stripped)
except json.JSONDecodeError:
return stripped
return stripped
def _normalize_whitespace(value: str) -> str:
return re.sub(r"\s+", " ", value).strip()
def _parse_front_matter(front_matter: str) -> dict[str, object]:
metadata: dict[str, object] = {}
lines = front_matter.splitlines()
index = 0
while index < len(lines):
raw_line = lines[index]
if not raw_line.strip() or raw_line.lstrip().startswith("#"):
index += 1
continue
if ":" not in raw_line:
index += 1
continue
key, value = raw_line.split(":", 1)
normalized_key = key.strip()
stripped_value = value.strip()
if stripped_value:
metadata[normalized_key] = _parse_scalar_value(stripped_value)
index += 1
continue
items: list[dict[str, object]] = []
index += 1
while index < len(lines):
item_line = lines[index]
if not item_line.strip():
index += 1
continue
if not item_line.startswith(" - "):
break
match = INDENTED_ITEM_PATTERN.match(item_line)
if not match:
break
item: dict[str, object] = {}
first_body = match.group("body")
if ":" in first_body:
item_key, item_value = first_body.split(":", 1)
item[item_key.strip()] = _parse_scalar_value(item_value.strip())
index += 1
while index < len(lines):
nested_line = lines[index]
if nested_line.startswith(" ") and ":" in nested_line.strip():
nested_key, nested_value = nested_line.strip().split(":", 1)
item[nested_key.strip()] = _parse_scalar_value(nested_value.strip())
index += 1
continue
break
items.append(item)
metadata[normalized_key] = items
return metadata
def _split_front_matter(text: str) -> tuple[dict[str, object], str]:
stripped = text.lstrip()
if not stripped.startswith("---\n"):
return {}, text
_, _, remainder = stripped.partition("---\n")
front_matter, separator, body = remainder.partition("\n---\n")
if not separator:
return {}, text
return _parse_front_matter(front_matter), body
def parse_markdown_document(text: str) -> StructuredDocument:
metadata, body = _split_front_matter(text)
root_nodes: list[DocumentNode] = []
stack: list[DocumentNode] = []
body_lines: list[str] = []
def flush_body() -> None:
if not stack:
body_lines.clear()
return
stack[-1].body = "\n".join(body_lines).strip()
body_lines.clear()
for raw_line in body.splitlines():
match = HEADING_PATTERN.match(raw_line)
if not match:
body_lines.append(raw_line)
continue
flush_body()
depth = len(match.group(1))
node = DocumentNode(
node_type="section",
title=match.group("title").strip(),
body="",
depth=depth,
)
while stack and stack[-1].depth >= depth:
stack.pop()
if stack:
stack[-1].children.append(node)
else:
root_nodes.append(node)
stack.append(node)
flush_body()
return StructuredDocument(metadata=metadata, nodes=root_nodes)
def validate_markdown_document(text: str) -> list[str]:
errors: list[str] = []
metadata, body = _split_front_matter(text)
if not metadata:
errors.append("Markdown document must include YAML front matter.")
last_depth: int | None = None
for raw_line in body.splitlines():
match = HEADING_PATTERN.match(raw_line)
if not match:
continue
depth = len(match.group(1))
if last_depth is not None and depth > last_depth + 1:
errors.append(
f"Heading depth jumps from level {last_depth} to level {depth}: {match.group('title').strip()}"
)
last_depth = depth
return errors
def _append_metadata_lines(lines: list[str], key: str, value: object) -> None:
if isinstance(value, list):
lines.append(f"{key}:")
for item in value:
if isinstance(item, dict) and item:
first = True
for item_key, item_value in item.items():
rendered = "true" if item_value is True else "false" if item_value is False else str(item_value)
prefix = " - " if first else " "
lines.append(f"{prefix}{item_key}: {rendered}")
first = False
else:
lines.append(f" - {item}")
return
rendered = "true" if value is True else "false" if value is False else str(value)
lines.append(f"{key}: {rendered}")
def export_markdown_document(document: StructuredDocument) -> str:
lines: list[str] = ["---"]
for key, value in document.metadata.items():
_append_metadata_lines(lines, key, value)
lines.append("---")
lines.append("")
def append_nodes(nodes: list[DocumentNode]) -> None:
for node in nodes:
lines.append(f"{'#' * node.depth} {node.title}")
if node.body:
lines.append(node.body)
lines.append("")
append_nodes(node.children)
append_nodes(document.nodes)
return "\n".join(lines).rstrip() + "\n"
def flatten_document_nodes(document: StructuredDocument) -> list[dict[str, object]]:
flattened: list[dict[str, object]] = []
def visit(nodes: list[DocumentNode], parent_id: str | None) -> None:
for index, node in enumerate(nodes, start=1):
node_id = f"node-{len(flattened) + 1}"
flattened.append(
{
"node_id": node_id,
"parent_id": parent_id,
"position": index,
"depth": node.depth,
"node_type": node.node_type,
"title": node.title,
"body_markdown": node.body,
"body_plaintext": node.body,
}
)
visit(node.children, node_id)
visit(document.nodes, None)
return flattened
def document_to_json(document: StructuredDocument) -> str:
return json.dumps(asdict(document), ensure_ascii=True)
def build_document_from_species_payload(item: dict[str, object]) -> StructuredDocument:
legacy_identifiers: list[dict[str, object]] = []
if item.get("flelmr_code"):
legacy_identifiers.append(
{
"authority": "legacy-ecospecies",
"identifier": str(item.get("flelmr_code", "")),
"label": "FLELMR",
}
)
metadata = {
"title": str(item.get("title", "")),
"common_name": str(item.get("common_name", "")),
"scientific_name": str(item.get("scientific_name", "")),
"legacy_identifiers": legacy_identifiers,
"taxon_identifiers": list(item.get("taxon_identifiers", [])),
"primary_taxon_authority": str(item.get("primary_taxon_authority", "")),
"source_file": str(item.get("source_file", "")),
"publication_status": str(item.get("publication_status", "published")),
"source_format": "ecospecies-markdown-v1",
}
nodes: list[DocumentNode] = []
summary = str(item.get("summary", "")).strip()
if summary:
nodes.append(
DocumentNode(
node_type="section",
title="Summary",
body=summary,
depth=2,
)
)
for section in item.get("sections", []):
heading = str(section.get("heading", "")).strip()
if not heading or heading == "HEADER":
continue
nodes.append(
DocumentNode(
node_type="section",
title=heading,
body=str(section.get("content", "")).strip(),
depth=2,
)
)
return StructuredDocument(metadata=metadata, nodes=nodes)
def extract_species_projection(document: StructuredDocument) -> dict[str, object]:
metadata = document.metadata
summary = ""
sections: list[dict[str, object]] = []
legacy_identifiers = metadata.get("legacy_identifiers", [])
taxon_identifiers = metadata.get("taxon_identifiers", [])
flelmr_code = ""
if isinstance(legacy_identifiers, list):
for item in legacy_identifiers:
if not isinstance(item, dict):
continue
authority = str(item.get("authority", "")).strip().lower()
label = str(item.get("label", "")).strip().lower()
if authority == "legacy-ecospecies" or label == "flelmr":
flelmr_code = str(item.get("identifier", "")).strip()
if flelmr_code:
break
if not flelmr_code:
flelmr_code = str(metadata.get("species_code", "")).strip()
def visit(nodes: list[DocumentNode], path: list[str]) -> None:
nonlocal summary
for node in nodes:
current_path = [*path, node.title]
if node.title.lower() == "summary" and not summary:
summary = node.body.strip()
else:
sections.append(
{
"heading": " / ".join(current_path),
"content": node.body.strip(),
}
)
visit(node.children, current_path)
visit(document.nodes, [])
return {
"title": metadata.get("title", ""),
"common_name": metadata.get("common_name", ""),
"scientific_name": metadata.get("scientific_name", ""),
"flelmr_code": flelmr_code,
"legacy_identifiers": legacy_identifiers if isinstance(legacy_identifiers, list) else [],
"taxon_identifiers": taxon_identifiers if isinstance(taxon_identifiers, list) else [],
"primary_taxon_authority": str(metadata.get("primary_taxon_authority", "")),
"summary": summary,
"sections": sections,
}
def _is_citation_heading(title: str) -> bool:
lowered = title.strip().rstrip(":").lower()
return lowered in {
"references",
"reference",
"citations",
"citation",
"bibliography",
"related references",
"related citations",
}
def _split_citation_lines(body: str) -> list[str]:
entries: list[dict[str, str]] = []
current: list[str] = []
current_number = ""
def flush() -> None:
nonlocal current_number
if not current:
return
compact = " ".join(part.strip() for part in current if part.strip()).strip()
if compact:
entries.append(
{
"legacy_reference_number": current_number,
"raw_text": compact,
}
)
current.clear()
current_number = ""
for raw_line in body.splitlines():
stripped = raw_line.strip()
if not stripped:
flush()
continue
leading_number_match = re.match(r"^(?P<num>\d+)\s*,\s*(?P<text>.+)$", stripped)
if leading_number_match:
flush()
current_number = leading_number_match.group("num")
current.append(leading_number_match.group("text"))
continue
bare_number_match = re.match(r"^(?P<num>\d+)\s+(?P<text>[A-Z].+)$", stripped)
if bare_number_match:
flush()
current_number = bare_number_match.group("num")
current.append(bare_number_match.group("text"))
continue
bullet_match = re.match(
r"^(?:[-*]|\[(?P<bracket_num>\d+)\]|(?P<plain_num>\d+)[\.,])\s+(?P<text>.+)$",
stripped,
)
if bullet_match:
flush()
current_number = bullet_match.group("bracket_num") or bullet_match.group("plain_num") or ""
bullet_text = bullet_match.group("text")
if not current_number:
nested_number_match = re.match(r"^\[(?P<num>\d+)\]\s+(?P<text>.+)$", bullet_text)
if nested_number_match:
current_number = nested_number_match.group("num")
bullet_text = nested_number_match.group("text")
else:
nested_comma_match = re.match(r"^(?P<num>\d+)\s*,\s*(?P<text>.+)$", bullet_text)
if nested_comma_match:
current_number = nested_comma_match.group("num")
bullet_text = nested_comma_match.group("text")
current.append(bullet_text)
continue
current.append(stripped)
flush()
return entries
def extract_citation_entries(document: StructuredDocument) -> list[dict[str, object]]:
entries: list[dict[str, object]] = []
def visit(nodes: list[DocumentNode], path: list[str]) -> None:
for node in nodes:
current_path = [*path, node.title]
if _is_citation_heading(node.title):
section_heading = " / ".join(current_path)
for item in _split_citation_lines(node.body):
raw_text = item["raw_text"]
doi_match = DOI_PATTERN.search(raw_text)
entries.append(
{
"section_heading": section_heading,
"legacy_reference_number": item["legacy_reference_number"],
"raw_text": raw_text,
"normalized_text": _normalize_whitespace(raw_text),
"doi": doi_match.group(0) if doi_match else "",
}
)
visit(node.children, current_path)
visit(document.nodes, [])
return entries
def add_citation_to_document(
document: StructuredDocument,
citation_text: str,
heading_title: str = "Related References",
) -> bool:
normalized_citation = _normalize_whitespace(citation_text)
if not normalized_citation:
return False
for node in document.nodes:
if _is_citation_heading(node.title):
existing = {_normalize_whitespace(item["raw_text"]) for item in _split_citation_lines(node.body)}
if normalized_citation in existing:
return False
body = node.body.rstrip()
node.body = f"{body}\n- {citation_text}".strip() if body else f"- {citation_text}"
return True
document.nodes.append(
DocumentNode(
node_type="section",
title=heading_title,
body=f"- {citation_text}",
depth=2,
)
)
return True

View File

@ -0,0 +1,267 @@
from __future__ import annotations
from sqlalchemy import select
from ecospecies_api.citegeist_bridge import extract_draft_citation
from ecospecies_api.document_format import (
build_document_from_species_payload,
document_to_json,
extract_citation_entries,
extract_species_projection,
export_markdown_document,
flatten_document_nodes,
parse_markdown_document,
validate_markdown_document,
)
from ecospecies_api.models import (
DocumentSection,
Species,
SpeciesCitation,
SpeciesDocument,
SpeciesDocumentNode,
SpeciesTaxonIdentifier,
)
def _persist_taxon_identifiers(session, species: Species, taxon_identifiers: list[dict[str, object]]) -> None:
for identifier in list(species.taxon_identifiers):
session.delete(identifier)
session.flush()
for position, item in enumerate(taxon_identifiers, start=1):
authority = str(item.get("authority", "")).strip()
identifier = str(item.get("identifier", "")).strip()
if not authority or not identifier:
continue
session.add(
SpeciesTaxonIdentifier(
species_id=species.id,
position=position,
authority=authority,
identifier=identifier,
label=str(item.get("label", "")).strip(),
is_primary=bool(item.get("primary") or item.get("is_primary")),
source_url=str(item.get("source_url", "")).strip(),
)
)
def _existing_taxon_identifier_payload(species: Species) -> list[dict[str, object]]:
return [
{
"authority": item.authority,
"identifier": item.identifier,
"label": item.label,
"primary": item.is_primary,
"source_url": item.source_url,
}
for item in species.taxon_identifiers
]
def _citation_match_key(item: dict[str, object]) -> tuple[str, str, str]:
return (
str(item.get("section_heading", "")).strip(),
str(item.get("legacy_reference_number", "")).strip(),
str(item.get("raw_text", "")).strip(),
)
def _persist_citations(session, species: Species, citations: list[dict[str, object]]) -> None:
existing_by_key = {
_citation_match_key(
{
"section_heading": citation.section_heading,
"legacy_reference_number": citation.legacy_reference_number,
"raw_text": citation.raw_text,
}
): citation
for citation in species.citations
}
retained_ids: set[int] = set()
for position, item in enumerate(citations, start=1):
raw_text = str(item.get("raw_text", "")).strip()
if not raw_text:
continue
key = _citation_match_key(item)
legacy_reference_number = str(item.get("legacy_reference_number", "")).strip()
existing = existing_by_key.get(key)
extracted_normalized = str(item.get("normalized_text", "")).strip()
extracted_doi = str(item.get("doi", "")).strip()
draft = extract_draft_citation(raw_text, legacy_reference_number)
if existing is None:
session.add(
SpeciesCitation(
species_id=species.id,
position=position,
section_heading=str(item.get("section_heading", "")).strip(),
legacy_reference_number=legacy_reference_number,
citation_key=draft.citation_key if draft is not None else "",
entry_type=draft.entry_type if draft is not None else "misc",
raw_text=raw_text,
normalized_text=extracted_normalized,
abstract_text="",
draft_bibtex=draft.draft_bibtex if draft is not None else "",
doi=extracted_doi,
source_url="",
openalex_id="",
resolver_source_label="",
enrichment_status="pending",
enrichment_error="",
source_type="document_extract",
review_status="draft",
)
)
continue
existing.position = position
existing.section_heading = str(item.get("section_heading", "")).strip()
existing.legacy_reference_number = legacy_reference_number
existing.raw_text = raw_text
if existing.review_status == "draft":
existing.normalized_text = extracted_normalized
existing.abstract_text = ""
existing.doi = extracted_doi
existing.citation_key = draft.citation_key if draft is not None else ""
existing.entry_type = draft.entry_type if draft is not None else "misc"
existing.draft_bibtex = draft.draft_bibtex if draft is not None else ""
existing.source_type = "document_extract"
existing.enrichment_status = "pending"
existing.enrichment_error = ""
existing.resolver_source_label = ""
existing.source_url = ""
existing.openalex_id = ""
retained_ids.add(existing.id)
session.add(existing)
for citation in list(species.citations):
if citation.id not in retained_ids and citation.source_type in {"document_extract", "editor_review"}:
session.delete(citation)
def _persist_document_model(session, species: Species, document_model, markdown_content: str, updated_by: str) -> None:
ast_json = document_to_json(document_model)
document = session.scalar(
select(SpeciesDocument).where(SpeciesDocument.species_id == species.id)
)
if document is None:
document = SpeciesDocument(
species_id=species.id,
source_format="ecospecies-markdown-v1",
markdown_content=markdown_content,
ast_json=ast_json,
updated_by=updated_by,
)
session.add(document)
session.flush()
else:
document.source_format = "ecospecies-markdown-v1"
document.markdown_content = markdown_content
document.ast_json = ast_json
document.updated_by = updated_by
session.add(document)
for node in list(document.nodes):
session.delete(node)
session.flush()
for node in flatten_document_nodes(document_model):
session.add(
SpeciesDocumentNode(
document_id=document.id,
parent_node_ref=node["parent_id"],
node_ref=node["node_id"],
position=node["position"],
depth=node["depth"],
node_type=node["node_type"],
title=node["title"],
body_markdown=node["body_markdown"],
body_plaintext=node["body_plaintext"],
)
)
def sync_species_document(session, species: Species, item: dict[str, object]) -> None:
payload = dict(item)
if "taxon_identifiers" not in payload or not payload.get("taxon_identifiers"):
payload["taxon_identifiers"] = _existing_taxon_identifier_payload(species)
if "primary_taxon_authority" not in payload or not payload.get("primary_taxon_authority"):
for identifier in payload["taxon_identifiers"]:
if bool(identifier.get("primary")):
payload["primary_taxon_authority"] = str(identifier.get("authority", "")).strip()
break
document_model = build_document_from_species_payload(payload)
markdown_content = export_markdown_document(document_model)
_persist_document_model(
session,
species,
document_model,
markdown_content,
str(item.get("last_modified_by", "system-import")),
)
_persist_citations(session, species, extract_citation_entries(document_model))
def get_species_document_payload(session, slug: str) -> dict[str, object] | None:
species = session.scalar(select(Species).where(Species.slug == slug))
if species is None or species.document is None:
return None
document = species.document
return {
"slug": species.slug,
"source_format": document.source_format,
"markdown": document.markdown_content,
"ast_json": document.ast_json,
"node_count": len(document.nodes),
"updated_by": document.updated_by,
}
def save_species_document(session, species: Species, markdown: str, username: str) -> dict[str, object]:
errors = validate_markdown_document(markdown)
if errors:
raise ValueError("; ".join(errors))
document_model = parse_markdown_document(markdown)
projection = extract_species_projection(document_model)
_persist_document_model(session, species, document_model, markdown, username)
_persist_citations(session, species, extract_citation_entries(document_model))
if projection["title"]:
species.title = str(projection["title"])
if projection["common_name"]:
species.common_name = str(projection["common_name"])
if projection["scientific_name"]:
species.scientific_name = str(projection["scientific_name"])
if projection["flelmr_code"]:
species.flelmr_code = str(projection["flelmr_code"])
_persist_taxon_identifiers(session, species, list(projection["taxon_identifiers"]))
species.summary = str(projection["summary"])
species.section_count = len(projection["sections"])
species.last_modified_by = username
for section in list(species.sections):
session.delete(section)
session.flush()
for position, section in enumerate(projection["sections"], start=1):
session.add(
DocumentSection(
species_id=species.id,
position=position,
heading=str(section["heading"]),
content=str(section["content"]),
)
)
return {
"slug": species.slug,
"summary": species.summary,
"section_count": species.section_count,
"markdown": markdown,
"updated_by": username,
}

View File

@ -23,6 +23,9 @@ class Species(Base):
publication_status: Mapped[str] = mapped_column(String(32), default="published", index=True)
is_archived: Mapped[bool] = mapped_column(Boolean, default=False, index=True)
editor_notes: Mapped[str] = mapped_column(Text, default="")
created_by: Mapped[str] = mapped_column(String(255), default="system-import")
owner_username: Mapped[str] = mapped_column(String(255), default="")
owner_role: Mapped[str] = mapped_column(String(32), default="")
last_modified_by: Mapped[str] = mapped_column(String(255), default="system-import")
sections: Mapped[list["DocumentSection"]] = relationship(
@ -40,6 +43,21 @@ class Species(Base):
cascade="all, delete-orphan",
order_by="SpeciesAuditLog.id.desc()",
)
document: Mapped["SpeciesDocument | None"] = relationship(
back_populates="species",
cascade="all, delete-orphan",
uselist=False,
)
taxon_identifiers: Mapped[list["SpeciesTaxonIdentifier"]] = relationship(
back_populates="species",
cascade="all, delete-orphan",
order_by="SpeciesTaxonIdentifier.position",
)
citations: Mapped[list["SpeciesCitation"]] = relationship(
back_populates="species",
cascade="all, delete-orphan",
order_by="SpeciesCitation.position",
)
class DocumentSection(Base):
@ -77,3 +95,93 @@ class SpeciesAuditLog(Base):
details_json: Mapped[str] = mapped_column(Text)
species: Mapped[Species] = relationship(back_populates="audit_entries")
class SpeciesDocument(Base):
__tablename__ = "species_document"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
species_id: Mapped[int] = mapped_column(ForeignKey("species.id", ondelete="CASCADE"), unique=True, index=True)
source_format: Mapped[str] = mapped_column(String(64), default="ecospecies-markdown-v1")
markdown_content: Mapped[str] = mapped_column(Text, default="")
ast_json: Mapped[str] = mapped_column(Text, default="")
updated_by: Mapped[str] = mapped_column(String(255), default="system-import")
species: Mapped[Species] = relationship(back_populates="document")
nodes: Mapped[list["SpeciesDocumentNode"]] = relationship(
back_populates="document",
cascade="all, delete-orphan",
order_by="SpeciesDocumentNode.position",
)
class SpeciesDocumentNode(Base):
__tablename__ = "species_document_node"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
document_id: Mapped[int] = mapped_column(ForeignKey("species_document.id", ondelete="CASCADE"), index=True)
parent_node_ref: Mapped[str | None] = mapped_column(String(64), nullable=True, default=None)
node_ref: Mapped[str] = mapped_column(String(64), index=True)
position: Mapped[int] = mapped_column(Integer, default=1)
depth: Mapped[int] = mapped_column(Integer, default=2)
node_type: Mapped[str] = mapped_column(String(32), default="section")
title: Mapped[str] = mapped_column(String(255), default="")
body_markdown: Mapped[str] = mapped_column(Text, default="")
body_plaintext: Mapped[str] = mapped_column(Text, default="")
source_heading: Mapped[str] = mapped_column(String(255), default="")
source_span_start: Mapped[int | None] = mapped_column(Integer, nullable=True, default=None)
source_span_end: Mapped[int | None] = mapped_column(Integer, nullable=True, default=None)
document: Mapped[SpeciesDocument] = relationship(back_populates="nodes")
class ContributorAccount(Base):
__tablename__ = "contributor_account"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
email: Mapped[str] = mapped_column(String(255), unique=True, index=True)
token_hash: Mapped[str] = mapped_column(String(128), unique=True, index=True)
age_gate_confirmed: Mapped[bool] = mapped_column(Boolean, default=False)
created_at: Mapped[str] = mapped_column(String(64), index=True)
is_active: Mapped[bool] = mapped_column(Boolean, default=True, index=True)
class SpeciesTaxonIdentifier(Base):
__tablename__ = "species_taxon_identifier"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
species_id: Mapped[int] = mapped_column(ForeignKey("species.id", ondelete="CASCADE"), index=True)
position: Mapped[int] = mapped_column(Integer, default=1)
authority: Mapped[str] = mapped_column(String(64), default="")
identifier: Mapped[str] = mapped_column(String(255), default="")
label: Mapped[str] = mapped_column(String(128), default="")
is_primary: Mapped[bool] = mapped_column(Boolean, default=False, index=True)
source_url: Mapped[str] = mapped_column(String(500), default="")
species: Mapped[Species] = relationship(back_populates="taxon_identifiers")
class SpeciesCitation(Base):
__tablename__ = "species_citation"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
species_id: Mapped[int] = mapped_column(ForeignKey("species.id", ondelete="CASCADE"), index=True)
position: Mapped[int] = mapped_column(Integer, default=1)
section_heading: Mapped[str] = mapped_column(String(255), default="")
legacy_reference_number: Mapped[str] = mapped_column(String(64), default="", index=True)
citation_key: Mapped[str] = mapped_column(String(255), default="", index=True)
entry_type: Mapped[str] = mapped_column(String(64), default="misc")
raw_text: Mapped[str] = mapped_column(Text, default="")
normalized_text: Mapped[str] = mapped_column(Text, default="")
abstract_text: Mapped[str] = mapped_column(Text, default="")
draft_bibtex: Mapped[str] = mapped_column(Text, default="")
doi: Mapped[str] = mapped_column(String(255), default="", index=True)
source_url: Mapped[str] = mapped_column(String(500), default="")
openalex_id: Mapped[str] = mapped_column(String(64), default="", index=True)
resolver_source_label: Mapped[str] = mapped_column(String(255), default="")
enrichment_status: Mapped[str] = mapped_column(String(32), default="pending", index=True)
enrichment_error: Mapped[str] = mapped_column(Text, default="")
source_type: Mapped[str] = mapped_column(String(64), default="document_extract")
review_status: Mapped[str] = mapped_column(String(32), default="draft", index=True)
species: Mapped[Species] = relationship(back_populates="citations")

View File

@ -1,14 +1,18 @@
from __future__ import annotations
import hashlib
import os
import re
from collections import Counter
from dataclasses import dataclass
from pathlib import Path
SECTION_PATTERN = re.compile(r"^[A-Z][A-Z\s/&()-]{2,}$")
TITLE_SECTION_PATTERN = re.compile(r"^[A-Z][A-Za-z\s/&()-]{2,}$")
FIELD_PATTERN = re.compile(r"^(?P<key>[A-Za-z/ _-]+):\s*(?P<value>.*)$")
SUMMARY_MARKER_PATTERN = re.compile(r"^(summary(?:/abstract)?|abstract|executive summary):?\s*$", re.IGNORECASE)
SAFE_DIRECTORY_NAME_PATTERN = re.compile(r"^[A-Za-z0-9_-]+$")
@dataclass
@ -38,6 +42,10 @@ class SpeciesRecord:
diagnostics: list[IngestDiagnostic]
def get_repo_root() -> Path:
return Path(__file__).resolve().parents[4]
def slugify(value: str) -> str:
cleaned = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-")
return cleaned or "unknown-species"
@ -53,6 +61,33 @@ def normalize_whitespace(value: str) -> str:
return re.sub(r"\s+", " ", value).strip()
def is_section_heading(line: str) -> bool:
stripped = line.strip()
if not stripped:
return False
normalized = stripped[:-1].strip() if stripped.endswith(":") else stripped
if not normalized:
return False
if ":" in normalized:
return False
if SECTION_PATTERN.fullmatch(normalized):
return True
if not TITLE_SECTION_PATTERN.fullmatch(normalized):
return False
words = normalized.split()
if len(words) > 4:
return False
return all(word[0].isupper() for word in words if word and word[0].isalpha())
def normalize_heading(line: str) -> str:
stripped = line.strip()
if stripped.endswith(":"):
return stripped[:-1].strip()
return stripped
def split_sections(lines: list[str]) -> list[Section]:
sections: list[Section] = []
current_heading = "HEADER"
@ -61,7 +96,7 @@ def split_sections(lines: list[str]) -> list[Section]:
for raw_line in lines:
line = raw_line.rstrip()
stripped = line.strip()
if SECTION_PATTERN.fullmatch(stripped):
if is_section_heading(stripped):
if current_lines:
sections.append(
Section(
@ -69,7 +104,7 @@ def split_sections(lines: list[str]) -> list[Section]:
content="\n".join(current_lines).strip(),
)
)
current_heading = stripped
current_heading = normalize_heading(stripped)
current_lines = []
continue
current_lines.append(line)
@ -96,8 +131,9 @@ def extract_metadata(lines: list[str]) -> dict[str, str]:
value = match.group("value").strip()
metadata[key] = value
# Legacy files vary between "FLELMR", "FLELMR Code", and similar labels.
if key.startswith("flelmr"):
# Legacy files vary between "FLELMR", "FLELMR Code", "EcoSpecies Code",
# and similar labels.
if key.startswith("flelmr") or key == "ecospecies code":
metadata["flelmr"] = value
return metadata
@ -127,7 +163,7 @@ def extract_summary(lines: list[str], sections: list[Section]) -> str:
if summary_lines:
summary_lines.append("")
continue
if SECTION_PATTERN.fullmatch(stripped):
if is_section_heading(stripped):
break
if stripped.startswith("[") and not summary_lines:
break
@ -202,23 +238,76 @@ def parse_species_file(path: Path) -> SpeciesRecord:
)
def ensure_unique_record_slugs(records: list[SpeciesRecord]) -> list[SpeciesRecord]:
slug_counts = Counter(record.slug for record in records)
used_slugs: set[str] = set()
for record in records:
base_slug = record.slug
if slug_counts[base_slug] == 1 and base_slug not in used_slugs:
used_slugs.add(base_slug)
continue
disambiguator = slugify(Path(record.source_file).stem)
if disambiguator == base_slug:
disambiguator = hashlib.sha1(record.source_file.encode("utf-8")).hexdigest()[:8]
candidate = f"{base_slug}-{disambiguator}"
if candidate in used_slugs:
source_hash = hashlib.sha1(record.source_file.encode("utf-8")).hexdigest()[:8]
candidate = f"{candidate}-{source_hash}"
suffix = 2
while candidate in used_slugs:
candidate = f"{base_slug}-{disambiguator}-{suffix}"
suffix += 1
record.slug = candidate
used_slugs.add(candidate)
return records
def load_species_records(data_dir: str) -> list[SpeciesRecord]:
base = Path(data_dir)
base = resolve_data_dir(data_dir)
if not base.exists():
return []
records: list[SpeciesRecord] = []
for path in sorted(base.glob("*.txt")):
records.append(parse_species_file(path))
return records
return ensure_unique_record_slugs(records)
def resolve_data_dir(data_dir: str) -> Path:
repo_root = get_repo_root().resolve()
raw_value = data_dir.strip()
if not raw_value:
raise ValueError("Species data directory cannot be empty.")
candidate = Path(raw_value)
if candidate.is_absolute():
resolved = candidate.resolve()
else:
resolved = (repo_root / candidate).resolve()
try:
relative = resolved.relative_to(repo_root)
except ValueError as exc:
raise ValueError("Species data directory must stay within the codebase directory.") from exc
if not relative.parts:
raise ValueError("Species data directory must be a subdirectory of the codebase.")
for part in relative.parts:
if not SAFE_DIRECTORY_NAME_PATTERN.fullmatch(part):
raise ValueError(
f"Species data directory contains an unsafe directory name: {part!r}."
)
return resolved
def get_default_data_dir() -> str:
return os.environ.get(
"ECOSPECIES_DATA_DIR",
str(
Path(__file__).resolve().parents[4].parent
/ "01-legacy-code-and-data"
/ "InputFiles - TXT"
),
)
configured = os.environ.get("ECOSPECIES_DATA_DIR", "input-data/InputFiles")
return str(resolve_data_dir(configured))

File diff suppressed because it is too large Load Diff

21
apps/api/test_auth.py Normal file
View File

@ -0,0 +1,21 @@
from __future__ import annotations
import importlib.util
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parent
SRC = ROOT / "src"
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))
TEST_PATH = ROOT / "tests" / "test_auth.py"
SPEC = importlib.util.spec_from_file_location("ecospecies_api_test_auth", TEST_PATH)
MODULE = importlib.util.module_from_spec(SPEC)
assert SPEC is not None and SPEC.loader is not None
SPEC.loader.exec_module(MODULE)
for name in dir(MODULE):
if name.startswith("Test") or name.endswith("Tests"):
globals()[name] = getattr(MODULE, name)

View File

@ -0,0 +1,21 @@
from __future__ import annotations
import importlib.util
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parent
SRC = ROOT / "src"
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))
TEST_PATH = ROOT / "tests" / "test_citation_enrichment.py"
SPEC = importlib.util.spec_from_file_location("ecospecies_api_test_citation_enrichment", TEST_PATH)
MODULE = importlib.util.module_from_spec(SPEC)
assert SPEC is not None and SPEC.loader is not None
SPEC.loader.exec_module(MODULE)
for name in dir(MODULE):
if name.startswith("Test") or name.endswith("Tests"):
globals()[name] = getattr(MODULE, name)

View File

@ -0,0 +1,21 @@
from __future__ import annotations
import importlib.util
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parent
SRC = ROOT / "src"
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))
TEST_PATH = ROOT / "tests" / "test_document_format.py"
SPEC = importlib.util.spec_from_file_location("ecospecies_api_test_document_format", TEST_PATH)
MODULE = importlib.util.module_from_spec(SPEC)
assert SPEC is not None and SPEC.loader is not None
SPEC.loader.exec_module(MODULE)
for name in dir(MODULE):
if name.startswith("Test") or name.endswith("Tests"):
globals()[name] = getattr(MODULE, name)

21
apps/api/test_parser.py Normal file
View File

@ -0,0 +1,21 @@
from __future__ import annotations
import importlib.util
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parent
SRC = ROOT / "src"
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))
TEST_PATH = ROOT / "tests" / "test_parser.py"
SPEC = importlib.util.spec_from_file_location("ecospecies_api_test_parser", TEST_PATH)
MODULE = importlib.util.module_from_spec(SPEC)
assert SPEC is not None and SPEC.loader is not None
SPEC.loader.exec_module(MODULE)
for name in dir(MODULE):
if name.startswith("Test") or name.endswith("Tests"):
globals()[name] = getattr(MODULE, name)

View File

@ -0,0 +1,58 @@
from __future__ import annotations
import tempfile
import unittest
from pathlib import Path
from unittest.mock import patch
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from ecospecies_api import auth, repository
class ContributorAuthTests(unittest.TestCase):
def setUp(self) -> None:
self.tempdir = tempfile.TemporaryDirectory()
db_path = Path(self.tempdir.name) / "test.db"
self.engine = create_engine(f"sqlite:///{db_path}", future=True)
self.session_local = sessionmaker(
bind=self.engine,
autoflush=False,
autocommit=False,
future=True,
)
self.repository_engine_patch = patch.object(repository, "create_db_engine", return_value=self.engine)
self.repository_session_patch = patch.object(repository, "SessionLocal", self.session_local)
self.auth_engine_patch = patch.object(auth, "create_db_engine", return_value=self.engine)
self.auth_session_patch = patch.object(auth, "SessionLocal", self.session_local)
self.repository_engine_patch.start()
self.repository_session_patch.start()
self.auth_engine_patch.start()
self.auth_session_patch.start()
def tearDown(self) -> None:
self.auth_session_patch.stop()
self.auth_engine_patch.stop()
self.repository_session_patch.stop()
self.repository_engine_patch.stop()
self.engine.dispose()
self.tempdir.cleanup()
def test_contributor_token_resolves_to_contributor_session(self) -> None:
registration = repository.register_contributor("author@example.org", True)
session = auth.resolve_auth_session({"Authorization": f"Bearer {registration['token']}"})
self.assertIsNotNone(session)
assert session is not None
self.assertEqual(session.username, "author@example.org")
self.assertEqual(session.role, "contributor")
def test_contributor_role_does_not_satisfy_editor(self) -> None:
self.assertTrue(auth.role_satisfies("editor", "contributor"))
self.assertFalse(auth.role_satisfies("contributor", "editor"))
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,527 @@
from __future__ import annotations
import unittest
from unittest.mock import patch
from ecospecies_api.citation_enrichment import (
_crossref_message_to_entry,
_datacite_item_to_entry,
_openalex_work_to_entry,
_render_normalized_text,
apply_citation_candidate_selection,
discover_citation_candidates,
enrich_citation_payload,
LocalBibEntry,
LocalMetadataResolver,
LocalResolution,
)
from ecospecies_api.citegeist_bridge import extract_draft_citation, render_single_bibtex
class CitationEnrichmentTests(unittest.TestCase):
def test_render_normalized_text_includes_volume_number_and_pages(self) -> None:
rendered = _render_normalized_text(
"article",
{
"author": "Daniell, W.C.",
"year": "1872",
"title": "Letters referring to experiments of W.C. Daniell",
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
"volume": "2",
"number": "4",
"pages": "387-390",
"doi": "10.1000/example",
},
)
self.assertEqual(
rendered,
"Daniell, W.C. (1872). Letters referring to experiments of W.C. Daniell. Comm. Rept. U.S. Comm. Fish & Fish., 2(4): 387-390. DOI:10.1000/example",
)
def test_crossref_mapping_keeps_volume_issue_and_pages(self) -> None:
entry = _crossref_message_to_entry(
{
"type": "journal-article",
"title": ["Example Work"],
"issued": {"date-parts": [[1872]]},
"author": [{"family": "Daniell", "given": "W.C."}],
"container-title": ["Comm. Rept. U.S. Comm. Fish & Fish."],
"DOI": "10.1000/example",
"URL": "https://doi.org/10.1000/example",
"volume": "2",
"issue": "4",
"page": "387-390",
}
)
self.assertEqual(entry.fields["volume"], "2")
self.assertEqual(entry.fields["number"], "4")
self.assertEqual(entry.fields["pages"], "387-390")
def test_openalex_mapping_keeps_biblio_fields(self) -> None:
entry = _openalex_work_to_entry(
{
"id": "https://openalex.org/W12345",
"display_name": "OpenAlex Discovered Work",
"publication_year": 2022,
"type": "article",
"doi": "https://doi.org/10.1000/example-openalex",
"authorships": [{"author": {"display_name": "J S, Smith"}}],
"primary_location": {"source": {"display_name": "Journal of Graph Discovery"}},
"biblio": {"volume": "12", "issue": "3", "first_page": "101", "last_page": "118"},
"abstract_inverted_index": {"Graphs": [0], "support": [1], "learning": [2]},
}
)
self.assertEqual(entry.fields["author"], "Smith, J. S.")
self.assertEqual(entry.fields["volume"], "12")
self.assertEqual(entry.fields["number"], "3")
self.assertEqual(entry.fields["pages"], "101-118")
self.assertEqual(entry.fields["abstract"], "Graphs support learning")
def test_openalex_mapping_handles_null_source(self) -> None:
entry = _openalex_work_to_entry(
{
"id": "https://openalex.org/W54321",
"display_name": "OpenAlex Work Without Source",
"publication_year": 2021,
"type": "article",
"doi": "https://doi.org/10.1000/example-null-source",
"authorships": [{"author": {"display_name": "Jane Smith"}}],
"primary_location": {"source": None},
"biblio": {"volume": "5", "issue": "1", "first_page": "10", "last_page": "20"},
}
)
self.assertEqual(entry.fields["title"], "OpenAlex Work Without Source")
self.assertNotIn("journal", entry.fields)
self.assertEqual(entry.fields["volume"], "5")
self.assertEqual(entry.fields["number"], "1")
self.assertEqual(entry.fields["pages"], "10-20")
def test_datacite_mapping_keeps_container_and_pages(self) -> None:
entry = _datacite_item_to_entry(
{
"attributes": {
"titles": [{"title": "DataCite Work"}],
"creators": [{"name": "J R, Rivera"}],
"publicationYear": "2021",
"doi": "10.1000/datacite-work",
"url": "https://doi.org/10.1000/datacite-work",
"container": "Journal of Metadata",
"volume": "7",
"issue": "2",
"firstPage": "44",
"lastPage": "59",
"descriptions": [
{"descriptionType": "Abstract", "description": "Abstract: Metadata makes reuse easier."}
],
}
}
)
self.assertEqual(entry.fields["author"], "Rivera, J. R.")
self.assertEqual(entry.fields["journal"], "Journal of Metadata")
self.assertEqual(entry.fields["volume"], "7")
self.assertEqual(entry.fields["number"], "2")
self.assertEqual(entry.fields["pages"], "44-59")
self.assertEqual(entry.fields["abstract"], "Metadata makes reuse easier.")
def test_render_single_bibtex_preserves_balanced_braces_and_repairs_unmatched_ones(self) -> None:
rendered = render_single_bibtex(
"misc",
"example",
{
"title": "Alpha_beta {Gamma}",
"note": "raw_reference = {Alpha } beta}",
},
)
self.assertIn("title = {Alpha_beta {Gamma}}", rendered)
self.assertIn("note = {raw_reference = {Alpha } beta)}", rendered)
def test_extract_draft_citation_repairs_report_style_reference_fields(self) -> None:
class MockEntry:
entry_type = "misc"
citation_key = "badkey"
fields = {
"title": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
"year": "1872",
"note": "extracted_reference = {true}",
}
with patch(
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
return_value=lambda text: [MockEntry()],
):
draft = extract_draft_citation(
"Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
legacy_reference_number="160",
)
self.assertIsNotNone(draft)
assert draft is not None
self.assertEqual(draft.fields["author"], "Daniell, W.C")
self.assertEqual(
draft.fields["title"],
"Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
)
self.assertEqual(draft.fields["journal"], "Comm. Rept. U.S. Comm. Fish & Fish")
self.assertEqual(draft.fields["volume"], "2")
self.assertEqual(draft.fields["pages"], "387-390")
self.assertEqual(draft.citation_key, "daniell1872lettersreferringexperiments")
def test_extract_draft_citation_does_not_split_title_on_report_word(self) -> None:
class MockEntry:
entry_type = "misc"
citation_key = "badkey"
fields = {
"title": "Smith, J. 1999. Habitat report synthesis for Alabama shad. NOAA Tech. Memo. NMFS-SEFSC-200.",
"year": "1999",
"note": "extracted_reference = {true}",
}
with patch(
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
return_value=lambda text: [MockEntry()],
):
draft = extract_draft_citation(
"Smith, J. 1999. Habitat report synthesis for Alabama shad. NOAA Tech. Memo. NMFS-SEFSC-200.",
legacy_reference_number="42",
)
self.assertIsNotNone(draft)
assert draft is not None
self.assertEqual(draft.fields["author"], "Smith, J")
self.assertEqual(draft.fields["title"], "Habitat report synthesis for Alabama shad")
self.assertEqual(draft.fields["howpublished"], "NOAA Tech. Memo. NMFS-SEFSC-200")
self.assertNotIn("journal", draft.fields)
def test_extract_draft_citation_repairs_proc_abbreviation_venue(self) -> None:
class MockEntry:
entry_type = "misc"
citation_key = "badkey"
fields = {
"title": "Bailey, R.M., H.E. Winn and C.L. Smith. 1954. Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes. Proc. Acad. Sci. Philad. 106: 109-134.",
"year": "1954",
"note": "extracted_reference = {true}",
}
with patch(
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
return_value=lambda text: [MockEntry()],
):
draft = extract_draft_citation(
"Bailey, R.M., H.E. Winn and C.L. Smith. 1954. Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes. Proc. Acad. Sci. Philad. 106: 109-134.",
legacy_reference_number="26",
)
self.assertIsNotNone(draft)
assert draft is not None
self.assertEqual(
draft.fields["title"],
"Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes",
)
self.assertEqual(draft.fields["journal"], "Proc. Acad. Sci. Philad")
self.assertEqual(draft.fields["volume"], "106")
self.assertEqual(draft.fields["pages"], "109-134")
def test_extract_draft_citation_repairs_occasional_paper_venue(self) -> None:
class MockEntry:
entry_type = "misc"
citation_key = "badkey"
fields = {
"title": "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
"year": "1950",
"note": "extracted_reference = {true}",
}
with patch(
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
return_value=lambda text: [MockEntry()],
):
draft = extract_draft_citation(
"Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
legacy_reference_number="41",
)
self.assertIsNotNone(draft)
assert draft is not None
self.assertEqual(
draft.fields["title"],
"Annotated list of the fauna of the Grand Isle region, 1928-1946",
)
self.assertEqual(draft.fields["journal"], "Occas. Pap. Mar. Lab., LSU")
self.assertEqual(draft.fields["volume"], "6")
self.assertEqual(draft.fields["number"], "6")
self.assertEqual(draft.fields["pages"], "1-66")
def test_extract_draft_citation_repairs_partial_existing_venue_stub(self) -> None:
class MockEntry:
entry_type = "misc"
citation_key = "badkey"
fields = {
"title": "Annotated list of the fauna of the Grand Isle region, 1928-1946",
"year": "1950",
"howpublished": "Occas",
"note": "extracted_reference = {true}",
}
with patch(
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
return_value=lambda text: [MockEntry()],
):
draft = extract_draft_citation(
"Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
legacy_reference_number="41",
)
self.assertIsNotNone(draft)
assert draft is not None
self.assertEqual(draft.fields["journal"], "Occas. Pap. Mar. Lab., LSU")
self.assertEqual(draft.fields["volume"], "6")
self.assertEqual(draft.fields["number"], "6")
self.assertEqual(draft.fields["pages"], "1-66")
def test_falls_back_to_internal_resolver_when_citegeist_repo_is_unavailable(self) -> None:
class MockResolver:
def resolve_entry(self, entry):
class Resolution:
source_label = "crossref:doi:10.1000/example"
class Entry:
entry_type = "article"
citation_key = "doi101000example"
fields = {
"author": "Smith, Jane",
"year": "2024",
"title": "Example Work",
"journal": "Journal of Examples",
"doi": "10.1000/example",
"url": "https://doi.org/10.1000/example",
}
entry = Entry()
return Resolution()
with patch(
"ecospecies_api.citation_enrichment._load_citegeist_resolution_components",
return_value=(None, None, None, None),
):
result = enrich_citation_payload(
{
"raw_text": "Smith, Jane. 2024. Example Work.",
"legacy_reference_number": "7",
},
resolver=MockResolver(),
)
self.assertEqual(result["enrichment_status"], "resolved")
self.assertEqual(result["doi"], "10.1000/example")
self.assertEqual(result["source_url"], "https://doi.org/10.1000/example")
self.assertEqual(result["resolver_source_label"], "crossref:doi:10.1000/example")
self.assertIn("ecospecies_reference_number = {7}", result["draft_bibtex"])
def test_enrichment_replaces_raw_reference_title_and_dedupes_legacy_note(self) -> None:
class MockResolver:
def resolve_entry(self, entry):
class Resolution:
source_label = "crossref:search:Letters referring to experiments"
class Entry:
entry_type = "article"
citation_key = "daniell1872lettersshadalabama"
fields = {
"author": "Daniell, W.C.",
"year": "1872",
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
"url": "",
}
entry = Entry()
return Resolution()
result = enrich_citation_payload(
{
"raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
"legacy_reference_number": "160",
"citation_key": "daniell1948daniellwc",
},
resolver=MockResolver(),
)
self.assertEqual(result["enrichment_status"], "resolved")
self.assertEqual(result["citation_key"], "daniell1872lettersreferringexperiments")
self.assertIn(
"title = {Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River}",
result["draft_bibtex"],
)
self.assertIn("year = {1872}", result["draft_bibtex"])
self.assertEqual(result["draft_bibtex"].count("ecospecies_reference_number = {160}"), 1)
def test_enrichment_rejects_conflicting_resolved_metadata(self) -> None:
class MockResolver:
def resolve_entry(self, entry):
class Resolution:
source_label = "crossref:search:alabama-shad-false-positive"
class Entry:
entry_type = "article"
citation_key = "daniell2009habitatuseage"
fields = {
"author": "Daniell, W.C.",
"year": "2009",
"title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA",
"journal": "Transactions of the American Fisheries Society",
"doi": "10.1111/j.1600-0633.2009.00395.x",
"url": "https://doi.org/10.1111/j.1600-0633.2009.00395.x",
"volume": "19",
"number": "1",
"pages": "107-115",
}
entry = Entry()
return Resolution()
result = enrich_citation_payload(
{
"raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
"legacy_reference_number": "160",
},
resolver=MockResolver(),
)
self.assertEqual(result["enrichment_status"], "unresolved")
self.assertIn("conflicts with citation seed fields", result["enrichment_error"])
def test_unresolved_enrichment_still_returns_refreshed_seed_fields(self) -> None:
class MockResolver:
def resolve_entry(self, entry):
return None
result = enrich_citation_payload(
{
"raw_text": "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
"legacy_reference_number": "41",
"citation_key": "oldbadkey",
"entry_type": "misc",
},
resolver=MockResolver(),
)
self.assertEqual(result["enrichment_status"], "unresolved")
self.assertEqual(result["citation_key"], "behre1950annotatedlistfauna")
self.assertIn("title = {Annotated list of the fauna of the Grand Isle region, 1928-1946}", result["draft_bibtex"])
self.assertIn("Occas. Pap. Mar. Lab., LSU", result["draft_bibtex"])
def test_discover_citation_candidates_returns_scored_candidates(self) -> None:
class MockResolver:
def search_crossref_candidates(self, title):
return [
LocalResolution(
LocalBibEntry(
"article",
"daniell1872lettersreferringexperiments",
{
"author": "Daniell, W.C.",
"year": "1872",
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
"volume": "2",
"pages": "387-390",
},
),
"crossref:search:1:daniell-good",
),
LocalResolution(
LocalBibEntry(
"article",
"daniell2009habitatuseage",
{
"author": "Daniell, W.C.",
"year": "2009",
"title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA",
"journal": "Transactions of the American Fisheries Society",
"volume": "19",
"number": "1",
"pages": "107-115",
},
),
"crossref:search:2:daniell-bad",
),
]
def search_datacite_candidates(self, title):
return []
def search_openalex_candidates(self, title):
return []
result = discover_citation_candidates(
{
"raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
"legacy_reference_number": "160",
},
resolver=MockResolver(),
)
self.assertEqual(result["candidate_count"], 2)
self.assertGreater(result["candidates"][0]["score"], result["candidates"][1]["score"])
self.assertEqual(result["candidates"][0]["field_matches"]["year"]["status"], "exact")
self.assertEqual(result["candidates"][1]["field_matches"]["year"]["status"], "conflict")
def test_local_crossref_candidate_search_filters_placeholder_stub_entries(self) -> None:
resolver = LocalMetadataResolver()
resolver._safe_get_json = lambda url: {
"message": {
"items": [
{
"type": "journal-article",
"title": ["Referenced work 1"],
"issued": {"date-parts": [[2020]]},
},
{
"type": "journal-article",
"title": ["Useful Paper"],
"issued": {"date-parts": [[2020]]},
"author": [{"family": "Smith", "given": "J S"}],
"container-title": ["Journal of Examples"],
"DOI": "10.1000/useful",
},
]
}
}
results = resolver.search_crossref_candidates("Useful Paper")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entry.fields["title"], "Useful Paper")
def test_apply_citation_candidate_selection_uses_selected_candidate_fields(self) -> None:
result = apply_citation_candidate_selection(
{
"raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
"legacy_reference_number": "160",
},
{
"source_label": "crossref:search:1:daniell-good",
"entry_type": "article",
"fields": {
"author": "Daniell, W.C.",
"year": "1872",
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
"volume": "2",
"pages": "387-390",
},
},
)
self.assertEqual(result["enrichment_status"], "resolved")
self.assertEqual(result["resolver_source_label"], "editor:selected:crossref:search:1:daniell-good")
self.assertIn("Comm. Rept. U.S. Comm. Fish & Fish., 2: 387-390", result["normalized_text"])

View File

@ -0,0 +1,195 @@
from __future__ import annotations
import json
import unittest
from ecospecies_api.document_format import (
DocumentNode,
StructuredDocument,
build_document_from_species_payload,
extract_citation_entries,
extract_species_projection,
export_markdown_document,
parse_markdown_document,
validate_markdown_document,
)
class StructuredMarkdownTests(unittest.TestCase):
def test_round_trip_markdown_preserves_metadata_and_hierarchy(self) -> None:
source = """---
title: American Oyster
common_name: American Oyster
scientific_name: Crassostrea virginica
legacy_identifiers:
- authority: legacy-ecospecies
identifier: 5192
label: FLELMR
taxon_identifiers:
- authority: worms
identifier: 159059
label: AphiaID
primary: true
primary_taxon_authority: worms
---
## Summary
Short abstract.
## Habitat
### Type
Estuarine.
"""
document = parse_markdown_document(source)
self.assertEqual(document.metadata["title"], "American Oyster")
self.assertEqual(document.metadata["primary_taxon_authority"], "worms")
self.assertEqual(document.metadata["legacy_identifiers"][0]["identifier"], "5192")
self.assertEqual(document.metadata["taxon_identifiers"][0]["authority"], "worms")
self.assertEqual(document.nodes[0].title, "Summary")
self.assertEqual(document.nodes[1].children[0].title, "Type")
self.assertIn("## Habitat", export_markdown_document(document))
def test_build_document_from_species_payload_creates_markdown_sections(self) -> None:
document = build_document_from_species_payload(
{
"title": "American Oyster",
"common_name": "American Oyster",
"scientific_name": "Crassostrea virginica",
"flelmr_code": "5192",
"source_file": "American Oyster.txt",
"summary": "Short abstract.",
"sections": [
{"heading": "HEADER", "content": "Ignored header"},
{"heading": "Habitat", "content": "Estuarine."},
{"heading": "Reproduction", "content": "Broadcast spawner."},
],
}
)
self.assertEqual(document.metadata["legacy_identifiers"][0]["identifier"], "5192")
self.assertEqual(document.metadata["legacy_identifiers"][0]["authority"], "legacy-ecospecies")
self.assertEqual([node.title for node in document.nodes], ["Summary", "Habitat", "Reproduction"])
self.assertEqual(document.nodes[1].body, "Estuarine.")
def test_extract_species_projection_flattens_nested_headings(self) -> None:
document = parse_markdown_document(
"""---
title: American Oyster
common_name: American Oyster
scientific_name: Crassostrea virginica
legacy_identifiers:
- authority: legacy-ecospecies
identifier: 5192
label: FLELMR
---
## Summary
Short abstract.
## Habitat
General habitat.
### Type
Estuarine.
"""
)
projection = extract_species_projection(document)
self.assertEqual(projection["summary"], "Short abstract.")
self.assertEqual(projection["flelmr_code"], "5192")
self.assertEqual(
[section["heading"] for section in projection["sections"]],
["Habitat", "Habitat / Type"],
)
def test_extract_species_projection_accepts_legacy_species_code_front_matter(self) -> None:
document = parse_markdown_document(
"""---
title: Legacy Fish
common_name: Legacy Fish
scientific_name: Pisces historicus
species_code: 4242
---
## Habitat
Estuarine.
"""
)
projection = extract_species_projection(document)
self.assertEqual(projection["flelmr_code"], "4242")
def test_validate_markdown_document_rejects_missing_front_matter_and_depth_jump(self) -> None:
errors = validate_markdown_document(
"""## Habitat
Text
#### Type
Nested too deeply.
"""
)
self.assertTrue(any("front matter" in error for error in errors))
self.assertTrue(any("Heading depth jumps" in error for error in errors))
def test_extract_citation_entries_strips_legacy_comma_number_prefix(self) -> None:
document = parse_markdown_document(
"""---
title: Alabama Shad
common_name: Alabama Shad
scientific_name: Alosa alabamae
---
## References
160, Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.
"""
)
citations = extract_citation_entries(document)
self.assertEqual(len(citations), 1)
self.assertEqual(citations[0]["legacy_reference_number"], "160")
self.assertTrue(citations[0]["raw_text"].startswith("Daniell, W.C. 1872."))
self.assertFalse(citations[0]["raw_text"].startswith("160,"))
def test_extract_citation_entries_accepts_colon_terminated_citation_heading(self) -> None:
citations = extract_citation_entries(
StructuredDocument(
metadata={},
nodes=[
DocumentNode(
node_type="section",
title="Citations:",
body="7, Ahmed, M. 1975. Speciation in living oysters.",
depth=2,
)
],
)
)
self.assertEqual(len(citations), 1)
self.assertEqual(citations[0]["legacy_reference_number"], "7")
def test_extract_citation_entries_accepts_bare_number_prefix(self) -> None:
document = parse_markdown_document(
"""---
title: Eastern Mosquitofish
common_name: Eastern Mosquitofish
scientific_name: Gambusia holbrooki
---
## Citations
848 Gilmore, R.G. 1977. Fishes of the Indian River Lagoon and adjacent waters, Florida.
"""
)
citations = extract_citation_entries(document)
self.assertEqual(len(citations), 1)
self.assertEqual(citations[0]["legacy_reference_number"], "848")
self.assertTrue(citations[0]["raw_text"].startswith("Gilmore, R.G. 1977."))

View File

@ -0,0 +1,109 @@
from __future__ import annotations
import tempfile
import unittest
from pathlib import Path
from unittest.mock import patch
from ecospecies_api import parser
class ParserPathResolutionTests(unittest.TestCase):
def test_ecospecies_code_is_treated_as_flelmr_code(self) -> None:
metadata = parser.extract_metadata(
[
"Title: Test Fish",
"EcoSpecies Code: 4242",
]
)
self.assertEqual(metadata["ecospecies code"], "4242")
self.assertEqual(metadata["flelmr"], "4242")
def test_title_case_headings_are_split_into_sections(self) -> None:
sections = parser.split_sections(
[
"Species profile: American oyster (Crassostrea virginica)",
"",
"Classification",
" Phylum: Mollusca",
"Value",
"Commercial: Important fishery.",
"Habitat",
"Type: Estuarine.",
]
)
self.assertEqual(
[section.heading for section in sections],
["HEADER", "Classification", "Value", "Habitat"],
)
def test_colon_terminated_title_case_headings_are_split_into_sections(self) -> None:
sections = parser.split_sections(
[
"Ecological Interactions and Notes",
"Predator text.",
"",
"Reference Numbers:",
"",
"Citations:",
"7, Ahmed, M. 1975. Speciation in living oysters.",
]
)
self.assertEqual(
[section.heading for section in sections],
["HEADER", "Citations"],
)
def test_default_data_dir_uses_in_repo_path_without_spaces(self) -> None:
with patch.dict("os.environ", {}, clear=True):
resolved = Path(parser.get_default_data_dir())
self.assertEqual(resolved, parser.get_repo_root() / "input-data" / "InputFiles")
def test_relative_override_must_stay_within_repo(self) -> None:
with self.assertRaisesRegex(ValueError, "within the codebase directory"):
parser.resolve_data_dir("../input-data/InputFiles")
def test_absolute_override_outside_repo_is_rejected(self) -> None:
with tempfile.TemporaryDirectory() as tempdir:
with self.assertRaisesRegex(ValueError, "within the codebase directory"):
parser.resolve_data_dir(tempdir)
def test_directory_names_with_spaces_are_rejected(self) -> None:
with self.assertRaisesRegex(ValueError, "unsafe directory name"):
parser.resolve_data_dir("input-data/Bad Name")
def test_directory_names_with_special_characters_are_rejected(self) -> None:
with self.assertRaisesRegex(ValueError, "unsafe directory name"):
parser.resolve_data_dir("input-data/bad@name")
def test_load_species_records_resolves_repo_relative_paths(self) -> None:
records = parser.load_species_records("input-data/InputFiles")
self.assertGreater(len(records), 0)
def test_duplicate_source_records_receive_unique_stable_slugs(self) -> None:
records = parser.load_species_records("input-data/InputFiles")
slug_by_source = {record.source_file: record.slug for record in records}
self.assertEqual(len(records), len(set(record.slug for record in records)))
self.assertEqual(
slug_by_source["Red Snapper_SLH_Outline2012_0722.txt"],
"red-snapper-red-snapper-slh-outline2012-0722",
)
self.assertEqual(
slug_by_source["RedSnapper_SLH_2012_0830_combined.txt"],
"red-snapper-redsnapper-slh-2012-0830-combined",
)
self.assertEqual(
slug_by_source["Sailfin Molly SLH RGG.txt"],
"sailfin-molly-sailfin-molly-slh-rgg",
)
self.assertTrue(
slug_by_source["Sailfin_Molly SLH RGG.txt"].startswith(
"sailfin-molly-sailfin-molly-slh-rgg-"
)
)

View File

@ -112,6 +112,35 @@ class RepositoryWorkflowTests(unittest.TestCase):
self.assertEqual(detail["section_count"], 2)
self.assertEqual([section["position"] for section in detail["sections"]], [1, 2])
self.assertEqual([item["code"] for item in detail["diagnostics"]], ["missing_citations"])
self.assertEqual(
detail["legacy_identifiers"],
[
{
"authority": "legacy-ecospecies",
"identifier": "9999",
"label": "FLELMR",
}
],
)
def test_species_detail_includes_structured_document_and_legacy_source(self) -> None:
input_dir = Path(self.tempdir.name) / "input-data" / "InputFiles"
input_dir.mkdir(parents=True, exist_ok=True)
(input_dir / "Test Shad.txt").write_text("HEADER\nLegacy header content\n", encoding="utf-8")
with patch.object(repository, "get_default_data_dir", return_value=str(input_dir)):
detail = repository.get_species_by_slug("test-shad")
self.assertIsNotNone(detail)
assert detail is not None
self.assertEqual(detail["structured_document"]["source_format"], "ecospecies-markdown-v1")
self.assertIn(
"HABITAT",
[node["title"] for node in detail["structured_document"]["ast"]["nodes"]],
)
self.assertEqual(detail["legacy_source"]["source_file"], "Test Shad.txt")
self.assertIn("Legacy header content", detail["legacy_source"]["text"])
self.assertEqual(detail["taxon_identifiers"], [])
def test_editorial_update_changes_publication_visibility_and_creates_audit(self) -> None:
result = repository.update_species_editorial(
@ -207,6 +236,60 @@ class RepositoryWorkflowTests(unittest.TestCase):
self.assertEqual(len(audit), 2)
self.assertEqual([entry["action"] for entry in audit], ["section_update", "editorial_update"])
def test_reimport_preserves_persisted_taxon_identifiers(self) -> None:
repository.update_species_document_markdown(
slug="test-shad",
markdown="""---
title: Test Shad
common_name: Test Shad
scientific_name: Alosa testus
legacy_identifiers:
- authority: legacy-ecospecies
identifier: 9999
label: FLELMR
taxon_identifiers:
- authority: gbif
identifier: 12345
label: taxonKey
primary: true
primary_taxon_authority: gbif
---
## Summary
Taxon-reviewed summary.
""",
username="edith",
)
repository.import_species_payload(UPDATED_PAYLOAD)
detail = repository.get_editor_species_detail("test-shad")
self.assertIsNotNone(detail)
self.assertEqual(detail["primary_taxon_authority"], "gbif")
self.assertEqual(
detail["primary_taxon_identifier"],
{
"authority": "gbif",
"identifier": "12345",
"label": "taxonKey",
"primary": True,
"source_url": "",
},
)
self.assertEqual(
detail["taxon_identifiers"],
[
{
"authority": "gbif",
"identifier": "12345",
"label": "taxonKey",
"primary": True,
"source_url": "",
}
],
)
def test_reimport_updates_summary_when_no_editorial_override_exists(self) -> None:
repository.import_species_payload(UPDATED_PAYLOAD)
@ -302,6 +385,583 @@ class RepositoryWorkflowTests(unittest.TestCase):
self.assertEqual(audit[0]["action"], "import_restore")
self.assertEqual(audit[0]["details"]["is_archived"], {"from": True, "to": False})
def test_document_markdown_update_refreshes_flat_projection(self) -> None:
result = repository.update_species_document_markdown(
slug="test-shad",
markdown="""---
title: Test Shad Markdown
common_name: Test Shad
scientific_name: Alosa markdownus
species_code: 4242
---
## Summary
Markdown summary.
## Habitat
Open water.
### Type
Pelagic.
""",
username="frank",
)
detail = repository.get_editor_species_detail("test-shad")
document = repository.get_species_document("test-shad")
audit = repository.list_species_audit("test-shad")
self.assertIsNotNone(result)
self.assertIsNotNone(detail)
self.assertIsNotNone(document)
self.assertEqual(detail["title"], "Test Shad Markdown")
self.assertEqual(detail["scientific_name"], "Alosa markdownus")
self.assertEqual(detail["flelmr_code"], "4242")
self.assertEqual(detail["summary"], "Markdown summary.")
self.assertEqual(
[section["heading"] for section in detail["sections"]],
["Habitat", "Habitat / Type"],
)
self.assertEqual(document["updated_by"], "frank")
self.assertIsNotNone(audit)
self.assertEqual(audit[0]["action"], "document_update")
def test_document_markdown_update_extracts_citations(self) -> None:
repository.update_species_document_markdown(
slug="test-shad",
markdown="""---
title: Test Shad Markdown
common_name: Test Shad
scientific_name: Alosa markdownus
legacy_identifiers:
- authority: legacy-ecospecies
identifier: 4242
label: FLELMR
---
## Summary
Markdown summary.
## References
- Smith, J. 2024. Example paper. doi:10.1000/example-doi
- [7] Jones, A. 2022. Fisheries review.
""",
username="frank",
)
detail = repository.get_editor_species_detail("test-shad")
self.assertIsNotNone(detail)
self.assertEqual(detail["citation_count"], 2)
self.assertEqual(detail["citations"][0]["section_heading"], "References")
self.assertEqual(detail["citations"][0]["legacy_reference_number"], "")
self.assertEqual(detail["citations"][0]["doi"], "10.1000/example-doi")
self.assertTrue(detail["citations"][0]["citation_key"])
self.assertIn("@", detail["citations"][0]["draft_bibtex"])
self.assertEqual(detail["citations"][0]["review_status"], "draft")
self.assertEqual(detail["citations"][1]["legacy_reference_number"], "7")
self.assertEqual(detail["citations"][1]["doi"], "")
self.assertIn("ecospecies_reference_number = \\{7\\}", detail["citations"][1]["draft_bibtex"])
def test_editor_can_review_citations_and_reviews_survive_document_save(self) -> None:
repository.update_species_document_markdown(
slug="test-shad",
markdown="""---
title: Test Shad Markdown
common_name: Test Shad
scientific_name: Alosa markdownus
legacy_identifiers:
- authority: legacy-ecospecies
identifier: 4242
label: FLELMR
---
## References
- [7] Jones, A. 2022. Fisheries review.
""",
username="frank",
)
citations = repository.get_editor_species_citations("test-shad")
self.assertIsNotNone(citations)
citation = citations["citations"][0]
result = repository.update_species_citation_review(
slug="test-shad",
citation_id=citation["id"],
review_status="accepted",
normalized_text="Jones, A. (2022). Fisheries review.",
doi="10.1000/review-doi",
citation_key="jones2022review",
entry_type="article",
draft_bibtex="@article{jones2022review,\n doi = {10.1000/review-doi}\n}",
username="edith",
)
self.assertIsNotNone(result)
self.assertEqual(result["citation"]["review_status"], "accepted")
self.assertEqual(result["citation"]["source_type"], "editor_review")
repository.update_species_document_markdown(
slug="test-shad",
markdown="""---
title: Test Shad Markdown
common_name: Test Shad
scientific_name: Alosa markdownus
legacy_identifiers:
- authority: legacy-ecospecies
identifier: 4242
label: FLELMR
---
## References
- [7] Jones, A. 2022. Fisheries review.
""",
username="frank",
)
citations = repository.get_editor_species_citations("test-shad")
audit = repository.list_species_audit("test-shad")
self.assertIsNotNone(citations)
self.assertEqual(citations["citation_count"], 1)
self.assertEqual(citations["citations"][0]["review_status"], "accepted")
self.assertEqual(citations["citations"][0]["doi"], "10.1000/review-doi")
self.assertEqual(citations["citations"][0]["citation_key"], "jones2022review")
self.assertEqual(citations["citations"][0]["entry_type"], "article")
self.assertIn("10.1000/review-doi", citations["citations"][0]["draft_bibtex"])
self.assertIsNotNone(audit)
self.assertEqual(audit[1]["action"], "citation_review_update")
def test_editor_can_run_citation_enrichment(self) -> None:
repository.update_species_document_markdown(
slug="test-shad",
markdown="""---
title: Test Shad Markdown
common_name: Test Shad
scientific_name: Alosa markdownus
legacy_identifiers:
- authority: legacy-ecospecies
identifier: 4242
label: FLELMR
---
## References
- [7] Jones, A. 2022. Fisheries review.
""",
username="frank",
)
citations = repository.get_editor_species_citations("test-shad")
self.assertIsNotNone(citations)
citation = citations["citations"][0]
with patch.object(
repository,
"enrich_citation_payload",
return_value={
"citation_key": "jones2022review",
"entry_type": "article",
"normalized_text": "Jones, A. (2022). Fisheries review. Journal of Tests. DOI:10.1000/review-doi",
"draft_bibtex": "@article{jones2022review,\n doi = {10.1000/review-doi},\n}",
"doi": "10.1000/review-doi",
"source_url": "https://doi.org/10.1000/review-doi",
"openalex_id": "W12345",
"resolver_source_label": "crossref:doi:10.1000/review-doi",
"enrichment_status": "resolved",
"enrichment_error": "",
"conflicts": [],
},
):
result = repository.update_species_citation_enrichment(
slug="test-shad",
citation_id=citation["id"],
username="edith",
)
self.assertIsNotNone(result)
self.assertEqual(result["citation"]["enrichment_status"], "resolved")
self.assertEqual(result["citation"]["doi"], "10.1000/review-doi")
self.assertEqual(result["citation"]["openalex_id"], "W12345")
self.assertEqual(result["citation"]["resolver_source_label"], "crossref:doi:10.1000/review-doi")
self.assertEqual(result["citation"]["source_url"], "https://doi.org/10.1000/review-doi")
citations = repository.get_editor_species_citations("test-shad")
audit = repository.list_species_audit("test-shad")
self.assertIsNotNone(citations)
self.assertEqual(citations["citations"][0]["citation_key"], "jones2022review")
self.assertEqual(citations["citations"][0]["entry_type"], "article")
self.assertEqual(citations["citations"][0]["enrichment_status"], "resolved")
self.assertIsNotNone(audit)
self.assertEqual(audit[0]["action"], "citation_enrichment")
def test_editor_can_run_batch_citation_enrichment(self) -> None:
repository.update_species_document_markdown(
slug="test-shad",
markdown="""---
title: Test Shad Markdown
common_name: Test Shad
scientific_name: Alosa markdownus
legacy_identifiers:
- authority: legacy-ecospecies
identifier: 4242
label: FLELMR
---
## References
- [7] Jones, A. 2022. Fisheries review.
- [8] Smith, B. 2021. Estuarine habitat paper.
""",
username="frank",
)
payloads = [
{
"citation_key": "jones2022review",
"entry_type": "article",
"normalized_text": "Jones, A. (2022). Fisheries review.",
"draft_bibtex": "@article{jones2022review,\n}",
"doi": "10.1000/review-doi",
"source_url": "https://doi.org/10.1000/review-doi",
"openalex_id": "W12345",
"resolver_source_label": "crossref:doi:10.1000/review-doi",
"enrichment_status": "resolved",
"enrichment_error": "",
"conflicts": [],
},
{
"citation_key": "smith2021estuarine",
"entry_type": "misc",
"normalized_text": "",
"draft_bibtex": "",
"doi": "",
"source_url": "",
"openalex_id": "",
"resolver_source_label": "",
"enrichment_status": "unresolved",
"enrichment_error": "No metadata match found from DOI, title, or authority identifiers.",
"conflicts": [],
},
]
with patch.object(repository, "enrich_citation_payload", side_effect=payloads):
result = repository.update_species_citations_enrichment_batch(
slug="test-shad",
username="edith",
)
self.assertIsNotNone(result)
self.assertEqual(result["citation_count"], 2)
self.assertEqual(result["changed_count"], 2)
self.assertEqual(result["resolved_count"], 1)
self.assertEqual(result["unresolved_count"], 1)
self.assertEqual(result["error_count"], 0)
def test_editor_can_review_and_apply_citation_candidates(self) -> None:
repository.update_species_document_markdown(
slug="test-shad",
markdown="""---
title: Test Shad Markdown
common_name: Test Shad
scientific_name: Alosa markdownus
legacy_identifiers:
- authority: legacy-ecospecies
identifier: 4242
label: FLELMR
---
## References
- [7] Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.
""",
username="frank",
)
citations = repository.get_editor_species_citations("test-shad")
self.assertIsNotNone(citations)
citation = citations["citations"][0]
with patch.object(
repository,
"discover_citation_candidates",
return_value={
"seed": {
"fields": {
"author": "Daniell, W.C.",
"year": "1872",
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
"volume": "2",
"pages": "387-390",
}
},
"candidate_count": 1,
"candidates": [
{
"candidate_id": "crossref-search-1-daniell-good",
"source_label": "crossref:search:1:daniell-good",
"entry_type": "article",
"citation_key": "daniell1872lettersreferringexperiments",
"fields": {
"author": "Daniell, W.C.",
"year": "1872",
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
"volume": "2",
"pages": "387-390",
},
}
],
},
):
candidates = repository.get_species_citation_candidates("test-shad", citation["id"])
self.assertIsNotNone(candidates)
self.assertEqual(candidates["candidate_count"], 1)
result = repository.apply_species_citation_candidate_selection(
slug="test-shad",
citation_id=citation["id"],
candidate={
"source_label": "crossref:search:1:daniell-good",
"entry_type": "article",
"fields": {
"author": "Daniell, W.C.",
"year": "1872",
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
"volume": "2",
"pages": "387-390",
},
},
username="edith",
)
self.assertIsNotNone(result)
self.assertEqual(result["citation"]["resolver_source_label"], "editor:selected:crossref:search:1:daniell-good")
self.assertEqual(result["citation"]["source_type"], "editor_selected_candidate")
self.assertEqual(result["citation"]["review_status"], "accepted")
audit = repository.list_species_audit("test-shad")
self.assertIsNotNone(audit)
self.assertEqual(audit[0]["action"], "citation_candidate_selection")
def test_editor_can_add_candidate_as_additional_citation_and_preserve_it(self) -> None:
repository.update_species_document_markdown(
slug="test-shad",
markdown="""---
title: Test Shad Markdown
common_name: Test Shad
scientific_name: Alosa markdownus
legacy_identifiers:
- authority: legacy-ecospecies
identifier: 4242
label: FLELMR
---
## References
- [7] Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.
""",
username="frank",
)
citations = repository.get_editor_species_citations("test-shad")
self.assertIsNotNone(citations)
source_citation = citations["citations"][0]
result = repository.add_species_citation_from_candidate(
slug="test-shad",
citation_id=source_citation["id"],
candidate={
"source_label": "crossref:search:1:daniell-related",
"entry_type": "article",
"fields": {
"author": "Jordan, F.",
"year": "2009",
"title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA",
"journal": "Transactions of the American Fisheries Society",
"volume": "19",
"number": "1",
"pages": "107-115",
"doi": "10.1111/j.1600-0633.2009.00395.x",
"url": "https://doi.org/10.1111/j.1600-0633.2009.00395.x",
},
},
username="edith",
)
self.assertIsNotNone(result)
self.assertEqual(result["citation"]["source_type"], "editor_added_candidate")
self.assertEqual(result["citation"]["review_status"], "accepted")
citations = repository.get_editor_species_citations("test-shad")
self.assertIsNotNone(citations)
self.assertEqual(citations["citation_count"], 2)
self.assertEqual(citations["citations"][1]["section_heading"], "References")
document = repository.get_species_document("test-shad")
self.assertIsNotNone(document)
self.assertIn("Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA", document["markdown"])
repository.update_species_document_markdown(
slug="test-shad",
markdown=document["markdown"],
username="frank",
)
citations = repository.get_editor_species_citations("test-shad")
self.assertIsNotNone(citations)
self.assertEqual(citations["citation_count"], 2)
self.assertEqual(citations["citations"][1]["source_type"], "editor_added_candidate")
audit = repository.list_species_audit("test-shad")
self.assertIsNotNone(audit)
self.assertEqual(audit[0]["action"], "document_update")
self.assertEqual(audit[1]["action"], "citation_candidate_addition")
def test_contributor_can_view_only_owned_citations(self) -> None:
created = repository.create_contributor_species(
"writer@example.org",
"""---
title: Contributor Draft
common_name: Contributor Fish
scientific_name: Pisces contributoris
species_code:
---
## References
- [12] Example, A. 2025. Draft reference.
""",
)
owned = repository.get_contributor_species_citations(created["slug"], "writer@example.org")
other = repository.get_contributor_species_citations(created["slug"], "other@example.org")
self.assertIsNotNone(owned)
self.assertEqual(owned["citation_count"], 1)
self.assertEqual(owned["citations"][0]["legacy_reference_number"], "12")
self.assertIsNone(other)
def test_public_bibliography_aggregates_species_citations(self) -> None:
repository.update_species_document_markdown(
slug="test-shad",
markdown="""---
title: Test Shad Markdown
common_name: Test Shad
scientific_name: Alosa markdownus
legacy_identifiers:
- authority: legacy-ecospecies
identifier: 4242
label: FLELMR
---
## References
- [7] Jones, A. 2022. Fisheries review.
""",
username="frank",
)
citations = repository.get_editor_species_citations("test-shad")
self.assertIsNotNone(citations)
citation = citations["citations"][0]
repository.update_species_citation_review(
slug="test-shad",
citation_id=citation["id"],
review_status="accepted",
normalized_text="Jones, A. (2022). Fisheries review.",
doi="10.1000/review-doi",
citation_key="jones2022review",
entry_type="article",
draft_bibtex="@article{jones2022review,\n doi = {10.1000/review-doi}\n}",
username="edith",
abstract_text="A short abstract about fisheries review.",
)
bibliography = repository.list_public_bibliography()
self.assertEqual(len(bibliography), 1)
self.assertEqual(bibliography[0]["citation_key"], "jones2022review")
self.assertEqual(bibliography[0]["abstract_text"], "A short abstract about fisheries review.")
self.assertEqual(bibliography[0]["legacy_reference_numbers"], ["7"])
self.assertEqual(bibliography[0]["species_count"], 1)
self.assertEqual(bibliography[0]["species_refs"][0]["slug"], "test-shad")
def test_register_contributor_creates_token_and_enforces_age_gate(self) -> None:
with self.assertRaisesRegex(ValueError, "at least 13 years old"):
repository.register_contributor("person@example.org", False)
result = repository.register_contributor("Person@Example.org", True)
self.assertEqual(result["username"], "person@example.org")
self.assertEqual(result["role"], "contributor")
self.assertEqual(result["minimum_age"], 13)
self.assertTrue(result["token"])
def test_contributor_can_create_and_edit_only_owned_species(self) -> None:
created = repository.create_contributor_species(
"writer@example.org",
"""---
title: Contributor Draft
common_name: Contributor Fish
scientific_name: Pisces contributoris
species_code:
---
## Summary
Draft summary.
## Habitat
Mangroves.
""",
)
detail = repository.get_contributor_species_detail(created["slug"], "writer@example.org")
public_detail = repository.get_species_by_slug(created["slug"])
self.assertIsNotNone(detail)
self.assertIsNone(public_detail)
self.assertEqual(detail["publication_status"], "draft")
self.assertEqual(detail["common_name"], "Contributor Fish")
updated = repository.update_contributor_species_document_markdown(
created["slug"],
"""---
title: Contributor Draft Revised
common_name: Contributor Fish
scientific_name: Pisces contributoris
species_code:
---
## Summary
Revised summary.
## Habitat
Seagrass.
### Depth
Shallow bays.
""",
"writer@example.org",
)
self.assertIsNotNone(updated)
detail = repository.get_contributor_species_detail(created["slug"], "writer@example.org")
other_user_detail = repository.get_contributor_species_detail(created["slug"], "other@example.org")
audit = repository.list_species_audit(created["slug"])
self.assertIsNotNone(detail)
self.assertEqual(detail["summary"], "Revised summary.")
self.assertEqual(
[section["heading"] for section in detail["sections"]],
["Habitat", "Habitat / Depth"],
)
self.assertIsNone(other_user_detail)
self.assertIsNotNone(audit)
self.assertEqual(audit[0]["action"], "contributor_document_update")
if __name__ == "__main__":
unittest.main()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,43 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>EcoSpecies Bibliography</title>
<link rel="stylesheet" href="./styles.css">
</head>
<body>
<header class="site-header">
<div class="site-header-inner">
<div class="site-brand">
<p class="site-brand-mark">Open Species Archive</p>
<a href="./index.html" class="site-brand-link">EcoSpecies Atlas</a>
<p class="site-brand-summary">Public field atlas for historical species life-history materials.</p>
</div>
<nav class="site-nav" aria-label="Primary">
<a href="./index.html">Atlas</a>
<a href="./bibliography.html">Bibliography</a>
</nav>
</div>
</header>
<main class="page">
<section class="hero">
<p class="eyebrow">EcoSpecies Atlas</p>
<h1>Bibliography</h1>
<p class="lede">
A site-wide bibliography for the EcoSpecies atlas, including imported references and citations added during review.
</p>
<div class="auth-bar auth-panel-row">
<input id="bibliography-search" type="search" placeholder="Search title, author, DOI, or abstract">
<button id="bibliography-download" type="button" class="secondary-button">Download BibTeX</button>
<p id="bibliography-status" class="auth-status">Loading bibliography...</p>
</div>
</section>
<section class="panel">
<div id="bibliography-list" class="public-citation-list"></div>
</section>
</main>
<script src="./bibliography.js" defer></script>
</body>
</html>

230
apps/web/bibliography.js Normal file
View File

@ -0,0 +1,230 @@
function getAppBase() {
const { pathname } = window.location;
if (pathname === "/" || pathname === "/index.html") {
return "";
}
if (pathname.endsWith("/index.html")) {
return pathname.slice(0, -"/index.html".length);
}
return pathname.endsWith("/") ? pathname.slice(0, -1) : pathname;
}
const apiBase = getAppBase().replace(/\/bibliography\.html$/, "");
const bibliographyList = document.querySelector("#bibliography-list");
const bibliographySearch = document.querySelector("#bibliography-search");
const bibliographyStatus = document.querySelector("#bibliography-status");
const bibliographyDownload = document.querySelector("#bibliography-download");
let currentBibliographyItems = [];
function escapeHtml(value) {
return String(value)
.replaceAll("&", "&amp;")
.replaceAll('"', "&quot;")
.replaceAll("<", "&lt;")
.replaceAll(">", "&gt;");
}
function normalizeAbstractForDisplay(value) {
const raw = String(value || "").trim();
if (!raw) {
return "";
}
const temp = document.createElement("div");
temp.innerHTML = raw;
return temp.textContent
.replace(/^abstract\s*[:.\-]?\s*/i, "")
.replace(/\s+/g, " ")
.trim();
}
function parseBibtexFields(draftBibtex) {
const fields = {};
const text = String(draftBibtex || "");
const pattern = /([a-zA-Z_]+)\s*=\s*\{([^}]*)\}/g;
let match = pattern.exec(text);
while (match) {
fields[match[1].toLowerCase()] = match[2].trim();
match = pattern.exec(text);
}
return fields;
}
function collectBibtexRecords(items) {
const seen = new Set();
const records = [];
for (const item of items || []) {
const draftBibtex = String(item && item.draft_bibtex ? item.draft_bibtex : "").trim();
if (!draftBibtex || seen.has(draftBibtex)) {
continue;
}
seen.add(draftBibtex);
records.push(draftBibtex);
}
return records;
}
function downloadBibtexRecords(items, filenameStem) {
const records = collectBibtexRecords(items);
if (!records.length) {
return false;
}
const blob = new Blob([`${records.join("\n\n")}\n`], { type: "application/x-bibtex;charset=utf-8" });
const url = URL.createObjectURL(blob);
const link = document.createElement("a");
link.href = url;
link.download = `${filenameStem}.bib`;
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
window.setTimeout(() => URL.revokeObjectURL(url), 0);
return true;
}
function syncDownloadButton(items) {
if (!bibliographyDownload) {
return;
}
const recordCount = collectBibtexRecords(items).length;
bibliographyDownload.disabled = !recordCount;
bibliographyDownload.textContent = recordCount
? `Download BibTeX (${recordCount})`
: "Download BibTeX";
}
function buildCitationText(item) {
const fields = parseBibtexFields(item.draft_bibtex || "");
if (item.normalized_text) {
return escapeHtml(item.normalized_text);
}
const author = fields.author || "";
const year = fields.year || "";
const title = fields.title || "";
const venue = fields.journal || fields.booktitle || fields.publisher || "";
const volume = fields.volume || "";
const issue = fields.number || "";
const pages = fields.pages || "";
const parts = [];
const lead = [author, year ? `(${year})` : ""].filter(Boolean).join(" ");
if (lead) {
parts.push(lead);
}
if (title) {
parts.push(title);
}
const venueBits = [venue, volume ? `${volume}${issue ? `(${issue})` : ""}` : issue ? `(${issue})` : "", pages]
.filter(Boolean)
.join(", ");
if (venueBits) {
parts.push(venueBits);
}
return escapeHtml(parts.join(". ").trim() || item.raw_text || "");
}
function renderSpeciesRefs(refs) {
return refs
.map(
(ref) =>
`<a href="./index.html#${escapeHtml(ref.slug)}">${escapeHtml(ref.common_name || ref.slug)}</a>`,
)
.join(", ");
}
function renderAbstractBlock(text) {
const abstract = normalizeAbstractForDisplay(text);
if (!abstract) {
return "";
}
return `
<div class="citation-abstract-shell">
<button type="button" class="secondary-button citation-abstract-toggle" aria-expanded="false">
Show Abstract
</button>
<div class="citation-abstract-display hidden">
<p class="public-citation-abstract">${escapeHtml(abstract)}</p>
</div>
</div>
`;
}
function attachCitationAbstractToggles(root) {
for (const toggle of root.querySelectorAll(".citation-abstract-toggle")) {
const shell = toggle.parentElement;
const display = shell && shell.querySelector(".citation-abstract-display");
if (!display) {
continue;
}
toggle.addEventListener("click", () => {
const hidden = display.classList.toggle("hidden");
toggle.setAttribute("aria-expanded", hidden ? "false" : "true");
toggle.textContent = hidden ? "Show Abstract" : "Hide Abstract";
});
}
}
function renderBibliography(items) {
bibliographyList.innerHTML = "";
if (!items.length) {
bibliographyList.innerHTML = `<p class="editor-status">No bibliography entries match the current search.</p>`;
return;
}
for (const item of items) {
const links = [
item.doi ? `<a href="https://doi.org/${encodeURIComponent(String(item.doi).replace(/^https?:\/\/doi\.org\//, ""))}" target="_blank" rel="noopener noreferrer">DOI</a>` : "",
item.source_url ? `<a href="${escapeHtml(item.source_url)}" target="_blank" rel="noopener noreferrer">Source</a>` : "",
item.openalex_id ? `<a href="https://openalex.org/${escapeHtml(String(item.openalex_id).replace(/^https?:\/\/openalex\.org\//, ""))}" target="_blank" rel="noopener noreferrer">OpenAlex</a>` : "",
]
.filter(Boolean)
.join(" · ");
const article = document.createElement("article");
article.className = "public-citation-entry";
article.innerHTML = `
<p class="public-citation-text">${buildCitationText(item)}</p>
${renderAbstractBlock(item.abstract_text || "")}
<p class="public-citation-meta">
Appears in ${item.species_count} species record${item.species_count === 1 ? "" : "s"}
${item.legacy_reference_numbers && item.legacy_reference_numbers.length ? ` • Imported references: ${item.legacy_reference_numbers.map((value) => escapeHtml(value)).join(", ")}` : ""}
</p>
<p class="public-citation-meta">Species: ${renderSpeciesRefs(item.species_refs || [])}</p>
${links ? `<p class="public-citation-links">${links}</p>` : ""}
`;
attachCitationAbstractToggles(article);
bibliographyList.appendChild(article);
}
}
async function loadBibliography(search = "") {
bibliographyStatus.textContent = "Loading bibliography...";
const query = search ? `?search=${encodeURIComponent(search)}` : "";
const response = await fetch(`${apiBase}/api/bibliography${query}`);
const data = await response.json();
if (!response.ok) {
bibliographyList.innerHTML = `<p class="error">${escapeHtml(data.error || "Unable to load bibliography.")}</p>`;
bibliographyStatus.textContent = data.error || "Bibliography load failed";
return;
}
currentBibliographyItems = data.items || [];
renderBibliography(currentBibliographyItems);
syncDownloadButton(currentBibliographyItems);
bibliographyStatus.textContent = `${data.count || 0} bibliography entr${data.count === 1 ? "y" : "ies"}`;
}
bibliographySearch.addEventListener("input", async (event) => {
await loadBibliography(event.target.value);
});
loadBibliography().catch((error) => {
bibliographyList.innerHTML = `<p class="error">Failed to load bibliography: ${escapeHtml(String(error))}</p>`;
bibliographyStatus.textContent = "Bibliography load failed";
});
if (bibliographyDownload) {
bibliographyDownload.addEventListener("click", () => {
const downloaded = downloadBibtexRecords(currentBibliographyItems, "ecospecies-bibliography");
if (!downloaded) {
bibliographyStatus.textContent = "No BibTeX records are available for download yet.";
}
});
}

View File

@ -7,20 +7,31 @@
<link rel="stylesheet" href="./styles.css">
</head>
<body>
<header class="site-header">
<div class="site-header-inner">
<div class="site-brand">
<p class="site-brand-mark">Open Species Archive</p>
<a href="./index.html" class="site-brand-link">EcoSpecies Atlas</a>
<p class="site-brand-summary">Public field atlas for historical species life-history materials.</p>
</div>
<nav class="site-nav" aria-label="Primary">
<a href="./index.html">Atlas</a>
<a href="./bibliography.html">Bibliography</a>
</nav>
</div>
</header>
<main class="page">
<section class="hero">
<p class="eyebrow">Marine Species Knowledge System</p>
<h1>EcoSpecies</h1>
<p class="eyebrow">Open Biodiversity Reference</p>
<h1>EcoSpecies Atlas</h1>
<p class="lede">
A modern follow-on for the legacy EcoSpecies archive, starting with direct ingestion
of historical Species Life History text files.
A modern follow-on for the legacy EcoSpecies archive, built as an open ecology and
biodiversity reference workspace.
</p>
<p class="hero-context">
Use EcoSpecies Atlas for species profiles, habitat evidence, ecological reading, and
citation-aware exploration grounded in the migrated legacy corpus.
</p>
<div class="auth-bar">
<input id="auth-token" type="password" placeholder="Bearer token for editor access">
<button id="auth-save" type="button">Use Token</button>
<button id="auth-clear" type="button" class="secondary-button">Clear</button>
<p id="auth-status" class="auth-status">Public access</p>
</div>
<div class="hero-stats">
<div class="stat">
<span id="species-count">0</span>
@ -38,6 +49,7 @@
<div class="panel-header">
<h2>Species</h2>
<input id="search" type="search" placeholder="Search common or scientific name">
<button id="contributor-create" type="button" class="secondary-button hidden">Create New Draft</button>
<div id="archive-filter-group" class="archive-filter-group hidden">
<button type="button" class="archive-filter-button is-active" data-archive-filter="active">Active</button>
<button type="button" class="archive-filter-button" data-archive-filter="all">All</button>
@ -66,44 +78,166 @@
This record is archived. It is hidden from public endpoints but remains available to editors for audit and recovery.
</p>
</header>
<section id="editor-panel" class="detail-section editor-panel hidden">
<h3>Editor Controls</h3>
<label class="editor-label" for="editor-publication-status">Publication Status</label>
<select id="editor-publication-status">
<option value="draft">Draft</option>
<option value="review">Review</option>
<option value="published">Published</option>
</select>
<label class="editor-label" for="editor-summary">Summary</label>
<textarea id="editor-summary" rows="5" placeholder="Write a concise executive summary."></textarea>
<label class="editor-label" for="editor-notes">Editor Notes</label>
<textarea id="editor-notes" rows="4" placeholder="Internal editorial notes"></textarea>
<label class="archive-toggle">
<input id="editor-is-archived" type="checkbox">
<span>Archive this species</span>
</label>
<div class="editor-actions">
<button id="editor-save" type="button">Save Editorial Changes</button>
<p id="editor-status" class="editor-status"></p>
</div>
</section>
<section id="audit-panel" class="detail-section hidden">
<h3>Audit History</h3>
<div id="audit-list" class="audit-list"></div>
</section>
<div id="detail-sections" class="detail-sections"></div>
<div class="workflow-panels">
<section id="legacy-panel" class="detail-section collapsible-panel collapsed hidden" data-label="Legacy Materials Under Review">
<div class="collapsible-header">
<h3>Legacy Materials Under Review</h3>
<button type="button" class="secondary-button collapsible-toggle" data-target="legacy-panel" data-label="Legacy Materials Under Review" aria-expanded="false">
Show Legacy Materials Under Review
</button>
</div>
<div class="collapsible-body">
<p id="legacy-source-meta" class="editor-status"></p>
<pre id="legacy-source-text" class="legacy-source"></pre>
</div>
</section>
<section id="access-panel" class="detail-section collapsible-panel collapsed" data-label="Access and Contribution">
<div class="collapsible-header">
<h3>Access and Contribution</h3>
<button type="button" class="secondary-button collapsible-toggle" data-target="access-panel" data-label="Access and Contribution" aria-expanded="false">
Show Access and Contribution
</button>
</div>
<div class="collapsible-body">
<div class="auth-bar auth-panel-row">
<input id="auth-token" type="password" placeholder="Bearer token for editor access">
<button id="auth-save" type="button">Use Token</button>
<button id="auth-clear" type="button" class="secondary-button">Clear</button>
<p id="auth-status" class="auth-status">Public access</p>
</div>
<div class="auth-bar contributor-signup auth-panel-row">
<input id="contributor-email" type="email" placeholder="Email for contributor access">
<label class="archive-toggle contributor-age-gate">
<input id="contributor-age-gate" type="checkbox">
<span>I confirm I am at least <span id="contributor-age-label">13</span> years old</span>
</label>
<button id="contributor-register" type="button" class="secondary-button">Become Contributor</button>
<p id="contributor-status" class="auth-status"></p>
</div>
</div>
</section>
<section id="editor-panel" class="detail-section collapsible-panel editor-panel collapsed hidden" data-label="Editing Workflow">
<div class="collapsible-header">
<h3>Editing Workflow</h3>
<button type="button" class="secondary-button collapsible-toggle" data-target="editor-panel" data-label="Editing Workflow" aria-expanded="false">
Show Editing Workflow
</button>
</div>
<div class="collapsible-body">
<label class="editor-label" for="editor-publication-status">Publication Status</label>
<select id="editor-publication-status">
<option value="draft">Draft</option>
<option value="review">Review</option>
<option value="published">Published</option>
</select>
<label class="editor-label" for="editor-notes">Editor Notes</label>
<textarea id="editor-notes" rows="4" placeholder="Internal editorial notes"></textarea>
<label class="archive-toggle">
<input id="editor-is-archived" type="checkbox">
<span>Archive this species</span>
</label>
<div class="editor-actions">
<button id="editor-save" type="button">Save Editorial Changes</button>
<p id="editor-status" class="editor-status"></p>
</div>
</div>
</section>
<section id="document-panel" class="detail-section collapsible-panel editor-panel collapsed hidden" data-label="Metadata and Document Workflow">
<div class="collapsible-header">
<h3>Metadata and Document Workflow</h3>
<button type="button" class="secondary-button collapsible-toggle" data-target="document-panel" data-label="Metadata and Document Workflow" aria-expanded="false">
Show Metadata and Document Workflow
</button>
</div>
<div class="collapsible-body">
<div class="document-panel-header">
<div>
<p class="editor-status">
Markdown is the editable source of truth for hierarchy. Front matter and headings are validated on save.
</p>
</div>
<div class="editor-actions">
<button id="document-save" type="button">Save Document</button>
<p id="document-status" class="editor-status"></p>
</div>
</div>
<label class="editor-label" for="document-markdown">Markdown Source</label>
<textarea id="document-markdown" class="document-editor" rows="18" spellcheck="false"></textarea>
<details class="document-preview-shell" open>
<summary>Outline Preview</summary>
<div id="document-preview" class="document-preview"></div>
</details>
</div>
</section>
<section id="citation-panel" class="detail-section collapsible-panel collapsed hidden" data-label="Review Workflow">
<div class="collapsible-header">
<h3>Review Workflow</h3>
<button type="button" class="secondary-button collapsible-toggle" data-target="citation-panel" data-label="Review Workflow" aria-expanded="false">
Show Review Workflow
</button>
</div>
<div class="collapsible-body">
<div class="document-panel-header">
<div>
<p id="citation-status" class="editor-status">
Extracted bibliography entries and draft BibTeX records.
</p>
</div>
<div class="editor-actions">
<button id="citation-backfill-species" type="button" class="secondary-button hidden">Backfill This Species</button>
<button id="citation-enrich-all" type="button" class="secondary-button hidden">Run Enrichment For All Citations</button>
</div>
</div>
<div id="citation-list" class="citation-list"></div>
</div>
</section>
<section id="audit-panel" class="detail-section collapsible-panel collapsed hidden" data-label="Audit History">
<div class="collapsible-header">
<h3>Audit History</h3>
<button type="button" class="secondary-button collapsible-toggle" data-target="audit-panel" data-label="Audit History" aria-expanded="false">
Show Audit History
</button>
</div>
<div class="collapsible-body">
<div id="audit-list" class="audit-list"></div>
</div>
</section>
</div>
</article>
</section>
</section>
<footer class="footer">
<p>
This migration path preserves attribution for Dr. Peter Rubec, Dr. Diane Blackwood,
EcoSpecies Atlas preserves attribution for Dr. Peter Rubec, Dr. Diane Blackwood,
Dr. Welsbery R. Elsberry, and the Florida Fish and Wildlife Research Institute context
documented in the legacy project materials.
</p>
</footer>
</main>
<section id="citation-match-dialog" class="match-dialog-shell hidden" aria-hidden="true">
<div class="match-dialog-backdrop"></div>
<article class="match-dialog-card" role="dialog" aria-modal="true" aria-labelledby="citation-match-title">
<div class="match-dialog-header">
<div>
<h2 id="citation-match-title">Citation Candidate Review</h2>
<p id="citation-match-status" class="editor-status">Compare the parsed source citation against candidate metadata.</p>
</div>
<button id="citation-match-close" type="button" class="secondary-button">Close</button>
</div>
<div class="match-dialog-grid">
<section class="detail-section">
<h3>Parsed Source Metadata</h3>
<div id="citation-match-seed" class="match-seed"></div>
</section>
<section class="detail-section">
<h3>Candidate Matches</h3>
<div id="citation-match-candidates" class="match-candidates"></div>
</section>
</div>
</article>
</section>
<script src="./app.js" defer></script>
</body>
</html>

View File

@ -5,6 +5,10 @@ server {
root /usr/share/nginx/html;
index index.html;
location = /apps/ecospecies {
return 301 /apps/ecospecies/;
}
location /api/ {
proxy_pass http://api:8000/api/;
proxy_http_version 1.1;
@ -14,19 +18,46 @@ server {
proxy_set_header X-Forwarded-Proto $scheme;
}
location /apps/ecospecies/api/ {
rewrite ^/apps/ecospecies/api/(.*)$ /api/$1 break;
proxy_pass http://api:8000;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
location /healthz {
proxy_pass http://api:8000/healthz;
proxy_http_version 1.1;
proxy_set_header Host $host;
}
location /apps/ecospecies/healthz {
proxy_pass http://api:8000/healthz;
proxy_http_version 1.1;
proxy_set_header Host $host;
}
location /readyz {
proxy_pass http://api:8000/readyz;
proxy_http_version 1.1;
proxy_set_header Host $host;
}
location /apps/ecospecies/readyz {
proxy_pass http://api:8000/readyz;
proxy_http_version 1.1;
proxy_set_header Host $host;
}
location / {
try_files $uri $uri/ /index.html;
}
location /apps/ecospecies/ {
rewrite ^/apps/ecospecies/(.*)$ /$1 break;
try_files $uri $uri/ /index.html;
}
}

View File

@ -1,12 +1,12 @@
:root {
--bg: #f4efe6;
--paper: rgba(255, 252, 247, 0.78);
--ink: #16251f;
--muted: #58655f;
--accent: #0f766e;
--accent-2: #bc6c25;
--line: rgba(22, 37, 31, 0.12);
--shadow: 0 24px 70px rgba(24, 35, 30, 0.15);
--bg: #f4f7fb;
--paper: rgba(255, 255, 255, 0.88);
--ink: #182433;
--muted: #5f6b7d;
--accent: #2457a6;
--accent-2: #1f7a5a;
--line: rgba(24, 36, 51, 0.11);
--shadow: 0 24px 70px rgba(33, 52, 84, 0.14);
}
* {
@ -15,12 +15,83 @@
body {
margin: 0;
font-family: Georgia, "Times New Roman", serif;
font-family: "Segoe UI", "Helvetica Neue", Arial, sans-serif;
color: var(--ink);
background:
radial-gradient(circle at top left, rgba(15, 118, 110, 0.14), transparent 28%),
radial-gradient(circle at top right, rgba(188, 108, 37, 0.16), transparent 24%),
linear-gradient(180deg, #f8f4ec, #efe6d7 70%, #e7dcc9);
radial-gradient(circle at top left, rgba(36, 87, 166, 0.14), transparent 26%),
radial-gradient(circle at top right, rgba(31, 122, 90, 0.12), transparent 24%),
linear-gradient(180deg, #f4f7fb, #e4edf6 72%, #d9e6ef);
}
.site-header {
width: min(1320px, calc(100vw - 32px));
margin: 0 auto;
padding-top: 24px;
}
.site-header-inner {
display: flex;
gap: 18px;
align-items: center;
justify-content: space-between;
padding: 18px 22px;
border-radius: 24px;
backdrop-filter: blur(10px);
background: var(--paper);
border: 1px solid var(--line);
box-shadow: var(--shadow);
}
.site-brand {
display: flex;
flex-direction: column;
gap: 4px;
}
.site-brand-mark {
margin: 0;
color: var(--accent);
text-transform: uppercase;
letter-spacing: 0.18em;
font-size: 0.76rem;
}
.site-brand-link {
color: var(--ink);
font-size: 1.5rem;
font-weight: 700;
text-decoration: none;
}
.site-brand-summary {
margin: 0;
color: var(--muted);
font-size: 0.94rem;
}
.site-nav {
display: flex;
flex-wrap: wrap;
gap: 10px;
justify-content: flex-end;
}
.site-nav a {
display: inline-flex;
align-items: center;
justify-content: center;
border-radius: 999px;
padding: 11px 16px;
text-decoration: none;
color: var(--ink);
border: 1px solid var(--line);
background: rgba(255, 255, 255, 0.72);
transition: transform 160ms ease, border-color 160ms ease;
}
.site-nav a:hover {
transform: translateY(-1px);
border-color: rgba(15, 118, 110, 0.45);
}
.page {
@ -42,6 +113,9 @@ body {
.hero {
padding: 28px;
margin-bottom: 20px;
background:
linear-gradient(135deg, rgba(255, 255, 255, 0.95), rgba(234, 244, 240, 0.92)),
var(--paper);
}
.eyebrow {
@ -56,6 +130,7 @@ h1 {
margin: 0;
font-size: clamp(2.8rem, 7vw, 5.6rem);
line-height: 0.92;
letter-spacing: -0.03em;
}
.lede {
@ -64,6 +139,12 @@ h1 {
font-size: 1.08rem;
}
.hero-context {
max-width: 68ch;
color: var(--muted);
line-height: 1.58;
}
.hero-stats {
display: flex;
gap: 16px;
@ -79,6 +160,15 @@ h1 {
margin-top: 18px;
}
.auth-panel-row {
margin-top: 0;
}
.contributor-signup {
padding-top: 14px;
border-top: 1px solid var(--line);
}
.auth-bar input {
min-width: min(360px, 100%);
flex: 1;
@ -93,7 +183,7 @@ h1 {
min-width: 180px;
padding: 14px 16px;
border-radius: 18px;
background: rgba(255, 255, 255, 0.6);
background: linear-gradient(180deg, rgba(255, 255, 255, 0.95), rgba(232, 242, 239, 0.92));
border: 1px solid var(--line);
}
@ -158,6 +248,16 @@ input[type="search"] {
background: rgba(255, 255, 255, 0.9);
}
input[type="text"],
input[type="email"],
input[type="password"] {
border: 1px solid var(--line);
border-radius: 18px;
padding: 12px 14px;
font: inherit;
background: rgba(255, 255, 255, 0.92);
}
select,
textarea,
button {
@ -201,7 +301,7 @@ button {
padding: 14px;
border-radius: 18px;
border: 1px solid var(--line);
background: linear-gradient(180deg, rgba(255, 255, 255, 0.95), rgba(241, 237, 230, 0.95));
background: linear-gradient(180deg, rgba(255, 255, 255, 0.97), rgba(239, 246, 244, 0.94));
cursor: pointer;
transition: transform 160ms ease, border-color 160ms ease;
}
@ -213,7 +313,7 @@ button {
.species-card-archived {
border-style: dashed;
background: linear-gradient(180deg, rgba(247, 241, 231, 0.98), rgba(233, 226, 214, 0.98));
background: linear-gradient(180deg, rgba(243, 247, 249, 0.98), rgba(227, 236, 242, 0.98));
}
.species-name,
@ -273,6 +373,32 @@ button {
display: none;
}
.match-dialog-shell {
position: fixed;
inset: 0;
z-index: 50;
}
.match-dialog-backdrop {
position: absolute;
inset: 0;
background: rgba(12, 20, 18, 0.46);
}
.match-dialog-card {
position: relative;
z-index: 1;
width: min(1180px, calc(100vw - 32px));
max-height: calc(100vh - 40px);
overflow: auto;
margin: 20px auto;
padding: 18px;
border-radius: 24px;
background: #fbf8f1;
border: 1px solid var(--line);
box-shadow: var(--shadow);
}
.detail-header {
padding-bottom: 16px;
border-bottom: 1px solid var(--line);
@ -313,6 +439,12 @@ button {
margin-top: 18px;
}
.workflow-panels {
display: grid;
gap: 16px;
margin-top: 20px;
}
.detail-section {
padding: 16px;
border-radius: 18px;
@ -329,6 +461,44 @@ button {
margin-top: 18px;
}
.workflow-panels .editor-panel,
.workflow-panels .detail-section {
margin-top: 0;
}
.collapsible-panel {
padding-top: 14px;
}
.collapsible-header {
display: flex;
gap: 12px;
align-items: center;
justify-content: space-between;
flex-wrap: wrap;
}
.collapsible-header h3 {
margin-bottom: 0;
}
.collapsible-body {
margin-top: 16px;
}
.collapsible-panel.collapsed .collapsible-body {
display: none;
}
.document-panel-header {
display: flex;
gap: 16px;
align-items: flex-start;
justify-content: space-between;
flex-wrap: wrap;
margin-bottom: 14px;
}
.editor-label {
display: block;
margin: 0 0 8px;
@ -349,6 +519,11 @@ button {
font-weight: 700;
}
.contributor-age-gate {
margin: 0;
font-weight: 400;
}
.archive-toggle input {
width: 18px;
height: 18px;
@ -372,6 +547,149 @@ button {
gap: 12px;
}
.citation-list {
display: grid;
gap: 14px;
}
.citation-entry {
padding: 14px;
border-radius: 16px;
border: 1px solid var(--line);
background: rgba(255, 255, 255, 0.76);
}
.citation-entry-meta {
margin: 0 0 10px;
color: var(--muted);
font-size: 0.92rem;
}
.citation-entry-raw {
margin: 0 0 12px;
line-height: 1.5;
}
.citation-bibtex,
.citation-bibtex-editor {
font-family: "Courier New", monospace;
font-size: 0.9rem;
line-height: 1.45;
}
.citation-abstract-shell {
display: grid;
gap: 8px;
margin: 4px 0 10px;
}
.citation-detail-shell {
display: grid;
gap: 8px;
margin: 4px 0 10px;
}
.citation-abstract-display {
padding: 10px 12px;
border-radius: 12px;
border: 1px solid var(--line);
background: rgba(15, 118, 110, 0.05);
}
.citation-detail-display {
padding: 10px 12px;
border-radius: 12px;
border: 1px solid var(--line);
background: rgba(255, 255, 255, 0.78);
}
.match-dialog-header,
.match-dialog-grid,
.match-candidate-header,
.match-candidates,
.match-candidate-card,
.match-seed,
.match-table {
display: grid;
gap: 12px;
}
.match-dialog-header {
grid-template-columns: minmax(0, 1fr) auto;
align-items: start;
}
.match-dialog-grid {
grid-template-columns: minmax(260px, 0.9fr) minmax(0, 1.6fr);
margin-top: 16px;
}
.match-candidate-card {
padding: 14px;
border-radius: 16px;
border: 1px solid var(--line);
background: rgba(255, 255, 255, 0.84);
}
.match-candidate-header {
grid-template-columns: minmax(0, 1fr) auto;
align-items: baseline;
}
.match-score {
font-weight: 700;
color: var(--accent);
}
.match-table {
border: 1px solid var(--line);
border-radius: 14px;
overflow: hidden;
}
.match-row {
display: grid;
grid-template-columns: 120px 110px minmax(0, 1fr) minmax(0, 1fr);
gap: 10px;
padding: 10px 12px;
border-top: 1px solid var(--line);
font-size: 0.92rem;
}
.match-row:first-child {
border-top: 0;
}
.match-row-head {
background: rgba(15, 118, 110, 0.08);
font-weight: 700;
}
.match-label {
color: var(--muted);
font-weight: 700;
}
.match-status {
text-transform: uppercase;
letter-spacing: 0.04em;
font-size: 0.78rem;
}
.match-status-exact {
color: var(--accent);
}
.match-status-partial,
.match-status-seed-missing,
.match-status-candidate-missing {
color: var(--accent-2);
}
.match-status-conflict {
color: #a12626;
}
.audit-entry {
padding: 14px;
border-radius: 16px;
@ -394,6 +712,62 @@ button {
line-height: 1.45;
}
.document-editor,
.document-preview {
font-family: "Courier New", monospace;
font-size: 0.92rem;
line-height: 1.5;
}
.document-editor {
min-height: 420px;
margin-bottom: 14px;
white-space: pre;
overflow: auto;
}
.document-preview-shell {
border: 1px solid var(--line);
border-radius: 18px;
background: rgba(255, 255, 255, 0.72);
overflow: hidden;
}
.document-preview-shell summary {
cursor: pointer;
padding: 12px 16px;
font-weight: 700;
color: var(--accent);
}
.document-preview {
padding: 0 16px 16px;
}
.document-preview-empty {
color: var(--muted);
}
.document-preview-list {
margin: 0;
padding-left: 22px;
}
.document-preview-list li + li {
margin-top: 8px;
}
.document-preview-metadata {
margin: 0 0 14px;
padding: 0;
list-style: none;
color: var(--muted);
}
.document-preview-metadata li + li {
margin-top: 6px;
}
.diagnostic-list {
margin: 0;
padding-left: 18px;
@ -403,6 +777,100 @@ button {
margin-top: 8px;
}
.structured-node {
display: grid;
gap: 12px;
background: linear-gradient(180deg, rgba(255, 255, 255, 0.84), rgba(242, 247, 252, 0.88));
}
.structured-node + .structured-node {
margin-top: 4px;
}
.structured-node h3,
.structured-node h4,
.structured-node h5,
.structured-node h6 {
line-height: 1.18;
letter-spacing: -0.01em;
}
.structured-node-body {
margin: 0;
line-height: 1.58;
color: var(--ink);
}
.structured-node-children {
display: grid;
gap: 12px;
padding: 4px 0 0 18px;
border-left: 2px solid rgba(36, 87, 166, 0.12);
}
.public-citation-list {
display: grid;
gap: 14px;
}
.public-bibliography-actions {
display: flex;
gap: 12px;
align-items: center;
flex-wrap: wrap;
}
.public-bibliography-note {
margin: 0;
color: var(--muted);
font-size: 0.92rem;
}
.public-citation-entry {
display: grid;
gap: 8px;
padding: 14px;
border-radius: 16px;
border: 1px solid var(--line);
background: rgba(255, 255, 255, 0.76);
}
.public-citation-text,
.public-citation-meta,
.public-citation-links,
.public-citation-abstract {
margin: 0;
}
.public-citation-text {
line-height: 1.56;
}
.public-citation-meta,
.public-citation-links {
color: var(--muted);
font-size: 0.92rem;
}
.public-citation-links a {
color: var(--accent);
}
.public-citation-abstract {
padding-top: 2px;
color: var(--muted);
line-height: 1.58;
}
.legacy-source {
max-height: 28rem;
overflow: auto;
padding: 14px;
border-radius: 16px;
border: 1px solid var(--line);
background: rgba(255, 255, 255, 0.76);
}
pre {
margin: 0;
white-space: pre-wrap;
@ -417,6 +885,15 @@ pre {
}
@media (max-width: 960px) {
.site-header-inner {
flex-direction: column;
align-items: stretch;
}
.site-nav {
justify-content: flex-start;
}
.workspace {
grid-template-columns: 1fr;
}
@ -424,4 +901,12 @@ pre {
.species-list {
max-height: 40vh;
}
.match-dialog-grid {
grid-template-columns: 1fr;
}
.match-row {
grid-template-columns: 1fr;
}
}

View File

@ -1,5 +1,6 @@
services:
db:
container_name: ecospecies-db
image: postgres:16-alpine
environment:
POSTGRES_DB: ecospecies
@ -17,6 +18,7 @@ services:
- postgres_data:/var/lib/postgresql/data
importer:
container_name: ecospecies-importer
image: python:3.12-slim
depends_on:
db:
@ -30,11 +32,12 @@ services:
command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-import.sh"]
volumes:
- .:/workspace
- ../01-legacy-code-and-data:/legacy-data:ro
- ${ECOSPECIES_LEGACY_DATA_DIR:-../legacy-corpus}:/legacy-data:ro
- python_venv:/workspace/.docker/venv
- pip_cache:/root/.cache/pip
api:
container_name: ecospecies-api
image: python:3.12-slim
restart: unless-stopped
depends_on:
@ -56,11 +59,12 @@ services:
- "${ECOSPECIES_API_PORT:-8000}:8000"
volumes:
- .:/workspace
- ../01-legacy-code-and-data:/legacy-data:ro
- ${ECOSPECIES_LEGACY_DATA_DIR:-../legacy-corpus}:/legacy-data:ro
- python_venv:/workspace/.docker/venv
- pip_cache:/root/.cache/pip
web:
container_name: ecospecies-web
image: nginx:1.27-alpine
restart: unless-stopped
depends_on:

View File

@ -0,0 +1,110 @@
## CiteGeist Review Notes
These notes capture parser issues seen while integrating CiteGeist-style extraction into EcoSpecies.
### Report-style references
Observed failure shape:
- references like `Daniell, W.C. 1872. Letters referring ... Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.`
- extracted `title` may contain the full raw bibliography string
- abbreviated venue names such as `Comm. Rept.` are not separated cleanly from the title
Suggested upstream change in `citegeist.extract`:
- add a report-style parser path after year detection
- prefer sentence-boundary venue detection before naive keyword splits so words like `report` inside a real title do not trigger an early cut
- support abbreviation-heavy venue starters such as:
- `comm. rept.`
- `rept.`
- `proc.`
- `occas. pap.`
- `bulletin`
- `bull.`
- `memoir`
- strip trailing volume/page blobs like `2: 387-390` from the venue field
- when a first parse leaves a partial venue stub such as `Occas`, reparse the full raw reference line and prefer the fuller repaired venue/title split
### Placeholder title merge behavior
Observed failure shape:
- a raw bibliography string may survive as `title` even after DOI/title resolution finds a better title
Suggested upstream change in `citegeist.resolve.merge_entries_with_conflicts`:
- treat titles that look like raw bibliography strings as placeholders
- example heuristic:
- starts with `Surname, ... YEAR.`
- unusually long for a title
- contains a resolved shorter title as a substring after punctuation normalization
### Legacy note deduplication
Observed failure shape:
- note fragments like `ecospecies_reference_number = {160}` can be appended more than once downstream when re-merging enriched metadata
Suggested upstream change:
- when joining note fragments, split on `;`, normalize whitespace, and dedupe per fragment rather than per whole note string
### Unresolved entries should still refresh local parses
Observed failure shape:
- parser improvements may correctly rebuild `title`, venue, `volume`, `number`, and `pages`
- but if no remote metadata source matches, the stored draft BibTeX can remain unchanged unless unresolved enrichment also writes the refreshed local seed back out
Suggested upstream change:
- unresolved enrichment should still return the rebuilt local draft entry
- keep `citation_key`, normalized text, and draft BibTeX synchronized with the current local parser even when resolver status remains `unresolved`
### Returned metadata not carried through
Observed concern:
- resolver/source payloads may include bibliographic details such as:
- `volume`
- `issue` / BibTeX `number`
- `page` / BibTeX `pages`
- these should be preserved into the BibTeX entry whenever available
Current note:
- CiteGeist Crossref mapping already includes `volume`, `number`, and `pages`
- verify that all resolver paths, storage round-trips, and exports preserve those fields consistently
- OpenAlex/DataCite mappings should also be checked for analogous bibliographic fields in `biblio` / attribute payloads
### False-positive title-search acceptance
Observed failure shape:
- title search can return a thematically related but bibliographically different work
- downstream acceptance may keep some seed fields while adopting conflicting DOI/title/volume/pages from the returned match
- this is especially risky for historical references with sparse or abbreviated venue names
Suggested upstream change in `citegeist.resolve` and any title-search ranking path:
- do not fall back to the first search hit when no strong title match exists
- prefer exact or near-exact title matches only
- reject a candidate when structured seed metadata conflicts on strong fields such as:
- `year`
- venue / journal
- `volume`
- `number`
- `pages`
- treat those fields as match-validation inputs, not just merge-time metadata
### OpenAlex null-source handling
Observed failure shape:
- some OpenAlex works have `primary_location` present but `source: null`
- downstream mapping can crash if it assumes `source` is always a dictionary
Suggested upstream change:
- treat null `source` payloads as empty dictionaries
- continue mapping title, year, DOI, and `biblio` fields even when venue/source is missing

89
docs/dc-orig.yml Normal file
View File

@ -0,0 +1,89 @@
services:
db:
image: postgres:16-alpine
restart: unless-stopped
environment:
POSTGRES_DB: ${ECOSPECIES_DB_NAME:-ecospecies}
POSTGRES_USER: ${ECOSPECIES_DB_USER:-ecospecies}
POSTGRES_PASSWORD: ${ECOSPECIES_DB_PASSWORD:?set ECOSPECIES_DB_PASSWORD}
PGDATA: /var/lib/postgresql/data/pgdata
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${ECOSPECIES_DB_USER:-ecospecies} -d ${ECOSPECIES_DB_NAME:-ecospecies}"]
interval: 5s
timeout: 5s
retries: 10
volumes:
- postgres_data:/var/lib/postgresql/data
importer:
image: python:3.12-slim
restart: "no"
depends_on:
db:
condition: service_healthy
working_dir: /workspace
environment:
ECOSPECIES_DATA_DIR: ${ECOSPECIES_DATA_DIR:-/legacy-data/InputFiles - TXT}
ECOSPECIES_DATABASE_URL: postgresql+psycopg://${ECOSPECIES_DB_USER:-ecospecies}:${ECOSPECIES_DB_PASSWORD}@db:5432/${ECOSPECIES_DB_NAME:-ecospecies}
ECOSPECIES_VENV_DIR: /workspace/.docker/venv
PYTHONPATH: /workspace/apps/api/src
command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-import.sh"]
volumes:
- ..:/workspace
- ${ECOSPECIES_LEGACY_DATA_DIR:-../path-to-legacy-corpus}:/legacy-data:ro
- python_venv:/workspace/.docker/venv
- pip_cache:/root/.cache/pip
api:
image: python:3.12-slim
restart: unless-stopped
depends_on:
db:
condition: service_healthy
importer:
condition: service_completed_successfully
working_dir: /workspace
environment:
ECOSPECIES_DATA_DIR: ${ECOSPECIES_DATA_DIR:-/legacy-data/InputFiles - TXT}
ECOSPECIES_DATABASE_URL: postgresql+psycopg://${ECOSPECIES_DB_USER:-ecospecies}:${ECOSPECIES_DB_PASSWORD}@db:5432/${ECOSPECIES_DB_NAME:-ecospecies}
ECOSPECIES_HOST: 0.0.0.0
ECOSPECIES_PORT: "8000"
ECOSPECIES_AUTH_TOKENS: ${ECOSPECIES_AUTH_TOKENS:-}
ECOSPECIES_VENV_DIR: /workspace/.docker/venv
PYTHONPATH: /workspace/apps/api/src
command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-api.sh"]
volumes:
- ..:/workspace
- ${ECOSPECIES_LEGACY_DATA_DIR:-../path-to-legacy-corpus}:/legacy-data:ro
- python_venv:/workspace/.docker/venv
- pip_cache:/root/.cache/pip
web:
image: nginx:1.27-alpine
restart: unless-stopped
depends_on:
api:
condition: service_started
labels:
- "traefik.enable=true"
- "traefik.docker.network=${TRAEFIK_NETWORK:-traefik-network}"
- "traefik.http.routers.ecospecies-atlas.rule=Host(`${ECOSPECIES_HOSTNAME:?set ECOSPECIES_HOSTNAME}`)"
- "traefik.http.routers.ecospecies-atlas.entrypoints=${TRAEFIK_ENTRYPOINTS:-websecure}"
- "traefik.http.routers.ecospecies-atlas.tls.certresolver=${TRAEFIK_CERTRESOLVER:-myresolver}"
- "traefik.http.services.ecospecies-atlas.loadbalancer.server.port=80"
volumes:
- ../apps/web:/usr/share/nginx/html:ro
- ../apps/web/nginx.conf:/etc/nginx/conf.d/default.conf:ro
networks:
- default
- traefik-network
volumes:
postgres_data:
python_venv:
pip_cache:
networks:
traefik-network:
external: true
name: ${TRAEFIK_NETWORK:-traefik-network}

View File

@ -0,0 +1,20 @@
# Required
ECOSPECIES_HOSTNAME=example.org
ECOSPECIES_BASE_PATH=/apps/ecospecies
ECOSPECIES_DB_PASSWORD=replace-with-strong-password
# Optional database settings
ECOSPECIES_DB_NAME=ecospecies
ECOSPECIES_DB_USER=ecospecies
# Optional application settings
ECOSPECIES_AUTH_TOKENS=
ECOSPECIES_DATA_DIR=/workspace/input-data/InputFiles
# Optional host path to the legacy corpus if it is not at ../path-to-legacy-corpus
ECOSPECIES_LEGACY_DATA_DIR=../path-to-legacy-corpus
# Optional Traefik settings
TRAEFIK_NETWORK=traefik-network
TRAEFIK_ENTRYPOINTS=websecure
TRAEFIK_CERTRESOLVER=myresolver

View File

@ -0,0 +1,93 @@
services:
db:
container_name: ecospecies-db
image: postgres:16-alpine
restart: unless-stopped
environment:
POSTGRES_DB: ${ECOSPECIES_DB_NAME:-ecospecies}
POSTGRES_USER: ${ECOSPECIES_DB_USER:-ecospecies}
POSTGRES_PASSWORD: ${ECOSPECIES_DB_PASSWORD:?set ECOSPECIES_DB_PASSWORD}
PGDATA: /var/lib/postgresql/data/pgdata
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${ECOSPECIES_DB_USER:-ecospecies} -d ${ECOSPECIES_DB_NAME:-ecospecies}"]
interval: 5s
timeout: 5s
retries: 10
volumes:
- postgres_data:/var/lib/postgresql/data
importer:
container_name: ecospecies-importer
image: python:3.12-slim
restart: "no"
depends_on:
db:
condition: service_healthy
working_dir: /workspace
environment:
ECOSPECIES_DATA_DIR: ${ECOSPECIES_DATA_DIR:-/workspace/input-data/InputFiles}
ECOSPECIES_DATABASE_URL: postgresql+psycopg://${ECOSPECIES_DB_USER:-ecospecies}:${ECOSPECIES_DB_PASSWORD}@db:5432/${ECOSPECIES_DB_NAME:-ecospecies}
ECOSPECIES_VENV_DIR: /workspace/.docker/venv
PYTHONPATH: /workspace/apps/api/src
command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-import.sh"]
volumes:
- ..:/workspace
- ${ECOSPECIES_LEGACY_DATA_DIR:-../path-to-legacy-corpus}:/legacy-data:ro
- python_venv:/workspace/.docker/venv
- pip_cache:/root/.cache/pip
api:
container_name: ecospecies-api
image: python:3.12-slim
restart: unless-stopped
depends_on:
db:
condition: service_healthy
importer:
condition: service_completed_successfully
working_dir: /workspace
environment:
ECOSPECIES_DATA_DIR: ${ECOSPECIES_DATA_DIR:-/workspace/input-data/InputFiles}
ECOSPECIES_DATABASE_URL: postgresql+psycopg://${ECOSPECIES_DB_USER:-ecospecies}:${ECOSPECIES_DB_PASSWORD}@db:5432/${ECOSPECIES_DB_NAME:-ecospecies}
ECOSPECIES_HOST: 0.0.0.0
ECOSPECIES_PORT: "8000"
ECOSPECIES_AUTH_TOKENS: ${ECOSPECIES_AUTH_TOKENS:-}
ECOSPECIES_VENV_DIR: /workspace/.docker/venv
PYTHONPATH: /workspace/apps/api/src
command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-api.sh"]
volumes:
- ..:/workspace
- ${ECOSPECIES_LEGACY_DATA_DIR:-/input-data}:/legacy-data:ro
- python_venv:/workspace/.docker/venv
- pip_cache:/root/.cache/pip
web:
container_name: ecospecies-web
image: nginx:1.27-alpine
restart: unless-stopped
depends_on:
api:
condition: service_started
labels:
- "traefik.enable=true"
- "traefik.docker.network=${TRAEFIK_NETWORK:-traefik-network}"
- "traefik.http.routers.ecospecies-atlas.rule=Host(`${ECOSPECIES_HOSTNAME:?set ECOSPECIES_HOSTNAME}`) && PathPrefix(`${ECOSPECIES_BASE_PATH:-/}`)"
- "traefik.http.routers.ecospecies-atlas.entrypoints=${TRAEFIK_ENTRYPOINTS:-websecure}"
- "traefik.http.routers.ecospecies-atlas.tls.certresolver=${TRAEFIK_CERTRESOLVER:-myresolver}"
- "traefik.http.services.ecospecies-atlas.loadbalancer.server.port=80"
volumes:
- ../apps/web:/usr/share/nginx/html:ro
- ../apps/web/nginx.conf:/etc/nginx/conf.d/default.conf:ro
networks:
- default
- traefik-network
volumes:
postgres_data:
python_venv:
pip_cache:
networks:
traefik-network:
external: true
name: ${TRAEFIK_NETWORK:-traefik-network}

48
docs/postgres-backup.md Normal file
View File

@ -0,0 +1,48 @@
# PostgreSQL Backup Notes
This note applies to deployments that use the PostgreSQL volume defined by the Compose stack, including the Traefik deployment variant.
## What Needs Backup
At minimum, back up:
- the PostgreSQL data volume
- the deployment env file that contains the database credentials
For the Traefik deployment variant, that usually means:
- the Docker volume `postgres_data`
- `docs/docker-compose-traefik.env`
## Logical Backup
From the repository root, create a SQL dump with:
```bash
./scripts/backup-postgres.sh
```
To write to a specific file:
```bash
./scripts/backup-postgres.sh /path/to/ecospecies-backup.sql
```
## Restore From Logical Backup
Restore a SQL dump with:
```bash
./scripts/restore-postgres.sh /path/to/ecospecies-backup.sql
```
## Volume-Level Backup
If the host backup system can snapshot Docker volumes safely, include the PostgreSQL volume in that schedule. A volume snapshot is useful for full recovery, but a logical dump is still recommended for portability and validation.
## Operational Guidance
- Run backups on a schedule instead of relying on ad hoc dumps.
- Test restore procedures before relying on the backup policy.
- Keep backup artifacts outside the live Docker host when possible.
- The backup and restore scripts default to `docs/docker-compose-traefik.env` and `docs/docker-compose-traefik.yml`, but both can be overridden with `ECOSPECIES_ENV_FILE` and `ECOSPECIES_COMPOSE_FILE`.

View File

@ -1,5 +1,22 @@
# EcoSpecies Modernization Roadmap
## Current Status
As of 2026-03-27, the repo is no longer at the pure planning stage. The following pieces are already implemented and working in the live stack:
- Docker Compose deployment with explicit `ecospecies-...` container names
- path-based hosting support for `/apps/ecospecies`
- in-repo-only source directory resolution with safe path validation
- legacy SLH ingest into PostgreSQL-backed species, sections, citations, audit, and document records
- editor/admin workflows for draft, review, publish, archive, and audit history
- contributor registration and draft-authoring workflow with token-based access
- structured Markdown document storage and editor/API round-trip
- persisted taxon identifier scaffolding with legacy identifiers separated from future-facing external identifiers
- citation extraction, review, enrichment, batch enrichment, candidate matching, and reviewed-candidate selection/addition
- citation persistence back into the structured Markdown source of truth
The roadmap below has been updated to reflect that actual state.
## Target Product
Create a Docker Compose-based, open-source EcoSpecies successor that:
@ -31,48 +48,91 @@ Create a Docker Compose-based, open-source EcoSpecies successor that:
### Phase 0: Discovery and migration planning
Status: completed
- Inventory legacy assets and user-facing capabilities.
- Capture the replacement architecture and ingestion strategy.
- Define acknowledgements, provenance, and licensing boundaries.
### Phase 1: Ingestion foundation
Status: substantially complete, with parser refinement ongoing
- Parse legacy `.txt` SLH inputs into structured JSON records.
- Normalize common metadata: title, scientific name, common name, FLELMR code, headings, references.
- Normalize common metadata: title, scientific name, common name, FLELMR/EcoSpecies code, headings, references.
- Create ingest diagnostics to flag malformed files and missing metadata.
- Continue parser refinement for legacy edge cases in headings, citations, and historical bibliography formats.
### Phase 2: Public read experience
Status: implemented baseline
- Species listing and search.
- Species detail view with section navigation.
- Provenance and acknowledgement display.
- Summary metrics on corpus coverage.
- Path-based deployment under `/apps/ecospecies`.
### Phase 3: Structured persistence
### Phase 3: Structured persistence and editorial workflow
- Move parsed content into PostgreSQL.
- Add editor-safe import jobs and audit metadata.
- Preserve raw source alongside normalized records.
- Establish authentication and role-based access for editor and admin workflows.
- Add persisted editorial workflow state for draft, review, and published records.
- Make document sections individually addressable for editor review and revision, with audit history for section-level changes.
Status: implemented baseline, with editor UX still maturing
### Phase 4: Linkages and visualization
- PostgreSQL-backed persistence for species, sections, citations, documents, taxon identifiers, and audit history.
- Editor-safe import jobs and audit metadata.
- Raw-source preservation alongside normalized records.
- Authentication and role-based access for admin/editor/contributor workflows.
- Persisted editorial workflow state for draft, review, published, and archived records.
- Structured Markdown document storage and round-trip editing.
- Citation review, enrichment, candidate selection, and reviewed-candidate addition.
- Contributor draft creation and owner-scoped editing.
### Phase 4: Standards-aware identity and bibliography
Status: partially implemented
- Preserve legacy local identifiers as provenance.
- Persist taxon identifiers separately from legacy identifiers.
- Expose `legacy_identifiers`, `taxon_identifiers`, and `primary_taxon_*` API fields.
- Persist structured citation records with DOI/OpenAlex/DataCite-style enrichment fields.
- Continue toward multi-authority identifier review, richer citation entities, and CiteGeist-backed bibliography expansion.
### Phase 5: Editor ergonomics and advanced review
Status: in progress
- Structured Markdown editor is live.
- Citation match-review dialog is live.
- Remaining work:
- CodeMirror-based Markdown editor with folding
- inline parser diagnostics in the editor
- richer citation diff/review affordances
- clearer document-node and citation provenance in the UI
### Phase 6: Linkages and visualization
Status: not started
- Model predator/prey, habitat, and ecological association edges.
- Add graph endpoints and species-relationship views.
- Support public-friendly visual explanations and expert filters.
### Phase 5: Reports and export
### Phase 7: Reports and export
- Recreate legacy-like text/RTF export.
- Add machine-readable export formats such as JSON and Markdown.
- Support FLELMR-oriented authoring/export profiles.
Status: partially implemented
### Phase 6: Assisted research workflows
- JSON and Markdown exports exist through the API/document model.
- Structured Markdown is now the primary human-readable editor/export format.
- Remaining work:
- recreate legacy-like text/RTF export
- support export profiles for legacy compatibility and standards-forward outputs
- improve citation/bibliography export fidelity
### Phase 8: Assisted research workflows
Status: planned
- Add local-LLM-assisted extraction and drafting in a human-review loop.
- Integrate bibliography tooling for citation consolidation.
- Integrate bibliography tooling for citation consolidation and topic expansion.
- Support candidate-species intake for records not yet in the historical corpus.
- Restrict assisted drafting and publication actions to authenticated editorial roles.
@ -84,6 +144,9 @@ Initial core entities:
- `source_document`
- `document_section`
- `citation`
- `taxon_identifier`
- `citation_identifier`
- `bibliography_topic`
- `taxon`
- `linkage`
- `media_asset`
@ -95,6 +158,7 @@ Key design rules:
- retain provenance and import timestamps
- separate public published records from draft/editor states
- make sections addressable for citation and graph linking
- prefer a canonical document AST over direct projection from free-form source text
## LLM Extension Strategy
@ -103,6 +167,8 @@ Use local models only for assistive tasks, never silent publication:
- extracting candidate structured fields from new SLH text
- suggesting missing headings or linkage labels
- clustering similar citations
- resolving bibliography entries toward DOI/OpenAlex/DataCite where available
- treating local legacy codes as provenance, not canonical identifiers
- drafting summaries for editor review
Guardrails:
@ -111,16 +177,19 @@ Guardrails:
- all generated content is marked as draft
- every automated extraction stores source spans where possible
## Development Roadmap
## Near-Term Priorities
1. Implement a thin ingestion API over the legacy text corpus.
2. Build a responsive browser UI for listing and viewing species.
3. Add a persistent PostgreSQL-backed ingest store.
4. Introduce export and visualization services.
5. Add editorial workflows and local-LLM assistance.
1. Add CodeMirror-based folding and structure-aware editing to the Markdown document editor.
2. Expand taxon identifier review workflows for WoRMS, GBIF, Catalogue of Life, and related authorities.
3. Deepen citation quality controls, including better parsed-field visibility and stricter/manual review loops where resolver confidence is weak.
4. Add CiteGeist-style topic expansion and bibliography-suggestion review for under-cited species.
5. Improve document export fidelity so reviewed citations and standards-based identifiers are clearly represented in Markdown and downstream exports.
6. Begin the first ecological-linkage data model and API endpoints once citation/identifier workflows stabilize.
## Definition Of Done For The Initial Milestone
- `docker compose up` starts a working API and frontend.
- The system can enumerate the legacy corpus and show parsed species detail for at least one real SLH file.
- Project docs describe the migration approach, target architecture, and next phases.
- The system can enumerate the legacy corpus and show parsed species detail for real SLH files.
- Editors can curate structured Markdown documents and citations through authenticated workflows.
- Contributors can register, create drafts, and edit only their own submissions.
- Project docs describe both the implemented modernization state and the next phases.

View File

@ -0,0 +1,315 @@
# EcoSpecies Standards Migration Plan
## Problem
The current EcoSpecies ingest and document model still treats legacy local fields such as `FLELMR code` / `species_code` as if they were primary identifiers. That is useful for historical provenance, but it is the wrong long-term center of gravity for a broader, modern biodiversity knowledge system.
The same problem exists for citations:
- legacy plaintext reference blocks are treated as local document text,
- citation identity is weak or missing,
- bibliography growth is tied to what happened to appear in the historical SLH file.
The new system should preserve legacy local identifiers and references, but it should not be structurally bound to them.
## Direction
Treat legacy local codes and freeform references as import-era artifacts, not canonical future-facing identifiers.
Going forward, EcoSpecies should prefer broadly recognized identifiers and registries:
- taxonomic name authority and taxon identifiers:
- Catalogue of Life IDs and release DOIs
- GBIF taxon keys
- WoRMS AphiaIDs for marine taxa
- ITIS TSNs where relevant
- optional NCBI Taxonomy IDs for research interoperability
- literature and dataset identifiers:
- DOI as the primary publication/dataset identifier
- ISBN/ISSN where DOI is absent
- OpenAlex IDs and DataCite metadata as enrichment layers
- contributor identity:
- email-based local contributor accounts now
- optional ORCID linkage later for editor and contributor identity
The system should be marine-forward because that matches the historical corpus, but not marine-exclusive. Identifier strategy should therefore be authority-aware rather than tied to a single domain-specific registry.
## Authority Selection Strategy
Choose the primary taxon authority by best-fit coverage, not by a single global rule.
- marine taxa:
- prefer WoRMS AphiaID as primary when confidently matched
- retain GBIF and Catalogue of Life as crosswalks
- non-marine or mixed-domain taxa:
- prefer Catalogue of Life or GBIF as primary, depending on match quality and coverage
- retain ITIS and other relevant identifiers as crosswalks
- unresolved or conflicting cases:
- store all candidate identifiers
- require editorial review before a primary identifier is asserted
This keeps the project ready for terrestrial expansion without discarding the value of WoRMS for the present corpus.
## Important Taxonomic Note
PhyloCode is relevant for clade naming, not as a general-purpose replacement for species-level registry IDs. It should not become the primary EcoSpecies species identifier layer. It may be useful later for clade-aware ontology and higher-level phylogenetic naming, but not as the main substitute for local `species_code` values.
## Core Design Rules
1. Legacy local identifiers remain preserved exactly as imported.
2. Canonical taxon identity becomes multi-authority, not single-local-code.
3. Citations become first-class structured entities, not just text inside a section.
4. Bibliographies can be extended by topic and citation graph, not only by source-document inheritance.
5. Exports keep provenance visible so readers can distinguish legacy source metadata from normalized external identifiers.
## Schema Changes
### Species metadata
Retain `flelmr_code` for provenance, but demote it to a legacy metadata field.
Add a taxon-identity layer:
- `taxon_name_usage`
- `taxon_identifier`
- `taxon_authority`
- `taxon_match_review`
Suggested fields:
- `taxon_identifier.authority`
- `taxon_identifier.identifier`
- `taxon_identifier.rank`
- `taxon_identifier.label`
- `taxon_identifier.is_primary`
- `taxon_identifier.source_url`
- `taxon_identifier.asserted_by`
- `taxon_identifier.match_confidence`
- `taxon_identifier.review_status`
Examples:
- `authority = "worms", identifier = "159059", label = "AphiaID"`
- `authority = "gbif", identifier = "2290910", label = "taxonKey"`
- `authority = "col", identifier = "5T7L7", label = "taxonID"`
- `authority = "itis", identifier = "161989", label = "TSN"`
- `authority = "legacy-ecospecies", identifier = "5192", label = "FLELMR"`
### Citation model
Move from section text to structured bibliography entities:
- `citation`
- `citation_identifier`
- `citation_relation`
- `species_citation`
- `document_node_citation`
- `bibliography_topic`
Suggested citation identifier types:
- DOI
- ISBN
- ISSN
- PMID
- arXiv
- OpenAlex
- URL
## Markdown / AST Changes
Update the constrained Markdown profile so metadata stops implying that `species_code` is canonical.
Replace the current front matter recommendation:
```md
species_code: 5192
```
with a provenance-oriented shape:
```md
legacy_identifiers:
- authority: legacy-ecospecies
identifier: 5192
label: FLELMR
taxon_identifiers:
- authority: worms
identifier: 159059
label: AphiaID
primary: true
- authority: gbif
identifier: 2290910
label: taxonKey
```
Also add explicit bibliography sections:
```md
## References
- id: doi:10.1000/example
text: Smith, J. 2024. Example paper...
relation: cites
## Suggested Reading
- topic: estuarine ecology
```
The AST should preserve:
- legacy identifiers
- normalized taxon identifiers
- structured references
- topic links used for bibliography expansion
## Import Pipeline Changes
### Species identity
Import should produce:
1. raw imported name fields,
2. legacy local identifiers,
3. unresolved candidate taxon identifiers,
4. optional matched external identifiers,
5. a review state for unresolved or conflicting authority matches.
Do not block ingest if no external authority match exists. Store the unresolved state explicitly.
Primary identifier assignment should be determined by:
1. domain fit of the authority
2. confidence of the match
3. editorial review status
4. future ability to crosswalk to other authorities
### Citations
Split citation processing into stages:
1. detect bibliography/reference sections in the imported SLH text,
2. extract plaintext reference strings,
3. convert plaintext references into draft structured entries,
4. enrich identifiers and metadata,
5. assign accepted citations back to species and document nodes,
6. optionally expand bibliography by topic and citation graph.
## CiteGeist Integration
`../CiteGeist` is a strong fit for this migration.
Observed capabilities in that repo already cover much of what EcoSpecies needs:
- extracting references from plaintext,
- converting rough references into draft structured entries,
- DOI/Crossref/DataCite/OpenAlex enrichment,
- citation graph expansion,
- topic-based bibliography expansion,
- duplicate clustering and canonicalization.
### Recommended integration boundary
Do not embed CiteGeist logic directly into the EcoSpecies parser.
Instead:
1. EcoSpecies exports candidate plaintext references and topic phrases.
2. CiteGeist processes and enriches them into structured bibliography data.
3. EcoSpecies imports reviewed citation outputs into its own `citation` tables.
### First integration targets
- species-level bibliography cleanup from `References` sections
- DOI resolution and identifier assignment
- duplicate detection across species bibliographies
- topic expansion for subject areas such as habitat, trophic ecology, reproduction, invasive biology, and fisheries context
### Later integration targets
- node-level citation attachment
- bibliography review UI
- suggested-reading generation per species
- topic-seeded bibliography augmentation for under-cited species drafts
## API Changes
Add standards-aware endpoints:
- `/api/species/<slug>/identifiers`
- `/api/species/<slug>/citations`
- `/api/species/<slug>/bibliography/topics`
- `/api/editor/species/<slug>/identifier-review`
- `/api/editor/species/<slug>/citation-review`
Do not remove legacy fields immediately. Keep `flelmr_code` in payloads for compatibility while introducing:
- `legacy_identifiers`
- `taxon_identifiers`
- `primary_taxon_identifier`
## UI Changes
The species detail page should distinguish:
- scientific name
- primary external taxon identifier
- legacy local identifiers
- bibliography
- suggested reading
Editors should see:
- unresolved authority matches
- conflicting taxon IDs
- citation enrichment candidates
- duplicate-reference clusters
Contributors should only author content and draft references; identifier normalization and bibliography publication remain editorial functions.
## Migration Phases
### Phase A: Demote legacy code
- Rename internal presentation from “species code” to “legacy identifier”.
- Keep `flelmr_code` only as legacy provenance.
- Add `legacy_identifiers` to Markdown export and AST.
### Phase B: Add external taxon identifiers
- Create taxon-identifier tables and API payloads.
- Add editor review workflows for selecting a primary authority identifier.
- Default marine taxa review toward WoRMS where available.
- Default broader cross-domain review toward Catalogue of Life and GBIF where WoRMS is not the right authority.
- Keep the model open to terrestrial species from the beginning rather than treating them as out-of-scope exceptions.
### Phase C: Structured bibliography
- Create citation tables.
- Extract plaintext references from imported documents.
- Store draft citations separately from accepted citations.
### Phase D: CiteGeist bridge
- Define import/export format between EcoSpecies and CiteGeist.
- Run draft-reference normalization and DOI enrichment.
- Import reviewed structured citations back into EcoSpecies.
### Phase E: Topic-aware bibliography growth
- Store species topic phrases.
- Use CiteGeist topic expansion for bibliography augmentation.
- Keep added citations flagged by source type:
- imported
- resolved
- topic-expanded
- editor-added
## Immediate Next Steps
1. Update the Markdown profile to replace `species_code` with `legacy_identifiers` plus `taxon_identifiers`.
2. Add `legacy_identifiers` and `taxon_identifiers` to the AST/document model.
3. Introduce taxon identifier tables in the PostgreSQL schema.
4. Define a minimal EcoSpecies-to-CiteGeist interchange format for plaintext references and topic phrases.
5. Add editor-facing citation review before attempting automatic bibliography publication.

View File

@ -0,0 +1,338 @@
# Structured Markdown Document Plan
## Goal
Replace the current flat, parser-heavy free-form text handling with a document model that is:
- human-readable in plaintext
- editable in the browser with hierarchy folding
- permissive-license friendly
- suitable for first-pass conversion from legacy SLH text files
- suitable as the primary export format for a species life history
- able to project cleanly into a flexible database model with greater hierarchical depth
## Recommendation
Adopt a constrained Markdown-based authoring format as the primary human-facing document format, backed by an internal hierarchical document AST and a relational projection layer in PostgreSQL.
Use this three-layer model:
1. Source and export format: constrained EcoSpecies Markdown
2. Canonical application representation: hierarchical AST
3. Database representation: relational projection for querying, indexing, publishing, and editorial workflows
This avoids treating raw free-form text as both the storage format and the parser input.
## Why Markdown Instead Of Org
Markdown is the better fit for this codebase and licensing requirement because:
- it is familiar to most users
- it is easier to constrain than Org
- it maps naturally to hierarchical headings
- it works well with CodeMirror folding
- it does not require adopting GPL or AGPL editor code
Org-style authoring remains conceptually attractive, but embedding Org-specific tooling such as organice would introduce copyleft code, which is not aligned with a permissive-only implementation strategy.
## EcoSpecies Markdown Profile
The format should be Markdown-like, but intentionally narrower than unrestricted Markdown.
### Metadata
Use YAML front matter for canonical metadata fields:
```md
---
title: American Oyster
common_name: American Oyster
scientific_name: Crassostrea virginica
legacy_identifiers:
- authority: legacy-ecospecies
identifier: 5192
label: FLELMR
taxon_identifiers:
- authority: worms
identifier: 159059
label: AphiaID
primary: true
source_file: American Oyster SLH NOAA SEA.txt
publication_status: published
---
```
Recommended canonical fields:
- `title`
- `common_name`
- `scientific_name`
- `legacy_identifiers`
- `taxon_identifiers`
- `primary_taxon_authority`
- `source_file`
- `publication_status`
- `source_format`
- `legacy_import_id`
### Hierarchy
Use headings as the sole structure-bearing primitive.
Example:
```md
---
title: American Oyster
common_name: American Oyster
scientific_name: Crassostrea virginica
legacy_identifiers:
- authority: legacy-ecospecies
identifier: 5192
label: FLELMR
---
## Summary
Short editor-reviewed abstract.
## Habitat
### Type
Estuarine.
### Substrate
Hard bottom, shell, mud flats, and other suitable settlement surfaces.
## Reproduction
### Season
Spawning occurs from spring through fall in much of the Gulf.
```
Rules:
- Heading depth is meaningful.
- Skip-level headings should be rejected or normalized.
- Body text belongs to the nearest preceding heading.
- `#` level is optional if the document title already exists in front matter.
- Tables, lists, and citations are allowed only where explicitly supported.
- Arbitrary embedded HTML should be disallowed.
### Citations
Keep citations readable in Markdown but structured enough to parse.
Preferred first-pass shape:
```md
## Citations
- [7] Ahmed, M. 1975. Speciation in living oysters. Advances in Marine Biology 13:357-397.
- [15] Andrews, J.D. 1979. Pelecypoda: Ostreidae. Reproduction of Marine Invertebrates...
```
This is intentionally simpler than trying to infer citations from arbitrary prose.
## Canonical AST
Markdown should not be the sole internal representation. Parse it into an AST that preserves hierarchy explicitly.
Example conceptual shape:
```json
{
"metadata": {
"title": "American Oyster",
"common_name": "American Oyster",
"scientific_name": "Crassostrea virginica",
"legacy_identifiers": [
{
"authority": "legacy-ecospecies",
"identifier": "5192",
"label": "FLELMR"
}
]
},
"nodes": [
{
"id": "n1",
"type": "section",
"depth": 2,
"title": "Summary",
"body": "Short editor-reviewed abstract.",
"children": []
},
{
"id": "n2",
"type": "section",
"depth": 2,
"title": "Habitat",
"body": "",
"children": [
{
"id": "n3",
"type": "section",
"depth": 3,
"title": "Type",
"body": "Estuarine.",
"children": []
}
]
}
]
}
```
Required AST properties:
- arbitrary hierarchical depth
- stable node identifiers
- separate metadata from body structure
- support for editor audit and provenance
- support for extracting source spans from imported legacy text when available
## Database Direction
The current flat `document_section` model should evolve into a general document tree.
Suggested core tables:
- `species_document`
- `species_document_node`
- `species_document_node_revision`
- `species_document_metadata`
- `citation`
- `species_document_export`
Suggested `species_document_node` fields:
- `id`
- `document_id`
- `parent_id`
- `position`
- `depth`
- `node_type`
- `title`
- `body_markdown`
- `body_plaintext`
- `source_heading`
- `source_span_start`
- `source_span_end`
This enables:
- greater hierarchical depth
- stable editor operations on subtrees
- future insertion of machine-extracted nested content
- simplified export back to Markdown
## Import Flow
The legacy text parser should no longer attempt to infer the final database structure directly.
Instead:
1. Parse raw legacy text into a best-effort intermediate tree.
2. Normalize extracted metadata.
3. Emit constrained Markdown.
4. Parse constrained Markdown into AST.
5. Persist AST and project relationally.
6. Record diagnostics on uncertain conversions.
This changes the parsers role from “infer final structure perfectly” to “produce a reviewable first draft”.
## Editor Flow
The web editor should operate primarily on the Markdown representation, with a structured parse running on save or preview.
Recommended behavior:
- fold by heading depth in CodeMirror
- validate front matter and heading structure
- preview rendered sections
- show parser diagnostics inline
- save both Markdown source and parsed AST
The editor should reject or flag:
- invalid front matter
- duplicate canonical metadata keys
- heading depth jumps
- malformed citation entries in structured sections
## Export Policy
Markdown should be the primary export format for a species life history.
Export targets:
- constrained Markdown for editorial interchange
- JSON AST for machine workflows
- derived relational/API payloads for the application
- optional report-oriented exports later
The export path should be:
- database document tree -> canonical AST -> constrained Markdown
This ensures the exported plaintext remains stable and human-readable.
## Migration Strategy
### Stage 1: Introduce the document model
- add AST schema and persistence tables
- keep existing section-based reads working
- build Markdown import/export helpers
### Stage 2: Convert current parser output
- map current parsed sections into Markdown drafts
- preserve existing metadata and diagnostics
- store generated Markdown alongside current records
### Stage 3: Introduce Markdown editor
- add CodeMirror-based editor with heading folding
- add validation for front matter and heading structure
- add round-trip save through AST
### Stage 4: Move public reads to the new document model
- generate current API responses from the hierarchical document tree
- keep compatibility shims for legacy flat sections where needed
### Stage 5: Expand structured extraction
- add deeper parsing for habitat, reproduction, citations, and linkages
- add richer projections from AST to relational tables
## Immediate Implementation Tasks
Recommended first engineering tasks:
1. Define the constrained Markdown grammar and validation rules.
2. Design the AST schema and PostgreSQL tables.
3. Add Markdown import/export utilities in the API service.
4. Prototype a CodeMirror editor with heading folding.
5. Add a migration command that converts current species records into Markdown drafts.
6. Preserve current endpoints while introducing the document-tree backing model.
## Non-Goals For The First Pass
- full unrestricted Markdown feature support
- WYSIWYG editing
- arbitrary embedded HTML
- perfect citation parsing from all legacy free text
- replacing every existing API shape immediately
## Decision Summary
The planned direction is:
- constrained Markdown as the editable and exportable document format
- internal AST as the canonical application representation
- relational projection for queryable application state
- CodeMirror-based browser editing with heading folding
This is the most practical path toward human-editable hierarchy, permissive-only implementation, cleaner parsing, and deeper long-term document structure.

79
docs/traefik-deploy.md Normal file
View File

@ -0,0 +1,79 @@
# Traefik Deployment Notes
This note applies to the reverse-proxy deployment variant in `docs/docker-compose-traefik.yml`.
## Start The Stack
From the repository root:
```bash
cp docs/docker-compose-traefik.env.example docs/docker-compose-traefik.env
# edit docs/docker-compose-traefik.env
docker compose \
--env-file docs/docker-compose-traefik.env \
-f docs/docker-compose-traefik.yml \
up -d
```
## Common Failure Modes
### Traefik cannot reach the web container
Check:
- the external Docker network named by `TRAEFIK_NETWORK` exists
- the Traefik instance is attached to that same Docker network
- the hostname in `ECOSPECIES_HOSTNAME` matches the Traefik router rule you expect
- the path in `ECOSPECIES_BASE_PATH` matches the published application prefix, for example `/apps/ecospecies`
### The site opens but the API fails
Check:
- the `api` service is healthy and running
- the `web` service is using the repo's `apps/web/nginx.conf`
- the `api` service finished waiting for `importer`
- the request path is under `ECOSPECIES_BASE_PATH` if you are publishing the app below a domain root
### Importer fails on startup
Check:
- `ECOSPECIES_LEGACY_DATA_DIR` points to a real host path
- that path contains `InputFiles - TXT`
- the mount is readable by Docker on the target host
### Database does not initialize
Check:
- `ECOSPECIES_DB_PASSWORD` is set
- the PostgreSQL volume is writable
- an old incompatible volume is not being reused unintentionally
### Editor login works but no editor state is available
Check:
- `ECOSPECIES_AUTH_TOKENS` is set on the `api` service
- the token you entered matches the configured value exactly
## Operational Notes
- This deployment variant intentionally exposes only the `web` container to Traefik.
- The `api`, `db`, and `importer` services stay on the internal Compose network.
- The `importer` runs before the API starts and seeds or synchronizes the dataset.
- The web container serves both the domain root and `/apps/ecospecies/`, but the Traefik router should target the intended public path.
## Apache Front Door
If Apache is the public front door for the hostname in `ECOSPECIES_HOSTNAME`, it must proxy the configured `ECOSPECIES_BASE_PATH` onward. Otherwise Apache can return its own `Not Found` page before the EcoSpecies stack sees the request.
Example Apache directives:
```apache
ProxyPass /apps/ecospecies http://127.0.0.1:80/apps/ecospecies
ProxyPassReverse /apps/ecospecies http://127.0.0.1:80/apps/ecospecies
```
Point the backend address at the actual Traefik listener on the host if it is not `127.0.0.1:80`, and adjust the published path if `ECOSPECIES_BASE_PATH` is different.

View File

@ -0,0 +1,185 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
from pathlib import Path
from ecospecies_api.repository import (
get_editor_species_citations,
get_editor_species_list,
update_species_citation_enrichment,
)
def should_backfill(citation: dict[str, object], include_accepted: bool) -> bool:
review_status = str(citation.get("review_status", "")).strip().lower()
source_type = str(citation.get("source_type", "")).strip().lower()
enrichment_status = str(citation.get("enrichment_status", "")).strip().lower()
normalized_text = str(citation.get("normalized_text", "")).strip()
abstract_text = str(citation.get("abstract_text", "")).strip()
if not include_accepted and review_status == "accepted":
return False
if source_type in {"editor_selected_candidate", "editor_added_candidate"} and not include_accepted:
return False
return (
source_type in {"document_extract", "editor_review", ""}
or enrichment_status in {"pending", "unresolved", "error", ""}
or not normalized_text
or not abstract_text
)
def reorder_species_with_cursor(
species_items: list[dict[str, object]],
state_file: Path | None,
) -> list[dict[str, object]]:
if not state_file or not species_items:
return species_items
try:
last_slug = state_file.read_text(encoding="utf-8").strip()
except FileNotFoundError:
return species_items
if not last_slug:
return species_items
for index, item in enumerate(species_items):
if str(item.get("slug", "")).strip() == last_slug:
return species_items[index + 1 :] + species_items[: index + 1]
return species_items
def write_cursor(state_file: Path | None, slug: str) -> None:
if not state_file or not slug:
return
state_file.parent.mkdir(parents=True, exist_ok=True)
state_file.write_text(f"{slug}\n", encoding="utf-8")
def main() -> int:
parser = argparse.ArgumentParser(description="Backfill EcoSpecies citation enrichment.")
parser.add_argument("--slug", help="Limit the backfill to a single species slug.")
parser.add_argument("--username", default="citation-backfill", help="Audit username to record.")
parser.add_argument(
"--include-accepted",
action="store_true",
help="Also rerun accepted/editor-curated citations.",
)
parser.add_argument(
"--max-species",
type=int,
default=0,
help="Stop after this many species with eligible citations. 0 means no limit.",
)
parser.add_argument(
"--max-citations",
type=int,
default=0,
help="Stop after this many citations overall. 0 means no limit.",
)
parser.add_argument(
"--state-file",
help="Optional cursor file used to rotate scheduled runs through the species list.",
)
args = parser.parse_args()
state_file = Path(args.state_file).expanduser() if args.state_file else None
species_items = (
[item for item in get_editor_species_list() if item["slug"] == args.slug]
if args.slug
else get_editor_species_list()
)
if not args.slug:
species_items = reorder_species_with_cursor(species_items, state_file)
if args.slug and not species_items:
print(f"Species not found: {args.slug}")
return 1
species_count = 0
citation_count = 0
changed_count = 0
resolved_count = 0
unresolved_count = 0
error_count = 0
last_seen_slug = ""
for species in species_items:
if args.max_species and species_count >= args.max_species:
break
slug = str(species["slug"])
last_seen_slug = slug
citation_payload = get_editor_species_citations(slug)
if citation_payload is None:
continue
eligible = [
citation
for citation in citation_payload["citations"]
if should_backfill(citation, include_accepted=args.include_accepted)
]
if not eligible:
continue
species_count += 1
print(f"[{slug}] backfilling {len(eligible)} citation(s)", flush=True)
for citation in eligible:
if args.max_citations and citation_count >= args.max_citations:
write_cursor(state_file, last_seen_slug)
print("citation limit reached; stopping early", flush=True)
print(
"summary:"
f" species={species_count}"
f" citations={citation_count}"
f" changed={changed_count}"
f" resolved={resolved_count}"
f" unresolved={unresolved_count}"
f" errors={error_count}",
flush=True,
)
return 0
citation_count += 1
result = update_species_citation_enrichment(
slug=slug,
citation_id=int(citation["id"]),
username=args.username,
)
if result is None:
print(f" - citation {citation['id']}: skipped (not found)", flush=True)
continue
changed_fields = result.get("changed_fields", {})
status = str(result["citation"].get("enrichment_status", "")).strip().lower()
if changed_fields:
changed_count += 1
if status == "resolved":
resolved_count += 1
elif status == "unresolved":
unresolved_count += 1
elif status == "error":
error_count += 1
print(
f" - citation {citation['id']}: {status or 'unknown'}"
+ (f" ({len(changed_fields)} field changes)" if changed_fields else "")
, flush=True)
write_cursor(state_file, last_seen_slug)
print(
"summary:"
f" species={species_count}"
f" citations={citation_count}"
f" changed={changed_count}"
f" resolved={resolved_count}"
f" unresolved={unresolved_count}"
f" errors={error_count}",
flush=True,
)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -0,0 +1,28 @@
#!/bin/sh
set -eu
ROOT_DIR="$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)"
ENV_FILE="${ECOSPECIES_ENV_FILE:-$ROOT_DIR/docs/docker-compose-traefik.env}"
COMPOSE_FILE="${ECOSPECIES_COMPOSE_FILE:-$ROOT_DIR/docs/docker-compose-traefik.yml}"
OUTPUT_FILE="${1:-$ROOT_DIR/ecospecies-backup.sql}"
if [ ! -f "$ENV_FILE" ]; then
echo "Missing env file: $ENV_FILE" >&2
exit 1
fi
set -a
. "$ENV_FILE"
set +a
DB_USER="${ECOSPECIES_DB_USER:-ecospecies}"
DB_NAME="${ECOSPECIES_DB_NAME:-ecospecies}"
docker compose \
--env-file "$ENV_FILE" \
-f "$COMPOSE_FILE" \
exec -T db \
pg_dump -U "$DB_USER" "$DB_NAME" \
> "$OUTPUT_FILE"
printf 'Backup written to %s\n' "$OUTPUT_FILE"

View File

@ -0,0 +1,37 @@
#!/bin/sh
set -eu
ROOT_DIR="$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)"
ENV_FILE="${ECOSPECIES_ENV_FILE:-$ROOT_DIR/docs/docker-compose-traefik.env}"
COMPOSE_FILE="${ECOSPECIES_COMPOSE_FILE:-$ROOT_DIR/docs/docker-compose-traefik.yml}"
INPUT_FILE="${1:-}"
if [ -z "$INPUT_FILE" ]; then
echo "Usage: $0 <sql-backup-file>" >&2
exit 1
fi
if [ ! -f "$ENV_FILE" ]; then
echo "Missing env file: $ENV_FILE" >&2
exit 1
fi
if [ ! -f "$INPUT_FILE" ]; then
echo "Missing backup file: $INPUT_FILE" >&2
exit 1
fi
set -a
. "$ENV_FILE"
set +a
DB_USER="${ECOSPECIES_DB_USER:-ecospecies}"
DB_NAME="${ECOSPECIES_DB_NAME:-ecospecies}"
cat "$INPUT_FILE" | docker compose \
--env-file "$ENV_FILE" \
-f "$COMPOSE_FILE" \
exec -T db \
psql -U "$DB_USER" "$DB_NAME"
printf 'Restore completed from %s\n' "$INPUT_FILE"

View File

@ -0,0 +1,21 @@
#!/bin/sh
set -eu
ROOT_DIR="$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)"
LOG_DIR="${ECOSPECIES_BACKFILL_LOG_DIR:-$ROOT_DIR/var/logs}"
STATE_FILE="${ECOSPECIES_BACKFILL_STATE_FILE:-$ROOT_DIR/var/citation-backfill.cursor}"
LOCK_DIR="${ECOSPECIES_BACKFILL_LOCK_DIR:-$ROOT_DIR/var/citation-backfill.lock}"
MAX_SPECIES="${ECOSPECIES_BACKFILL_MAX_SPECIES:-3}"
mkdir -p "$LOG_DIR"
mkdir -p "$ROOT_DIR/var"
if ! mkdir "$LOCK_DIR" 2>/dev/null; then
echo "citation backfill already running; skipping"
exit 0
fi
trap 'rmdir "$LOCK_DIR"' EXIT INT TERM
exec docker exec ecospecies-api /bin/sh -lc \
"PYTHONPATH=/workspace/apps/api/src /workspace/.docker/venv/bin/python -u /workspace/scripts/backfill-citations.py --username citation-backfill --max-species ${MAX_SPECIES} --state-file ${STATE_FILE}"