Prepare public-safe repo update
This commit is contained in:
parent
a6b04a995a
commit
1143f9bfcc
|
|
@ -7,3 +7,12 @@ __pycache__/
|
||||||
node_modules/
|
node_modules/
|
||||||
test-results/
|
test-results/
|
||||||
playwright-report/
|
playwright-report/
|
||||||
|
*~
|
||||||
|
*.env
|
||||||
|
secrets*
|
||||||
|
codex*
|
||||||
|
restart.sh
|
||||||
|
*lock.json
|
||||||
|
input-data/
|
||||||
|
legacy-data
|
||||||
|
var/logs/
|
||||||
|
|
|
||||||
19
README.md
19
README.md
|
|
@ -21,7 +21,7 @@ Docker Compose owns all runtime dependencies:
|
||||||
- Python services run in `python:3.12-slim`
|
- Python services run in `python:3.12-slim`
|
||||||
- the Python virtual environment is created in a Docker-managed volume mounted at `/workspace/.docker/venv`
|
- the Python virtual environment is created in a Docker-managed volume mounted at `/workspace/.docker/venv`
|
||||||
- dependencies are installed from `apps/api/requirements.txt` inside that virtual environment
|
- dependencies are installed from `apps/api/requirements.txt` inside that virtual environment
|
||||||
- the legacy corpus is mounted read-only from `../01-legacy-code-and-data`
|
- the legacy corpus is mounted read-only from a sibling directory, defaulting to `../legacy-corpus`
|
||||||
|
|
||||||
No host Python packages are required for the Compose workflow.
|
No host Python packages are required for the Compose workflow.
|
||||||
|
|
||||||
|
|
@ -48,6 +48,13 @@ Endpoints:
|
||||||
- editor section detail/update: `/api/editor/species/<slug>/sections/<position>` (requires `editor` or `admin`)
|
- editor section detail/update: `/api/editor/species/<slug>/sections/<position>` (requires `editor` or `admin`)
|
||||||
- editor audit history: `/api/editor/species/<slug>/audit` (requires `editor` or `admin`)
|
- editor audit history: `/api/editor/species/<slug>/audit` (requires `editor` or `admin`)
|
||||||
|
|
||||||
|
The app can also be published under a URL prefix. A reverse-proxy deployment can publish the app at a host and path such as:
|
||||||
|
|
||||||
|
- `ECOSPECIES_HOSTNAME=example.org`
|
||||||
|
- `ECOSPECIES_BASE_PATH=/apps/ecospecies`
|
||||||
|
|
||||||
|
When the site is served below a path prefix, the frontend derives its API base from the current page URL and nginx serves both the UI and proxied API under that same prefix.
|
||||||
|
|
||||||
If those host ports are already in use, override them when starting Compose, for example:
|
If those host ports are already in use, override them when starting Compose, for example:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|
@ -87,6 +94,14 @@ Run the browser-level smoke test against the real Compose stack with:
|
||||||
./scripts/check-ui-stack-smoke.sh
|
./scripts/check-ui-stack-smoke.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Run a bounded citation backfill pass with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/run-citation-backfill.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
The wrapper runs inside `ecospecies-api`, keeps a rotating cursor in `var/citation-backfill.cursor`, and skips a run if another backfill is already active.
|
||||||
|
|
||||||
## Notes
|
## Notes
|
||||||
|
|
||||||
- The importer seeds PostgreSQL from the legacy text corpus before the API starts and now synchronizes by slug instead of truncating the full dataset.
|
- The importer seeds PostgreSQL from the legacy text corpus before the API starts and now synchronizes by slug instead of truncating the full dataset.
|
||||||
|
|
@ -98,6 +113,8 @@ Run the browser-level smoke test against the real Compose stack with:
|
||||||
- Initial editor auth uses `ECOSPECIES_AUTH_TOKENS` in the format `token:username:role[,token2:username2:role2]`, where `role` is `viewer`, `editor`, or `admin`.
|
- Initial editor auth uses `ECOSPECIES_AUTH_TOKENS` in the format `token:username:role[,token2:username2:role2]`, where `role` is `viewer`, `editor`, or `admin`.
|
||||||
- Editorial workflow state is persisted per species with `draft`, `review`, and `published` statuses. Public endpoints return only `published` records; editor endpoints can inspect and update all records.
|
- Editorial workflow state is persisted per species with `draft`, `review`, and `published` statuses. Public endpoints return only `published` records; editor endpoints can inspect and update all records.
|
||||||
- Editors can curate top-level metadata and section content from the web UI, and every editorial or section change is recorded in per-species audit history.
|
- Editors can curate top-level metadata and section content from the web UI, and every editorial or section change is recorded in per-species audit history.
|
||||||
|
- Citation backfill can be scheduled externally, such as with a nightly cron job that runs `./scripts/run-citation-backfill.sh`. Use `ECOSPECIES_BACKFILL_LOG_DIR` if logs should go somewhere other than `var/logs`.
|
||||||
|
- Unresolved citation enrichment now still refreshes the locally parsed BibTeX and normalized citation text, so parser improvements propagate even without a remote metadata match.
|
||||||
- Summary authoring guidance for future FLELMR-compatible records is in `docs/flelmr-authoring.md`.
|
- Summary authoring guidance for future FLELMR-compatible records is in `docs/flelmr-authoring.md`.
|
||||||
- Legacy survey and roadmap artifacts are in `docs/`.
|
- Legacy survey and roadmap artifacts are in `docs/`.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,17 +15,36 @@ from ecospecies_api.auth import (
|
||||||
)
|
)
|
||||||
from ecospecies_api.parser import get_default_data_dir, load_species_records
|
from ecospecies_api.parser import get_default_data_dir, load_species_records
|
||||||
from ecospecies_api.repository import (
|
from ecospecies_api.repository import (
|
||||||
|
add_species_citation_from_candidate,
|
||||||
|
apply_species_citation_candidate_selection,
|
||||||
|
create_contributor_species,
|
||||||
|
get_contributor_species_citations,
|
||||||
|
get_contributor_species_detail,
|
||||||
|
get_contributor_species_document,
|
||||||
|
get_contributor_species_list,
|
||||||
|
get_species_citation_candidates,
|
||||||
|
get_editor_species_citations,
|
||||||
get_editor_species_detail,
|
get_editor_species_detail,
|
||||||
|
get_species_document,
|
||||||
get_editor_species_list,
|
get_editor_species_list,
|
||||||
get_editor_species_workflow,
|
get_editor_species_workflow,
|
||||||
|
get_minimum_contributor_age,
|
||||||
get_species_by_slug,
|
get_species_by_slug,
|
||||||
list_species_audit,
|
list_species_audit,
|
||||||
|
list_public_bibliography,
|
||||||
get_readiness_status,
|
get_readiness_status,
|
||||||
get_summary_metrics,
|
get_summary_metrics,
|
||||||
has_species_data,
|
has_species_data,
|
||||||
import_species_payload,
|
import_species_payload,
|
||||||
list_diagnostics,
|
list_diagnostics,
|
||||||
list_species,
|
list_species,
|
||||||
|
register_contributor,
|
||||||
|
update_species_citation_enrichment,
|
||||||
|
backfill_species_citations,
|
||||||
|
update_species_citations_enrichment_batch,
|
||||||
|
update_species_citation_review,
|
||||||
|
update_contributor_species_document_markdown,
|
||||||
|
update_species_document_markdown,
|
||||||
update_species_section,
|
update_species_section,
|
||||||
update_species_editorial,
|
update_species_editorial,
|
||||||
)
|
)
|
||||||
|
|
@ -99,6 +118,7 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
|
||||||
{
|
{
|
||||||
"authenticated": session is not None,
|
"authenticated": session is not None,
|
||||||
"auth_configured": auth_is_configured(),
|
"auth_configured": auth_is_configured(),
|
||||||
|
"minimum_contributor_age": get_minimum_contributor_age(),
|
||||||
"user": (
|
"user": (
|
||||||
{"username": session.username, "role": session.role}
|
{"username": session.username, "role": session.role}
|
||||||
if session is not None
|
if session is not None
|
||||||
|
|
@ -108,6 +128,23 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if path == "/api/contributor/status":
|
||||||
|
if not self.require_role(session, "contributor"):
|
||||||
|
return
|
||||||
|
self.write_json(
|
||||||
|
{
|
||||||
|
"status": "ok",
|
||||||
|
"contributor_access": True,
|
||||||
|
"user": {"username": session.username, "role": session.role},
|
||||||
|
"minimum_age": get_minimum_contributor_age(),
|
||||||
|
"capabilities": [
|
||||||
|
"create_species_draft",
|
||||||
|
"edit_owned_drafts",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
if path == "/api/editor/status":
|
if path == "/api/editor/status":
|
||||||
if not self.require_role(session, "editor"):
|
if not self.require_role(session, "editor"):
|
||||||
return
|
return
|
||||||
|
|
@ -135,10 +172,42 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
|
||||||
"slug": item["slug"],
|
"slug": item["slug"],
|
||||||
"title": item["title"],
|
"title": item["title"],
|
||||||
"common_name": item["common_name"],
|
"common_name": item["common_name"],
|
||||||
|
"scientific_name": item["scientific_name"],
|
||||||
|
"legacy_identifiers": item["legacy_identifiers"],
|
||||||
|
"taxon_identifiers": item["taxon_identifiers"],
|
||||||
|
"primary_taxon_authority": item["primary_taxon_authority"],
|
||||||
|
"primary_taxon_identifier": item["primary_taxon_identifier"],
|
||||||
"publication_status": item["publication_status"],
|
"publication_status": item["publication_status"],
|
||||||
"is_archived": item["is_archived"],
|
"is_archived": item["is_archived"],
|
||||||
"last_modified_by": item["last_modified_by"],
|
"last_modified_by": item["last_modified_by"],
|
||||||
"diagnostic_count": len(item["diagnostics"]),
|
"diagnostic_count": len(item["diagnostics"]),
|
||||||
|
"summary": item["summary"],
|
||||||
|
}
|
||||||
|
for item in items
|
||||||
|
]
|
||||||
|
self.write_json({"items": compact, "count": len(compact)})
|
||||||
|
return
|
||||||
|
|
||||||
|
if path == "/api/contributor/species":
|
||||||
|
if not self.require_role(session, "contributor"):
|
||||||
|
return
|
||||||
|
search = query.get("search", [""])[0].strip().lower()
|
||||||
|
items = get_contributor_species_list(session.username, search)
|
||||||
|
compact = [
|
||||||
|
{
|
||||||
|
"slug": item["slug"],
|
||||||
|
"title": item["title"],
|
||||||
|
"common_name": item["common_name"],
|
||||||
|
"scientific_name": item["scientific_name"],
|
||||||
|
"legacy_identifiers": item["legacy_identifiers"],
|
||||||
|
"taxon_identifiers": item["taxon_identifiers"],
|
||||||
|
"primary_taxon_authority": item["primary_taxon_authority"],
|
||||||
|
"primary_taxon_identifier": item["primary_taxon_identifier"],
|
||||||
|
"publication_status": item["publication_status"],
|
||||||
|
"is_archived": item["is_archived"],
|
||||||
|
"last_modified_by": item["last_modified_by"],
|
||||||
|
"diagnostic_count": len(item["diagnostics"]),
|
||||||
|
"summary": item["summary"],
|
||||||
}
|
}
|
||||||
for item in items
|
for item in items
|
||||||
]
|
]
|
||||||
|
|
@ -176,7 +245,68 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
|
||||||
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
||||||
return
|
return
|
||||||
|
|
||||||
if path.startswith("/api/editor/species/") and not path.endswith("/workflow") and not path.endswith("/editorial") and not path.endswith("/audit"):
|
if path.startswith("/api/editor/species/") and path.endswith("/document"):
|
||||||
|
if not self.require_role(session, "editor"):
|
||||||
|
return
|
||||||
|
slug = path[len("/api/editor/species/") : -len("/document")].strip("/")
|
||||||
|
item = get_species_document(slug)
|
||||||
|
if item is None:
|
||||||
|
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
||||||
|
return
|
||||||
|
self.write_json(item)
|
||||||
|
return
|
||||||
|
|
||||||
|
if path.startswith("/api/editor/species/") and path.endswith("/citations"):
|
||||||
|
if not self.require_role(session, "editor"):
|
||||||
|
return
|
||||||
|
slug = path[len("/api/editor/species/") : -len("/citations")].strip("/")
|
||||||
|
item = get_editor_species_citations(slug)
|
||||||
|
if item is None:
|
||||||
|
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
||||||
|
return
|
||||||
|
self.write_json(item)
|
||||||
|
return
|
||||||
|
|
||||||
|
if path.startswith("/api/editor/species/") and "/citations/" in path and path.endswith("/candidates"):
|
||||||
|
if not self.require_role(session, "editor"):
|
||||||
|
return
|
||||||
|
slug, _, tail = path[len("/api/editor/species/") :].partition("/citations/")
|
||||||
|
citation_tail = tail[: -len("/candidates")].strip("/")
|
||||||
|
try:
|
||||||
|
citation_id = int(citation_tail)
|
||||||
|
except ValueError:
|
||||||
|
self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST)
|
||||||
|
return
|
||||||
|
item = get_species_citation_candidates(slug.strip("/"), citation_id)
|
||||||
|
if item is None:
|
||||||
|
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
||||||
|
return
|
||||||
|
self.write_json(item)
|
||||||
|
return
|
||||||
|
|
||||||
|
if path.startswith("/api/contributor/species/") and path.endswith("/document"):
|
||||||
|
if not self.require_role(session, "contributor"):
|
||||||
|
return
|
||||||
|
slug = path[len("/api/contributor/species/") : -len("/document")].strip("/")
|
||||||
|
item = get_contributor_species_document(slug, session.username)
|
||||||
|
if item is None:
|
||||||
|
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
||||||
|
return
|
||||||
|
self.write_json(item)
|
||||||
|
return
|
||||||
|
|
||||||
|
if path.startswith("/api/contributor/species/") and path.endswith("/citations"):
|
||||||
|
if not self.require_role(session, "contributor"):
|
||||||
|
return
|
||||||
|
slug = path[len("/api/contributor/species/") : -len("/citations")].strip("/")
|
||||||
|
item = get_contributor_species_citations(slug, session.username)
|
||||||
|
if item is None:
|
||||||
|
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
||||||
|
return
|
||||||
|
self.write_json(item)
|
||||||
|
return
|
||||||
|
|
||||||
|
if path.startswith("/api/editor/species/") and not path.endswith("/workflow") and not path.endswith("/editorial") and not path.endswith("/audit") and not path.endswith("/document"):
|
||||||
if not self.require_role(session, "editor"):
|
if not self.require_role(session, "editor"):
|
||||||
return
|
return
|
||||||
slug = path[len("/api/editor/species/") :].strip("/")
|
slug = path[len("/api/editor/species/") :].strip("/")
|
||||||
|
|
@ -187,6 +317,17 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
|
||||||
self.write_json(item)
|
self.write_json(item)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if path.startswith("/api/contributor/species/") and not path.endswith("/document"):
|
||||||
|
if not self.require_role(session, "contributor"):
|
||||||
|
return
|
||||||
|
slug = path[len("/api/contributor/species/") :].strip("/")
|
||||||
|
item = get_contributor_species_detail(slug, session.username)
|
||||||
|
if item is None:
|
||||||
|
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
||||||
|
return
|
||||||
|
self.write_json(item)
|
||||||
|
return
|
||||||
|
|
||||||
if path.startswith("/api/editor/species/") and path.endswith("/workflow"):
|
if path.startswith("/api/editor/species/") and path.endswith("/workflow"):
|
||||||
if not self.require_role(session, "editor"):
|
if not self.require_role(session, "editor"):
|
||||||
return
|
return
|
||||||
|
|
@ -215,6 +356,12 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
|
||||||
self.write_json({"items": flagged, "count": len(flagged)})
|
self.write_json({"items": flagged, "count": len(flagged)})
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if path == "/api/bibliography":
|
||||||
|
search = query.get("search", [""])[0].strip()
|
||||||
|
items = list_public_bibliography(search=search)
|
||||||
|
self.write_json({"items": items, "count": len(items)})
|
||||||
|
return
|
||||||
|
|
||||||
if path == "/api/species":
|
if path == "/api/species":
|
||||||
search = query.get("search", [""])[0].strip().lower()
|
search = query.get("search", [""])[0].strip().lower()
|
||||||
species = list_species(search)
|
species = list_species(search)
|
||||||
|
|
@ -225,6 +372,10 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
|
||||||
"common_name": item["common_name"],
|
"common_name": item["common_name"],
|
||||||
"scientific_name": item["scientific_name"],
|
"scientific_name": item["scientific_name"],
|
||||||
"flelmr_code": item["flelmr_code"],
|
"flelmr_code": item["flelmr_code"],
|
||||||
|
"legacy_identifiers": item["legacy_identifiers"],
|
||||||
|
"taxon_identifiers": item["taxon_identifiers"],
|
||||||
|
"primary_taxon_authority": item["primary_taxon_authority"],
|
||||||
|
"primary_taxon_identifier": item["primary_taxon_identifier"],
|
||||||
"summary": item["summary"],
|
"summary": item["summary"],
|
||||||
"section_count": item["section_count"],
|
"section_count": item["section_count"],
|
||||||
"diagnostic_count": len(item["diagnostics"]),
|
"diagnostic_count": len(item["diagnostics"]),
|
||||||
|
|
@ -250,6 +401,47 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
|
||||||
path = parsed.path
|
path = parsed.path
|
||||||
session = resolve_auth_session(self.headers)
|
session = resolve_auth_session(self.headers)
|
||||||
|
|
||||||
|
if path == "/api/contributor/register":
|
||||||
|
payload = self.read_json_body()
|
||||||
|
if payload is None:
|
||||||
|
return
|
||||||
|
email = payload.get("email")
|
||||||
|
age_gate_confirmed = payload.get("age_gate_confirmed")
|
||||||
|
if not isinstance(email, str):
|
||||||
|
self.write_json({"error": "email must be a string"}, status=HTTPStatus.BAD_REQUEST)
|
||||||
|
return
|
||||||
|
if not isinstance(age_gate_confirmed, bool):
|
||||||
|
self.write_json(
|
||||||
|
{"error": "age_gate_confirmed must be a boolean"},
|
||||||
|
status=HTTPStatus.BAD_REQUEST,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
result = register_contributor(email=email, age_gate_confirmed=age_gate_confirmed)
|
||||||
|
except ValueError as exc:
|
||||||
|
self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST)
|
||||||
|
return
|
||||||
|
self.write_json({"status": "ok", **result}, status=HTTPStatus.CREATED)
|
||||||
|
return
|
||||||
|
|
||||||
|
if path == "/api/contributor/species":
|
||||||
|
if not self.require_role(session, "contributor"):
|
||||||
|
return
|
||||||
|
payload = self.read_json_body()
|
||||||
|
if payload is None:
|
||||||
|
return
|
||||||
|
markdown = payload.get("markdown")
|
||||||
|
if markdown is not None and not isinstance(markdown, str):
|
||||||
|
self.write_json({"error": "markdown must be a string"}, status=HTTPStatus.BAD_REQUEST)
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
result = create_contributor_species(session.username, markdown)
|
||||||
|
except ValueError as exc:
|
||||||
|
self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST)
|
||||||
|
return
|
||||||
|
self.write_json({"status": "ok", **result}, status=HTTPStatus.CREATED)
|
||||||
|
return
|
||||||
|
|
||||||
if path.startswith("/api/editor/species/") and path.endswith("/workflow"):
|
if path.startswith("/api/editor/species/") and path.endswith("/workflow"):
|
||||||
if not self.require_role(session, "editor"):
|
if not self.require_role(session, "editor"):
|
||||||
return
|
return
|
||||||
|
|
@ -341,6 +533,229 @@ class EcoSpeciesHandler(BaseHTTPRequestHandler):
|
||||||
self.write_json({"status": "ok", **result})
|
self.write_json({"status": "ok", **result})
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if path.startswith("/api/editor/species/") and path.endswith("/document"):
|
||||||
|
if not self.require_role(session, "editor"):
|
||||||
|
return
|
||||||
|
|
||||||
|
payload = self.read_json_body()
|
||||||
|
if payload is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
markdown = payload.get("markdown")
|
||||||
|
if not isinstance(markdown, str):
|
||||||
|
self.write_json({"error": "markdown must be a string"}, status=HTTPStatus.BAD_REQUEST)
|
||||||
|
return
|
||||||
|
|
||||||
|
slug = path[len("/api/editor/species/") : -len("/document")].strip("/")
|
||||||
|
try:
|
||||||
|
result = update_species_document_markdown(
|
||||||
|
slug=slug,
|
||||||
|
markdown=markdown,
|
||||||
|
username=session.username,
|
||||||
|
)
|
||||||
|
except ValueError as exc:
|
||||||
|
self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST)
|
||||||
|
return
|
||||||
|
|
||||||
|
if result is None:
|
||||||
|
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
||||||
|
return
|
||||||
|
|
||||||
|
self.write_json({"status": "ok", **result})
|
||||||
|
return
|
||||||
|
|
||||||
|
if (
|
||||||
|
path.startswith("/api/editor/species/")
|
||||||
|
and "/citations/" in path
|
||||||
|
and not path.endswith("/citations/enrich")
|
||||||
|
and not path.endswith("/citations/backfill")
|
||||||
|
):
|
||||||
|
if not self.require_role(session, "editor"):
|
||||||
|
return
|
||||||
|
|
||||||
|
payload = self.read_json_body()
|
||||||
|
if payload is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
slug, _, tail = path[len("/api/editor/species/") :].partition("/citations/")
|
||||||
|
if tail.endswith("/enrich"):
|
||||||
|
citation_tail = tail[: -len("/enrich")].strip("/")
|
||||||
|
try:
|
||||||
|
citation_id = int(citation_tail)
|
||||||
|
except ValueError:
|
||||||
|
self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST)
|
||||||
|
return
|
||||||
|
|
||||||
|
result = update_species_citation_enrichment(
|
||||||
|
slug=slug.strip("/"),
|
||||||
|
citation_id=citation_id,
|
||||||
|
username=session.username,
|
||||||
|
)
|
||||||
|
if result is None:
|
||||||
|
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
||||||
|
return
|
||||||
|
|
||||||
|
self.write_json({"status": "ok", **result})
|
||||||
|
return
|
||||||
|
|
||||||
|
if tail.endswith("/apply-match"):
|
||||||
|
citation_tail = tail[: -len("/apply-match")].strip("/")
|
||||||
|
try:
|
||||||
|
citation_id = int(citation_tail)
|
||||||
|
except ValueError:
|
||||||
|
self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST)
|
||||||
|
return
|
||||||
|
candidate = payload.get("candidate")
|
||||||
|
if not isinstance(candidate, dict):
|
||||||
|
self.write_json({"error": "candidate must be an object"}, status=HTTPStatus.BAD_REQUEST)
|
||||||
|
return
|
||||||
|
result = apply_species_citation_candidate_selection(
|
||||||
|
slug=slug.strip("/"),
|
||||||
|
citation_id=citation_id,
|
||||||
|
candidate=candidate,
|
||||||
|
username=session.username,
|
||||||
|
)
|
||||||
|
if result is None:
|
||||||
|
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
||||||
|
return
|
||||||
|
self.write_json({"status": "ok", **result})
|
||||||
|
return
|
||||||
|
|
||||||
|
if tail.endswith("/add-match"):
|
||||||
|
citation_tail = tail[: -len("/add-match")].strip("/")
|
||||||
|
try:
|
||||||
|
citation_id = int(citation_tail)
|
||||||
|
except ValueError:
|
||||||
|
self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST)
|
||||||
|
return
|
||||||
|
candidate = payload.get("candidate")
|
||||||
|
if not isinstance(candidate, dict):
|
||||||
|
self.write_json({"error": "candidate must be an object"}, status=HTTPStatus.BAD_REQUEST)
|
||||||
|
return
|
||||||
|
result = add_species_citation_from_candidate(
|
||||||
|
slug=slug.strip("/"),
|
||||||
|
citation_id=citation_id,
|
||||||
|
candidate=candidate,
|
||||||
|
username=session.username,
|
||||||
|
)
|
||||||
|
if result is None:
|
||||||
|
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
||||||
|
return
|
||||||
|
self.write_json({"status": "ok", **result})
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
citation_id = int(tail.strip("/"))
|
||||||
|
except ValueError:
|
||||||
|
self.write_json({"error": "Invalid citation id"}, status=HTTPStatus.BAD_REQUEST)
|
||||||
|
return
|
||||||
|
|
||||||
|
for field in ("review_status", "normalized_text", "abstract_text", "doi", "citation_key", "entry_type", "draft_bibtex"):
|
||||||
|
value = payload.get(field)
|
||||||
|
if value is not None and not isinstance(value, str):
|
||||||
|
self.write_json(
|
||||||
|
{"error": f"{field} must be a string"},
|
||||||
|
status=HTTPStatus.BAD_REQUEST,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = update_species_citation_review(
|
||||||
|
slug=slug.strip("/"),
|
||||||
|
citation_id=citation_id,
|
||||||
|
review_status=payload.get("review_status"),
|
||||||
|
normalized_text=payload.get("normalized_text"),
|
||||||
|
doi=payload.get("doi"),
|
||||||
|
citation_key=payload.get("citation_key"),
|
||||||
|
entry_type=payload.get("entry_type"),
|
||||||
|
draft_bibtex=payload.get("draft_bibtex"),
|
||||||
|
abstract_text=payload.get("abstract_text"),
|
||||||
|
username=session.username,
|
||||||
|
)
|
||||||
|
except ValueError as exc:
|
||||||
|
self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST)
|
||||||
|
return
|
||||||
|
|
||||||
|
if result is None:
|
||||||
|
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
||||||
|
return
|
||||||
|
|
||||||
|
self.write_json({"status": "ok", **result})
|
||||||
|
return
|
||||||
|
|
||||||
|
if path.startswith("/api/editor/species/") and path.endswith("/citations/enrich"):
|
||||||
|
if not self.require_role(session, "editor"):
|
||||||
|
return
|
||||||
|
|
||||||
|
payload = self.read_json_body()
|
||||||
|
if payload is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
slug = path[len("/api/editor/species/") : -len("/citations/enrich")].strip("/")
|
||||||
|
result = update_species_citations_enrichment_batch(
|
||||||
|
slug=slug,
|
||||||
|
username=session.username,
|
||||||
|
)
|
||||||
|
if result is None:
|
||||||
|
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
||||||
|
return
|
||||||
|
|
||||||
|
self.write_json({"status": "ok", **result})
|
||||||
|
return
|
||||||
|
|
||||||
|
if path.startswith("/api/editor/species/") and path.endswith("/citations/backfill"):
|
||||||
|
if not self.require_role(session, "editor"):
|
||||||
|
return
|
||||||
|
|
||||||
|
payload = self.read_json_body()
|
||||||
|
if payload is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
slug = path[len("/api/editor/species/") : -len("/citations/backfill")].strip("/")
|
||||||
|
include_accepted = bool(payload.get("include_accepted", False))
|
||||||
|
result = backfill_species_citations(
|
||||||
|
slug=slug,
|
||||||
|
username=session.username,
|
||||||
|
include_accepted=include_accepted,
|
||||||
|
)
|
||||||
|
if result is None:
|
||||||
|
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
||||||
|
return
|
||||||
|
|
||||||
|
self.write_json({"status": "ok", **result})
|
||||||
|
return
|
||||||
|
|
||||||
|
if path.startswith("/api/contributor/species/") and path.endswith("/document"):
|
||||||
|
if not self.require_role(session, "contributor"):
|
||||||
|
return
|
||||||
|
|
||||||
|
payload = self.read_json_body()
|
||||||
|
if payload is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
markdown = payload.get("markdown")
|
||||||
|
if not isinstance(markdown, str):
|
||||||
|
self.write_json({"error": "markdown must be a string"}, status=HTTPStatus.BAD_REQUEST)
|
||||||
|
return
|
||||||
|
|
||||||
|
slug = path[len("/api/contributor/species/") : -len("/document")].strip("/")
|
||||||
|
try:
|
||||||
|
result = update_contributor_species_document_markdown(
|
||||||
|
slug=slug,
|
||||||
|
markdown=markdown,
|
||||||
|
username=session.username,
|
||||||
|
)
|
||||||
|
except ValueError as exc:
|
||||||
|
self.write_json({"error": str(exc)}, status=HTTPStatus.BAD_REQUEST)
|
||||||
|
return
|
||||||
|
|
||||||
|
if result is None:
|
||||||
|
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
||||||
|
return
|
||||||
|
|
||||||
|
self.write_json({"status": "ok", **result})
|
||||||
|
return
|
||||||
|
|
||||||
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
self.write_json({"error": "Not found"}, status=HTTPStatus.NOT_FOUND)
|
||||||
|
|
||||||
def log_message(self, format: str, *args: object) -> None:
|
def log_message(self, format: str, *args: object) -> None:
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,21 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Mapping
|
from typing import Mapping
|
||||||
|
|
||||||
|
from sqlalchemy import select
|
||||||
|
|
||||||
|
from ecospecies_api.db import SessionLocal, create_db_engine
|
||||||
|
from ecospecies_api.models import Base, ContributorAccount
|
||||||
|
|
||||||
|
|
||||||
ROLE_ORDER = {
|
ROLE_ORDER = {
|
||||||
"viewer": 1,
|
"viewer": 1,
|
||||||
"editor": 2,
|
"contributor": 2,
|
||||||
"admin": 3,
|
"editor": 3,
|
||||||
|
"admin": 4,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -41,17 +48,27 @@ def _parse_token_entry(entry: str) -> tuple[str, AuthSession]:
|
||||||
|
|
||||||
|
|
||||||
def get_token_registry() -> dict[str, AuthSession]:
|
def get_token_registry() -> dict[str, AuthSession]:
|
||||||
configured = os.environ.get("ECOSPECIES_AUTH_TOKENS", "").strip()
|
|
||||||
if not configured:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
registry: dict[str, AuthSession] = {}
|
registry: dict[str, AuthSession] = {}
|
||||||
|
configured = os.environ.get("ECOSPECIES_AUTH_TOKENS", "").strip()
|
||||||
|
if configured:
|
||||||
for raw_entry in configured.split(","):
|
for raw_entry in configured.split(","):
|
||||||
entry = raw_entry.strip()
|
entry = raw_entry.strip()
|
||||||
if not entry:
|
if not entry:
|
||||||
continue
|
continue
|
||||||
token, session = _parse_token_entry(entry)
|
token, session = _parse_token_entry(entry)
|
||||||
registry[token] = session
|
registry[token] = session
|
||||||
|
|
||||||
|
engine = create_db_engine()
|
||||||
|
Base.metadata.create_all(engine)
|
||||||
|
with SessionLocal() as session:
|
||||||
|
for account in session.scalars(
|
||||||
|
select(ContributorAccount).where(ContributorAccount.is_active.is_(True))
|
||||||
|
):
|
||||||
|
registry[account.token_hash] = AuthSession(
|
||||||
|
token=account.token_hash,
|
||||||
|
username=account.email,
|
||||||
|
role="contributor",
|
||||||
|
)
|
||||||
return registry
|
return registry
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -70,7 +87,11 @@ def resolve_auth_session(headers: Mapping[str, str]) -> AuthSession | None:
|
||||||
token = get_bearer_token(headers)
|
token = get_bearer_token(headers)
|
||||||
if not token:
|
if not token:
|
||||||
return None
|
return None
|
||||||
return registry.get(token)
|
direct = registry.get(token)
|
||||||
|
if direct is not None:
|
||||||
|
return direct
|
||||||
|
token_hash = hashlib.sha256(token.encode("utf-8")).hexdigest()
|
||||||
|
return registry.get(token_hash)
|
||||||
|
|
||||||
|
|
||||||
def auth_is_configured() -> bool:
|
def auth_is_configured() -> bool:
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,387 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def _load_citegeist_extract():
|
||||||
|
citegeist_src = Path(__file__).resolve().parents[5] / "CiteGeist" / "src"
|
||||||
|
if citegeist_src.exists() and str(citegeist_src) not in sys.path:
|
||||||
|
sys.path.insert(0, str(citegeist_src))
|
||||||
|
try:
|
||||||
|
from citegeist.extract import extract_references # type: ignore
|
||||||
|
except ImportError:
|
||||||
|
return None
|
||||||
|
return extract_references
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DraftCitation:
|
||||||
|
citation_key: str
|
||||||
|
entry_type: str
|
||||||
|
fields: dict[str, str]
|
||||||
|
draft_bibtex: str
|
||||||
|
|
||||||
|
|
||||||
|
STOPWORD_TOKENS = {
|
||||||
|
"a",
|
||||||
|
"an",
|
||||||
|
"and",
|
||||||
|
"for",
|
||||||
|
"from",
|
||||||
|
"in",
|
||||||
|
"of",
|
||||||
|
"on",
|
||||||
|
"the",
|
||||||
|
"to",
|
||||||
|
"with",
|
||||||
|
}
|
||||||
|
HISTORICAL_YEAR_PATTERN = r"(1\d{3}|20\d{2})"
|
||||||
|
|
||||||
|
|
||||||
|
def build_standard_citation_key(
|
||||||
|
authors: str = "",
|
||||||
|
year: str = "",
|
||||||
|
title: str = "",
|
||||||
|
fallback_text: str = "",
|
||||||
|
) -> str:
|
||||||
|
family_name = _family_name_stem(authors or fallback_text)
|
||||||
|
year_stem = re.sub(r"[^0-9]+", "", year)[:4]
|
||||||
|
topic_stem = _topic_stem(title or fallback_text)
|
||||||
|
key = f"{family_name}{year_stem}{topic_stem}"
|
||||||
|
return key or "reference"
|
||||||
|
|
||||||
|
|
||||||
|
def extract_draft_citation(raw_text: str, legacy_reference_number: str = "") -> DraftCitation | None:
|
||||||
|
extractor = _load_citegeist_extract()
|
||||||
|
if extractor is None:
|
||||||
|
return _fallback_citation(raw_text, legacy_reference_number)
|
||||||
|
|
||||||
|
entries = extractor(raw_text)
|
||||||
|
if not entries:
|
||||||
|
return _fallback_citation(raw_text, legacy_reference_number)
|
||||||
|
|
||||||
|
entry = entries[0]
|
||||||
|
fields = dict(entry.fields)
|
||||||
|
fields = _repair_reference_fields(raw_text, fields)
|
||||||
|
citation_key = build_standard_citation_key(
|
||||||
|
authors=str(fields.get("author", "")),
|
||||||
|
year=str(fields.get("year", "")),
|
||||||
|
title=str(fields.get("title", "")),
|
||||||
|
fallback_text=raw_text,
|
||||||
|
)
|
||||||
|
note_parts = [fields.get("note", "").strip()] if fields.get("note") else []
|
||||||
|
if legacy_reference_number:
|
||||||
|
note_parts.append(f"ecospecies_reference_number = {{{legacy_reference_number}}}")
|
||||||
|
fields["note"] = "; ".join(part for part in note_parts if part)
|
||||||
|
draft_bibtex = render_single_bibtex(entry.entry_type, citation_key, fields)
|
||||||
|
return DraftCitation(
|
||||||
|
citation_key=citation_key,
|
||||||
|
entry_type=entry.entry_type,
|
||||||
|
fields=fields,
|
||||||
|
draft_bibtex=draft_bibtex,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _fallback_citation(raw_text: str, legacy_reference_number: str) -> DraftCitation:
|
||||||
|
year_match = re.search(rf"\b{HISTORICAL_YEAR_PATTERN}\b", raw_text)
|
||||||
|
year = year_match.group(0) if year_match else ""
|
||||||
|
fields = _repair_reference_fields(
|
||||||
|
raw_text,
|
||||||
|
{
|
||||||
|
"title": raw_text.strip(),
|
||||||
|
"year": year,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
title = str(fields.get("title", "")).strip() or raw_text.strip()
|
||||||
|
citation_key = build_standard_citation_key(year=year, title=title, fallback_text=raw_text)
|
||||||
|
fields["note"] = f"raw_reference = {{{raw_text}}}"
|
||||||
|
if legacy_reference_number:
|
||||||
|
fields["note"] += f"; ecospecies_reference_number = {{{legacy_reference_number}}}"
|
||||||
|
draft_bibtex = render_single_bibtex("misc", citation_key, fields)
|
||||||
|
return DraftCitation(
|
||||||
|
citation_key=citation_key,
|
||||||
|
entry_type="misc",
|
||||||
|
fields=fields,
|
||||||
|
draft_bibtex=draft_bibtex,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _family_name_stem(raw_text: str) -> str:
|
||||||
|
compact = raw_text.strip()
|
||||||
|
if not compact:
|
||||||
|
return "ref"
|
||||||
|
if "," in compact:
|
||||||
|
compact = compact.split(",", 1)[0]
|
||||||
|
else:
|
||||||
|
compact = compact.split()[0]
|
||||||
|
compact = re.sub(r"[^A-Za-z0-9]+", "", compact).lower()
|
||||||
|
return compact or "ref"
|
||||||
|
|
||||||
|
|
||||||
|
def _topic_stem(raw_text: str) -> str:
|
||||||
|
tokens = [
|
||||||
|
token
|
||||||
|
for token in re.findall(r"[A-Za-z0-9]+", raw_text.lower())
|
||||||
|
if token not in STOPWORD_TOKENS and not token.isdigit()
|
||||||
|
]
|
||||||
|
topic_tokens = tokens[:3] or ["topic"]
|
||||||
|
return "".join(topic_tokens)
|
||||||
|
|
||||||
|
|
||||||
|
def _repair_reference_fields(raw_text: str, fields: dict[str, str]) -> dict[str, str]:
|
||||||
|
repaired = dict(fields)
|
||||||
|
title = str(repaired.get("title", "")).strip()
|
||||||
|
raw = raw_text.strip()
|
||||||
|
if not raw:
|
||||||
|
return repaired
|
||||||
|
|
||||||
|
parsed = _parse_report_style_reference(raw)
|
||||||
|
if parsed is None:
|
||||||
|
return repaired
|
||||||
|
|
||||||
|
current_venue = (
|
||||||
|
str(repaired.get("journal", "")).strip()
|
||||||
|
or str(repaired.get("howpublished", "")).strip()
|
||||||
|
or str(repaired.get("booktitle", "")).strip()
|
||||||
|
or str(repaired.get("publisher", "")).strip()
|
||||||
|
)
|
||||||
|
parsed_venue = str(parsed.get("venue", "")).strip()
|
||||||
|
needs_structural_repair = bool(
|
||||||
|
parsed_venue
|
||||||
|
and (
|
||||||
|
not current_venue
|
||||||
|
or len(current_venue) < max(8, len(parsed_venue) // 2)
|
||||||
|
or current_venue.lower() not in parsed_venue.lower()
|
||||||
|
or (parsed.get("volume") and not str(repaired.get("volume", "")).strip())
|
||||||
|
or (parsed.get("number") and not str(repaired.get("number", "")).strip())
|
||||||
|
or (parsed.get("pages") and not str(repaired.get("pages", "")).strip())
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if title and not _title_looks_like_raw_reference(title) and not needs_structural_repair:
|
||||||
|
return repaired
|
||||||
|
|
||||||
|
if parsed.get("author"):
|
||||||
|
repaired["author"] = parsed["author"]
|
||||||
|
if parsed.get("year"):
|
||||||
|
repaired["year"] = parsed["year"]
|
||||||
|
if parsed.get("title"):
|
||||||
|
repaired["title"] = parsed["title"]
|
||||||
|
venue = parsed.get("venue", "")
|
||||||
|
if venue:
|
||||||
|
repaired.pop("howpublished", None)
|
||||||
|
if _venue_looks_journal_like(venue):
|
||||||
|
repaired["journal"] = venue
|
||||||
|
else:
|
||||||
|
repaired["howpublished"] = venue
|
||||||
|
if parsed.get("volume"):
|
||||||
|
repaired["volume"] = parsed["volume"]
|
||||||
|
if parsed.get("number"):
|
||||||
|
repaired["number"] = parsed["number"]
|
||||||
|
if parsed.get("pages"):
|
||||||
|
repaired["pages"] = parsed["pages"]
|
||||||
|
return repaired
|
||||||
|
|
||||||
|
|
||||||
|
def _title_looks_like_raw_reference(title: str) -> bool:
|
||||||
|
compact = " ".join(title.split()).strip()
|
||||||
|
if not compact:
|
||||||
|
return True
|
||||||
|
if len(compact) > 120:
|
||||||
|
return True
|
||||||
|
return bool(re.match(rf"^[^,]+,\s+.+\b{HISTORICAL_YEAR_PATTERN}\.\s+", compact))
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_report_style_reference(raw_text: str) -> dict[str, str] | None:
|
||||||
|
match = re.match(
|
||||||
|
rf"^(?P<author>.+?)\s+(?P<year>{HISTORICAL_YEAR_PATTERN})\.\s+(?P<remainder>.+)$",
|
||||||
|
raw_text.strip(),
|
||||||
|
)
|
||||||
|
if match is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
author = match.group("author").strip(" .")
|
||||||
|
year = match.group("year").strip()
|
||||||
|
remainder = match.group("remainder").strip()
|
||||||
|
if not author or not remainder:
|
||||||
|
return None
|
||||||
|
|
||||||
|
venue_start = _find_venue_start(remainder)
|
||||||
|
if venue_start is None:
|
||||||
|
return {
|
||||||
|
"author": author,
|
||||||
|
"year": year,
|
||||||
|
"title": remainder.strip(" ."),
|
||||||
|
"venue": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
title = remainder[:venue_start].strip(" .")
|
||||||
|
venue_part = remainder[venue_start:].strip(" .")
|
||||||
|
venue, volume, number, pages = _split_venue_and_locator(venue_part)
|
||||||
|
return {
|
||||||
|
"author": author,
|
||||||
|
"year": year,
|
||||||
|
"title": title,
|
||||||
|
"venue": venue,
|
||||||
|
"volume": volume,
|
||||||
|
"number": number,
|
||||||
|
"pages": pages,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _split_venue_and_locator(venue_part: str) -> tuple[str, str, str, str]:
|
||||||
|
compact = venue_part.strip(" .")
|
||||||
|
if not compact:
|
||||||
|
return "", "", "", ""
|
||||||
|
|
||||||
|
match = re.search(
|
||||||
|
r"(?P<venue>.+?)\.\s+(?P<volume>\d+)(?:\((?P<number>[^)]+)\))?\s*:\s*(?P<pages>\d+(?:-\d+)?)\.?$",
|
||||||
|
compact,
|
||||||
|
)
|
||||||
|
if match is None:
|
||||||
|
match = re.search(
|
||||||
|
r"(?P<venue>.+?)\s+(?P<volume>\d+)(?:\((?P<number>[^)]+)\))?\s*:\s*(?P<pages>\d+(?:-\d+)?)\.?$",
|
||||||
|
compact,
|
||||||
|
)
|
||||||
|
if match is None:
|
||||||
|
return compact, "", "", ""
|
||||||
|
|
||||||
|
return (
|
||||||
|
match.group("venue").strip(" ."),
|
||||||
|
(match.group("volume") or "").strip(),
|
||||||
|
(match.group("number") or "").strip(),
|
||||||
|
(match.group("pages") or "").strip(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _find_venue_start(remainder: str) -> int | None:
|
||||||
|
for match in re.finditer(r"\.\s+", remainder):
|
||||||
|
candidate_start = match.end()
|
||||||
|
candidate = remainder[candidate_start:].strip()
|
||||||
|
if _looks_like_publication_segment(candidate):
|
||||||
|
return candidate_start
|
||||||
|
|
||||||
|
lowered = remainder.lower()
|
||||||
|
markers = (
|
||||||
|
"comm. rept.",
|
||||||
|
"rept.",
|
||||||
|
"proc.",
|
||||||
|
"procs.",
|
||||||
|
"journal",
|
||||||
|
"transactions",
|
||||||
|
"proceedings",
|
||||||
|
"bulletin",
|
||||||
|
"bull.",
|
||||||
|
"occas. pap.",
|
||||||
|
"pap.",
|
||||||
|
"memoir",
|
||||||
|
"memorandum",
|
||||||
|
"memo.",
|
||||||
|
"tech. memo.",
|
||||||
|
"tech memo",
|
||||||
|
"technical memorandum",
|
||||||
|
"technical report",
|
||||||
|
"noaa",
|
||||||
|
)
|
||||||
|
positions = [lowered.find(marker) for marker in markers if lowered.find(marker) > 0]
|
||||||
|
if positions:
|
||||||
|
return min(positions)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_publication_segment(candidate: str) -> bool:
|
||||||
|
compact = candidate.strip(" .")
|
||||||
|
if not compact:
|
||||||
|
return False
|
||||||
|
|
||||||
|
venue, volume, number, pages = _split_venue_and_locator(compact)
|
||||||
|
if venue and (volume or number or pages) and _starts_with_publication_marker(compact):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return _starts_with_publication_marker(compact)
|
||||||
|
|
||||||
|
|
||||||
|
def _starts_with_publication_marker(text: str) -> bool:
|
||||||
|
lowered = text.lower()
|
||||||
|
publication_starts = (
|
||||||
|
"comm. rept.",
|
||||||
|
"rept.",
|
||||||
|
"proc.",
|
||||||
|
"procs.",
|
||||||
|
"journal",
|
||||||
|
"transactions",
|
||||||
|
"proceedings",
|
||||||
|
"bulletin",
|
||||||
|
"bull.",
|
||||||
|
"occas. pap.",
|
||||||
|
"pap.",
|
||||||
|
"memoir",
|
||||||
|
"memorandum",
|
||||||
|
"memo.",
|
||||||
|
"tech. memo.",
|
||||||
|
"tech memo",
|
||||||
|
"technical memorandum",
|
||||||
|
"technical report",
|
||||||
|
"noaa",
|
||||||
|
"u.s.",
|
||||||
|
)
|
||||||
|
return lowered.startswith(publication_starts)
|
||||||
|
|
||||||
|
|
||||||
|
def _venue_looks_journal_like(venue: str) -> bool:
|
||||||
|
lowered = venue.lower()
|
||||||
|
return any(
|
||||||
|
token in lowered
|
||||||
|
for token in (
|
||||||
|
"journal",
|
||||||
|
"transactions",
|
||||||
|
"review",
|
||||||
|
"letters",
|
||||||
|
"comm. rept.",
|
||||||
|
"rept.",
|
||||||
|
"proc.",
|
||||||
|
"proceedings",
|
||||||
|
"occas. pap.",
|
||||||
|
"pap.",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def render_single_bibtex(entry_type: str, citation_key: str, fields: dict[str, str]) -> str:
|
||||||
|
lines = [f"@{entry_type}{{{citation_key},"]
|
||||||
|
for key in sorted(fields):
|
||||||
|
value = _sanitize_bibtex_value(fields[key])
|
||||||
|
lines.append(f" {key} = {{{value}}},")
|
||||||
|
lines.append("}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_bibtex_value(value: str) -> str:
|
||||||
|
depth = 0
|
||||||
|
parts: list[str] = []
|
||||||
|
for char in value:
|
||||||
|
if char == "{":
|
||||||
|
depth += 1
|
||||||
|
parts.append(char)
|
||||||
|
continue
|
||||||
|
if char == "}":
|
||||||
|
if depth == 0:
|
||||||
|
parts.append(")")
|
||||||
|
else:
|
||||||
|
depth -= 1
|
||||||
|
parts.append(char)
|
||||||
|
continue
|
||||||
|
parts.append(char)
|
||||||
|
if depth > 0:
|
||||||
|
open_count = depth
|
||||||
|
normalized: list[str] = []
|
||||||
|
for char in parts:
|
||||||
|
if char == "{" and open_count > 0:
|
||||||
|
normalized.append("(")
|
||||||
|
open_count -= 1
|
||||||
|
else:
|
||||||
|
normalized.append(char)
|
||||||
|
return "".join(normalized)
|
||||||
|
return "".join(parts)
|
||||||
|
|
@ -0,0 +1,480 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from dataclasses import asdict, dataclass, field
|
||||||
|
|
||||||
|
|
||||||
|
HEADING_PATTERN = re.compile(r"^(#{2,6})\s+(?P<title>.+?)\s*$")
|
||||||
|
INDENTED_ITEM_PATTERN = re.compile(r"^\s*-\s*(?P<body>.+?)\s*$")
|
||||||
|
DOI_PATTERN = re.compile(r"\b10\.\d{4,9}/[-._;()/:A-Za-z0-9]+\b")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DocumentNode:
|
||||||
|
node_type: str
|
||||||
|
title: str
|
||||||
|
body: str
|
||||||
|
depth: int
|
||||||
|
children: list["DocumentNode"] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class StructuredDocument:
|
||||||
|
metadata: dict[str, object]
|
||||||
|
nodes: list[DocumentNode]
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_scalar_value(value: str) -> object:
|
||||||
|
stripped = value.strip()
|
||||||
|
if not stripped:
|
||||||
|
return ""
|
||||||
|
if stripped.lower() == "true":
|
||||||
|
return True
|
||||||
|
if stripped.lower() == "false":
|
||||||
|
return False
|
||||||
|
if stripped.startswith("{") or stripped.startswith("["):
|
||||||
|
try:
|
||||||
|
return json.loads(stripped)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return stripped
|
||||||
|
return stripped
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_whitespace(value: str) -> str:
|
||||||
|
return re.sub(r"\s+", " ", value).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_front_matter(front_matter: str) -> dict[str, object]:
|
||||||
|
metadata: dict[str, object] = {}
|
||||||
|
lines = front_matter.splitlines()
|
||||||
|
index = 0
|
||||||
|
|
||||||
|
while index < len(lines):
|
||||||
|
raw_line = lines[index]
|
||||||
|
if not raw_line.strip() or raw_line.lstrip().startswith("#"):
|
||||||
|
index += 1
|
||||||
|
continue
|
||||||
|
if ":" not in raw_line:
|
||||||
|
index += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
key, value = raw_line.split(":", 1)
|
||||||
|
normalized_key = key.strip()
|
||||||
|
stripped_value = value.strip()
|
||||||
|
if stripped_value:
|
||||||
|
metadata[normalized_key] = _parse_scalar_value(stripped_value)
|
||||||
|
index += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
items: list[dict[str, object]] = []
|
||||||
|
index += 1
|
||||||
|
while index < len(lines):
|
||||||
|
item_line = lines[index]
|
||||||
|
if not item_line.strip():
|
||||||
|
index += 1
|
||||||
|
continue
|
||||||
|
if not item_line.startswith(" - "):
|
||||||
|
break
|
||||||
|
|
||||||
|
match = INDENTED_ITEM_PATTERN.match(item_line)
|
||||||
|
if not match:
|
||||||
|
break
|
||||||
|
item: dict[str, object] = {}
|
||||||
|
first_body = match.group("body")
|
||||||
|
if ":" in first_body:
|
||||||
|
item_key, item_value = first_body.split(":", 1)
|
||||||
|
item[item_key.strip()] = _parse_scalar_value(item_value.strip())
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
while index < len(lines):
|
||||||
|
nested_line = lines[index]
|
||||||
|
if nested_line.startswith(" ") and ":" in nested_line.strip():
|
||||||
|
nested_key, nested_value = nested_line.strip().split(":", 1)
|
||||||
|
item[nested_key.strip()] = _parse_scalar_value(nested_value.strip())
|
||||||
|
index += 1
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
|
||||||
|
items.append(item)
|
||||||
|
|
||||||
|
metadata[normalized_key] = items
|
||||||
|
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
|
def _split_front_matter(text: str) -> tuple[dict[str, object], str]:
|
||||||
|
stripped = text.lstrip()
|
||||||
|
if not stripped.startswith("---\n"):
|
||||||
|
return {}, text
|
||||||
|
|
||||||
|
_, _, remainder = stripped.partition("---\n")
|
||||||
|
front_matter, separator, body = remainder.partition("\n---\n")
|
||||||
|
if not separator:
|
||||||
|
return {}, text
|
||||||
|
|
||||||
|
return _parse_front_matter(front_matter), body
|
||||||
|
|
||||||
|
|
||||||
|
def parse_markdown_document(text: str) -> StructuredDocument:
|
||||||
|
metadata, body = _split_front_matter(text)
|
||||||
|
root_nodes: list[DocumentNode] = []
|
||||||
|
stack: list[DocumentNode] = []
|
||||||
|
body_lines: list[str] = []
|
||||||
|
|
||||||
|
def flush_body() -> None:
|
||||||
|
if not stack:
|
||||||
|
body_lines.clear()
|
||||||
|
return
|
||||||
|
stack[-1].body = "\n".join(body_lines).strip()
|
||||||
|
body_lines.clear()
|
||||||
|
|
||||||
|
for raw_line in body.splitlines():
|
||||||
|
match = HEADING_PATTERN.match(raw_line)
|
||||||
|
if not match:
|
||||||
|
body_lines.append(raw_line)
|
||||||
|
continue
|
||||||
|
|
||||||
|
flush_body()
|
||||||
|
depth = len(match.group(1))
|
||||||
|
node = DocumentNode(
|
||||||
|
node_type="section",
|
||||||
|
title=match.group("title").strip(),
|
||||||
|
body="",
|
||||||
|
depth=depth,
|
||||||
|
)
|
||||||
|
|
||||||
|
while stack and stack[-1].depth >= depth:
|
||||||
|
stack.pop()
|
||||||
|
|
||||||
|
if stack:
|
||||||
|
stack[-1].children.append(node)
|
||||||
|
else:
|
||||||
|
root_nodes.append(node)
|
||||||
|
stack.append(node)
|
||||||
|
|
||||||
|
flush_body()
|
||||||
|
return StructuredDocument(metadata=metadata, nodes=root_nodes)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_markdown_document(text: str) -> list[str]:
|
||||||
|
errors: list[str] = []
|
||||||
|
metadata, body = _split_front_matter(text)
|
||||||
|
if not metadata:
|
||||||
|
errors.append("Markdown document must include YAML front matter.")
|
||||||
|
|
||||||
|
last_depth: int | None = None
|
||||||
|
for raw_line in body.splitlines():
|
||||||
|
match = HEADING_PATTERN.match(raw_line)
|
||||||
|
if not match:
|
||||||
|
continue
|
||||||
|
depth = len(match.group(1))
|
||||||
|
if last_depth is not None and depth > last_depth + 1:
|
||||||
|
errors.append(
|
||||||
|
f"Heading depth jumps from level {last_depth} to level {depth}: {match.group('title').strip()}"
|
||||||
|
)
|
||||||
|
last_depth = depth
|
||||||
|
|
||||||
|
return errors
|
||||||
|
|
||||||
|
|
||||||
|
def _append_metadata_lines(lines: list[str], key: str, value: object) -> None:
|
||||||
|
if isinstance(value, list):
|
||||||
|
lines.append(f"{key}:")
|
||||||
|
for item in value:
|
||||||
|
if isinstance(item, dict) and item:
|
||||||
|
first = True
|
||||||
|
for item_key, item_value in item.items():
|
||||||
|
rendered = "true" if item_value is True else "false" if item_value is False else str(item_value)
|
||||||
|
prefix = " - " if first else " "
|
||||||
|
lines.append(f"{prefix}{item_key}: {rendered}")
|
||||||
|
first = False
|
||||||
|
else:
|
||||||
|
lines.append(f" - {item}")
|
||||||
|
return
|
||||||
|
|
||||||
|
rendered = "true" if value is True else "false" if value is False else str(value)
|
||||||
|
lines.append(f"{key}: {rendered}")
|
||||||
|
|
||||||
|
|
||||||
|
def export_markdown_document(document: StructuredDocument) -> str:
|
||||||
|
lines: list[str] = ["---"]
|
||||||
|
for key, value in document.metadata.items():
|
||||||
|
_append_metadata_lines(lines, key, value)
|
||||||
|
lines.append("---")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
def append_nodes(nodes: list[DocumentNode]) -> None:
|
||||||
|
for node in nodes:
|
||||||
|
lines.append(f"{'#' * node.depth} {node.title}")
|
||||||
|
if node.body:
|
||||||
|
lines.append(node.body)
|
||||||
|
lines.append("")
|
||||||
|
append_nodes(node.children)
|
||||||
|
|
||||||
|
append_nodes(document.nodes)
|
||||||
|
return "\n".join(lines).rstrip() + "\n"
|
||||||
|
|
||||||
|
|
||||||
|
def flatten_document_nodes(document: StructuredDocument) -> list[dict[str, object]]:
|
||||||
|
flattened: list[dict[str, object]] = []
|
||||||
|
|
||||||
|
def visit(nodes: list[DocumentNode], parent_id: str | None) -> None:
|
||||||
|
for index, node in enumerate(nodes, start=1):
|
||||||
|
node_id = f"node-{len(flattened) + 1}"
|
||||||
|
flattened.append(
|
||||||
|
{
|
||||||
|
"node_id": node_id,
|
||||||
|
"parent_id": parent_id,
|
||||||
|
"position": index,
|
||||||
|
"depth": node.depth,
|
||||||
|
"node_type": node.node_type,
|
||||||
|
"title": node.title,
|
||||||
|
"body_markdown": node.body,
|
||||||
|
"body_plaintext": node.body,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
visit(node.children, node_id)
|
||||||
|
|
||||||
|
visit(document.nodes, None)
|
||||||
|
return flattened
|
||||||
|
|
||||||
|
|
||||||
|
def document_to_json(document: StructuredDocument) -> str:
|
||||||
|
return json.dumps(asdict(document), ensure_ascii=True)
|
||||||
|
|
||||||
|
|
||||||
|
def build_document_from_species_payload(item: dict[str, object]) -> StructuredDocument:
|
||||||
|
legacy_identifiers: list[dict[str, object]] = []
|
||||||
|
if item.get("flelmr_code"):
|
||||||
|
legacy_identifiers.append(
|
||||||
|
{
|
||||||
|
"authority": "legacy-ecospecies",
|
||||||
|
"identifier": str(item.get("flelmr_code", "")),
|
||||||
|
"label": "FLELMR",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
metadata = {
|
||||||
|
"title": str(item.get("title", "")),
|
||||||
|
"common_name": str(item.get("common_name", "")),
|
||||||
|
"scientific_name": str(item.get("scientific_name", "")),
|
||||||
|
"legacy_identifiers": legacy_identifiers,
|
||||||
|
"taxon_identifiers": list(item.get("taxon_identifiers", [])),
|
||||||
|
"primary_taxon_authority": str(item.get("primary_taxon_authority", "")),
|
||||||
|
"source_file": str(item.get("source_file", "")),
|
||||||
|
"publication_status": str(item.get("publication_status", "published")),
|
||||||
|
"source_format": "ecospecies-markdown-v1",
|
||||||
|
}
|
||||||
|
|
||||||
|
nodes: list[DocumentNode] = []
|
||||||
|
summary = str(item.get("summary", "")).strip()
|
||||||
|
if summary:
|
||||||
|
nodes.append(
|
||||||
|
DocumentNode(
|
||||||
|
node_type="section",
|
||||||
|
title="Summary",
|
||||||
|
body=summary,
|
||||||
|
depth=2,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
for section in item.get("sections", []):
|
||||||
|
heading = str(section.get("heading", "")).strip()
|
||||||
|
if not heading or heading == "HEADER":
|
||||||
|
continue
|
||||||
|
nodes.append(
|
||||||
|
DocumentNode(
|
||||||
|
node_type="section",
|
||||||
|
title=heading,
|
||||||
|
body=str(section.get("content", "")).strip(),
|
||||||
|
depth=2,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return StructuredDocument(metadata=metadata, nodes=nodes)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_species_projection(document: StructuredDocument) -> dict[str, object]:
|
||||||
|
metadata = document.metadata
|
||||||
|
summary = ""
|
||||||
|
sections: list[dict[str, object]] = []
|
||||||
|
legacy_identifiers = metadata.get("legacy_identifiers", [])
|
||||||
|
taxon_identifiers = metadata.get("taxon_identifiers", [])
|
||||||
|
|
||||||
|
flelmr_code = ""
|
||||||
|
if isinstance(legacy_identifiers, list):
|
||||||
|
for item in legacy_identifiers:
|
||||||
|
if not isinstance(item, dict):
|
||||||
|
continue
|
||||||
|
authority = str(item.get("authority", "")).strip().lower()
|
||||||
|
label = str(item.get("label", "")).strip().lower()
|
||||||
|
if authority == "legacy-ecospecies" or label == "flelmr":
|
||||||
|
flelmr_code = str(item.get("identifier", "")).strip()
|
||||||
|
if flelmr_code:
|
||||||
|
break
|
||||||
|
if not flelmr_code:
|
||||||
|
flelmr_code = str(metadata.get("species_code", "")).strip()
|
||||||
|
|
||||||
|
def visit(nodes: list[DocumentNode], path: list[str]) -> None:
|
||||||
|
nonlocal summary
|
||||||
|
for node in nodes:
|
||||||
|
current_path = [*path, node.title]
|
||||||
|
if node.title.lower() == "summary" and not summary:
|
||||||
|
summary = node.body.strip()
|
||||||
|
else:
|
||||||
|
sections.append(
|
||||||
|
{
|
||||||
|
"heading": " / ".join(current_path),
|
||||||
|
"content": node.body.strip(),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
visit(node.children, current_path)
|
||||||
|
|
||||||
|
visit(document.nodes, [])
|
||||||
|
return {
|
||||||
|
"title": metadata.get("title", ""),
|
||||||
|
"common_name": metadata.get("common_name", ""),
|
||||||
|
"scientific_name": metadata.get("scientific_name", ""),
|
||||||
|
"flelmr_code": flelmr_code,
|
||||||
|
"legacy_identifiers": legacy_identifiers if isinstance(legacy_identifiers, list) else [],
|
||||||
|
"taxon_identifiers": taxon_identifiers if isinstance(taxon_identifiers, list) else [],
|
||||||
|
"primary_taxon_authority": str(metadata.get("primary_taxon_authority", "")),
|
||||||
|
"summary": summary,
|
||||||
|
"sections": sections,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _is_citation_heading(title: str) -> bool:
|
||||||
|
lowered = title.strip().rstrip(":").lower()
|
||||||
|
return lowered in {
|
||||||
|
"references",
|
||||||
|
"reference",
|
||||||
|
"citations",
|
||||||
|
"citation",
|
||||||
|
"bibliography",
|
||||||
|
"related references",
|
||||||
|
"related citations",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _split_citation_lines(body: str) -> list[str]:
|
||||||
|
entries: list[dict[str, str]] = []
|
||||||
|
current: list[str] = []
|
||||||
|
current_number = ""
|
||||||
|
|
||||||
|
def flush() -> None:
|
||||||
|
nonlocal current_number
|
||||||
|
if not current:
|
||||||
|
return
|
||||||
|
compact = " ".join(part.strip() for part in current if part.strip()).strip()
|
||||||
|
if compact:
|
||||||
|
entries.append(
|
||||||
|
{
|
||||||
|
"legacy_reference_number": current_number,
|
||||||
|
"raw_text": compact,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
current.clear()
|
||||||
|
current_number = ""
|
||||||
|
|
||||||
|
for raw_line in body.splitlines():
|
||||||
|
stripped = raw_line.strip()
|
||||||
|
if not stripped:
|
||||||
|
flush()
|
||||||
|
continue
|
||||||
|
|
||||||
|
leading_number_match = re.match(r"^(?P<num>\d+)\s*,\s*(?P<text>.+)$", stripped)
|
||||||
|
if leading_number_match:
|
||||||
|
flush()
|
||||||
|
current_number = leading_number_match.group("num")
|
||||||
|
current.append(leading_number_match.group("text"))
|
||||||
|
continue
|
||||||
|
|
||||||
|
bare_number_match = re.match(r"^(?P<num>\d+)\s+(?P<text>[A-Z].+)$", stripped)
|
||||||
|
if bare_number_match:
|
||||||
|
flush()
|
||||||
|
current_number = bare_number_match.group("num")
|
||||||
|
current.append(bare_number_match.group("text"))
|
||||||
|
continue
|
||||||
|
|
||||||
|
bullet_match = re.match(
|
||||||
|
r"^(?:[-*]|\[(?P<bracket_num>\d+)\]|(?P<plain_num>\d+)[\.,])\s+(?P<text>.+)$",
|
||||||
|
stripped,
|
||||||
|
)
|
||||||
|
if bullet_match:
|
||||||
|
flush()
|
||||||
|
current_number = bullet_match.group("bracket_num") or bullet_match.group("plain_num") or ""
|
||||||
|
bullet_text = bullet_match.group("text")
|
||||||
|
if not current_number:
|
||||||
|
nested_number_match = re.match(r"^\[(?P<num>\d+)\]\s+(?P<text>.+)$", bullet_text)
|
||||||
|
if nested_number_match:
|
||||||
|
current_number = nested_number_match.group("num")
|
||||||
|
bullet_text = nested_number_match.group("text")
|
||||||
|
else:
|
||||||
|
nested_comma_match = re.match(r"^(?P<num>\d+)\s*,\s*(?P<text>.+)$", bullet_text)
|
||||||
|
if nested_comma_match:
|
||||||
|
current_number = nested_comma_match.group("num")
|
||||||
|
bullet_text = nested_comma_match.group("text")
|
||||||
|
current.append(bullet_text)
|
||||||
|
continue
|
||||||
|
|
||||||
|
current.append(stripped)
|
||||||
|
|
||||||
|
flush()
|
||||||
|
return entries
|
||||||
|
|
||||||
|
|
||||||
|
def extract_citation_entries(document: StructuredDocument) -> list[dict[str, object]]:
|
||||||
|
entries: list[dict[str, object]] = []
|
||||||
|
|
||||||
|
def visit(nodes: list[DocumentNode], path: list[str]) -> None:
|
||||||
|
for node in nodes:
|
||||||
|
current_path = [*path, node.title]
|
||||||
|
if _is_citation_heading(node.title):
|
||||||
|
section_heading = " / ".join(current_path)
|
||||||
|
for item in _split_citation_lines(node.body):
|
||||||
|
raw_text = item["raw_text"]
|
||||||
|
doi_match = DOI_PATTERN.search(raw_text)
|
||||||
|
entries.append(
|
||||||
|
{
|
||||||
|
"section_heading": section_heading,
|
||||||
|
"legacy_reference_number": item["legacy_reference_number"],
|
||||||
|
"raw_text": raw_text,
|
||||||
|
"normalized_text": _normalize_whitespace(raw_text),
|
||||||
|
"doi": doi_match.group(0) if doi_match else "",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
visit(node.children, current_path)
|
||||||
|
|
||||||
|
visit(document.nodes, [])
|
||||||
|
return entries
|
||||||
|
|
||||||
|
|
||||||
|
def add_citation_to_document(
|
||||||
|
document: StructuredDocument,
|
||||||
|
citation_text: str,
|
||||||
|
heading_title: str = "Related References",
|
||||||
|
) -> bool:
|
||||||
|
normalized_citation = _normalize_whitespace(citation_text)
|
||||||
|
if not normalized_citation:
|
||||||
|
return False
|
||||||
|
|
||||||
|
for node in document.nodes:
|
||||||
|
if _is_citation_heading(node.title):
|
||||||
|
existing = {_normalize_whitespace(item["raw_text"]) for item in _split_citation_lines(node.body)}
|
||||||
|
if normalized_citation in existing:
|
||||||
|
return False
|
||||||
|
body = node.body.rstrip()
|
||||||
|
node.body = f"{body}\n- {citation_text}".strip() if body else f"- {citation_text}"
|
||||||
|
return True
|
||||||
|
|
||||||
|
document.nodes.append(
|
||||||
|
DocumentNode(
|
||||||
|
node_type="section",
|
||||||
|
title=heading_title,
|
||||||
|
body=f"- {citation_text}",
|
||||||
|
depth=2,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
|
@ -0,0 +1,267 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from sqlalchemy import select
|
||||||
|
|
||||||
|
from ecospecies_api.citegeist_bridge import extract_draft_citation
|
||||||
|
from ecospecies_api.document_format import (
|
||||||
|
build_document_from_species_payload,
|
||||||
|
document_to_json,
|
||||||
|
extract_citation_entries,
|
||||||
|
extract_species_projection,
|
||||||
|
export_markdown_document,
|
||||||
|
flatten_document_nodes,
|
||||||
|
parse_markdown_document,
|
||||||
|
validate_markdown_document,
|
||||||
|
)
|
||||||
|
from ecospecies_api.models import (
|
||||||
|
DocumentSection,
|
||||||
|
Species,
|
||||||
|
SpeciesCitation,
|
||||||
|
SpeciesDocument,
|
||||||
|
SpeciesDocumentNode,
|
||||||
|
SpeciesTaxonIdentifier,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _persist_taxon_identifiers(session, species: Species, taxon_identifiers: list[dict[str, object]]) -> None:
|
||||||
|
for identifier in list(species.taxon_identifiers):
|
||||||
|
session.delete(identifier)
|
||||||
|
session.flush()
|
||||||
|
|
||||||
|
for position, item in enumerate(taxon_identifiers, start=1):
|
||||||
|
authority = str(item.get("authority", "")).strip()
|
||||||
|
identifier = str(item.get("identifier", "")).strip()
|
||||||
|
if not authority or not identifier:
|
||||||
|
continue
|
||||||
|
session.add(
|
||||||
|
SpeciesTaxonIdentifier(
|
||||||
|
species_id=species.id,
|
||||||
|
position=position,
|
||||||
|
authority=authority,
|
||||||
|
identifier=identifier,
|
||||||
|
label=str(item.get("label", "")).strip(),
|
||||||
|
is_primary=bool(item.get("primary") or item.get("is_primary")),
|
||||||
|
source_url=str(item.get("source_url", "")).strip(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _existing_taxon_identifier_payload(species: Species) -> list[dict[str, object]]:
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"authority": item.authority,
|
||||||
|
"identifier": item.identifier,
|
||||||
|
"label": item.label,
|
||||||
|
"primary": item.is_primary,
|
||||||
|
"source_url": item.source_url,
|
||||||
|
}
|
||||||
|
for item in species.taxon_identifiers
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _citation_match_key(item: dict[str, object]) -> tuple[str, str, str]:
|
||||||
|
return (
|
||||||
|
str(item.get("section_heading", "")).strip(),
|
||||||
|
str(item.get("legacy_reference_number", "")).strip(),
|
||||||
|
str(item.get("raw_text", "")).strip(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _persist_citations(session, species: Species, citations: list[dict[str, object]]) -> None:
|
||||||
|
existing_by_key = {
|
||||||
|
_citation_match_key(
|
||||||
|
{
|
||||||
|
"section_heading": citation.section_heading,
|
||||||
|
"legacy_reference_number": citation.legacy_reference_number,
|
||||||
|
"raw_text": citation.raw_text,
|
||||||
|
}
|
||||||
|
): citation
|
||||||
|
for citation in species.citations
|
||||||
|
}
|
||||||
|
retained_ids: set[int] = set()
|
||||||
|
|
||||||
|
for position, item in enumerate(citations, start=1):
|
||||||
|
raw_text = str(item.get("raw_text", "")).strip()
|
||||||
|
if not raw_text:
|
||||||
|
continue
|
||||||
|
key = _citation_match_key(item)
|
||||||
|
legacy_reference_number = str(item.get("legacy_reference_number", "")).strip()
|
||||||
|
existing = existing_by_key.get(key)
|
||||||
|
extracted_normalized = str(item.get("normalized_text", "")).strip()
|
||||||
|
extracted_doi = str(item.get("doi", "")).strip()
|
||||||
|
draft = extract_draft_citation(raw_text, legacy_reference_number)
|
||||||
|
|
||||||
|
if existing is None:
|
||||||
|
session.add(
|
||||||
|
SpeciesCitation(
|
||||||
|
species_id=species.id,
|
||||||
|
position=position,
|
||||||
|
section_heading=str(item.get("section_heading", "")).strip(),
|
||||||
|
legacy_reference_number=legacy_reference_number,
|
||||||
|
citation_key=draft.citation_key if draft is not None else "",
|
||||||
|
entry_type=draft.entry_type if draft is not None else "misc",
|
||||||
|
raw_text=raw_text,
|
||||||
|
normalized_text=extracted_normalized,
|
||||||
|
abstract_text="",
|
||||||
|
draft_bibtex=draft.draft_bibtex if draft is not None else "",
|
||||||
|
doi=extracted_doi,
|
||||||
|
source_url="",
|
||||||
|
openalex_id="",
|
||||||
|
resolver_source_label="",
|
||||||
|
enrichment_status="pending",
|
||||||
|
enrichment_error="",
|
||||||
|
source_type="document_extract",
|
||||||
|
review_status="draft",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
existing.position = position
|
||||||
|
existing.section_heading = str(item.get("section_heading", "")).strip()
|
||||||
|
existing.legacy_reference_number = legacy_reference_number
|
||||||
|
existing.raw_text = raw_text
|
||||||
|
if existing.review_status == "draft":
|
||||||
|
existing.normalized_text = extracted_normalized
|
||||||
|
existing.abstract_text = ""
|
||||||
|
existing.doi = extracted_doi
|
||||||
|
existing.citation_key = draft.citation_key if draft is not None else ""
|
||||||
|
existing.entry_type = draft.entry_type if draft is not None else "misc"
|
||||||
|
existing.draft_bibtex = draft.draft_bibtex if draft is not None else ""
|
||||||
|
existing.source_type = "document_extract"
|
||||||
|
existing.enrichment_status = "pending"
|
||||||
|
existing.enrichment_error = ""
|
||||||
|
existing.resolver_source_label = ""
|
||||||
|
existing.source_url = ""
|
||||||
|
existing.openalex_id = ""
|
||||||
|
retained_ids.add(existing.id)
|
||||||
|
session.add(existing)
|
||||||
|
|
||||||
|
for citation in list(species.citations):
|
||||||
|
if citation.id not in retained_ids and citation.source_type in {"document_extract", "editor_review"}:
|
||||||
|
session.delete(citation)
|
||||||
|
|
||||||
|
|
||||||
|
def _persist_document_model(session, species: Species, document_model, markdown_content: str, updated_by: str) -> None:
|
||||||
|
ast_json = document_to_json(document_model)
|
||||||
|
document = session.scalar(
|
||||||
|
select(SpeciesDocument).where(SpeciesDocument.species_id == species.id)
|
||||||
|
)
|
||||||
|
if document is None:
|
||||||
|
document = SpeciesDocument(
|
||||||
|
species_id=species.id,
|
||||||
|
source_format="ecospecies-markdown-v1",
|
||||||
|
markdown_content=markdown_content,
|
||||||
|
ast_json=ast_json,
|
||||||
|
updated_by=updated_by,
|
||||||
|
)
|
||||||
|
session.add(document)
|
||||||
|
session.flush()
|
||||||
|
else:
|
||||||
|
document.source_format = "ecospecies-markdown-v1"
|
||||||
|
document.markdown_content = markdown_content
|
||||||
|
document.ast_json = ast_json
|
||||||
|
document.updated_by = updated_by
|
||||||
|
session.add(document)
|
||||||
|
|
||||||
|
for node in list(document.nodes):
|
||||||
|
session.delete(node)
|
||||||
|
session.flush()
|
||||||
|
|
||||||
|
for node in flatten_document_nodes(document_model):
|
||||||
|
session.add(
|
||||||
|
SpeciesDocumentNode(
|
||||||
|
document_id=document.id,
|
||||||
|
parent_node_ref=node["parent_id"],
|
||||||
|
node_ref=node["node_id"],
|
||||||
|
position=node["position"],
|
||||||
|
depth=node["depth"],
|
||||||
|
node_type=node["node_type"],
|
||||||
|
title=node["title"],
|
||||||
|
body_markdown=node["body_markdown"],
|
||||||
|
body_plaintext=node["body_plaintext"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def sync_species_document(session, species: Species, item: dict[str, object]) -> None:
|
||||||
|
payload = dict(item)
|
||||||
|
if "taxon_identifiers" not in payload or not payload.get("taxon_identifiers"):
|
||||||
|
payload["taxon_identifiers"] = _existing_taxon_identifier_payload(species)
|
||||||
|
if "primary_taxon_authority" not in payload or not payload.get("primary_taxon_authority"):
|
||||||
|
for identifier in payload["taxon_identifiers"]:
|
||||||
|
if bool(identifier.get("primary")):
|
||||||
|
payload["primary_taxon_authority"] = str(identifier.get("authority", "")).strip()
|
||||||
|
break
|
||||||
|
|
||||||
|
document_model = build_document_from_species_payload(payload)
|
||||||
|
markdown_content = export_markdown_document(document_model)
|
||||||
|
_persist_document_model(
|
||||||
|
session,
|
||||||
|
species,
|
||||||
|
document_model,
|
||||||
|
markdown_content,
|
||||||
|
str(item.get("last_modified_by", "system-import")),
|
||||||
|
)
|
||||||
|
_persist_citations(session, species, extract_citation_entries(document_model))
|
||||||
|
|
||||||
|
|
||||||
|
def get_species_document_payload(session, slug: str) -> dict[str, object] | None:
|
||||||
|
species = session.scalar(select(Species).where(Species.slug == slug))
|
||||||
|
if species is None or species.document is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
document = species.document
|
||||||
|
return {
|
||||||
|
"slug": species.slug,
|
||||||
|
"source_format": document.source_format,
|
||||||
|
"markdown": document.markdown_content,
|
||||||
|
"ast_json": document.ast_json,
|
||||||
|
"node_count": len(document.nodes),
|
||||||
|
"updated_by": document.updated_by,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def save_species_document(session, species: Species, markdown: str, username: str) -> dict[str, object]:
|
||||||
|
errors = validate_markdown_document(markdown)
|
||||||
|
if errors:
|
||||||
|
raise ValueError("; ".join(errors))
|
||||||
|
|
||||||
|
document_model = parse_markdown_document(markdown)
|
||||||
|
projection = extract_species_projection(document_model)
|
||||||
|
_persist_document_model(session, species, document_model, markdown, username)
|
||||||
|
_persist_citations(session, species, extract_citation_entries(document_model))
|
||||||
|
|
||||||
|
if projection["title"]:
|
||||||
|
species.title = str(projection["title"])
|
||||||
|
if projection["common_name"]:
|
||||||
|
species.common_name = str(projection["common_name"])
|
||||||
|
if projection["scientific_name"]:
|
||||||
|
species.scientific_name = str(projection["scientific_name"])
|
||||||
|
if projection["flelmr_code"]:
|
||||||
|
species.flelmr_code = str(projection["flelmr_code"])
|
||||||
|
_persist_taxon_identifiers(session, species, list(projection["taxon_identifiers"]))
|
||||||
|
species.summary = str(projection["summary"])
|
||||||
|
species.section_count = len(projection["sections"])
|
||||||
|
species.last_modified_by = username
|
||||||
|
|
||||||
|
for section in list(species.sections):
|
||||||
|
session.delete(section)
|
||||||
|
session.flush()
|
||||||
|
|
||||||
|
for position, section in enumerate(projection["sections"], start=1):
|
||||||
|
session.add(
|
||||||
|
DocumentSection(
|
||||||
|
species_id=species.id,
|
||||||
|
position=position,
|
||||||
|
heading=str(section["heading"]),
|
||||||
|
content=str(section["content"]),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"slug": species.slug,
|
||||||
|
"summary": species.summary,
|
||||||
|
"section_count": species.section_count,
|
||||||
|
"markdown": markdown,
|
||||||
|
"updated_by": username,
|
||||||
|
}
|
||||||
|
|
@ -23,6 +23,9 @@ class Species(Base):
|
||||||
publication_status: Mapped[str] = mapped_column(String(32), default="published", index=True)
|
publication_status: Mapped[str] = mapped_column(String(32), default="published", index=True)
|
||||||
is_archived: Mapped[bool] = mapped_column(Boolean, default=False, index=True)
|
is_archived: Mapped[bool] = mapped_column(Boolean, default=False, index=True)
|
||||||
editor_notes: Mapped[str] = mapped_column(Text, default="")
|
editor_notes: Mapped[str] = mapped_column(Text, default="")
|
||||||
|
created_by: Mapped[str] = mapped_column(String(255), default="system-import")
|
||||||
|
owner_username: Mapped[str] = mapped_column(String(255), default="")
|
||||||
|
owner_role: Mapped[str] = mapped_column(String(32), default="")
|
||||||
last_modified_by: Mapped[str] = mapped_column(String(255), default="system-import")
|
last_modified_by: Mapped[str] = mapped_column(String(255), default="system-import")
|
||||||
|
|
||||||
sections: Mapped[list["DocumentSection"]] = relationship(
|
sections: Mapped[list["DocumentSection"]] = relationship(
|
||||||
|
|
@ -40,6 +43,21 @@ class Species(Base):
|
||||||
cascade="all, delete-orphan",
|
cascade="all, delete-orphan",
|
||||||
order_by="SpeciesAuditLog.id.desc()",
|
order_by="SpeciesAuditLog.id.desc()",
|
||||||
)
|
)
|
||||||
|
document: Mapped["SpeciesDocument | None"] = relationship(
|
||||||
|
back_populates="species",
|
||||||
|
cascade="all, delete-orphan",
|
||||||
|
uselist=False,
|
||||||
|
)
|
||||||
|
taxon_identifiers: Mapped[list["SpeciesTaxonIdentifier"]] = relationship(
|
||||||
|
back_populates="species",
|
||||||
|
cascade="all, delete-orphan",
|
||||||
|
order_by="SpeciesTaxonIdentifier.position",
|
||||||
|
)
|
||||||
|
citations: Mapped[list["SpeciesCitation"]] = relationship(
|
||||||
|
back_populates="species",
|
||||||
|
cascade="all, delete-orphan",
|
||||||
|
order_by="SpeciesCitation.position",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class DocumentSection(Base):
|
class DocumentSection(Base):
|
||||||
|
|
@ -77,3 +95,93 @@ class SpeciesAuditLog(Base):
|
||||||
details_json: Mapped[str] = mapped_column(Text)
|
details_json: Mapped[str] = mapped_column(Text)
|
||||||
|
|
||||||
species: Mapped[Species] = relationship(back_populates="audit_entries")
|
species: Mapped[Species] = relationship(back_populates="audit_entries")
|
||||||
|
|
||||||
|
|
||||||
|
class SpeciesDocument(Base):
|
||||||
|
__tablename__ = "species_document"
|
||||||
|
|
||||||
|
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||||
|
species_id: Mapped[int] = mapped_column(ForeignKey("species.id", ondelete="CASCADE"), unique=True, index=True)
|
||||||
|
source_format: Mapped[str] = mapped_column(String(64), default="ecospecies-markdown-v1")
|
||||||
|
markdown_content: Mapped[str] = mapped_column(Text, default="")
|
||||||
|
ast_json: Mapped[str] = mapped_column(Text, default="")
|
||||||
|
updated_by: Mapped[str] = mapped_column(String(255), default="system-import")
|
||||||
|
|
||||||
|
species: Mapped[Species] = relationship(back_populates="document")
|
||||||
|
nodes: Mapped[list["SpeciesDocumentNode"]] = relationship(
|
||||||
|
back_populates="document",
|
||||||
|
cascade="all, delete-orphan",
|
||||||
|
order_by="SpeciesDocumentNode.position",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SpeciesDocumentNode(Base):
|
||||||
|
__tablename__ = "species_document_node"
|
||||||
|
|
||||||
|
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||||
|
document_id: Mapped[int] = mapped_column(ForeignKey("species_document.id", ondelete="CASCADE"), index=True)
|
||||||
|
parent_node_ref: Mapped[str | None] = mapped_column(String(64), nullable=True, default=None)
|
||||||
|
node_ref: Mapped[str] = mapped_column(String(64), index=True)
|
||||||
|
position: Mapped[int] = mapped_column(Integer, default=1)
|
||||||
|
depth: Mapped[int] = mapped_column(Integer, default=2)
|
||||||
|
node_type: Mapped[str] = mapped_column(String(32), default="section")
|
||||||
|
title: Mapped[str] = mapped_column(String(255), default="")
|
||||||
|
body_markdown: Mapped[str] = mapped_column(Text, default="")
|
||||||
|
body_plaintext: Mapped[str] = mapped_column(Text, default="")
|
||||||
|
source_heading: Mapped[str] = mapped_column(String(255), default="")
|
||||||
|
source_span_start: Mapped[int | None] = mapped_column(Integer, nullable=True, default=None)
|
||||||
|
source_span_end: Mapped[int | None] = mapped_column(Integer, nullable=True, default=None)
|
||||||
|
|
||||||
|
document: Mapped[SpeciesDocument] = relationship(back_populates="nodes")
|
||||||
|
|
||||||
|
|
||||||
|
class ContributorAccount(Base):
|
||||||
|
__tablename__ = "contributor_account"
|
||||||
|
|
||||||
|
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||||
|
email: Mapped[str] = mapped_column(String(255), unique=True, index=True)
|
||||||
|
token_hash: Mapped[str] = mapped_column(String(128), unique=True, index=True)
|
||||||
|
age_gate_confirmed: Mapped[bool] = mapped_column(Boolean, default=False)
|
||||||
|
created_at: Mapped[str] = mapped_column(String(64), index=True)
|
||||||
|
is_active: Mapped[bool] = mapped_column(Boolean, default=True, index=True)
|
||||||
|
|
||||||
|
|
||||||
|
class SpeciesTaxonIdentifier(Base):
|
||||||
|
__tablename__ = "species_taxon_identifier"
|
||||||
|
|
||||||
|
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||||
|
species_id: Mapped[int] = mapped_column(ForeignKey("species.id", ondelete="CASCADE"), index=True)
|
||||||
|
position: Mapped[int] = mapped_column(Integer, default=1)
|
||||||
|
authority: Mapped[str] = mapped_column(String(64), default="")
|
||||||
|
identifier: Mapped[str] = mapped_column(String(255), default="")
|
||||||
|
label: Mapped[str] = mapped_column(String(128), default="")
|
||||||
|
is_primary: Mapped[bool] = mapped_column(Boolean, default=False, index=True)
|
||||||
|
source_url: Mapped[str] = mapped_column(String(500), default="")
|
||||||
|
|
||||||
|
species: Mapped[Species] = relationship(back_populates="taxon_identifiers")
|
||||||
|
|
||||||
|
|
||||||
|
class SpeciesCitation(Base):
|
||||||
|
__tablename__ = "species_citation"
|
||||||
|
|
||||||
|
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||||
|
species_id: Mapped[int] = mapped_column(ForeignKey("species.id", ondelete="CASCADE"), index=True)
|
||||||
|
position: Mapped[int] = mapped_column(Integer, default=1)
|
||||||
|
section_heading: Mapped[str] = mapped_column(String(255), default="")
|
||||||
|
legacy_reference_number: Mapped[str] = mapped_column(String(64), default="", index=True)
|
||||||
|
citation_key: Mapped[str] = mapped_column(String(255), default="", index=True)
|
||||||
|
entry_type: Mapped[str] = mapped_column(String(64), default="misc")
|
||||||
|
raw_text: Mapped[str] = mapped_column(Text, default="")
|
||||||
|
normalized_text: Mapped[str] = mapped_column(Text, default="")
|
||||||
|
abstract_text: Mapped[str] = mapped_column(Text, default="")
|
||||||
|
draft_bibtex: Mapped[str] = mapped_column(Text, default="")
|
||||||
|
doi: Mapped[str] = mapped_column(String(255), default="", index=True)
|
||||||
|
source_url: Mapped[str] = mapped_column(String(500), default="")
|
||||||
|
openalex_id: Mapped[str] = mapped_column(String(64), default="", index=True)
|
||||||
|
resolver_source_label: Mapped[str] = mapped_column(String(255), default="")
|
||||||
|
enrichment_status: Mapped[str] = mapped_column(String(32), default="pending", index=True)
|
||||||
|
enrichment_error: Mapped[str] = mapped_column(Text, default="")
|
||||||
|
source_type: Mapped[str] = mapped_column(String(64), default="document_extract")
|
||||||
|
review_status: Mapped[str] = mapped_column(String(32), default="draft", index=True)
|
||||||
|
|
||||||
|
species: Mapped[Species] = relationship(back_populates="citations")
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,18 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
from collections import Counter
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
SECTION_PATTERN = re.compile(r"^[A-Z][A-Z\s/&()-]{2,}$")
|
SECTION_PATTERN = re.compile(r"^[A-Z][A-Z\s/&()-]{2,}$")
|
||||||
|
TITLE_SECTION_PATTERN = re.compile(r"^[A-Z][A-Za-z\s/&()-]{2,}$")
|
||||||
FIELD_PATTERN = re.compile(r"^(?P<key>[A-Za-z/ _-]+):\s*(?P<value>.*)$")
|
FIELD_PATTERN = re.compile(r"^(?P<key>[A-Za-z/ _-]+):\s*(?P<value>.*)$")
|
||||||
SUMMARY_MARKER_PATTERN = re.compile(r"^(summary(?:/abstract)?|abstract|executive summary):?\s*$", re.IGNORECASE)
|
SUMMARY_MARKER_PATTERN = re.compile(r"^(summary(?:/abstract)?|abstract|executive summary):?\s*$", re.IGNORECASE)
|
||||||
|
SAFE_DIRECTORY_NAME_PATTERN = re.compile(r"^[A-Za-z0-9_-]+$")
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
@ -38,6 +42,10 @@ class SpeciesRecord:
|
||||||
diagnostics: list[IngestDiagnostic]
|
diagnostics: list[IngestDiagnostic]
|
||||||
|
|
||||||
|
|
||||||
|
def get_repo_root() -> Path:
|
||||||
|
return Path(__file__).resolve().parents[4]
|
||||||
|
|
||||||
|
|
||||||
def slugify(value: str) -> str:
|
def slugify(value: str) -> str:
|
||||||
cleaned = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-")
|
cleaned = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-")
|
||||||
return cleaned or "unknown-species"
|
return cleaned or "unknown-species"
|
||||||
|
|
@ -53,6 +61,33 @@ def normalize_whitespace(value: str) -> str:
|
||||||
return re.sub(r"\s+", " ", value).strip()
|
return re.sub(r"\s+", " ", value).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def is_section_heading(line: str) -> bool:
|
||||||
|
stripped = line.strip()
|
||||||
|
if not stripped:
|
||||||
|
return False
|
||||||
|
normalized = stripped[:-1].strip() if stripped.endswith(":") else stripped
|
||||||
|
if not normalized:
|
||||||
|
return False
|
||||||
|
if ":" in normalized:
|
||||||
|
return False
|
||||||
|
if SECTION_PATTERN.fullmatch(normalized):
|
||||||
|
return True
|
||||||
|
if not TITLE_SECTION_PATTERN.fullmatch(normalized):
|
||||||
|
return False
|
||||||
|
|
||||||
|
words = normalized.split()
|
||||||
|
if len(words) > 4:
|
||||||
|
return False
|
||||||
|
return all(word[0].isupper() for word in words if word and word[0].isalpha())
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_heading(line: str) -> str:
|
||||||
|
stripped = line.strip()
|
||||||
|
if stripped.endswith(":"):
|
||||||
|
return stripped[:-1].strip()
|
||||||
|
return stripped
|
||||||
|
|
||||||
|
|
||||||
def split_sections(lines: list[str]) -> list[Section]:
|
def split_sections(lines: list[str]) -> list[Section]:
|
||||||
sections: list[Section] = []
|
sections: list[Section] = []
|
||||||
current_heading = "HEADER"
|
current_heading = "HEADER"
|
||||||
|
|
@ -61,7 +96,7 @@ def split_sections(lines: list[str]) -> list[Section]:
|
||||||
for raw_line in lines:
|
for raw_line in lines:
|
||||||
line = raw_line.rstrip()
|
line = raw_line.rstrip()
|
||||||
stripped = line.strip()
|
stripped = line.strip()
|
||||||
if SECTION_PATTERN.fullmatch(stripped):
|
if is_section_heading(stripped):
|
||||||
if current_lines:
|
if current_lines:
|
||||||
sections.append(
|
sections.append(
|
||||||
Section(
|
Section(
|
||||||
|
|
@ -69,7 +104,7 @@ def split_sections(lines: list[str]) -> list[Section]:
|
||||||
content="\n".join(current_lines).strip(),
|
content="\n".join(current_lines).strip(),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
current_heading = stripped
|
current_heading = normalize_heading(stripped)
|
||||||
current_lines = []
|
current_lines = []
|
||||||
continue
|
continue
|
||||||
current_lines.append(line)
|
current_lines.append(line)
|
||||||
|
|
@ -96,8 +131,9 @@ def extract_metadata(lines: list[str]) -> dict[str, str]:
|
||||||
value = match.group("value").strip()
|
value = match.group("value").strip()
|
||||||
metadata[key] = value
|
metadata[key] = value
|
||||||
|
|
||||||
# Legacy files vary between "FLELMR", "FLELMR Code", and similar labels.
|
# Legacy files vary between "FLELMR", "FLELMR Code", "EcoSpecies Code",
|
||||||
if key.startswith("flelmr"):
|
# and similar labels.
|
||||||
|
if key.startswith("flelmr") or key == "ecospecies code":
|
||||||
metadata["flelmr"] = value
|
metadata["flelmr"] = value
|
||||||
|
|
||||||
return metadata
|
return metadata
|
||||||
|
|
@ -127,7 +163,7 @@ def extract_summary(lines: list[str], sections: list[Section]) -> str:
|
||||||
if summary_lines:
|
if summary_lines:
|
||||||
summary_lines.append("")
|
summary_lines.append("")
|
||||||
continue
|
continue
|
||||||
if SECTION_PATTERN.fullmatch(stripped):
|
if is_section_heading(stripped):
|
||||||
break
|
break
|
||||||
if stripped.startswith("[") and not summary_lines:
|
if stripped.startswith("[") and not summary_lines:
|
||||||
break
|
break
|
||||||
|
|
@ -202,23 +238,76 @@ def parse_species_file(path: Path) -> SpeciesRecord:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_unique_record_slugs(records: list[SpeciesRecord]) -> list[SpeciesRecord]:
|
||||||
|
slug_counts = Counter(record.slug for record in records)
|
||||||
|
used_slugs: set[str] = set()
|
||||||
|
|
||||||
|
for record in records:
|
||||||
|
base_slug = record.slug
|
||||||
|
if slug_counts[base_slug] == 1 and base_slug not in used_slugs:
|
||||||
|
used_slugs.add(base_slug)
|
||||||
|
continue
|
||||||
|
|
||||||
|
disambiguator = slugify(Path(record.source_file).stem)
|
||||||
|
if disambiguator == base_slug:
|
||||||
|
disambiguator = hashlib.sha1(record.source_file.encode("utf-8")).hexdigest()[:8]
|
||||||
|
|
||||||
|
candidate = f"{base_slug}-{disambiguator}"
|
||||||
|
if candidate in used_slugs:
|
||||||
|
source_hash = hashlib.sha1(record.source_file.encode("utf-8")).hexdigest()[:8]
|
||||||
|
candidate = f"{candidate}-{source_hash}"
|
||||||
|
|
||||||
|
suffix = 2
|
||||||
|
while candidate in used_slugs:
|
||||||
|
candidate = f"{base_slug}-{disambiguator}-{suffix}"
|
||||||
|
suffix += 1
|
||||||
|
|
||||||
|
record.slug = candidate
|
||||||
|
used_slugs.add(candidate)
|
||||||
|
|
||||||
|
return records
|
||||||
|
|
||||||
|
|
||||||
def load_species_records(data_dir: str) -> list[SpeciesRecord]:
|
def load_species_records(data_dir: str) -> list[SpeciesRecord]:
|
||||||
base = Path(data_dir)
|
base = resolve_data_dir(data_dir)
|
||||||
if not base.exists():
|
if not base.exists():
|
||||||
return []
|
return []
|
||||||
|
|
||||||
records: list[SpeciesRecord] = []
|
records: list[SpeciesRecord] = []
|
||||||
for path in sorted(base.glob("*.txt")):
|
for path in sorted(base.glob("*.txt")):
|
||||||
records.append(parse_species_file(path))
|
records.append(parse_species_file(path))
|
||||||
return records
|
return ensure_unique_record_slugs(records)
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_data_dir(data_dir: str) -> Path:
|
||||||
|
repo_root = get_repo_root().resolve()
|
||||||
|
raw_value = data_dir.strip()
|
||||||
|
if not raw_value:
|
||||||
|
raise ValueError("Species data directory cannot be empty.")
|
||||||
|
|
||||||
|
candidate = Path(raw_value)
|
||||||
|
if candidate.is_absolute():
|
||||||
|
resolved = candidate.resolve()
|
||||||
|
else:
|
||||||
|
resolved = (repo_root / candidate).resolve()
|
||||||
|
|
||||||
|
try:
|
||||||
|
relative = resolved.relative_to(repo_root)
|
||||||
|
except ValueError as exc:
|
||||||
|
raise ValueError("Species data directory must stay within the codebase directory.") from exc
|
||||||
|
|
||||||
|
if not relative.parts:
|
||||||
|
raise ValueError("Species data directory must be a subdirectory of the codebase.")
|
||||||
|
|
||||||
|
for part in relative.parts:
|
||||||
|
if not SAFE_DIRECTORY_NAME_PATTERN.fullmatch(part):
|
||||||
|
raise ValueError(
|
||||||
|
f"Species data directory contains an unsafe directory name: {part!r}."
|
||||||
|
)
|
||||||
|
|
||||||
|
return resolved
|
||||||
|
|
||||||
|
|
||||||
def get_default_data_dir() -> str:
|
def get_default_data_dir() -> str:
|
||||||
return os.environ.get(
|
configured = os.environ.get("ECOSPECIES_DATA_DIR", "input-data/InputFiles")
|
||||||
"ECOSPECIES_DATA_DIR",
|
return str(resolve_data_dir(configured))
|
||||||
str(
|
|
||||||
Path(__file__).resolve().parents[4].parent
|
|
||||||
/ "01-legacy-code-and-data"
|
|
||||||
/ "InputFiles - TXT"
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,21 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import importlib.util
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
ROOT = Path(__file__).resolve().parent
|
||||||
|
SRC = ROOT / "src"
|
||||||
|
if str(SRC) not in sys.path:
|
||||||
|
sys.path.insert(0, str(SRC))
|
||||||
|
|
||||||
|
TEST_PATH = ROOT / "tests" / "test_auth.py"
|
||||||
|
SPEC = importlib.util.spec_from_file_location("ecospecies_api_test_auth", TEST_PATH)
|
||||||
|
MODULE = importlib.util.module_from_spec(SPEC)
|
||||||
|
assert SPEC is not None and SPEC.loader is not None
|
||||||
|
SPEC.loader.exec_module(MODULE)
|
||||||
|
|
||||||
|
for name in dir(MODULE):
|
||||||
|
if name.startswith("Test") or name.endswith("Tests"):
|
||||||
|
globals()[name] = getattr(MODULE, name)
|
||||||
|
|
@ -0,0 +1,21 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import importlib.util
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
ROOT = Path(__file__).resolve().parent
|
||||||
|
SRC = ROOT / "src"
|
||||||
|
if str(SRC) not in sys.path:
|
||||||
|
sys.path.insert(0, str(SRC))
|
||||||
|
|
||||||
|
TEST_PATH = ROOT / "tests" / "test_citation_enrichment.py"
|
||||||
|
SPEC = importlib.util.spec_from_file_location("ecospecies_api_test_citation_enrichment", TEST_PATH)
|
||||||
|
MODULE = importlib.util.module_from_spec(SPEC)
|
||||||
|
assert SPEC is not None and SPEC.loader is not None
|
||||||
|
SPEC.loader.exec_module(MODULE)
|
||||||
|
|
||||||
|
for name in dir(MODULE):
|
||||||
|
if name.startswith("Test") or name.endswith("Tests"):
|
||||||
|
globals()[name] = getattr(MODULE, name)
|
||||||
|
|
@ -0,0 +1,21 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import importlib.util
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
ROOT = Path(__file__).resolve().parent
|
||||||
|
SRC = ROOT / "src"
|
||||||
|
if str(SRC) not in sys.path:
|
||||||
|
sys.path.insert(0, str(SRC))
|
||||||
|
|
||||||
|
TEST_PATH = ROOT / "tests" / "test_document_format.py"
|
||||||
|
SPEC = importlib.util.spec_from_file_location("ecospecies_api_test_document_format", TEST_PATH)
|
||||||
|
MODULE = importlib.util.module_from_spec(SPEC)
|
||||||
|
assert SPEC is not None and SPEC.loader is not None
|
||||||
|
SPEC.loader.exec_module(MODULE)
|
||||||
|
|
||||||
|
for name in dir(MODULE):
|
||||||
|
if name.startswith("Test") or name.endswith("Tests"):
|
||||||
|
globals()[name] = getattr(MODULE, name)
|
||||||
|
|
@ -0,0 +1,21 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import importlib.util
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
ROOT = Path(__file__).resolve().parent
|
||||||
|
SRC = ROOT / "src"
|
||||||
|
if str(SRC) not in sys.path:
|
||||||
|
sys.path.insert(0, str(SRC))
|
||||||
|
|
||||||
|
TEST_PATH = ROOT / "tests" / "test_parser.py"
|
||||||
|
SPEC = importlib.util.spec_from_file_location("ecospecies_api_test_parser", TEST_PATH)
|
||||||
|
MODULE = importlib.util.module_from_spec(SPEC)
|
||||||
|
assert SPEC is not None and SPEC.loader is not None
|
||||||
|
SPEC.loader.exec_module(MODULE)
|
||||||
|
|
||||||
|
for name in dir(MODULE):
|
||||||
|
if name.startswith("Test") or name.endswith("Tests"):
|
||||||
|
globals()[name] = getattr(MODULE, name)
|
||||||
|
|
@ -0,0 +1,58 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
|
||||||
|
from ecospecies_api import auth, repository
|
||||||
|
|
||||||
|
|
||||||
|
class ContributorAuthTests(unittest.TestCase):
|
||||||
|
def setUp(self) -> None:
|
||||||
|
self.tempdir = tempfile.TemporaryDirectory()
|
||||||
|
db_path = Path(self.tempdir.name) / "test.db"
|
||||||
|
self.engine = create_engine(f"sqlite:///{db_path}", future=True)
|
||||||
|
self.session_local = sessionmaker(
|
||||||
|
bind=self.engine,
|
||||||
|
autoflush=False,
|
||||||
|
autocommit=False,
|
||||||
|
future=True,
|
||||||
|
)
|
||||||
|
self.repository_engine_patch = patch.object(repository, "create_db_engine", return_value=self.engine)
|
||||||
|
self.repository_session_patch = patch.object(repository, "SessionLocal", self.session_local)
|
||||||
|
self.auth_engine_patch = patch.object(auth, "create_db_engine", return_value=self.engine)
|
||||||
|
self.auth_session_patch = patch.object(auth, "SessionLocal", self.session_local)
|
||||||
|
self.repository_engine_patch.start()
|
||||||
|
self.repository_session_patch.start()
|
||||||
|
self.auth_engine_patch.start()
|
||||||
|
self.auth_session_patch.start()
|
||||||
|
|
||||||
|
def tearDown(self) -> None:
|
||||||
|
self.auth_session_patch.stop()
|
||||||
|
self.auth_engine_patch.stop()
|
||||||
|
self.repository_session_patch.stop()
|
||||||
|
self.repository_engine_patch.stop()
|
||||||
|
self.engine.dispose()
|
||||||
|
self.tempdir.cleanup()
|
||||||
|
|
||||||
|
def test_contributor_token_resolves_to_contributor_session(self) -> None:
|
||||||
|
registration = repository.register_contributor("author@example.org", True)
|
||||||
|
|
||||||
|
session = auth.resolve_auth_session({"Authorization": f"Bearer {registration['token']}"})
|
||||||
|
|
||||||
|
self.assertIsNotNone(session)
|
||||||
|
assert session is not None
|
||||||
|
self.assertEqual(session.username, "author@example.org")
|
||||||
|
self.assertEqual(session.role, "contributor")
|
||||||
|
|
||||||
|
def test_contributor_role_does_not_satisfy_editor(self) -> None:
|
||||||
|
self.assertTrue(auth.role_satisfies("editor", "contributor"))
|
||||||
|
self.assertFalse(auth.role_satisfies("contributor", "editor"))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
|
|
@ -0,0 +1,527 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from ecospecies_api.citation_enrichment import (
|
||||||
|
_crossref_message_to_entry,
|
||||||
|
_datacite_item_to_entry,
|
||||||
|
_openalex_work_to_entry,
|
||||||
|
_render_normalized_text,
|
||||||
|
apply_citation_candidate_selection,
|
||||||
|
discover_citation_candidates,
|
||||||
|
enrich_citation_payload,
|
||||||
|
LocalBibEntry,
|
||||||
|
LocalMetadataResolver,
|
||||||
|
LocalResolution,
|
||||||
|
)
|
||||||
|
from ecospecies_api.citegeist_bridge import extract_draft_citation, render_single_bibtex
|
||||||
|
|
||||||
|
|
||||||
|
class CitationEnrichmentTests(unittest.TestCase):
|
||||||
|
def test_render_normalized_text_includes_volume_number_and_pages(self) -> None:
|
||||||
|
rendered = _render_normalized_text(
|
||||||
|
"article",
|
||||||
|
{
|
||||||
|
"author": "Daniell, W.C.",
|
||||||
|
"year": "1872",
|
||||||
|
"title": "Letters referring to experiments of W.C. Daniell",
|
||||||
|
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
|
||||||
|
"volume": "2",
|
||||||
|
"number": "4",
|
||||||
|
"pages": "387-390",
|
||||||
|
"doi": "10.1000/example",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
rendered,
|
||||||
|
"Daniell, W.C. (1872). Letters referring to experiments of W.C. Daniell. Comm. Rept. U.S. Comm. Fish & Fish., 2(4): 387-390. DOI:10.1000/example",
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_crossref_mapping_keeps_volume_issue_and_pages(self) -> None:
|
||||||
|
entry = _crossref_message_to_entry(
|
||||||
|
{
|
||||||
|
"type": "journal-article",
|
||||||
|
"title": ["Example Work"],
|
||||||
|
"issued": {"date-parts": [[1872]]},
|
||||||
|
"author": [{"family": "Daniell", "given": "W.C."}],
|
||||||
|
"container-title": ["Comm. Rept. U.S. Comm. Fish & Fish."],
|
||||||
|
"DOI": "10.1000/example",
|
||||||
|
"URL": "https://doi.org/10.1000/example",
|
||||||
|
"volume": "2",
|
||||||
|
"issue": "4",
|
||||||
|
"page": "387-390",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(entry.fields["volume"], "2")
|
||||||
|
self.assertEqual(entry.fields["number"], "4")
|
||||||
|
self.assertEqual(entry.fields["pages"], "387-390")
|
||||||
|
|
||||||
|
def test_openalex_mapping_keeps_biblio_fields(self) -> None:
|
||||||
|
entry = _openalex_work_to_entry(
|
||||||
|
{
|
||||||
|
"id": "https://openalex.org/W12345",
|
||||||
|
"display_name": "OpenAlex Discovered Work",
|
||||||
|
"publication_year": 2022,
|
||||||
|
"type": "article",
|
||||||
|
"doi": "https://doi.org/10.1000/example-openalex",
|
||||||
|
"authorships": [{"author": {"display_name": "J S, Smith"}}],
|
||||||
|
"primary_location": {"source": {"display_name": "Journal of Graph Discovery"}},
|
||||||
|
"biblio": {"volume": "12", "issue": "3", "first_page": "101", "last_page": "118"},
|
||||||
|
"abstract_inverted_index": {"Graphs": [0], "support": [1], "learning": [2]},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(entry.fields["author"], "Smith, J. S.")
|
||||||
|
self.assertEqual(entry.fields["volume"], "12")
|
||||||
|
self.assertEqual(entry.fields["number"], "3")
|
||||||
|
self.assertEqual(entry.fields["pages"], "101-118")
|
||||||
|
self.assertEqual(entry.fields["abstract"], "Graphs support learning")
|
||||||
|
|
||||||
|
def test_openalex_mapping_handles_null_source(self) -> None:
|
||||||
|
entry = _openalex_work_to_entry(
|
||||||
|
{
|
||||||
|
"id": "https://openalex.org/W54321",
|
||||||
|
"display_name": "OpenAlex Work Without Source",
|
||||||
|
"publication_year": 2021,
|
||||||
|
"type": "article",
|
||||||
|
"doi": "https://doi.org/10.1000/example-null-source",
|
||||||
|
"authorships": [{"author": {"display_name": "Jane Smith"}}],
|
||||||
|
"primary_location": {"source": None},
|
||||||
|
"biblio": {"volume": "5", "issue": "1", "first_page": "10", "last_page": "20"},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(entry.fields["title"], "OpenAlex Work Without Source")
|
||||||
|
self.assertNotIn("journal", entry.fields)
|
||||||
|
self.assertEqual(entry.fields["volume"], "5")
|
||||||
|
self.assertEqual(entry.fields["number"], "1")
|
||||||
|
self.assertEqual(entry.fields["pages"], "10-20")
|
||||||
|
|
||||||
|
def test_datacite_mapping_keeps_container_and_pages(self) -> None:
|
||||||
|
entry = _datacite_item_to_entry(
|
||||||
|
{
|
||||||
|
"attributes": {
|
||||||
|
"titles": [{"title": "DataCite Work"}],
|
||||||
|
"creators": [{"name": "J R, Rivera"}],
|
||||||
|
"publicationYear": "2021",
|
||||||
|
"doi": "10.1000/datacite-work",
|
||||||
|
"url": "https://doi.org/10.1000/datacite-work",
|
||||||
|
"container": "Journal of Metadata",
|
||||||
|
"volume": "7",
|
||||||
|
"issue": "2",
|
||||||
|
"firstPage": "44",
|
||||||
|
"lastPage": "59",
|
||||||
|
"descriptions": [
|
||||||
|
{"descriptionType": "Abstract", "description": "Abstract: Metadata makes reuse easier."}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(entry.fields["author"], "Rivera, J. R.")
|
||||||
|
self.assertEqual(entry.fields["journal"], "Journal of Metadata")
|
||||||
|
self.assertEqual(entry.fields["volume"], "7")
|
||||||
|
self.assertEqual(entry.fields["number"], "2")
|
||||||
|
self.assertEqual(entry.fields["pages"], "44-59")
|
||||||
|
self.assertEqual(entry.fields["abstract"], "Metadata makes reuse easier.")
|
||||||
|
|
||||||
|
def test_render_single_bibtex_preserves_balanced_braces_and_repairs_unmatched_ones(self) -> None:
|
||||||
|
rendered = render_single_bibtex(
|
||||||
|
"misc",
|
||||||
|
"example",
|
||||||
|
{
|
||||||
|
"title": "Alpha_beta {Gamma}",
|
||||||
|
"note": "raw_reference = {Alpha } beta}",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn("title = {Alpha_beta {Gamma}}", rendered)
|
||||||
|
self.assertIn("note = {raw_reference = {Alpha } beta)}", rendered)
|
||||||
|
|
||||||
|
def test_extract_draft_citation_repairs_report_style_reference_fields(self) -> None:
|
||||||
|
class MockEntry:
|
||||||
|
entry_type = "misc"
|
||||||
|
citation_key = "badkey"
|
||||||
|
fields = {
|
||||||
|
"title": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
|
||||||
|
"year": "1872",
|
||||||
|
"note": "extracted_reference = {true}",
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
|
||||||
|
return_value=lambda text: [MockEntry()],
|
||||||
|
):
|
||||||
|
draft = extract_draft_citation(
|
||||||
|
"Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
|
||||||
|
legacy_reference_number="160",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIsNotNone(draft)
|
||||||
|
assert draft is not None
|
||||||
|
self.assertEqual(draft.fields["author"], "Daniell, W.C")
|
||||||
|
self.assertEqual(
|
||||||
|
draft.fields["title"],
|
||||||
|
"Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
|
||||||
|
)
|
||||||
|
self.assertEqual(draft.fields["journal"], "Comm. Rept. U.S. Comm. Fish & Fish")
|
||||||
|
self.assertEqual(draft.fields["volume"], "2")
|
||||||
|
self.assertEqual(draft.fields["pages"], "387-390")
|
||||||
|
self.assertEqual(draft.citation_key, "daniell1872lettersreferringexperiments")
|
||||||
|
|
||||||
|
def test_extract_draft_citation_does_not_split_title_on_report_word(self) -> None:
|
||||||
|
class MockEntry:
|
||||||
|
entry_type = "misc"
|
||||||
|
citation_key = "badkey"
|
||||||
|
fields = {
|
||||||
|
"title": "Smith, J. 1999. Habitat report synthesis for Alabama shad. NOAA Tech. Memo. NMFS-SEFSC-200.",
|
||||||
|
"year": "1999",
|
||||||
|
"note": "extracted_reference = {true}",
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
|
||||||
|
return_value=lambda text: [MockEntry()],
|
||||||
|
):
|
||||||
|
draft = extract_draft_citation(
|
||||||
|
"Smith, J. 1999. Habitat report synthesis for Alabama shad. NOAA Tech. Memo. NMFS-SEFSC-200.",
|
||||||
|
legacy_reference_number="42",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIsNotNone(draft)
|
||||||
|
assert draft is not None
|
||||||
|
self.assertEqual(draft.fields["author"], "Smith, J")
|
||||||
|
self.assertEqual(draft.fields["title"], "Habitat report synthesis for Alabama shad")
|
||||||
|
self.assertEqual(draft.fields["howpublished"], "NOAA Tech. Memo. NMFS-SEFSC-200")
|
||||||
|
self.assertNotIn("journal", draft.fields)
|
||||||
|
|
||||||
|
def test_extract_draft_citation_repairs_proc_abbreviation_venue(self) -> None:
|
||||||
|
class MockEntry:
|
||||||
|
entry_type = "misc"
|
||||||
|
citation_key = "badkey"
|
||||||
|
fields = {
|
||||||
|
"title": "Bailey, R.M., H.E. Winn and C.L. Smith. 1954. Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes. Proc. Acad. Sci. Philad. 106: 109-134.",
|
||||||
|
"year": "1954",
|
||||||
|
"note": "extracted_reference = {true}",
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
|
||||||
|
return_value=lambda text: [MockEntry()],
|
||||||
|
):
|
||||||
|
draft = extract_draft_citation(
|
||||||
|
"Bailey, R.M., H.E. Winn and C.L. Smith. 1954. Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes. Proc. Acad. Sci. Philad. 106: 109-134.",
|
||||||
|
legacy_reference_number="26",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIsNotNone(draft)
|
||||||
|
assert draft is not None
|
||||||
|
self.assertEqual(
|
||||||
|
draft.fields["title"],
|
||||||
|
"Fishes from the Escambia River, Alabama and Florida, with ecologic and taxonomic notes",
|
||||||
|
)
|
||||||
|
self.assertEqual(draft.fields["journal"], "Proc. Acad. Sci. Philad")
|
||||||
|
self.assertEqual(draft.fields["volume"], "106")
|
||||||
|
self.assertEqual(draft.fields["pages"], "109-134")
|
||||||
|
|
||||||
|
def test_extract_draft_citation_repairs_occasional_paper_venue(self) -> None:
|
||||||
|
class MockEntry:
|
||||||
|
entry_type = "misc"
|
||||||
|
citation_key = "badkey"
|
||||||
|
fields = {
|
||||||
|
"title": "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
|
||||||
|
"year": "1950",
|
||||||
|
"note": "extracted_reference = {true}",
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
|
||||||
|
return_value=lambda text: [MockEntry()],
|
||||||
|
):
|
||||||
|
draft = extract_draft_citation(
|
||||||
|
"Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
|
||||||
|
legacy_reference_number="41",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIsNotNone(draft)
|
||||||
|
assert draft is not None
|
||||||
|
self.assertEqual(
|
||||||
|
draft.fields["title"],
|
||||||
|
"Annotated list of the fauna of the Grand Isle region, 1928-1946",
|
||||||
|
)
|
||||||
|
self.assertEqual(draft.fields["journal"], "Occas. Pap. Mar. Lab., LSU")
|
||||||
|
self.assertEqual(draft.fields["volume"], "6")
|
||||||
|
self.assertEqual(draft.fields["number"], "6")
|
||||||
|
self.assertEqual(draft.fields["pages"], "1-66")
|
||||||
|
|
||||||
|
def test_extract_draft_citation_repairs_partial_existing_venue_stub(self) -> None:
|
||||||
|
class MockEntry:
|
||||||
|
entry_type = "misc"
|
||||||
|
citation_key = "badkey"
|
||||||
|
fields = {
|
||||||
|
"title": "Annotated list of the fauna of the Grand Isle region, 1928-1946",
|
||||||
|
"year": "1950",
|
||||||
|
"howpublished": "Occas",
|
||||||
|
"note": "extracted_reference = {true}",
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"ecospecies_api.citegeist_bridge._load_citegeist_extract",
|
||||||
|
return_value=lambda text: [MockEntry()],
|
||||||
|
):
|
||||||
|
draft = extract_draft_citation(
|
||||||
|
"Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
|
||||||
|
legacy_reference_number="41",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIsNotNone(draft)
|
||||||
|
assert draft is not None
|
||||||
|
self.assertEqual(draft.fields["journal"], "Occas. Pap. Mar. Lab., LSU")
|
||||||
|
self.assertEqual(draft.fields["volume"], "6")
|
||||||
|
self.assertEqual(draft.fields["number"], "6")
|
||||||
|
self.assertEqual(draft.fields["pages"], "1-66")
|
||||||
|
|
||||||
|
def test_falls_back_to_internal_resolver_when_citegeist_repo_is_unavailable(self) -> None:
|
||||||
|
class MockResolver:
|
||||||
|
def resolve_entry(self, entry):
|
||||||
|
class Resolution:
|
||||||
|
source_label = "crossref:doi:10.1000/example"
|
||||||
|
|
||||||
|
class Entry:
|
||||||
|
entry_type = "article"
|
||||||
|
citation_key = "doi101000example"
|
||||||
|
fields = {
|
||||||
|
"author": "Smith, Jane",
|
||||||
|
"year": "2024",
|
||||||
|
"title": "Example Work",
|
||||||
|
"journal": "Journal of Examples",
|
||||||
|
"doi": "10.1000/example",
|
||||||
|
"url": "https://doi.org/10.1000/example",
|
||||||
|
}
|
||||||
|
|
||||||
|
entry = Entry()
|
||||||
|
|
||||||
|
return Resolution()
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"ecospecies_api.citation_enrichment._load_citegeist_resolution_components",
|
||||||
|
return_value=(None, None, None, None),
|
||||||
|
):
|
||||||
|
result = enrich_citation_payload(
|
||||||
|
{
|
||||||
|
"raw_text": "Smith, Jane. 2024. Example Work.",
|
||||||
|
"legacy_reference_number": "7",
|
||||||
|
},
|
||||||
|
resolver=MockResolver(),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(result["enrichment_status"], "resolved")
|
||||||
|
self.assertEqual(result["doi"], "10.1000/example")
|
||||||
|
self.assertEqual(result["source_url"], "https://doi.org/10.1000/example")
|
||||||
|
self.assertEqual(result["resolver_source_label"], "crossref:doi:10.1000/example")
|
||||||
|
self.assertIn("ecospecies_reference_number = {7}", result["draft_bibtex"])
|
||||||
|
|
||||||
|
def test_enrichment_replaces_raw_reference_title_and_dedupes_legacy_note(self) -> None:
|
||||||
|
class MockResolver:
|
||||||
|
def resolve_entry(self, entry):
|
||||||
|
class Resolution:
|
||||||
|
source_label = "crossref:search:Letters referring to experiments"
|
||||||
|
|
||||||
|
class Entry:
|
||||||
|
entry_type = "article"
|
||||||
|
citation_key = "daniell1872lettersshadalabama"
|
||||||
|
fields = {
|
||||||
|
"author": "Daniell, W.C.",
|
||||||
|
"year": "1872",
|
||||||
|
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
|
||||||
|
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
|
||||||
|
"url": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
entry = Entry()
|
||||||
|
|
||||||
|
return Resolution()
|
||||||
|
|
||||||
|
result = enrich_citation_payload(
|
||||||
|
{
|
||||||
|
"raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
|
||||||
|
"legacy_reference_number": "160",
|
||||||
|
"citation_key": "daniell1948daniellwc",
|
||||||
|
},
|
||||||
|
resolver=MockResolver(),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(result["enrichment_status"], "resolved")
|
||||||
|
self.assertEqual(result["citation_key"], "daniell1872lettersreferringexperiments")
|
||||||
|
self.assertIn(
|
||||||
|
"title = {Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River}",
|
||||||
|
result["draft_bibtex"],
|
||||||
|
)
|
||||||
|
self.assertIn("year = {1872}", result["draft_bibtex"])
|
||||||
|
self.assertEqual(result["draft_bibtex"].count("ecospecies_reference_number = {160}"), 1)
|
||||||
|
|
||||||
|
def test_enrichment_rejects_conflicting_resolved_metadata(self) -> None:
|
||||||
|
class MockResolver:
|
||||||
|
def resolve_entry(self, entry):
|
||||||
|
class Resolution:
|
||||||
|
source_label = "crossref:search:alabama-shad-false-positive"
|
||||||
|
|
||||||
|
class Entry:
|
||||||
|
entry_type = "article"
|
||||||
|
citation_key = "daniell2009habitatuseage"
|
||||||
|
fields = {
|
||||||
|
"author": "Daniell, W.C.",
|
||||||
|
"year": "2009",
|
||||||
|
"title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA",
|
||||||
|
"journal": "Transactions of the American Fisheries Society",
|
||||||
|
"doi": "10.1111/j.1600-0633.2009.00395.x",
|
||||||
|
"url": "https://doi.org/10.1111/j.1600-0633.2009.00395.x",
|
||||||
|
"volume": "19",
|
||||||
|
"number": "1",
|
||||||
|
"pages": "107-115",
|
||||||
|
}
|
||||||
|
|
||||||
|
entry = Entry()
|
||||||
|
|
||||||
|
return Resolution()
|
||||||
|
|
||||||
|
result = enrich_citation_payload(
|
||||||
|
{
|
||||||
|
"raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
|
||||||
|
"legacy_reference_number": "160",
|
||||||
|
},
|
||||||
|
resolver=MockResolver(),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(result["enrichment_status"], "unresolved")
|
||||||
|
self.assertIn("conflicts with citation seed fields", result["enrichment_error"])
|
||||||
|
|
||||||
|
def test_unresolved_enrichment_still_returns_refreshed_seed_fields(self) -> None:
|
||||||
|
class MockResolver:
|
||||||
|
def resolve_entry(self, entry):
|
||||||
|
return None
|
||||||
|
|
||||||
|
result = enrich_citation_payload(
|
||||||
|
{
|
||||||
|
"raw_text": "Behre, E.H. 1950. Annotated list of the fauna of the Grand Isle region, 1928-1946. Occas. Pap. Mar. Lab., LSU 6(6): 1-66.",
|
||||||
|
"legacy_reference_number": "41",
|
||||||
|
"citation_key": "oldbadkey",
|
||||||
|
"entry_type": "misc",
|
||||||
|
},
|
||||||
|
resolver=MockResolver(),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(result["enrichment_status"], "unresolved")
|
||||||
|
self.assertEqual(result["citation_key"], "behre1950annotatedlistfauna")
|
||||||
|
self.assertIn("title = {Annotated list of the fauna of the Grand Isle region, 1928-1946}", result["draft_bibtex"])
|
||||||
|
self.assertIn("Occas. Pap. Mar. Lab., LSU", result["draft_bibtex"])
|
||||||
|
|
||||||
|
def test_discover_citation_candidates_returns_scored_candidates(self) -> None:
|
||||||
|
class MockResolver:
|
||||||
|
def search_crossref_candidates(self, title):
|
||||||
|
return [
|
||||||
|
LocalResolution(
|
||||||
|
LocalBibEntry(
|
||||||
|
"article",
|
||||||
|
"daniell1872lettersreferringexperiments",
|
||||||
|
{
|
||||||
|
"author": "Daniell, W.C.",
|
||||||
|
"year": "1872",
|
||||||
|
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
|
||||||
|
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
|
||||||
|
"volume": "2",
|
||||||
|
"pages": "387-390",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
"crossref:search:1:daniell-good",
|
||||||
|
),
|
||||||
|
LocalResolution(
|
||||||
|
LocalBibEntry(
|
||||||
|
"article",
|
||||||
|
"daniell2009habitatuseage",
|
||||||
|
{
|
||||||
|
"author": "Daniell, W.C.",
|
||||||
|
"year": "2009",
|
||||||
|
"title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA",
|
||||||
|
"journal": "Transactions of the American Fisheries Society",
|
||||||
|
"volume": "19",
|
||||||
|
"number": "1",
|
||||||
|
"pages": "107-115",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
"crossref:search:2:daniell-bad",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
def search_datacite_candidates(self, title):
|
||||||
|
return []
|
||||||
|
|
||||||
|
def search_openalex_candidates(self, title):
|
||||||
|
return []
|
||||||
|
|
||||||
|
result = discover_citation_candidates(
|
||||||
|
{
|
||||||
|
"raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
|
||||||
|
"legacy_reference_number": "160",
|
||||||
|
},
|
||||||
|
resolver=MockResolver(),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(result["candidate_count"], 2)
|
||||||
|
self.assertGreater(result["candidates"][0]["score"], result["candidates"][1]["score"])
|
||||||
|
self.assertEqual(result["candidates"][0]["field_matches"]["year"]["status"], "exact")
|
||||||
|
self.assertEqual(result["candidates"][1]["field_matches"]["year"]["status"], "conflict")
|
||||||
|
|
||||||
|
def test_local_crossref_candidate_search_filters_placeholder_stub_entries(self) -> None:
|
||||||
|
resolver = LocalMetadataResolver()
|
||||||
|
resolver._safe_get_json = lambda url: {
|
||||||
|
"message": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"type": "journal-article",
|
||||||
|
"title": ["Referenced work 1"],
|
||||||
|
"issued": {"date-parts": [[2020]]},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "journal-article",
|
||||||
|
"title": ["Useful Paper"],
|
||||||
|
"issued": {"date-parts": [[2020]]},
|
||||||
|
"author": [{"family": "Smith", "given": "J S"}],
|
||||||
|
"container-title": ["Journal of Examples"],
|
||||||
|
"DOI": "10.1000/useful",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
results = resolver.search_crossref_candidates("Useful Paper")
|
||||||
|
|
||||||
|
self.assertEqual(len(results), 1)
|
||||||
|
self.assertEqual(results[0].entry.fields["title"], "Useful Paper")
|
||||||
|
|
||||||
|
def test_apply_citation_candidate_selection_uses_selected_candidate_fields(self) -> None:
|
||||||
|
result = apply_citation_candidate_selection(
|
||||||
|
{
|
||||||
|
"raw_text": "Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.",
|
||||||
|
"legacy_reference_number": "160",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"source_label": "crossref:search:1:daniell-good",
|
||||||
|
"entry_type": "article",
|
||||||
|
"fields": {
|
||||||
|
"author": "Daniell, W.C.",
|
||||||
|
"year": "1872",
|
||||||
|
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
|
||||||
|
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
|
||||||
|
"volume": "2",
|
||||||
|
"pages": "387-390",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(result["enrichment_status"], "resolved")
|
||||||
|
self.assertEqual(result["resolver_source_label"], "editor:selected:crossref:search:1:daniell-good")
|
||||||
|
self.assertIn("Comm. Rept. U.S. Comm. Fish & Fish., 2: 387-390", result["normalized_text"])
|
||||||
|
|
@ -0,0 +1,195 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from ecospecies_api.document_format import (
|
||||||
|
DocumentNode,
|
||||||
|
StructuredDocument,
|
||||||
|
build_document_from_species_payload,
|
||||||
|
extract_citation_entries,
|
||||||
|
extract_species_projection,
|
||||||
|
export_markdown_document,
|
||||||
|
parse_markdown_document,
|
||||||
|
validate_markdown_document,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class StructuredMarkdownTests(unittest.TestCase):
|
||||||
|
def test_round_trip_markdown_preserves_metadata_and_hierarchy(self) -> None:
|
||||||
|
source = """---
|
||||||
|
title: American Oyster
|
||||||
|
common_name: American Oyster
|
||||||
|
scientific_name: Crassostrea virginica
|
||||||
|
legacy_identifiers:
|
||||||
|
- authority: legacy-ecospecies
|
||||||
|
identifier: 5192
|
||||||
|
label: FLELMR
|
||||||
|
taxon_identifiers:
|
||||||
|
- authority: worms
|
||||||
|
identifier: 159059
|
||||||
|
label: AphiaID
|
||||||
|
primary: true
|
||||||
|
primary_taxon_authority: worms
|
||||||
|
---
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
Short abstract.
|
||||||
|
|
||||||
|
## Habitat
|
||||||
|
|
||||||
|
### Type
|
||||||
|
Estuarine.
|
||||||
|
"""
|
||||||
|
|
||||||
|
document = parse_markdown_document(source)
|
||||||
|
|
||||||
|
self.assertEqual(document.metadata["title"], "American Oyster")
|
||||||
|
self.assertEqual(document.metadata["primary_taxon_authority"], "worms")
|
||||||
|
self.assertEqual(document.metadata["legacy_identifiers"][0]["identifier"], "5192")
|
||||||
|
self.assertEqual(document.metadata["taxon_identifiers"][0]["authority"], "worms")
|
||||||
|
self.assertEqual(document.nodes[0].title, "Summary")
|
||||||
|
self.assertEqual(document.nodes[1].children[0].title, "Type")
|
||||||
|
self.assertIn("## Habitat", export_markdown_document(document))
|
||||||
|
|
||||||
|
def test_build_document_from_species_payload_creates_markdown_sections(self) -> None:
|
||||||
|
document = build_document_from_species_payload(
|
||||||
|
{
|
||||||
|
"title": "American Oyster",
|
||||||
|
"common_name": "American Oyster",
|
||||||
|
"scientific_name": "Crassostrea virginica",
|
||||||
|
"flelmr_code": "5192",
|
||||||
|
"source_file": "American Oyster.txt",
|
||||||
|
"summary": "Short abstract.",
|
||||||
|
"sections": [
|
||||||
|
{"heading": "HEADER", "content": "Ignored header"},
|
||||||
|
{"heading": "Habitat", "content": "Estuarine."},
|
||||||
|
{"heading": "Reproduction", "content": "Broadcast spawner."},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(document.metadata["legacy_identifiers"][0]["identifier"], "5192")
|
||||||
|
self.assertEqual(document.metadata["legacy_identifiers"][0]["authority"], "legacy-ecospecies")
|
||||||
|
self.assertEqual([node.title for node in document.nodes], ["Summary", "Habitat", "Reproduction"])
|
||||||
|
self.assertEqual(document.nodes[1].body, "Estuarine.")
|
||||||
|
|
||||||
|
def test_extract_species_projection_flattens_nested_headings(self) -> None:
|
||||||
|
document = parse_markdown_document(
|
||||||
|
"""---
|
||||||
|
title: American Oyster
|
||||||
|
common_name: American Oyster
|
||||||
|
scientific_name: Crassostrea virginica
|
||||||
|
legacy_identifiers:
|
||||||
|
- authority: legacy-ecospecies
|
||||||
|
identifier: 5192
|
||||||
|
label: FLELMR
|
||||||
|
---
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
Short abstract.
|
||||||
|
|
||||||
|
## Habitat
|
||||||
|
General habitat.
|
||||||
|
|
||||||
|
### Type
|
||||||
|
Estuarine.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
projection = extract_species_projection(document)
|
||||||
|
|
||||||
|
self.assertEqual(projection["summary"], "Short abstract.")
|
||||||
|
self.assertEqual(projection["flelmr_code"], "5192")
|
||||||
|
self.assertEqual(
|
||||||
|
[section["heading"] for section in projection["sections"]],
|
||||||
|
["Habitat", "Habitat / Type"],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_extract_species_projection_accepts_legacy_species_code_front_matter(self) -> None:
|
||||||
|
document = parse_markdown_document(
|
||||||
|
"""---
|
||||||
|
title: Legacy Fish
|
||||||
|
common_name: Legacy Fish
|
||||||
|
scientific_name: Pisces historicus
|
||||||
|
species_code: 4242
|
||||||
|
---
|
||||||
|
|
||||||
|
## Habitat
|
||||||
|
Estuarine.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
projection = extract_species_projection(document)
|
||||||
|
|
||||||
|
self.assertEqual(projection["flelmr_code"], "4242")
|
||||||
|
|
||||||
|
def test_validate_markdown_document_rejects_missing_front_matter_and_depth_jump(self) -> None:
|
||||||
|
errors = validate_markdown_document(
|
||||||
|
"""## Habitat
|
||||||
|
Text
|
||||||
|
|
||||||
|
#### Type
|
||||||
|
Nested too deeply.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertTrue(any("front matter" in error for error in errors))
|
||||||
|
self.assertTrue(any("Heading depth jumps" in error for error in errors))
|
||||||
|
|
||||||
|
def test_extract_citation_entries_strips_legacy_comma_number_prefix(self) -> None:
|
||||||
|
document = parse_markdown_document(
|
||||||
|
"""---
|
||||||
|
title: Alabama Shad
|
||||||
|
common_name: Alabama Shad
|
||||||
|
scientific_name: Alosa alabamae
|
||||||
|
---
|
||||||
|
|
||||||
|
## References
|
||||||
|
160, Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
citations = extract_citation_entries(document)
|
||||||
|
|
||||||
|
self.assertEqual(len(citations), 1)
|
||||||
|
self.assertEqual(citations[0]["legacy_reference_number"], "160")
|
||||||
|
self.assertTrue(citations[0]["raw_text"].startswith("Daniell, W.C. 1872."))
|
||||||
|
self.assertFalse(citations[0]["raw_text"].startswith("160,"))
|
||||||
|
|
||||||
|
def test_extract_citation_entries_accepts_colon_terminated_citation_heading(self) -> None:
|
||||||
|
citations = extract_citation_entries(
|
||||||
|
StructuredDocument(
|
||||||
|
metadata={},
|
||||||
|
nodes=[
|
||||||
|
DocumentNode(
|
||||||
|
node_type="section",
|
||||||
|
title="Citations:",
|
||||||
|
body="7, Ahmed, M. 1975. Speciation in living oysters.",
|
||||||
|
depth=2,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(len(citations), 1)
|
||||||
|
self.assertEqual(citations[0]["legacy_reference_number"], "7")
|
||||||
|
|
||||||
|
def test_extract_citation_entries_accepts_bare_number_prefix(self) -> None:
|
||||||
|
document = parse_markdown_document(
|
||||||
|
"""---
|
||||||
|
title: Eastern Mosquitofish
|
||||||
|
common_name: Eastern Mosquitofish
|
||||||
|
scientific_name: Gambusia holbrooki
|
||||||
|
---
|
||||||
|
|
||||||
|
## Citations
|
||||||
|
848 Gilmore, R.G. 1977. Fishes of the Indian River Lagoon and adjacent waters, Florida.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
citations = extract_citation_entries(document)
|
||||||
|
|
||||||
|
self.assertEqual(len(citations), 1)
|
||||||
|
self.assertEqual(citations[0]["legacy_reference_number"], "848")
|
||||||
|
self.assertTrue(citations[0]["raw_text"].startswith("Gilmore, R.G. 1977."))
|
||||||
|
|
@ -0,0 +1,109 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from ecospecies_api import parser
|
||||||
|
|
||||||
|
|
||||||
|
class ParserPathResolutionTests(unittest.TestCase):
|
||||||
|
def test_ecospecies_code_is_treated_as_flelmr_code(self) -> None:
|
||||||
|
metadata = parser.extract_metadata(
|
||||||
|
[
|
||||||
|
"Title: Test Fish",
|
||||||
|
"EcoSpecies Code: 4242",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(metadata["ecospecies code"], "4242")
|
||||||
|
self.assertEqual(metadata["flelmr"], "4242")
|
||||||
|
|
||||||
|
def test_title_case_headings_are_split_into_sections(self) -> None:
|
||||||
|
sections = parser.split_sections(
|
||||||
|
[
|
||||||
|
"Species profile: American oyster (Crassostrea virginica)",
|
||||||
|
"",
|
||||||
|
"Classification",
|
||||||
|
" Phylum: Mollusca",
|
||||||
|
"Value",
|
||||||
|
"Commercial: Important fishery.",
|
||||||
|
"Habitat",
|
||||||
|
"Type: Estuarine.",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
[section.heading for section in sections],
|
||||||
|
["HEADER", "Classification", "Value", "Habitat"],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_colon_terminated_title_case_headings_are_split_into_sections(self) -> None:
|
||||||
|
sections = parser.split_sections(
|
||||||
|
[
|
||||||
|
"Ecological Interactions and Notes",
|
||||||
|
"Predator text.",
|
||||||
|
"",
|
||||||
|
"Reference Numbers:",
|
||||||
|
"",
|
||||||
|
"Citations:",
|
||||||
|
"7, Ahmed, M. 1975. Speciation in living oysters.",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
[section.heading for section in sections],
|
||||||
|
["HEADER", "Citations"],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_default_data_dir_uses_in_repo_path_without_spaces(self) -> None:
|
||||||
|
with patch.dict("os.environ", {}, clear=True):
|
||||||
|
resolved = Path(parser.get_default_data_dir())
|
||||||
|
|
||||||
|
self.assertEqual(resolved, parser.get_repo_root() / "input-data" / "InputFiles")
|
||||||
|
|
||||||
|
def test_relative_override_must_stay_within_repo(self) -> None:
|
||||||
|
with self.assertRaisesRegex(ValueError, "within the codebase directory"):
|
||||||
|
parser.resolve_data_dir("../input-data/InputFiles")
|
||||||
|
|
||||||
|
def test_absolute_override_outside_repo_is_rejected(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tempdir:
|
||||||
|
with self.assertRaisesRegex(ValueError, "within the codebase directory"):
|
||||||
|
parser.resolve_data_dir(tempdir)
|
||||||
|
|
||||||
|
def test_directory_names_with_spaces_are_rejected(self) -> None:
|
||||||
|
with self.assertRaisesRegex(ValueError, "unsafe directory name"):
|
||||||
|
parser.resolve_data_dir("input-data/Bad Name")
|
||||||
|
|
||||||
|
def test_directory_names_with_special_characters_are_rejected(self) -> None:
|
||||||
|
with self.assertRaisesRegex(ValueError, "unsafe directory name"):
|
||||||
|
parser.resolve_data_dir("input-data/bad@name")
|
||||||
|
|
||||||
|
def test_load_species_records_resolves_repo_relative_paths(self) -> None:
|
||||||
|
records = parser.load_species_records("input-data/InputFiles")
|
||||||
|
|
||||||
|
self.assertGreater(len(records), 0)
|
||||||
|
|
||||||
|
def test_duplicate_source_records_receive_unique_stable_slugs(self) -> None:
|
||||||
|
records = parser.load_species_records("input-data/InputFiles")
|
||||||
|
slug_by_source = {record.source_file: record.slug for record in records}
|
||||||
|
|
||||||
|
self.assertEqual(len(records), len(set(record.slug for record in records)))
|
||||||
|
self.assertEqual(
|
||||||
|
slug_by_source["Red Snapper_SLH_Outline2012_0722.txt"],
|
||||||
|
"red-snapper-red-snapper-slh-outline2012-0722",
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
slug_by_source["RedSnapper_SLH_2012_0830_combined.txt"],
|
||||||
|
"red-snapper-redsnapper-slh-2012-0830-combined",
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
slug_by_source["Sailfin Molly SLH RGG.txt"],
|
||||||
|
"sailfin-molly-sailfin-molly-slh-rgg",
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
slug_by_source["Sailfin_Molly SLH RGG.txt"].startswith(
|
||||||
|
"sailfin-molly-sailfin-molly-slh-rgg-"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
@ -112,6 +112,35 @@ class RepositoryWorkflowTests(unittest.TestCase):
|
||||||
self.assertEqual(detail["section_count"], 2)
|
self.assertEqual(detail["section_count"], 2)
|
||||||
self.assertEqual([section["position"] for section in detail["sections"]], [1, 2])
|
self.assertEqual([section["position"] for section in detail["sections"]], [1, 2])
|
||||||
self.assertEqual([item["code"] for item in detail["diagnostics"]], ["missing_citations"])
|
self.assertEqual([item["code"] for item in detail["diagnostics"]], ["missing_citations"])
|
||||||
|
self.assertEqual(
|
||||||
|
detail["legacy_identifiers"],
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"authority": "legacy-ecospecies",
|
||||||
|
"identifier": "9999",
|
||||||
|
"label": "FLELMR",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_species_detail_includes_structured_document_and_legacy_source(self) -> None:
|
||||||
|
input_dir = Path(self.tempdir.name) / "input-data" / "InputFiles"
|
||||||
|
input_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(input_dir / "Test Shad.txt").write_text("HEADER\nLegacy header content\n", encoding="utf-8")
|
||||||
|
|
||||||
|
with patch.object(repository, "get_default_data_dir", return_value=str(input_dir)):
|
||||||
|
detail = repository.get_species_by_slug("test-shad")
|
||||||
|
|
||||||
|
self.assertIsNotNone(detail)
|
||||||
|
assert detail is not None
|
||||||
|
self.assertEqual(detail["structured_document"]["source_format"], "ecospecies-markdown-v1")
|
||||||
|
self.assertIn(
|
||||||
|
"HABITAT",
|
||||||
|
[node["title"] for node in detail["structured_document"]["ast"]["nodes"]],
|
||||||
|
)
|
||||||
|
self.assertEqual(detail["legacy_source"]["source_file"], "Test Shad.txt")
|
||||||
|
self.assertIn("Legacy header content", detail["legacy_source"]["text"])
|
||||||
|
self.assertEqual(detail["taxon_identifiers"], [])
|
||||||
|
|
||||||
def test_editorial_update_changes_publication_visibility_and_creates_audit(self) -> None:
|
def test_editorial_update_changes_publication_visibility_and_creates_audit(self) -> None:
|
||||||
result = repository.update_species_editorial(
|
result = repository.update_species_editorial(
|
||||||
|
|
@ -207,6 +236,60 @@ class RepositoryWorkflowTests(unittest.TestCase):
|
||||||
self.assertEqual(len(audit), 2)
|
self.assertEqual(len(audit), 2)
|
||||||
self.assertEqual([entry["action"] for entry in audit], ["section_update", "editorial_update"])
|
self.assertEqual([entry["action"] for entry in audit], ["section_update", "editorial_update"])
|
||||||
|
|
||||||
|
def test_reimport_preserves_persisted_taxon_identifiers(self) -> None:
|
||||||
|
repository.update_species_document_markdown(
|
||||||
|
slug="test-shad",
|
||||||
|
markdown="""---
|
||||||
|
title: Test Shad
|
||||||
|
common_name: Test Shad
|
||||||
|
scientific_name: Alosa testus
|
||||||
|
legacy_identifiers:
|
||||||
|
- authority: legacy-ecospecies
|
||||||
|
identifier: 9999
|
||||||
|
label: FLELMR
|
||||||
|
taxon_identifiers:
|
||||||
|
- authority: gbif
|
||||||
|
identifier: 12345
|
||||||
|
label: taxonKey
|
||||||
|
primary: true
|
||||||
|
primary_taxon_authority: gbif
|
||||||
|
---
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
Taxon-reviewed summary.
|
||||||
|
""",
|
||||||
|
username="edith",
|
||||||
|
)
|
||||||
|
|
||||||
|
repository.import_species_payload(UPDATED_PAYLOAD)
|
||||||
|
|
||||||
|
detail = repository.get_editor_species_detail("test-shad")
|
||||||
|
|
||||||
|
self.assertIsNotNone(detail)
|
||||||
|
self.assertEqual(detail["primary_taxon_authority"], "gbif")
|
||||||
|
self.assertEqual(
|
||||||
|
detail["primary_taxon_identifier"],
|
||||||
|
{
|
||||||
|
"authority": "gbif",
|
||||||
|
"identifier": "12345",
|
||||||
|
"label": "taxonKey",
|
||||||
|
"primary": True,
|
||||||
|
"source_url": "",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
detail["taxon_identifiers"],
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"authority": "gbif",
|
||||||
|
"identifier": "12345",
|
||||||
|
"label": "taxonKey",
|
||||||
|
"primary": True,
|
||||||
|
"source_url": "",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
def test_reimport_updates_summary_when_no_editorial_override_exists(self) -> None:
|
def test_reimport_updates_summary_when_no_editorial_override_exists(self) -> None:
|
||||||
repository.import_species_payload(UPDATED_PAYLOAD)
|
repository.import_species_payload(UPDATED_PAYLOAD)
|
||||||
|
|
||||||
|
|
@ -302,6 +385,583 @@ class RepositoryWorkflowTests(unittest.TestCase):
|
||||||
self.assertEqual(audit[0]["action"], "import_restore")
|
self.assertEqual(audit[0]["action"], "import_restore")
|
||||||
self.assertEqual(audit[0]["details"]["is_archived"], {"from": True, "to": False})
|
self.assertEqual(audit[0]["details"]["is_archived"], {"from": True, "to": False})
|
||||||
|
|
||||||
|
def test_document_markdown_update_refreshes_flat_projection(self) -> None:
|
||||||
|
result = repository.update_species_document_markdown(
|
||||||
|
slug="test-shad",
|
||||||
|
markdown="""---
|
||||||
|
title: Test Shad Markdown
|
||||||
|
common_name: Test Shad
|
||||||
|
scientific_name: Alosa markdownus
|
||||||
|
species_code: 4242
|
||||||
|
---
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
Markdown summary.
|
||||||
|
|
||||||
|
## Habitat
|
||||||
|
Open water.
|
||||||
|
|
||||||
|
### Type
|
||||||
|
Pelagic.
|
||||||
|
""",
|
||||||
|
username="frank",
|
||||||
|
)
|
||||||
|
|
||||||
|
detail = repository.get_editor_species_detail("test-shad")
|
||||||
|
document = repository.get_species_document("test-shad")
|
||||||
|
audit = repository.list_species_audit("test-shad")
|
||||||
|
|
||||||
|
self.assertIsNotNone(result)
|
||||||
|
self.assertIsNotNone(detail)
|
||||||
|
self.assertIsNotNone(document)
|
||||||
|
self.assertEqual(detail["title"], "Test Shad Markdown")
|
||||||
|
self.assertEqual(detail["scientific_name"], "Alosa markdownus")
|
||||||
|
self.assertEqual(detail["flelmr_code"], "4242")
|
||||||
|
self.assertEqual(detail["summary"], "Markdown summary.")
|
||||||
|
self.assertEqual(
|
||||||
|
[section["heading"] for section in detail["sections"]],
|
||||||
|
["Habitat", "Habitat / Type"],
|
||||||
|
)
|
||||||
|
self.assertEqual(document["updated_by"], "frank")
|
||||||
|
self.assertIsNotNone(audit)
|
||||||
|
self.assertEqual(audit[0]["action"], "document_update")
|
||||||
|
|
||||||
|
def test_document_markdown_update_extracts_citations(self) -> None:
|
||||||
|
repository.update_species_document_markdown(
|
||||||
|
slug="test-shad",
|
||||||
|
markdown="""---
|
||||||
|
title: Test Shad Markdown
|
||||||
|
common_name: Test Shad
|
||||||
|
scientific_name: Alosa markdownus
|
||||||
|
legacy_identifiers:
|
||||||
|
- authority: legacy-ecospecies
|
||||||
|
identifier: 4242
|
||||||
|
label: FLELMR
|
||||||
|
---
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
Markdown summary.
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- Smith, J. 2024. Example paper. doi:10.1000/example-doi
|
||||||
|
- [7] Jones, A. 2022. Fisheries review.
|
||||||
|
""",
|
||||||
|
username="frank",
|
||||||
|
)
|
||||||
|
|
||||||
|
detail = repository.get_editor_species_detail("test-shad")
|
||||||
|
|
||||||
|
self.assertIsNotNone(detail)
|
||||||
|
self.assertEqual(detail["citation_count"], 2)
|
||||||
|
self.assertEqual(detail["citations"][0]["section_heading"], "References")
|
||||||
|
self.assertEqual(detail["citations"][0]["legacy_reference_number"], "")
|
||||||
|
self.assertEqual(detail["citations"][0]["doi"], "10.1000/example-doi")
|
||||||
|
self.assertTrue(detail["citations"][0]["citation_key"])
|
||||||
|
self.assertIn("@", detail["citations"][0]["draft_bibtex"])
|
||||||
|
self.assertEqual(detail["citations"][0]["review_status"], "draft")
|
||||||
|
self.assertEqual(detail["citations"][1]["legacy_reference_number"], "7")
|
||||||
|
self.assertEqual(detail["citations"][1]["doi"], "")
|
||||||
|
self.assertIn("ecospecies_reference_number = \\{7\\}", detail["citations"][1]["draft_bibtex"])
|
||||||
|
|
||||||
|
def test_editor_can_review_citations_and_reviews_survive_document_save(self) -> None:
|
||||||
|
repository.update_species_document_markdown(
|
||||||
|
slug="test-shad",
|
||||||
|
markdown="""---
|
||||||
|
title: Test Shad Markdown
|
||||||
|
common_name: Test Shad
|
||||||
|
scientific_name: Alosa markdownus
|
||||||
|
legacy_identifiers:
|
||||||
|
- authority: legacy-ecospecies
|
||||||
|
identifier: 4242
|
||||||
|
label: FLELMR
|
||||||
|
---
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [7] Jones, A. 2022. Fisheries review.
|
||||||
|
""",
|
||||||
|
username="frank",
|
||||||
|
)
|
||||||
|
|
||||||
|
citations = repository.get_editor_species_citations("test-shad")
|
||||||
|
self.assertIsNotNone(citations)
|
||||||
|
citation = citations["citations"][0]
|
||||||
|
|
||||||
|
result = repository.update_species_citation_review(
|
||||||
|
slug="test-shad",
|
||||||
|
citation_id=citation["id"],
|
||||||
|
review_status="accepted",
|
||||||
|
normalized_text="Jones, A. (2022). Fisheries review.",
|
||||||
|
doi="10.1000/review-doi",
|
||||||
|
citation_key="jones2022review",
|
||||||
|
entry_type="article",
|
||||||
|
draft_bibtex="@article{jones2022review,\n doi = {10.1000/review-doi}\n}",
|
||||||
|
username="edith",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIsNotNone(result)
|
||||||
|
self.assertEqual(result["citation"]["review_status"], "accepted")
|
||||||
|
self.assertEqual(result["citation"]["source_type"], "editor_review")
|
||||||
|
|
||||||
|
repository.update_species_document_markdown(
|
||||||
|
slug="test-shad",
|
||||||
|
markdown="""---
|
||||||
|
title: Test Shad Markdown
|
||||||
|
common_name: Test Shad
|
||||||
|
scientific_name: Alosa markdownus
|
||||||
|
legacy_identifiers:
|
||||||
|
- authority: legacy-ecospecies
|
||||||
|
identifier: 4242
|
||||||
|
label: FLELMR
|
||||||
|
---
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [7] Jones, A. 2022. Fisheries review.
|
||||||
|
""",
|
||||||
|
username="frank",
|
||||||
|
)
|
||||||
|
|
||||||
|
citations = repository.get_editor_species_citations("test-shad")
|
||||||
|
audit = repository.list_species_audit("test-shad")
|
||||||
|
|
||||||
|
self.assertIsNotNone(citations)
|
||||||
|
self.assertEqual(citations["citation_count"], 1)
|
||||||
|
self.assertEqual(citations["citations"][0]["review_status"], "accepted")
|
||||||
|
self.assertEqual(citations["citations"][0]["doi"], "10.1000/review-doi")
|
||||||
|
self.assertEqual(citations["citations"][0]["citation_key"], "jones2022review")
|
||||||
|
self.assertEqual(citations["citations"][0]["entry_type"], "article")
|
||||||
|
self.assertIn("10.1000/review-doi", citations["citations"][0]["draft_bibtex"])
|
||||||
|
self.assertIsNotNone(audit)
|
||||||
|
self.assertEqual(audit[1]["action"], "citation_review_update")
|
||||||
|
|
||||||
|
def test_editor_can_run_citation_enrichment(self) -> None:
|
||||||
|
repository.update_species_document_markdown(
|
||||||
|
slug="test-shad",
|
||||||
|
markdown="""---
|
||||||
|
title: Test Shad Markdown
|
||||||
|
common_name: Test Shad
|
||||||
|
scientific_name: Alosa markdownus
|
||||||
|
legacy_identifiers:
|
||||||
|
- authority: legacy-ecospecies
|
||||||
|
identifier: 4242
|
||||||
|
label: FLELMR
|
||||||
|
---
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [7] Jones, A. 2022. Fisheries review.
|
||||||
|
""",
|
||||||
|
username="frank",
|
||||||
|
)
|
||||||
|
citations = repository.get_editor_species_citations("test-shad")
|
||||||
|
self.assertIsNotNone(citations)
|
||||||
|
citation = citations["citations"][0]
|
||||||
|
|
||||||
|
with patch.object(
|
||||||
|
repository,
|
||||||
|
"enrich_citation_payload",
|
||||||
|
return_value={
|
||||||
|
"citation_key": "jones2022review",
|
||||||
|
"entry_type": "article",
|
||||||
|
"normalized_text": "Jones, A. (2022). Fisheries review. Journal of Tests. DOI:10.1000/review-doi",
|
||||||
|
"draft_bibtex": "@article{jones2022review,\n doi = {10.1000/review-doi},\n}",
|
||||||
|
"doi": "10.1000/review-doi",
|
||||||
|
"source_url": "https://doi.org/10.1000/review-doi",
|
||||||
|
"openalex_id": "W12345",
|
||||||
|
"resolver_source_label": "crossref:doi:10.1000/review-doi",
|
||||||
|
"enrichment_status": "resolved",
|
||||||
|
"enrichment_error": "",
|
||||||
|
"conflicts": [],
|
||||||
|
},
|
||||||
|
):
|
||||||
|
result = repository.update_species_citation_enrichment(
|
||||||
|
slug="test-shad",
|
||||||
|
citation_id=citation["id"],
|
||||||
|
username="edith",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIsNotNone(result)
|
||||||
|
self.assertEqual(result["citation"]["enrichment_status"], "resolved")
|
||||||
|
self.assertEqual(result["citation"]["doi"], "10.1000/review-doi")
|
||||||
|
self.assertEqual(result["citation"]["openalex_id"], "W12345")
|
||||||
|
self.assertEqual(result["citation"]["resolver_source_label"], "crossref:doi:10.1000/review-doi")
|
||||||
|
self.assertEqual(result["citation"]["source_url"], "https://doi.org/10.1000/review-doi")
|
||||||
|
|
||||||
|
citations = repository.get_editor_species_citations("test-shad")
|
||||||
|
audit = repository.list_species_audit("test-shad")
|
||||||
|
|
||||||
|
self.assertIsNotNone(citations)
|
||||||
|
self.assertEqual(citations["citations"][0]["citation_key"], "jones2022review")
|
||||||
|
self.assertEqual(citations["citations"][0]["entry_type"], "article")
|
||||||
|
self.assertEqual(citations["citations"][0]["enrichment_status"], "resolved")
|
||||||
|
self.assertIsNotNone(audit)
|
||||||
|
self.assertEqual(audit[0]["action"], "citation_enrichment")
|
||||||
|
|
||||||
|
def test_editor_can_run_batch_citation_enrichment(self) -> None:
|
||||||
|
repository.update_species_document_markdown(
|
||||||
|
slug="test-shad",
|
||||||
|
markdown="""---
|
||||||
|
title: Test Shad Markdown
|
||||||
|
common_name: Test Shad
|
||||||
|
scientific_name: Alosa markdownus
|
||||||
|
legacy_identifiers:
|
||||||
|
- authority: legacy-ecospecies
|
||||||
|
identifier: 4242
|
||||||
|
label: FLELMR
|
||||||
|
---
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [7] Jones, A. 2022. Fisheries review.
|
||||||
|
- [8] Smith, B. 2021. Estuarine habitat paper.
|
||||||
|
""",
|
||||||
|
username="frank",
|
||||||
|
)
|
||||||
|
|
||||||
|
payloads = [
|
||||||
|
{
|
||||||
|
"citation_key": "jones2022review",
|
||||||
|
"entry_type": "article",
|
||||||
|
"normalized_text": "Jones, A. (2022). Fisheries review.",
|
||||||
|
"draft_bibtex": "@article{jones2022review,\n}",
|
||||||
|
"doi": "10.1000/review-doi",
|
||||||
|
"source_url": "https://doi.org/10.1000/review-doi",
|
||||||
|
"openalex_id": "W12345",
|
||||||
|
"resolver_source_label": "crossref:doi:10.1000/review-doi",
|
||||||
|
"enrichment_status": "resolved",
|
||||||
|
"enrichment_error": "",
|
||||||
|
"conflicts": [],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"citation_key": "smith2021estuarine",
|
||||||
|
"entry_type": "misc",
|
||||||
|
"normalized_text": "",
|
||||||
|
"draft_bibtex": "",
|
||||||
|
"doi": "",
|
||||||
|
"source_url": "",
|
||||||
|
"openalex_id": "",
|
||||||
|
"resolver_source_label": "",
|
||||||
|
"enrichment_status": "unresolved",
|
||||||
|
"enrichment_error": "No metadata match found from DOI, title, or authority identifiers.",
|
||||||
|
"conflicts": [],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
with patch.object(repository, "enrich_citation_payload", side_effect=payloads):
|
||||||
|
result = repository.update_species_citations_enrichment_batch(
|
||||||
|
slug="test-shad",
|
||||||
|
username="edith",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIsNotNone(result)
|
||||||
|
self.assertEqual(result["citation_count"], 2)
|
||||||
|
self.assertEqual(result["changed_count"], 2)
|
||||||
|
self.assertEqual(result["resolved_count"], 1)
|
||||||
|
self.assertEqual(result["unresolved_count"], 1)
|
||||||
|
self.assertEqual(result["error_count"], 0)
|
||||||
|
|
||||||
|
def test_editor_can_review_and_apply_citation_candidates(self) -> None:
|
||||||
|
repository.update_species_document_markdown(
|
||||||
|
slug="test-shad",
|
||||||
|
markdown="""---
|
||||||
|
title: Test Shad Markdown
|
||||||
|
common_name: Test Shad
|
||||||
|
scientific_name: Alosa markdownus
|
||||||
|
legacy_identifiers:
|
||||||
|
- authority: legacy-ecospecies
|
||||||
|
identifier: 4242
|
||||||
|
label: FLELMR
|
||||||
|
---
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [7] Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.
|
||||||
|
""",
|
||||||
|
username="frank",
|
||||||
|
)
|
||||||
|
citations = repository.get_editor_species_citations("test-shad")
|
||||||
|
self.assertIsNotNone(citations)
|
||||||
|
citation = citations["citations"][0]
|
||||||
|
|
||||||
|
with patch.object(
|
||||||
|
repository,
|
||||||
|
"discover_citation_candidates",
|
||||||
|
return_value={
|
||||||
|
"seed": {
|
||||||
|
"fields": {
|
||||||
|
"author": "Daniell, W.C.",
|
||||||
|
"year": "1872",
|
||||||
|
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
|
||||||
|
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
|
||||||
|
"volume": "2",
|
||||||
|
"pages": "387-390",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"candidate_count": 1,
|
||||||
|
"candidates": [
|
||||||
|
{
|
||||||
|
"candidate_id": "crossref-search-1-daniell-good",
|
||||||
|
"source_label": "crossref:search:1:daniell-good",
|
||||||
|
"entry_type": "article",
|
||||||
|
"citation_key": "daniell1872lettersreferringexperiments",
|
||||||
|
"fields": {
|
||||||
|
"author": "Daniell, W.C.",
|
||||||
|
"year": "1872",
|
||||||
|
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
|
||||||
|
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
|
||||||
|
"volume": "2",
|
||||||
|
"pages": "387-390",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
):
|
||||||
|
candidates = repository.get_species_citation_candidates("test-shad", citation["id"])
|
||||||
|
|
||||||
|
self.assertIsNotNone(candidates)
|
||||||
|
self.assertEqual(candidates["candidate_count"], 1)
|
||||||
|
|
||||||
|
result = repository.apply_species_citation_candidate_selection(
|
||||||
|
slug="test-shad",
|
||||||
|
citation_id=citation["id"],
|
||||||
|
candidate={
|
||||||
|
"source_label": "crossref:search:1:daniell-good",
|
||||||
|
"entry_type": "article",
|
||||||
|
"fields": {
|
||||||
|
"author": "Daniell, W.C.",
|
||||||
|
"year": "1872",
|
||||||
|
"title": "Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River",
|
||||||
|
"journal": "Comm. Rept. U.S. Comm. Fish & Fish.",
|
||||||
|
"volume": "2",
|
||||||
|
"pages": "387-390",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
username="edith",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIsNotNone(result)
|
||||||
|
self.assertEqual(result["citation"]["resolver_source_label"], "editor:selected:crossref:search:1:daniell-good")
|
||||||
|
self.assertEqual(result["citation"]["source_type"], "editor_selected_candidate")
|
||||||
|
self.assertEqual(result["citation"]["review_status"], "accepted")
|
||||||
|
audit = repository.list_species_audit("test-shad")
|
||||||
|
self.assertIsNotNone(audit)
|
||||||
|
self.assertEqual(audit[0]["action"], "citation_candidate_selection")
|
||||||
|
|
||||||
|
def test_editor_can_add_candidate_as_additional_citation_and_preserve_it(self) -> None:
|
||||||
|
repository.update_species_document_markdown(
|
||||||
|
slug="test-shad",
|
||||||
|
markdown="""---
|
||||||
|
title: Test Shad Markdown
|
||||||
|
common_name: Test Shad
|
||||||
|
scientific_name: Alosa markdownus
|
||||||
|
legacy_identifiers:
|
||||||
|
- authority: legacy-ecospecies
|
||||||
|
identifier: 4242
|
||||||
|
label: FLELMR
|
||||||
|
---
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [7] Daniell, W.C. 1872. Letters referring to experiments of W.C. Daniell, M.D., in introducing shad into the Alabama River. Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.
|
||||||
|
""",
|
||||||
|
username="frank",
|
||||||
|
)
|
||||||
|
citations = repository.get_editor_species_citations("test-shad")
|
||||||
|
self.assertIsNotNone(citations)
|
||||||
|
source_citation = citations["citations"][0]
|
||||||
|
|
||||||
|
result = repository.add_species_citation_from_candidate(
|
||||||
|
slug="test-shad",
|
||||||
|
citation_id=source_citation["id"],
|
||||||
|
candidate={
|
||||||
|
"source_label": "crossref:search:1:daniell-related",
|
||||||
|
"entry_type": "article",
|
||||||
|
"fields": {
|
||||||
|
"author": "Jordan, F.",
|
||||||
|
"year": "2009",
|
||||||
|
"title": "Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA",
|
||||||
|
"journal": "Transactions of the American Fisheries Society",
|
||||||
|
"volume": "19",
|
||||||
|
"number": "1",
|
||||||
|
"pages": "107-115",
|
||||||
|
"doi": "10.1111/j.1600-0633.2009.00395.x",
|
||||||
|
"url": "https://doi.org/10.1111/j.1600-0633.2009.00395.x",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
username="edith",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIsNotNone(result)
|
||||||
|
self.assertEqual(result["citation"]["source_type"], "editor_added_candidate")
|
||||||
|
self.assertEqual(result["citation"]["review_status"], "accepted")
|
||||||
|
|
||||||
|
citations = repository.get_editor_species_citations("test-shad")
|
||||||
|
self.assertIsNotNone(citations)
|
||||||
|
self.assertEqual(citations["citation_count"], 2)
|
||||||
|
self.assertEqual(citations["citations"][1]["section_heading"], "References")
|
||||||
|
document = repository.get_species_document("test-shad")
|
||||||
|
self.assertIsNotNone(document)
|
||||||
|
self.assertIn("Habitat use of age 0 Alabama shad in the Pascagoula River drainage, USA", document["markdown"])
|
||||||
|
|
||||||
|
repository.update_species_document_markdown(
|
||||||
|
slug="test-shad",
|
||||||
|
markdown=document["markdown"],
|
||||||
|
username="frank",
|
||||||
|
)
|
||||||
|
|
||||||
|
citations = repository.get_editor_species_citations("test-shad")
|
||||||
|
self.assertIsNotNone(citations)
|
||||||
|
self.assertEqual(citations["citation_count"], 2)
|
||||||
|
self.assertEqual(citations["citations"][1]["source_type"], "editor_added_candidate")
|
||||||
|
audit = repository.list_species_audit("test-shad")
|
||||||
|
self.assertIsNotNone(audit)
|
||||||
|
self.assertEqual(audit[0]["action"], "document_update")
|
||||||
|
self.assertEqual(audit[1]["action"], "citation_candidate_addition")
|
||||||
|
|
||||||
|
def test_contributor_can_view_only_owned_citations(self) -> None:
|
||||||
|
created = repository.create_contributor_species(
|
||||||
|
"writer@example.org",
|
||||||
|
"""---
|
||||||
|
title: Contributor Draft
|
||||||
|
common_name: Contributor Fish
|
||||||
|
scientific_name: Pisces contributoris
|
||||||
|
species_code:
|
||||||
|
---
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [12] Example, A. 2025. Draft reference.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
owned = repository.get_contributor_species_citations(created["slug"], "writer@example.org")
|
||||||
|
other = repository.get_contributor_species_citations(created["slug"], "other@example.org")
|
||||||
|
|
||||||
|
self.assertIsNotNone(owned)
|
||||||
|
self.assertEqual(owned["citation_count"], 1)
|
||||||
|
self.assertEqual(owned["citations"][0]["legacy_reference_number"], "12")
|
||||||
|
self.assertIsNone(other)
|
||||||
|
|
||||||
|
def test_public_bibliography_aggregates_species_citations(self) -> None:
|
||||||
|
repository.update_species_document_markdown(
|
||||||
|
slug="test-shad",
|
||||||
|
markdown="""---
|
||||||
|
title: Test Shad Markdown
|
||||||
|
common_name: Test Shad
|
||||||
|
scientific_name: Alosa markdownus
|
||||||
|
legacy_identifiers:
|
||||||
|
- authority: legacy-ecospecies
|
||||||
|
identifier: 4242
|
||||||
|
label: FLELMR
|
||||||
|
---
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [7] Jones, A. 2022. Fisheries review.
|
||||||
|
""",
|
||||||
|
username="frank",
|
||||||
|
)
|
||||||
|
|
||||||
|
citations = repository.get_editor_species_citations("test-shad")
|
||||||
|
self.assertIsNotNone(citations)
|
||||||
|
citation = citations["citations"][0]
|
||||||
|
repository.update_species_citation_review(
|
||||||
|
slug="test-shad",
|
||||||
|
citation_id=citation["id"],
|
||||||
|
review_status="accepted",
|
||||||
|
normalized_text="Jones, A. (2022). Fisheries review.",
|
||||||
|
doi="10.1000/review-doi",
|
||||||
|
citation_key="jones2022review",
|
||||||
|
entry_type="article",
|
||||||
|
draft_bibtex="@article{jones2022review,\n doi = {10.1000/review-doi}\n}",
|
||||||
|
username="edith",
|
||||||
|
abstract_text="A short abstract about fisheries review.",
|
||||||
|
)
|
||||||
|
|
||||||
|
bibliography = repository.list_public_bibliography()
|
||||||
|
|
||||||
|
self.assertEqual(len(bibliography), 1)
|
||||||
|
self.assertEqual(bibliography[0]["citation_key"], "jones2022review")
|
||||||
|
self.assertEqual(bibliography[0]["abstract_text"], "A short abstract about fisheries review.")
|
||||||
|
self.assertEqual(bibliography[0]["legacy_reference_numbers"], ["7"])
|
||||||
|
self.assertEqual(bibliography[0]["species_count"], 1)
|
||||||
|
self.assertEqual(bibliography[0]["species_refs"][0]["slug"], "test-shad")
|
||||||
|
|
||||||
|
def test_register_contributor_creates_token_and_enforces_age_gate(self) -> None:
|
||||||
|
with self.assertRaisesRegex(ValueError, "at least 13 years old"):
|
||||||
|
repository.register_contributor("person@example.org", False)
|
||||||
|
|
||||||
|
result = repository.register_contributor("Person@Example.org", True)
|
||||||
|
|
||||||
|
self.assertEqual(result["username"], "person@example.org")
|
||||||
|
self.assertEqual(result["role"], "contributor")
|
||||||
|
self.assertEqual(result["minimum_age"], 13)
|
||||||
|
self.assertTrue(result["token"])
|
||||||
|
|
||||||
|
def test_contributor_can_create_and_edit_only_owned_species(self) -> None:
|
||||||
|
created = repository.create_contributor_species(
|
||||||
|
"writer@example.org",
|
||||||
|
"""---
|
||||||
|
title: Contributor Draft
|
||||||
|
common_name: Contributor Fish
|
||||||
|
scientific_name: Pisces contributoris
|
||||||
|
species_code:
|
||||||
|
---
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
Draft summary.
|
||||||
|
|
||||||
|
## Habitat
|
||||||
|
Mangroves.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
detail = repository.get_contributor_species_detail(created["slug"], "writer@example.org")
|
||||||
|
public_detail = repository.get_species_by_slug(created["slug"])
|
||||||
|
|
||||||
|
self.assertIsNotNone(detail)
|
||||||
|
self.assertIsNone(public_detail)
|
||||||
|
self.assertEqual(detail["publication_status"], "draft")
|
||||||
|
self.assertEqual(detail["common_name"], "Contributor Fish")
|
||||||
|
|
||||||
|
updated = repository.update_contributor_species_document_markdown(
|
||||||
|
created["slug"],
|
||||||
|
"""---
|
||||||
|
title: Contributor Draft Revised
|
||||||
|
common_name: Contributor Fish
|
||||||
|
scientific_name: Pisces contributoris
|
||||||
|
species_code:
|
||||||
|
---
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
Revised summary.
|
||||||
|
|
||||||
|
## Habitat
|
||||||
|
Seagrass.
|
||||||
|
|
||||||
|
### Depth
|
||||||
|
Shallow bays.
|
||||||
|
""",
|
||||||
|
"writer@example.org",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIsNotNone(updated)
|
||||||
|
detail = repository.get_contributor_species_detail(created["slug"], "writer@example.org")
|
||||||
|
other_user_detail = repository.get_contributor_species_detail(created["slug"], "other@example.org")
|
||||||
|
audit = repository.list_species_audit(created["slug"])
|
||||||
|
|
||||||
|
self.assertIsNotNone(detail)
|
||||||
|
self.assertEqual(detail["summary"], "Revised summary.")
|
||||||
|
self.assertEqual(
|
||||||
|
[section["heading"] for section in detail["sections"]],
|
||||||
|
["Habitat", "Habitat / Depth"],
|
||||||
|
)
|
||||||
|
self.assertIsNone(other_user_detail)
|
||||||
|
self.assertIsNotNone(audit)
|
||||||
|
self.assertEqual(audit[0]["action"], "contributor_document_update")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
|
||||||
1111
apps/web/app.js
1111
apps/web/app.js
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,43 @@
|
||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>EcoSpecies Bibliography</title>
|
||||||
|
<link rel="stylesheet" href="./styles.css">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<header class="site-header">
|
||||||
|
<div class="site-header-inner">
|
||||||
|
<div class="site-brand">
|
||||||
|
<p class="site-brand-mark">Open Species Archive</p>
|
||||||
|
<a href="./index.html" class="site-brand-link">EcoSpecies Atlas</a>
|
||||||
|
<p class="site-brand-summary">Public field atlas for historical species life-history materials.</p>
|
||||||
|
</div>
|
||||||
|
<nav class="site-nav" aria-label="Primary">
|
||||||
|
<a href="./index.html">Atlas</a>
|
||||||
|
<a href="./bibliography.html">Bibliography</a>
|
||||||
|
</nav>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
<main class="page">
|
||||||
|
<section class="hero">
|
||||||
|
<p class="eyebrow">EcoSpecies Atlas</p>
|
||||||
|
<h1>Bibliography</h1>
|
||||||
|
<p class="lede">
|
||||||
|
A site-wide bibliography for the EcoSpecies atlas, including imported references and citations added during review.
|
||||||
|
</p>
|
||||||
|
<div class="auth-bar auth-panel-row">
|
||||||
|
<input id="bibliography-search" type="search" placeholder="Search title, author, DOI, or abstract">
|
||||||
|
<button id="bibliography-download" type="button" class="secondary-button">Download BibTeX</button>
|
||||||
|
<p id="bibliography-status" class="auth-status">Loading bibliography...</p>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section class="panel">
|
||||||
|
<div id="bibliography-list" class="public-citation-list"></div>
|
||||||
|
</section>
|
||||||
|
</main>
|
||||||
|
<script src="./bibliography.js" defer></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
|
@ -0,0 +1,230 @@
|
||||||
|
function getAppBase() {
|
||||||
|
const { pathname } = window.location;
|
||||||
|
if (pathname === "/" || pathname === "/index.html") {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
if (pathname.endsWith("/index.html")) {
|
||||||
|
return pathname.slice(0, -"/index.html".length);
|
||||||
|
}
|
||||||
|
return pathname.endsWith("/") ? pathname.slice(0, -1) : pathname;
|
||||||
|
}
|
||||||
|
|
||||||
|
const apiBase = getAppBase().replace(/\/bibliography\.html$/, "");
|
||||||
|
const bibliographyList = document.querySelector("#bibliography-list");
|
||||||
|
const bibliographySearch = document.querySelector("#bibliography-search");
|
||||||
|
const bibliographyStatus = document.querySelector("#bibliography-status");
|
||||||
|
const bibliographyDownload = document.querySelector("#bibliography-download");
|
||||||
|
let currentBibliographyItems = [];
|
||||||
|
|
||||||
|
function escapeHtml(value) {
|
||||||
|
return String(value)
|
||||||
|
.replaceAll("&", "&")
|
||||||
|
.replaceAll('"', """)
|
||||||
|
.replaceAll("<", "<")
|
||||||
|
.replaceAll(">", ">");
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeAbstractForDisplay(value) {
|
||||||
|
const raw = String(value || "").trim();
|
||||||
|
if (!raw) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
const temp = document.createElement("div");
|
||||||
|
temp.innerHTML = raw;
|
||||||
|
return temp.textContent
|
||||||
|
.replace(/^abstract\s*[:.\-]?\s*/i, "")
|
||||||
|
.replace(/\s+/g, " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseBibtexFields(draftBibtex) {
|
||||||
|
const fields = {};
|
||||||
|
const text = String(draftBibtex || "");
|
||||||
|
const pattern = /([a-zA-Z_]+)\s*=\s*\{([^}]*)\}/g;
|
||||||
|
let match = pattern.exec(text);
|
||||||
|
while (match) {
|
||||||
|
fields[match[1].toLowerCase()] = match[2].trim();
|
||||||
|
match = pattern.exec(text);
|
||||||
|
}
|
||||||
|
return fields;
|
||||||
|
}
|
||||||
|
|
||||||
|
function collectBibtexRecords(items) {
|
||||||
|
const seen = new Set();
|
||||||
|
const records = [];
|
||||||
|
for (const item of items || []) {
|
||||||
|
const draftBibtex = String(item && item.draft_bibtex ? item.draft_bibtex : "").trim();
|
||||||
|
if (!draftBibtex || seen.has(draftBibtex)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
seen.add(draftBibtex);
|
||||||
|
records.push(draftBibtex);
|
||||||
|
}
|
||||||
|
return records;
|
||||||
|
}
|
||||||
|
|
||||||
|
function downloadBibtexRecords(items, filenameStem) {
|
||||||
|
const records = collectBibtexRecords(items);
|
||||||
|
if (!records.length) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const blob = new Blob([`${records.join("\n\n")}\n`], { type: "application/x-bibtex;charset=utf-8" });
|
||||||
|
const url = URL.createObjectURL(blob);
|
||||||
|
const link = document.createElement("a");
|
||||||
|
link.href = url;
|
||||||
|
link.download = `${filenameStem}.bib`;
|
||||||
|
document.body.appendChild(link);
|
||||||
|
link.click();
|
||||||
|
document.body.removeChild(link);
|
||||||
|
window.setTimeout(() => URL.revokeObjectURL(url), 0);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
function syncDownloadButton(items) {
|
||||||
|
if (!bibliographyDownload) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const recordCount = collectBibtexRecords(items).length;
|
||||||
|
bibliographyDownload.disabled = !recordCount;
|
||||||
|
bibliographyDownload.textContent = recordCount
|
||||||
|
? `Download BibTeX (${recordCount})`
|
||||||
|
: "Download BibTeX";
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildCitationText(item) {
|
||||||
|
const fields = parseBibtexFields(item.draft_bibtex || "");
|
||||||
|
if (item.normalized_text) {
|
||||||
|
return escapeHtml(item.normalized_text);
|
||||||
|
}
|
||||||
|
const author = fields.author || "";
|
||||||
|
const year = fields.year || "";
|
||||||
|
const title = fields.title || "";
|
||||||
|
const venue = fields.journal || fields.booktitle || fields.publisher || "";
|
||||||
|
const volume = fields.volume || "";
|
||||||
|
const issue = fields.number || "";
|
||||||
|
const pages = fields.pages || "";
|
||||||
|
const parts = [];
|
||||||
|
const lead = [author, year ? `(${year})` : ""].filter(Boolean).join(" ");
|
||||||
|
if (lead) {
|
||||||
|
parts.push(lead);
|
||||||
|
}
|
||||||
|
if (title) {
|
||||||
|
parts.push(title);
|
||||||
|
}
|
||||||
|
const venueBits = [venue, volume ? `${volume}${issue ? `(${issue})` : ""}` : issue ? `(${issue})` : "", pages]
|
||||||
|
.filter(Boolean)
|
||||||
|
.join(", ");
|
||||||
|
if (venueBits) {
|
||||||
|
parts.push(venueBits);
|
||||||
|
}
|
||||||
|
return escapeHtml(parts.join(". ").trim() || item.raw_text || "");
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderSpeciesRefs(refs) {
|
||||||
|
return refs
|
||||||
|
.map(
|
||||||
|
(ref) =>
|
||||||
|
`<a href="./index.html#${escapeHtml(ref.slug)}">${escapeHtml(ref.common_name || ref.slug)}</a>`,
|
||||||
|
)
|
||||||
|
.join(", ");
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderAbstractBlock(text) {
|
||||||
|
const abstract = normalizeAbstractForDisplay(text);
|
||||||
|
if (!abstract) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
return `
|
||||||
|
<div class="citation-abstract-shell">
|
||||||
|
<button type="button" class="secondary-button citation-abstract-toggle" aria-expanded="false">
|
||||||
|
Show Abstract
|
||||||
|
</button>
|
||||||
|
<div class="citation-abstract-display hidden">
|
||||||
|
<p class="public-citation-abstract">${escapeHtml(abstract)}</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function attachCitationAbstractToggles(root) {
|
||||||
|
for (const toggle of root.querySelectorAll(".citation-abstract-toggle")) {
|
||||||
|
const shell = toggle.parentElement;
|
||||||
|
const display = shell && shell.querySelector(".citation-abstract-display");
|
||||||
|
if (!display) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
toggle.addEventListener("click", () => {
|
||||||
|
const hidden = display.classList.toggle("hidden");
|
||||||
|
toggle.setAttribute("aria-expanded", hidden ? "false" : "true");
|
||||||
|
toggle.textContent = hidden ? "Show Abstract" : "Hide Abstract";
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderBibliography(items) {
|
||||||
|
bibliographyList.innerHTML = "";
|
||||||
|
if (!items.length) {
|
||||||
|
bibliographyList.innerHTML = `<p class="editor-status">No bibliography entries match the current search.</p>`;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const item of items) {
|
||||||
|
const links = [
|
||||||
|
item.doi ? `<a href="https://doi.org/${encodeURIComponent(String(item.doi).replace(/^https?:\/\/doi\.org\//, ""))}" target="_blank" rel="noopener noreferrer">DOI</a>` : "",
|
||||||
|
item.source_url ? `<a href="${escapeHtml(item.source_url)}" target="_blank" rel="noopener noreferrer">Source</a>` : "",
|
||||||
|
item.openalex_id ? `<a href="https://openalex.org/${escapeHtml(String(item.openalex_id).replace(/^https?:\/\/openalex\.org\//, ""))}" target="_blank" rel="noopener noreferrer">OpenAlex</a>` : "",
|
||||||
|
]
|
||||||
|
.filter(Boolean)
|
||||||
|
.join(" · ");
|
||||||
|
|
||||||
|
const article = document.createElement("article");
|
||||||
|
article.className = "public-citation-entry";
|
||||||
|
article.innerHTML = `
|
||||||
|
<p class="public-citation-text">${buildCitationText(item)}</p>
|
||||||
|
${renderAbstractBlock(item.abstract_text || "")}
|
||||||
|
<p class="public-citation-meta">
|
||||||
|
Appears in ${item.species_count} species record${item.species_count === 1 ? "" : "s"}
|
||||||
|
${item.legacy_reference_numbers && item.legacy_reference_numbers.length ? ` • Imported references: ${item.legacy_reference_numbers.map((value) => escapeHtml(value)).join(", ")}` : ""}
|
||||||
|
</p>
|
||||||
|
<p class="public-citation-meta">Species: ${renderSpeciesRefs(item.species_refs || [])}</p>
|
||||||
|
${links ? `<p class="public-citation-links">${links}</p>` : ""}
|
||||||
|
`;
|
||||||
|
attachCitationAbstractToggles(article);
|
||||||
|
bibliographyList.appendChild(article);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadBibliography(search = "") {
|
||||||
|
bibliographyStatus.textContent = "Loading bibliography...";
|
||||||
|
const query = search ? `?search=${encodeURIComponent(search)}` : "";
|
||||||
|
const response = await fetch(`${apiBase}/api/bibliography${query}`);
|
||||||
|
const data = await response.json();
|
||||||
|
if (!response.ok) {
|
||||||
|
bibliographyList.innerHTML = `<p class="error">${escapeHtml(data.error || "Unable to load bibliography.")}</p>`;
|
||||||
|
bibliographyStatus.textContent = data.error || "Bibliography load failed";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
currentBibliographyItems = data.items || [];
|
||||||
|
renderBibliography(currentBibliographyItems);
|
||||||
|
syncDownloadButton(currentBibliographyItems);
|
||||||
|
bibliographyStatus.textContent = `${data.count || 0} bibliography entr${data.count === 1 ? "y" : "ies"}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
bibliographySearch.addEventListener("input", async (event) => {
|
||||||
|
await loadBibliography(event.target.value);
|
||||||
|
});
|
||||||
|
|
||||||
|
loadBibliography().catch((error) => {
|
||||||
|
bibliographyList.innerHTML = `<p class="error">Failed to load bibliography: ${escapeHtml(String(error))}</p>`;
|
||||||
|
bibliographyStatus.textContent = "Bibliography load failed";
|
||||||
|
});
|
||||||
|
|
||||||
|
if (bibliographyDownload) {
|
||||||
|
bibliographyDownload.addEventListener("click", () => {
|
||||||
|
const downloaded = downloadBibtexRecords(currentBibliographyItems, "ecospecies-bibliography");
|
||||||
|
if (!downloaded) {
|
||||||
|
bibliographyStatus.textContent = "No BibTeX records are available for download yet.";
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
@ -7,20 +7,31 @@
|
||||||
<link rel="stylesheet" href="./styles.css">
|
<link rel="stylesheet" href="./styles.css">
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
|
<header class="site-header">
|
||||||
|
<div class="site-header-inner">
|
||||||
|
<div class="site-brand">
|
||||||
|
<p class="site-brand-mark">Open Species Archive</p>
|
||||||
|
<a href="./index.html" class="site-brand-link">EcoSpecies Atlas</a>
|
||||||
|
<p class="site-brand-summary">Public field atlas for historical species life-history materials.</p>
|
||||||
|
</div>
|
||||||
|
<nav class="site-nav" aria-label="Primary">
|
||||||
|
<a href="./index.html">Atlas</a>
|
||||||
|
<a href="./bibliography.html">Bibliography</a>
|
||||||
|
</nav>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
<main class="page">
|
<main class="page">
|
||||||
<section class="hero">
|
<section class="hero">
|
||||||
<p class="eyebrow">Marine Species Knowledge System</p>
|
<p class="eyebrow">Open Biodiversity Reference</p>
|
||||||
<h1>EcoSpecies</h1>
|
<h1>EcoSpecies Atlas</h1>
|
||||||
<p class="lede">
|
<p class="lede">
|
||||||
A modern follow-on for the legacy EcoSpecies archive, starting with direct ingestion
|
A modern follow-on for the legacy EcoSpecies archive, built as an open ecology and
|
||||||
of historical Species Life History text files.
|
biodiversity reference workspace.
|
||||||
|
</p>
|
||||||
|
<p class="hero-context">
|
||||||
|
Use EcoSpecies Atlas for species profiles, habitat evidence, ecological reading, and
|
||||||
|
citation-aware exploration grounded in the migrated legacy corpus.
|
||||||
</p>
|
</p>
|
||||||
<div class="auth-bar">
|
|
||||||
<input id="auth-token" type="password" placeholder="Bearer token for editor access">
|
|
||||||
<button id="auth-save" type="button">Use Token</button>
|
|
||||||
<button id="auth-clear" type="button" class="secondary-button">Clear</button>
|
|
||||||
<p id="auth-status" class="auth-status">Public access</p>
|
|
||||||
</div>
|
|
||||||
<div class="hero-stats">
|
<div class="hero-stats">
|
||||||
<div class="stat">
|
<div class="stat">
|
||||||
<span id="species-count">0</span>
|
<span id="species-count">0</span>
|
||||||
|
|
@ -38,6 +49,7 @@
|
||||||
<div class="panel-header">
|
<div class="panel-header">
|
||||||
<h2>Species</h2>
|
<h2>Species</h2>
|
||||||
<input id="search" type="search" placeholder="Search common or scientific name">
|
<input id="search" type="search" placeholder="Search common or scientific name">
|
||||||
|
<button id="contributor-create" type="button" class="secondary-button hidden">Create New Draft</button>
|
||||||
<div id="archive-filter-group" class="archive-filter-group hidden">
|
<div id="archive-filter-group" class="archive-filter-group hidden">
|
||||||
<button type="button" class="archive-filter-button is-active" data-archive-filter="active">Active</button>
|
<button type="button" class="archive-filter-button is-active" data-archive-filter="active">Active</button>
|
||||||
<button type="button" class="archive-filter-button" data-archive-filter="all">All</button>
|
<button type="button" class="archive-filter-button" data-archive-filter="all">All</button>
|
||||||
|
|
@ -66,16 +78,59 @@
|
||||||
This record is archived. It is hidden from public endpoints but remains available to editors for audit and recovery.
|
This record is archived. It is hidden from public endpoints but remains available to editors for audit and recovery.
|
||||||
</p>
|
</p>
|
||||||
</header>
|
</header>
|
||||||
<section id="editor-panel" class="detail-section editor-panel hidden">
|
<div id="detail-sections" class="detail-sections"></div>
|
||||||
<h3>Editor Controls</h3>
|
<div class="workflow-panels">
|
||||||
|
<section id="legacy-panel" class="detail-section collapsible-panel collapsed hidden" data-label="Legacy Materials Under Review">
|
||||||
|
<div class="collapsible-header">
|
||||||
|
<h3>Legacy Materials Under Review</h3>
|
||||||
|
<button type="button" class="secondary-button collapsible-toggle" data-target="legacy-panel" data-label="Legacy Materials Under Review" aria-expanded="false">
|
||||||
|
Show Legacy Materials Under Review
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<div class="collapsible-body">
|
||||||
|
<p id="legacy-source-meta" class="editor-status"></p>
|
||||||
|
<pre id="legacy-source-text" class="legacy-source"></pre>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
<section id="access-panel" class="detail-section collapsible-panel collapsed" data-label="Access and Contribution">
|
||||||
|
<div class="collapsible-header">
|
||||||
|
<h3>Access and Contribution</h3>
|
||||||
|
<button type="button" class="secondary-button collapsible-toggle" data-target="access-panel" data-label="Access and Contribution" aria-expanded="false">
|
||||||
|
Show Access and Contribution
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<div class="collapsible-body">
|
||||||
|
<div class="auth-bar auth-panel-row">
|
||||||
|
<input id="auth-token" type="password" placeholder="Bearer token for editor access">
|
||||||
|
<button id="auth-save" type="button">Use Token</button>
|
||||||
|
<button id="auth-clear" type="button" class="secondary-button">Clear</button>
|
||||||
|
<p id="auth-status" class="auth-status">Public access</p>
|
||||||
|
</div>
|
||||||
|
<div class="auth-bar contributor-signup auth-panel-row">
|
||||||
|
<input id="contributor-email" type="email" placeholder="Email for contributor access">
|
||||||
|
<label class="archive-toggle contributor-age-gate">
|
||||||
|
<input id="contributor-age-gate" type="checkbox">
|
||||||
|
<span>I confirm I am at least <span id="contributor-age-label">13</span> years old</span>
|
||||||
|
</label>
|
||||||
|
<button id="contributor-register" type="button" class="secondary-button">Become Contributor</button>
|
||||||
|
<p id="contributor-status" class="auth-status"></p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
<section id="editor-panel" class="detail-section collapsible-panel editor-panel collapsed hidden" data-label="Editing Workflow">
|
||||||
|
<div class="collapsible-header">
|
||||||
|
<h3>Editing Workflow</h3>
|
||||||
|
<button type="button" class="secondary-button collapsible-toggle" data-target="editor-panel" data-label="Editing Workflow" aria-expanded="false">
|
||||||
|
Show Editing Workflow
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<div class="collapsible-body">
|
||||||
<label class="editor-label" for="editor-publication-status">Publication Status</label>
|
<label class="editor-label" for="editor-publication-status">Publication Status</label>
|
||||||
<select id="editor-publication-status">
|
<select id="editor-publication-status">
|
||||||
<option value="draft">Draft</option>
|
<option value="draft">Draft</option>
|
||||||
<option value="review">Review</option>
|
<option value="review">Review</option>
|
||||||
<option value="published">Published</option>
|
<option value="published">Published</option>
|
||||||
</select>
|
</select>
|
||||||
<label class="editor-label" for="editor-summary">Summary</label>
|
|
||||||
<textarea id="editor-summary" rows="5" placeholder="Write a concise executive summary."></textarea>
|
|
||||||
<label class="editor-label" for="editor-notes">Editor Notes</label>
|
<label class="editor-label" for="editor-notes">Editor Notes</label>
|
||||||
<textarea id="editor-notes" rows="4" placeholder="Internal editorial notes"></textarea>
|
<textarea id="editor-notes" rows="4" placeholder="Internal editorial notes"></textarea>
|
||||||
<label class="archive-toggle">
|
<label class="archive-toggle">
|
||||||
|
|
@ -86,24 +141,103 @@
|
||||||
<button id="editor-save" type="button">Save Editorial Changes</button>
|
<button id="editor-save" type="button">Save Editorial Changes</button>
|
||||||
<p id="editor-status" class="editor-status"></p>
|
<p id="editor-status" class="editor-status"></p>
|
||||||
</div>
|
</div>
|
||||||
|
</div>
|
||||||
</section>
|
</section>
|
||||||
<section id="audit-panel" class="detail-section hidden">
|
<section id="document-panel" class="detail-section collapsible-panel editor-panel collapsed hidden" data-label="Metadata and Document Workflow">
|
||||||
|
<div class="collapsible-header">
|
||||||
|
<h3>Metadata and Document Workflow</h3>
|
||||||
|
<button type="button" class="secondary-button collapsible-toggle" data-target="document-panel" data-label="Metadata and Document Workflow" aria-expanded="false">
|
||||||
|
Show Metadata and Document Workflow
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<div class="collapsible-body">
|
||||||
|
<div class="document-panel-header">
|
||||||
|
<div>
|
||||||
|
<p class="editor-status">
|
||||||
|
Markdown is the editable source of truth for hierarchy. Front matter and headings are validated on save.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<div class="editor-actions">
|
||||||
|
<button id="document-save" type="button">Save Document</button>
|
||||||
|
<p id="document-status" class="editor-status"></p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<label class="editor-label" for="document-markdown">Markdown Source</label>
|
||||||
|
<textarea id="document-markdown" class="document-editor" rows="18" spellcheck="false"></textarea>
|
||||||
|
<details class="document-preview-shell" open>
|
||||||
|
<summary>Outline Preview</summary>
|
||||||
|
<div id="document-preview" class="document-preview"></div>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
<section id="citation-panel" class="detail-section collapsible-panel collapsed hidden" data-label="Review Workflow">
|
||||||
|
<div class="collapsible-header">
|
||||||
|
<h3>Review Workflow</h3>
|
||||||
|
<button type="button" class="secondary-button collapsible-toggle" data-target="citation-panel" data-label="Review Workflow" aria-expanded="false">
|
||||||
|
Show Review Workflow
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<div class="collapsible-body">
|
||||||
|
<div class="document-panel-header">
|
||||||
|
<div>
|
||||||
|
<p id="citation-status" class="editor-status">
|
||||||
|
Extracted bibliography entries and draft BibTeX records.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<div class="editor-actions">
|
||||||
|
<button id="citation-backfill-species" type="button" class="secondary-button hidden">Backfill This Species</button>
|
||||||
|
<button id="citation-enrich-all" type="button" class="secondary-button hidden">Run Enrichment For All Citations</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div id="citation-list" class="citation-list"></div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
<section id="audit-panel" class="detail-section collapsible-panel collapsed hidden" data-label="Audit History">
|
||||||
|
<div class="collapsible-header">
|
||||||
<h3>Audit History</h3>
|
<h3>Audit History</h3>
|
||||||
|
<button type="button" class="secondary-button collapsible-toggle" data-target="audit-panel" data-label="Audit History" aria-expanded="false">
|
||||||
|
Show Audit History
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<div class="collapsible-body">
|
||||||
<div id="audit-list" class="audit-list"></div>
|
<div id="audit-list" class="audit-list"></div>
|
||||||
|
</div>
|
||||||
</section>
|
</section>
|
||||||
<div id="detail-sections" class="detail-sections"></div>
|
</div>
|
||||||
</article>
|
</article>
|
||||||
</section>
|
</section>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
<footer class="footer">
|
<footer class="footer">
|
||||||
<p>
|
<p>
|
||||||
This migration path preserves attribution for Dr. Peter Rubec, Dr. Diane Blackwood,
|
EcoSpecies Atlas preserves attribution for Dr. Peter Rubec, Dr. Diane Blackwood,
|
||||||
Dr. Welsbery R. Elsberry, and the Florida Fish and Wildlife Research Institute context
|
Dr. Welsbery R. Elsberry, and the Florida Fish and Wildlife Research Institute context
|
||||||
documented in the legacy project materials.
|
documented in the legacy project materials.
|
||||||
</p>
|
</p>
|
||||||
</footer>
|
</footer>
|
||||||
</main>
|
</main>
|
||||||
|
<section id="citation-match-dialog" class="match-dialog-shell hidden" aria-hidden="true">
|
||||||
|
<div class="match-dialog-backdrop"></div>
|
||||||
|
<article class="match-dialog-card" role="dialog" aria-modal="true" aria-labelledby="citation-match-title">
|
||||||
|
<div class="match-dialog-header">
|
||||||
|
<div>
|
||||||
|
<h2 id="citation-match-title">Citation Candidate Review</h2>
|
||||||
|
<p id="citation-match-status" class="editor-status">Compare the parsed source citation against candidate metadata.</p>
|
||||||
|
</div>
|
||||||
|
<button id="citation-match-close" type="button" class="secondary-button">Close</button>
|
||||||
|
</div>
|
||||||
|
<div class="match-dialog-grid">
|
||||||
|
<section class="detail-section">
|
||||||
|
<h3>Parsed Source Metadata</h3>
|
||||||
|
<div id="citation-match-seed" class="match-seed"></div>
|
||||||
|
</section>
|
||||||
|
<section class="detail-section">
|
||||||
|
<h3>Candidate Matches</h3>
|
||||||
|
<div id="citation-match-candidates" class="match-candidates"></div>
|
||||||
|
</section>
|
||||||
|
</div>
|
||||||
|
</article>
|
||||||
|
</section>
|
||||||
<script src="./app.js" defer></script>
|
<script src="./app.js" defer></script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,10 @@ server {
|
||||||
root /usr/share/nginx/html;
|
root /usr/share/nginx/html;
|
||||||
index index.html;
|
index index.html;
|
||||||
|
|
||||||
|
location = /apps/ecospecies {
|
||||||
|
return 301 /apps/ecospecies/;
|
||||||
|
}
|
||||||
|
|
||||||
location /api/ {
|
location /api/ {
|
||||||
proxy_pass http://api:8000/api/;
|
proxy_pass http://api:8000/api/;
|
||||||
proxy_http_version 1.1;
|
proxy_http_version 1.1;
|
||||||
|
|
@ -14,19 +18,46 @@ server {
|
||||||
proxy_set_header X-Forwarded-Proto $scheme;
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
location /apps/ecospecies/api/ {
|
||||||
|
rewrite ^/apps/ecospecies/api/(.*)$ /api/$1 break;
|
||||||
|
proxy_pass http://api:8000;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
}
|
||||||
|
|
||||||
location /healthz {
|
location /healthz {
|
||||||
proxy_pass http://api:8000/healthz;
|
proxy_pass http://api:8000/healthz;
|
||||||
proxy_http_version 1.1;
|
proxy_http_version 1.1;
|
||||||
proxy_set_header Host $host;
|
proxy_set_header Host $host;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
location /apps/ecospecies/healthz {
|
||||||
|
proxy_pass http://api:8000/healthz;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
}
|
||||||
|
|
||||||
location /readyz {
|
location /readyz {
|
||||||
proxy_pass http://api:8000/readyz;
|
proxy_pass http://api:8000/readyz;
|
||||||
proxy_http_version 1.1;
|
proxy_http_version 1.1;
|
||||||
proxy_set_header Host $host;
|
proxy_set_header Host $host;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
location /apps/ecospecies/readyz {
|
||||||
|
proxy_pass http://api:8000/readyz;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
}
|
||||||
|
|
||||||
location / {
|
location / {
|
||||||
try_files $uri $uri/ /index.html;
|
try_files $uri $uri/ /index.html;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
location /apps/ecospecies/ {
|
||||||
|
rewrite ^/apps/ecospecies/(.*)$ /$1 break;
|
||||||
|
try_files $uri $uri/ /index.html;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,12 @@
|
||||||
:root {
|
:root {
|
||||||
--bg: #f4efe6;
|
--bg: #f4f7fb;
|
||||||
--paper: rgba(255, 252, 247, 0.78);
|
--paper: rgba(255, 255, 255, 0.88);
|
||||||
--ink: #16251f;
|
--ink: #182433;
|
||||||
--muted: #58655f;
|
--muted: #5f6b7d;
|
||||||
--accent: #0f766e;
|
--accent: #2457a6;
|
||||||
--accent-2: #bc6c25;
|
--accent-2: #1f7a5a;
|
||||||
--line: rgba(22, 37, 31, 0.12);
|
--line: rgba(24, 36, 51, 0.11);
|
||||||
--shadow: 0 24px 70px rgba(24, 35, 30, 0.15);
|
--shadow: 0 24px 70px rgba(33, 52, 84, 0.14);
|
||||||
}
|
}
|
||||||
|
|
||||||
* {
|
* {
|
||||||
|
|
@ -15,12 +15,83 @@
|
||||||
|
|
||||||
body {
|
body {
|
||||||
margin: 0;
|
margin: 0;
|
||||||
font-family: Georgia, "Times New Roman", serif;
|
font-family: "Segoe UI", "Helvetica Neue", Arial, sans-serif;
|
||||||
color: var(--ink);
|
color: var(--ink);
|
||||||
background:
|
background:
|
||||||
radial-gradient(circle at top left, rgba(15, 118, 110, 0.14), transparent 28%),
|
radial-gradient(circle at top left, rgba(36, 87, 166, 0.14), transparent 26%),
|
||||||
radial-gradient(circle at top right, rgba(188, 108, 37, 0.16), transparent 24%),
|
radial-gradient(circle at top right, rgba(31, 122, 90, 0.12), transparent 24%),
|
||||||
linear-gradient(180deg, #f8f4ec, #efe6d7 70%, #e7dcc9);
|
linear-gradient(180deg, #f4f7fb, #e4edf6 72%, #d9e6ef);
|
||||||
|
}
|
||||||
|
|
||||||
|
.site-header {
|
||||||
|
width: min(1320px, calc(100vw - 32px));
|
||||||
|
margin: 0 auto;
|
||||||
|
padding-top: 24px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.site-header-inner {
|
||||||
|
display: flex;
|
||||||
|
gap: 18px;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: space-between;
|
||||||
|
padding: 18px 22px;
|
||||||
|
border-radius: 24px;
|
||||||
|
backdrop-filter: blur(10px);
|
||||||
|
background: var(--paper);
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
box-shadow: var(--shadow);
|
||||||
|
}
|
||||||
|
|
||||||
|
.site-brand {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.site-brand-mark {
|
||||||
|
margin: 0;
|
||||||
|
color: var(--accent);
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.18em;
|
||||||
|
font-size: 0.76rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.site-brand-link {
|
||||||
|
color: var(--ink);
|
||||||
|
font-size: 1.5rem;
|
||||||
|
font-weight: 700;
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.site-brand-summary {
|
||||||
|
margin: 0;
|
||||||
|
color: var(--muted);
|
||||||
|
font-size: 0.94rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.site-nav {
|
||||||
|
display: flex;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
gap: 10px;
|
||||||
|
justify-content: flex-end;
|
||||||
|
}
|
||||||
|
|
||||||
|
.site-nav a {
|
||||||
|
display: inline-flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
border-radius: 999px;
|
||||||
|
padding: 11px 16px;
|
||||||
|
text-decoration: none;
|
||||||
|
color: var(--ink);
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
background: rgba(255, 255, 255, 0.72);
|
||||||
|
transition: transform 160ms ease, border-color 160ms ease;
|
||||||
|
}
|
||||||
|
|
||||||
|
.site-nav a:hover {
|
||||||
|
transform: translateY(-1px);
|
||||||
|
border-color: rgba(15, 118, 110, 0.45);
|
||||||
}
|
}
|
||||||
|
|
||||||
.page {
|
.page {
|
||||||
|
|
@ -42,6 +113,9 @@ body {
|
||||||
.hero {
|
.hero {
|
||||||
padding: 28px;
|
padding: 28px;
|
||||||
margin-bottom: 20px;
|
margin-bottom: 20px;
|
||||||
|
background:
|
||||||
|
linear-gradient(135deg, rgba(255, 255, 255, 0.95), rgba(234, 244, 240, 0.92)),
|
||||||
|
var(--paper);
|
||||||
}
|
}
|
||||||
|
|
||||||
.eyebrow {
|
.eyebrow {
|
||||||
|
|
@ -56,6 +130,7 @@ h1 {
|
||||||
margin: 0;
|
margin: 0;
|
||||||
font-size: clamp(2.8rem, 7vw, 5.6rem);
|
font-size: clamp(2.8rem, 7vw, 5.6rem);
|
||||||
line-height: 0.92;
|
line-height: 0.92;
|
||||||
|
letter-spacing: -0.03em;
|
||||||
}
|
}
|
||||||
|
|
||||||
.lede {
|
.lede {
|
||||||
|
|
@ -64,6 +139,12 @@ h1 {
|
||||||
font-size: 1.08rem;
|
font-size: 1.08rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.hero-context {
|
||||||
|
max-width: 68ch;
|
||||||
|
color: var(--muted);
|
||||||
|
line-height: 1.58;
|
||||||
|
}
|
||||||
|
|
||||||
.hero-stats {
|
.hero-stats {
|
||||||
display: flex;
|
display: flex;
|
||||||
gap: 16px;
|
gap: 16px;
|
||||||
|
|
@ -79,6 +160,15 @@ h1 {
|
||||||
margin-top: 18px;
|
margin-top: 18px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.auth-panel-row {
|
||||||
|
margin-top: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.contributor-signup {
|
||||||
|
padding-top: 14px;
|
||||||
|
border-top: 1px solid var(--line);
|
||||||
|
}
|
||||||
|
|
||||||
.auth-bar input {
|
.auth-bar input {
|
||||||
min-width: min(360px, 100%);
|
min-width: min(360px, 100%);
|
||||||
flex: 1;
|
flex: 1;
|
||||||
|
|
@ -93,7 +183,7 @@ h1 {
|
||||||
min-width: 180px;
|
min-width: 180px;
|
||||||
padding: 14px 16px;
|
padding: 14px 16px;
|
||||||
border-radius: 18px;
|
border-radius: 18px;
|
||||||
background: rgba(255, 255, 255, 0.6);
|
background: linear-gradient(180deg, rgba(255, 255, 255, 0.95), rgba(232, 242, 239, 0.92));
|
||||||
border: 1px solid var(--line);
|
border: 1px solid var(--line);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -158,6 +248,16 @@ input[type="search"] {
|
||||||
background: rgba(255, 255, 255, 0.9);
|
background: rgba(255, 255, 255, 0.9);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
input[type="text"],
|
||||||
|
input[type="email"],
|
||||||
|
input[type="password"] {
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
border-radius: 18px;
|
||||||
|
padding: 12px 14px;
|
||||||
|
font: inherit;
|
||||||
|
background: rgba(255, 255, 255, 0.92);
|
||||||
|
}
|
||||||
|
|
||||||
select,
|
select,
|
||||||
textarea,
|
textarea,
|
||||||
button {
|
button {
|
||||||
|
|
@ -201,7 +301,7 @@ button {
|
||||||
padding: 14px;
|
padding: 14px;
|
||||||
border-radius: 18px;
|
border-radius: 18px;
|
||||||
border: 1px solid var(--line);
|
border: 1px solid var(--line);
|
||||||
background: linear-gradient(180deg, rgba(255, 255, 255, 0.95), rgba(241, 237, 230, 0.95));
|
background: linear-gradient(180deg, rgba(255, 255, 255, 0.97), rgba(239, 246, 244, 0.94));
|
||||||
cursor: pointer;
|
cursor: pointer;
|
||||||
transition: transform 160ms ease, border-color 160ms ease;
|
transition: transform 160ms ease, border-color 160ms ease;
|
||||||
}
|
}
|
||||||
|
|
@ -213,7 +313,7 @@ button {
|
||||||
|
|
||||||
.species-card-archived {
|
.species-card-archived {
|
||||||
border-style: dashed;
|
border-style: dashed;
|
||||||
background: linear-gradient(180deg, rgba(247, 241, 231, 0.98), rgba(233, 226, 214, 0.98));
|
background: linear-gradient(180deg, rgba(243, 247, 249, 0.98), rgba(227, 236, 242, 0.98));
|
||||||
}
|
}
|
||||||
|
|
||||||
.species-name,
|
.species-name,
|
||||||
|
|
@ -273,6 +373,32 @@ button {
|
||||||
display: none;
|
display: none;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.match-dialog-shell {
|
||||||
|
position: fixed;
|
||||||
|
inset: 0;
|
||||||
|
z-index: 50;
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-dialog-backdrop {
|
||||||
|
position: absolute;
|
||||||
|
inset: 0;
|
||||||
|
background: rgba(12, 20, 18, 0.46);
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-dialog-card {
|
||||||
|
position: relative;
|
||||||
|
z-index: 1;
|
||||||
|
width: min(1180px, calc(100vw - 32px));
|
||||||
|
max-height: calc(100vh - 40px);
|
||||||
|
overflow: auto;
|
||||||
|
margin: 20px auto;
|
||||||
|
padding: 18px;
|
||||||
|
border-radius: 24px;
|
||||||
|
background: #fbf8f1;
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
box-shadow: var(--shadow);
|
||||||
|
}
|
||||||
|
|
||||||
.detail-header {
|
.detail-header {
|
||||||
padding-bottom: 16px;
|
padding-bottom: 16px;
|
||||||
border-bottom: 1px solid var(--line);
|
border-bottom: 1px solid var(--line);
|
||||||
|
|
@ -313,6 +439,12 @@ button {
|
||||||
margin-top: 18px;
|
margin-top: 18px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.workflow-panels {
|
||||||
|
display: grid;
|
||||||
|
gap: 16px;
|
||||||
|
margin-top: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
.detail-section {
|
.detail-section {
|
||||||
padding: 16px;
|
padding: 16px;
|
||||||
border-radius: 18px;
|
border-radius: 18px;
|
||||||
|
|
@ -329,6 +461,44 @@ button {
|
||||||
margin-top: 18px;
|
margin-top: 18px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.workflow-panels .editor-panel,
|
||||||
|
.workflow-panels .detail-section {
|
||||||
|
margin-top: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.collapsible-panel {
|
||||||
|
padding-top: 14px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.collapsible-header {
|
||||||
|
display: flex;
|
||||||
|
gap: 12px;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: space-between;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.collapsible-header h3 {
|
||||||
|
margin-bottom: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.collapsible-body {
|
||||||
|
margin-top: 16px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.collapsible-panel.collapsed .collapsible-body {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.document-panel-header {
|
||||||
|
display: flex;
|
||||||
|
gap: 16px;
|
||||||
|
align-items: flex-start;
|
||||||
|
justify-content: space-between;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
margin-bottom: 14px;
|
||||||
|
}
|
||||||
|
|
||||||
.editor-label {
|
.editor-label {
|
||||||
display: block;
|
display: block;
|
||||||
margin: 0 0 8px;
|
margin: 0 0 8px;
|
||||||
|
|
@ -349,6 +519,11 @@ button {
|
||||||
font-weight: 700;
|
font-weight: 700;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.contributor-age-gate {
|
||||||
|
margin: 0;
|
||||||
|
font-weight: 400;
|
||||||
|
}
|
||||||
|
|
||||||
.archive-toggle input {
|
.archive-toggle input {
|
||||||
width: 18px;
|
width: 18px;
|
||||||
height: 18px;
|
height: 18px;
|
||||||
|
|
@ -372,6 +547,149 @@ button {
|
||||||
gap: 12px;
|
gap: 12px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.citation-list {
|
||||||
|
display: grid;
|
||||||
|
gap: 14px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.citation-entry {
|
||||||
|
padding: 14px;
|
||||||
|
border-radius: 16px;
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
background: rgba(255, 255, 255, 0.76);
|
||||||
|
}
|
||||||
|
|
||||||
|
.citation-entry-meta {
|
||||||
|
margin: 0 0 10px;
|
||||||
|
color: var(--muted);
|
||||||
|
font-size: 0.92rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.citation-entry-raw {
|
||||||
|
margin: 0 0 12px;
|
||||||
|
line-height: 1.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
.citation-bibtex,
|
||||||
|
.citation-bibtex-editor {
|
||||||
|
font-family: "Courier New", monospace;
|
||||||
|
font-size: 0.9rem;
|
||||||
|
line-height: 1.45;
|
||||||
|
}
|
||||||
|
|
||||||
|
.citation-abstract-shell {
|
||||||
|
display: grid;
|
||||||
|
gap: 8px;
|
||||||
|
margin: 4px 0 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.citation-detail-shell {
|
||||||
|
display: grid;
|
||||||
|
gap: 8px;
|
||||||
|
margin: 4px 0 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.citation-abstract-display {
|
||||||
|
padding: 10px 12px;
|
||||||
|
border-radius: 12px;
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
background: rgba(15, 118, 110, 0.05);
|
||||||
|
}
|
||||||
|
|
||||||
|
.citation-detail-display {
|
||||||
|
padding: 10px 12px;
|
||||||
|
border-radius: 12px;
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
background: rgba(255, 255, 255, 0.78);
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-dialog-header,
|
||||||
|
.match-dialog-grid,
|
||||||
|
.match-candidate-header,
|
||||||
|
.match-candidates,
|
||||||
|
.match-candidate-card,
|
||||||
|
.match-seed,
|
||||||
|
.match-table {
|
||||||
|
display: grid;
|
||||||
|
gap: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-dialog-header {
|
||||||
|
grid-template-columns: minmax(0, 1fr) auto;
|
||||||
|
align-items: start;
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-dialog-grid {
|
||||||
|
grid-template-columns: minmax(260px, 0.9fr) minmax(0, 1.6fr);
|
||||||
|
margin-top: 16px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-candidate-card {
|
||||||
|
padding: 14px;
|
||||||
|
border-radius: 16px;
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
background: rgba(255, 255, 255, 0.84);
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-candidate-header {
|
||||||
|
grid-template-columns: minmax(0, 1fr) auto;
|
||||||
|
align-items: baseline;
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-score {
|
||||||
|
font-weight: 700;
|
||||||
|
color: var(--accent);
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-table {
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
border-radius: 14px;
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-row {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: 120px 110px minmax(0, 1fr) minmax(0, 1fr);
|
||||||
|
gap: 10px;
|
||||||
|
padding: 10px 12px;
|
||||||
|
border-top: 1px solid var(--line);
|
||||||
|
font-size: 0.92rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-row:first-child {
|
||||||
|
border-top: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-row-head {
|
||||||
|
background: rgba(15, 118, 110, 0.08);
|
||||||
|
font-weight: 700;
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-label {
|
||||||
|
color: var(--muted);
|
||||||
|
font-weight: 700;
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-status {
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.04em;
|
||||||
|
font-size: 0.78rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-status-exact {
|
||||||
|
color: var(--accent);
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-status-partial,
|
||||||
|
.match-status-seed-missing,
|
||||||
|
.match-status-candidate-missing {
|
||||||
|
color: var(--accent-2);
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-status-conflict {
|
||||||
|
color: #a12626;
|
||||||
|
}
|
||||||
|
|
||||||
.audit-entry {
|
.audit-entry {
|
||||||
padding: 14px;
|
padding: 14px;
|
||||||
border-radius: 16px;
|
border-radius: 16px;
|
||||||
|
|
@ -394,6 +712,62 @@ button {
|
||||||
line-height: 1.45;
|
line-height: 1.45;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.document-editor,
|
||||||
|
.document-preview {
|
||||||
|
font-family: "Courier New", monospace;
|
||||||
|
font-size: 0.92rem;
|
||||||
|
line-height: 1.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
.document-editor {
|
||||||
|
min-height: 420px;
|
||||||
|
margin-bottom: 14px;
|
||||||
|
white-space: pre;
|
||||||
|
overflow: auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
.document-preview-shell {
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
border-radius: 18px;
|
||||||
|
background: rgba(255, 255, 255, 0.72);
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
.document-preview-shell summary {
|
||||||
|
cursor: pointer;
|
||||||
|
padding: 12px 16px;
|
||||||
|
font-weight: 700;
|
||||||
|
color: var(--accent);
|
||||||
|
}
|
||||||
|
|
||||||
|
.document-preview {
|
||||||
|
padding: 0 16px 16px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.document-preview-empty {
|
||||||
|
color: var(--muted);
|
||||||
|
}
|
||||||
|
|
||||||
|
.document-preview-list {
|
||||||
|
margin: 0;
|
||||||
|
padding-left: 22px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.document-preview-list li + li {
|
||||||
|
margin-top: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.document-preview-metadata {
|
||||||
|
margin: 0 0 14px;
|
||||||
|
padding: 0;
|
||||||
|
list-style: none;
|
||||||
|
color: var(--muted);
|
||||||
|
}
|
||||||
|
|
||||||
|
.document-preview-metadata li + li {
|
||||||
|
margin-top: 6px;
|
||||||
|
}
|
||||||
|
|
||||||
.diagnostic-list {
|
.diagnostic-list {
|
||||||
margin: 0;
|
margin: 0;
|
||||||
padding-left: 18px;
|
padding-left: 18px;
|
||||||
|
|
@ -403,6 +777,100 @@ button {
|
||||||
margin-top: 8px;
|
margin-top: 8px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.structured-node {
|
||||||
|
display: grid;
|
||||||
|
gap: 12px;
|
||||||
|
background: linear-gradient(180deg, rgba(255, 255, 255, 0.84), rgba(242, 247, 252, 0.88));
|
||||||
|
}
|
||||||
|
|
||||||
|
.structured-node + .structured-node {
|
||||||
|
margin-top: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.structured-node h3,
|
||||||
|
.structured-node h4,
|
||||||
|
.structured-node h5,
|
||||||
|
.structured-node h6 {
|
||||||
|
line-height: 1.18;
|
||||||
|
letter-spacing: -0.01em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.structured-node-body {
|
||||||
|
margin: 0;
|
||||||
|
line-height: 1.58;
|
||||||
|
color: var(--ink);
|
||||||
|
}
|
||||||
|
|
||||||
|
.structured-node-children {
|
||||||
|
display: grid;
|
||||||
|
gap: 12px;
|
||||||
|
padding: 4px 0 0 18px;
|
||||||
|
border-left: 2px solid rgba(36, 87, 166, 0.12);
|
||||||
|
}
|
||||||
|
|
||||||
|
.public-citation-list {
|
||||||
|
display: grid;
|
||||||
|
gap: 14px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.public-bibliography-actions {
|
||||||
|
display: flex;
|
||||||
|
gap: 12px;
|
||||||
|
align-items: center;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.public-bibliography-note {
|
||||||
|
margin: 0;
|
||||||
|
color: var(--muted);
|
||||||
|
font-size: 0.92rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.public-citation-entry {
|
||||||
|
display: grid;
|
||||||
|
gap: 8px;
|
||||||
|
padding: 14px;
|
||||||
|
border-radius: 16px;
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
background: rgba(255, 255, 255, 0.76);
|
||||||
|
}
|
||||||
|
|
||||||
|
.public-citation-text,
|
||||||
|
.public-citation-meta,
|
||||||
|
.public-citation-links,
|
||||||
|
.public-citation-abstract {
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.public-citation-text {
|
||||||
|
line-height: 1.56;
|
||||||
|
}
|
||||||
|
|
||||||
|
.public-citation-meta,
|
||||||
|
.public-citation-links {
|
||||||
|
color: var(--muted);
|
||||||
|
font-size: 0.92rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.public-citation-links a {
|
||||||
|
color: var(--accent);
|
||||||
|
}
|
||||||
|
|
||||||
|
.public-citation-abstract {
|
||||||
|
padding-top: 2px;
|
||||||
|
color: var(--muted);
|
||||||
|
line-height: 1.58;
|
||||||
|
}
|
||||||
|
|
||||||
|
.legacy-source {
|
||||||
|
max-height: 28rem;
|
||||||
|
overflow: auto;
|
||||||
|
padding: 14px;
|
||||||
|
border-radius: 16px;
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
background: rgba(255, 255, 255, 0.76);
|
||||||
|
}
|
||||||
|
|
||||||
pre {
|
pre {
|
||||||
margin: 0;
|
margin: 0;
|
||||||
white-space: pre-wrap;
|
white-space: pre-wrap;
|
||||||
|
|
@ -417,6 +885,15 @@ pre {
|
||||||
}
|
}
|
||||||
|
|
||||||
@media (max-width: 960px) {
|
@media (max-width: 960px) {
|
||||||
|
.site-header-inner {
|
||||||
|
flex-direction: column;
|
||||||
|
align-items: stretch;
|
||||||
|
}
|
||||||
|
|
||||||
|
.site-nav {
|
||||||
|
justify-content: flex-start;
|
||||||
|
}
|
||||||
|
|
||||||
.workspace {
|
.workspace {
|
||||||
grid-template-columns: 1fr;
|
grid-template-columns: 1fr;
|
||||||
}
|
}
|
||||||
|
|
@ -424,4 +901,12 @@ pre {
|
||||||
.species-list {
|
.species-list {
|
||||||
max-height: 40vh;
|
max-height: 40vh;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.match-dialog-grid {
|
||||||
|
grid-template-columns: 1fr;
|
||||||
|
}
|
||||||
|
|
||||||
|
.match-row {
|
||||||
|
grid-template-columns: 1fr;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
services:
|
services:
|
||||||
db:
|
db:
|
||||||
|
container_name: ecospecies-db
|
||||||
image: postgres:16-alpine
|
image: postgres:16-alpine
|
||||||
environment:
|
environment:
|
||||||
POSTGRES_DB: ecospecies
|
POSTGRES_DB: ecospecies
|
||||||
|
|
@ -17,6 +18,7 @@ services:
|
||||||
- postgres_data:/var/lib/postgresql/data
|
- postgres_data:/var/lib/postgresql/data
|
||||||
|
|
||||||
importer:
|
importer:
|
||||||
|
container_name: ecospecies-importer
|
||||||
image: python:3.12-slim
|
image: python:3.12-slim
|
||||||
depends_on:
|
depends_on:
|
||||||
db:
|
db:
|
||||||
|
|
@ -30,11 +32,12 @@ services:
|
||||||
command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-import.sh"]
|
command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-import.sh"]
|
||||||
volumes:
|
volumes:
|
||||||
- .:/workspace
|
- .:/workspace
|
||||||
- ../01-legacy-code-and-data:/legacy-data:ro
|
- ${ECOSPECIES_LEGACY_DATA_DIR:-../legacy-corpus}:/legacy-data:ro
|
||||||
- python_venv:/workspace/.docker/venv
|
- python_venv:/workspace/.docker/venv
|
||||||
- pip_cache:/root/.cache/pip
|
- pip_cache:/root/.cache/pip
|
||||||
|
|
||||||
api:
|
api:
|
||||||
|
container_name: ecospecies-api
|
||||||
image: python:3.12-slim
|
image: python:3.12-slim
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
depends_on:
|
depends_on:
|
||||||
|
|
@ -56,11 +59,12 @@ services:
|
||||||
- "${ECOSPECIES_API_PORT:-8000}:8000"
|
- "${ECOSPECIES_API_PORT:-8000}:8000"
|
||||||
volumes:
|
volumes:
|
||||||
- .:/workspace
|
- .:/workspace
|
||||||
- ../01-legacy-code-and-data:/legacy-data:ro
|
- ${ECOSPECIES_LEGACY_DATA_DIR:-../legacy-corpus}:/legacy-data:ro
|
||||||
- python_venv:/workspace/.docker/venv
|
- python_venv:/workspace/.docker/venv
|
||||||
- pip_cache:/root/.cache/pip
|
- pip_cache:/root/.cache/pip
|
||||||
|
|
||||||
web:
|
web:
|
||||||
|
container_name: ecospecies-web
|
||||||
image: nginx:1.27-alpine
|
image: nginx:1.27-alpine
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
depends_on:
|
depends_on:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,110 @@
|
||||||
|
## CiteGeist Review Notes
|
||||||
|
|
||||||
|
These notes capture parser issues seen while integrating CiteGeist-style extraction into EcoSpecies.
|
||||||
|
|
||||||
|
### Report-style references
|
||||||
|
|
||||||
|
Observed failure shape:
|
||||||
|
|
||||||
|
- references like `Daniell, W.C. 1872. Letters referring ... Comm. Rept. U.S. Comm. Fish & Fish. 2: 387-390.`
|
||||||
|
- extracted `title` may contain the full raw bibliography string
|
||||||
|
- abbreviated venue names such as `Comm. Rept.` are not separated cleanly from the title
|
||||||
|
|
||||||
|
Suggested upstream change in `citegeist.extract`:
|
||||||
|
|
||||||
|
- add a report-style parser path after year detection
|
||||||
|
- prefer sentence-boundary venue detection before naive keyword splits so words like `report` inside a real title do not trigger an early cut
|
||||||
|
- support abbreviation-heavy venue starters such as:
|
||||||
|
- `comm. rept.`
|
||||||
|
- `rept.`
|
||||||
|
- `proc.`
|
||||||
|
- `occas. pap.`
|
||||||
|
- `bulletin`
|
||||||
|
- `bull.`
|
||||||
|
- `memoir`
|
||||||
|
- strip trailing volume/page blobs like `2: 387-390` from the venue field
|
||||||
|
- when a first parse leaves a partial venue stub such as `Occas`, reparse the full raw reference line and prefer the fuller repaired venue/title split
|
||||||
|
|
||||||
|
### Placeholder title merge behavior
|
||||||
|
|
||||||
|
Observed failure shape:
|
||||||
|
|
||||||
|
- a raw bibliography string may survive as `title` even after DOI/title resolution finds a better title
|
||||||
|
|
||||||
|
Suggested upstream change in `citegeist.resolve.merge_entries_with_conflicts`:
|
||||||
|
|
||||||
|
- treat titles that look like raw bibliography strings as placeholders
|
||||||
|
- example heuristic:
|
||||||
|
- starts with `Surname, ... YEAR.`
|
||||||
|
- unusually long for a title
|
||||||
|
- contains a resolved shorter title as a substring after punctuation normalization
|
||||||
|
|
||||||
|
### Legacy note deduplication
|
||||||
|
|
||||||
|
Observed failure shape:
|
||||||
|
|
||||||
|
- note fragments like `ecospecies_reference_number = {160}` can be appended more than once downstream when re-merging enriched metadata
|
||||||
|
|
||||||
|
Suggested upstream change:
|
||||||
|
|
||||||
|
- when joining note fragments, split on `;`, normalize whitespace, and dedupe per fragment rather than per whole note string
|
||||||
|
|
||||||
|
### Unresolved entries should still refresh local parses
|
||||||
|
|
||||||
|
Observed failure shape:
|
||||||
|
|
||||||
|
- parser improvements may correctly rebuild `title`, venue, `volume`, `number`, and `pages`
|
||||||
|
- but if no remote metadata source matches, the stored draft BibTeX can remain unchanged unless unresolved enrichment also writes the refreshed local seed back out
|
||||||
|
|
||||||
|
Suggested upstream change:
|
||||||
|
|
||||||
|
- unresolved enrichment should still return the rebuilt local draft entry
|
||||||
|
- keep `citation_key`, normalized text, and draft BibTeX synchronized with the current local parser even when resolver status remains `unresolved`
|
||||||
|
|
||||||
|
### Returned metadata not carried through
|
||||||
|
|
||||||
|
Observed concern:
|
||||||
|
|
||||||
|
- resolver/source payloads may include bibliographic details such as:
|
||||||
|
- `volume`
|
||||||
|
- `issue` / BibTeX `number`
|
||||||
|
- `page` / BibTeX `pages`
|
||||||
|
- these should be preserved into the BibTeX entry whenever available
|
||||||
|
|
||||||
|
Current note:
|
||||||
|
|
||||||
|
- CiteGeist Crossref mapping already includes `volume`, `number`, and `pages`
|
||||||
|
- verify that all resolver paths, storage round-trips, and exports preserve those fields consistently
|
||||||
|
- OpenAlex/DataCite mappings should also be checked for analogous bibliographic fields in `biblio` / attribute payloads
|
||||||
|
|
||||||
|
### False-positive title-search acceptance
|
||||||
|
|
||||||
|
Observed failure shape:
|
||||||
|
|
||||||
|
- title search can return a thematically related but bibliographically different work
|
||||||
|
- downstream acceptance may keep some seed fields while adopting conflicting DOI/title/volume/pages from the returned match
|
||||||
|
- this is especially risky for historical references with sparse or abbreviated venue names
|
||||||
|
|
||||||
|
Suggested upstream change in `citegeist.resolve` and any title-search ranking path:
|
||||||
|
|
||||||
|
- do not fall back to the first search hit when no strong title match exists
|
||||||
|
- prefer exact or near-exact title matches only
|
||||||
|
- reject a candidate when structured seed metadata conflicts on strong fields such as:
|
||||||
|
- `year`
|
||||||
|
- venue / journal
|
||||||
|
- `volume`
|
||||||
|
- `number`
|
||||||
|
- `pages`
|
||||||
|
- treat those fields as match-validation inputs, not just merge-time metadata
|
||||||
|
|
||||||
|
### OpenAlex null-source handling
|
||||||
|
|
||||||
|
Observed failure shape:
|
||||||
|
|
||||||
|
- some OpenAlex works have `primary_location` present but `source: null`
|
||||||
|
- downstream mapping can crash if it assumes `source` is always a dictionary
|
||||||
|
|
||||||
|
Suggested upstream change:
|
||||||
|
|
||||||
|
- treat null `source` payloads as empty dictionaries
|
||||||
|
- continue mapping title, year, DOI, and `biblio` fields even when venue/source is missing
|
||||||
|
|
@ -0,0 +1,89 @@
|
||||||
|
services:
|
||||||
|
db:
|
||||||
|
image: postgres:16-alpine
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
POSTGRES_DB: ${ECOSPECIES_DB_NAME:-ecospecies}
|
||||||
|
POSTGRES_USER: ${ECOSPECIES_DB_USER:-ecospecies}
|
||||||
|
POSTGRES_PASSWORD: ${ECOSPECIES_DB_PASSWORD:?set ECOSPECIES_DB_PASSWORD}
|
||||||
|
PGDATA: /var/lib/postgresql/data/pgdata
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "pg_isready -U ${ECOSPECIES_DB_USER:-ecospecies} -d ${ECOSPECIES_DB_NAME:-ecospecies}"]
|
||||||
|
interval: 5s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 10
|
||||||
|
volumes:
|
||||||
|
- postgres_data:/var/lib/postgresql/data
|
||||||
|
|
||||||
|
importer:
|
||||||
|
image: python:3.12-slim
|
||||||
|
restart: "no"
|
||||||
|
depends_on:
|
||||||
|
db:
|
||||||
|
condition: service_healthy
|
||||||
|
working_dir: /workspace
|
||||||
|
environment:
|
||||||
|
ECOSPECIES_DATA_DIR: ${ECOSPECIES_DATA_DIR:-/legacy-data/InputFiles - TXT}
|
||||||
|
ECOSPECIES_DATABASE_URL: postgresql+psycopg://${ECOSPECIES_DB_USER:-ecospecies}:${ECOSPECIES_DB_PASSWORD}@db:5432/${ECOSPECIES_DB_NAME:-ecospecies}
|
||||||
|
ECOSPECIES_VENV_DIR: /workspace/.docker/venv
|
||||||
|
PYTHONPATH: /workspace/apps/api/src
|
||||||
|
command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-import.sh"]
|
||||||
|
volumes:
|
||||||
|
- ..:/workspace
|
||||||
|
- ${ECOSPECIES_LEGACY_DATA_DIR:-../path-to-legacy-corpus}:/legacy-data:ro
|
||||||
|
- python_venv:/workspace/.docker/venv
|
||||||
|
- pip_cache:/root/.cache/pip
|
||||||
|
|
||||||
|
api:
|
||||||
|
image: python:3.12-slim
|
||||||
|
restart: unless-stopped
|
||||||
|
depends_on:
|
||||||
|
db:
|
||||||
|
condition: service_healthy
|
||||||
|
importer:
|
||||||
|
condition: service_completed_successfully
|
||||||
|
working_dir: /workspace
|
||||||
|
environment:
|
||||||
|
ECOSPECIES_DATA_DIR: ${ECOSPECIES_DATA_DIR:-/legacy-data/InputFiles - TXT}
|
||||||
|
ECOSPECIES_DATABASE_URL: postgresql+psycopg://${ECOSPECIES_DB_USER:-ecospecies}:${ECOSPECIES_DB_PASSWORD}@db:5432/${ECOSPECIES_DB_NAME:-ecospecies}
|
||||||
|
ECOSPECIES_HOST: 0.0.0.0
|
||||||
|
ECOSPECIES_PORT: "8000"
|
||||||
|
ECOSPECIES_AUTH_TOKENS: ${ECOSPECIES_AUTH_TOKENS:-}
|
||||||
|
ECOSPECIES_VENV_DIR: /workspace/.docker/venv
|
||||||
|
PYTHONPATH: /workspace/apps/api/src
|
||||||
|
command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-api.sh"]
|
||||||
|
volumes:
|
||||||
|
- ..:/workspace
|
||||||
|
- ${ECOSPECIES_LEGACY_DATA_DIR:-../path-to-legacy-corpus}:/legacy-data:ro
|
||||||
|
- python_venv:/workspace/.docker/venv
|
||||||
|
- pip_cache:/root/.cache/pip
|
||||||
|
|
||||||
|
web:
|
||||||
|
image: nginx:1.27-alpine
|
||||||
|
restart: unless-stopped
|
||||||
|
depends_on:
|
||||||
|
api:
|
||||||
|
condition: service_started
|
||||||
|
labels:
|
||||||
|
- "traefik.enable=true"
|
||||||
|
- "traefik.docker.network=${TRAEFIK_NETWORK:-traefik-network}"
|
||||||
|
- "traefik.http.routers.ecospecies-atlas.rule=Host(`${ECOSPECIES_HOSTNAME:?set ECOSPECIES_HOSTNAME}`)"
|
||||||
|
- "traefik.http.routers.ecospecies-atlas.entrypoints=${TRAEFIK_ENTRYPOINTS:-websecure}"
|
||||||
|
- "traefik.http.routers.ecospecies-atlas.tls.certresolver=${TRAEFIK_CERTRESOLVER:-myresolver}"
|
||||||
|
- "traefik.http.services.ecospecies-atlas.loadbalancer.server.port=80"
|
||||||
|
volumes:
|
||||||
|
- ../apps/web:/usr/share/nginx/html:ro
|
||||||
|
- ../apps/web/nginx.conf:/etc/nginx/conf.d/default.conf:ro
|
||||||
|
networks:
|
||||||
|
- default
|
||||||
|
- traefik-network
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
postgres_data:
|
||||||
|
python_venv:
|
||||||
|
pip_cache:
|
||||||
|
|
||||||
|
networks:
|
||||||
|
traefik-network:
|
||||||
|
external: true
|
||||||
|
name: ${TRAEFIK_NETWORK:-traefik-network}
|
||||||
|
|
@ -0,0 +1,20 @@
|
||||||
|
# Required
|
||||||
|
ECOSPECIES_HOSTNAME=example.org
|
||||||
|
ECOSPECIES_BASE_PATH=/apps/ecospecies
|
||||||
|
ECOSPECIES_DB_PASSWORD=replace-with-strong-password
|
||||||
|
|
||||||
|
# Optional database settings
|
||||||
|
ECOSPECIES_DB_NAME=ecospecies
|
||||||
|
ECOSPECIES_DB_USER=ecospecies
|
||||||
|
|
||||||
|
# Optional application settings
|
||||||
|
ECOSPECIES_AUTH_TOKENS=
|
||||||
|
ECOSPECIES_DATA_DIR=/workspace/input-data/InputFiles
|
||||||
|
|
||||||
|
# Optional host path to the legacy corpus if it is not at ../path-to-legacy-corpus
|
||||||
|
ECOSPECIES_LEGACY_DATA_DIR=../path-to-legacy-corpus
|
||||||
|
|
||||||
|
# Optional Traefik settings
|
||||||
|
TRAEFIK_NETWORK=traefik-network
|
||||||
|
TRAEFIK_ENTRYPOINTS=websecure
|
||||||
|
TRAEFIK_CERTRESOLVER=myresolver
|
||||||
|
|
@ -0,0 +1,93 @@
|
||||||
|
services:
|
||||||
|
db:
|
||||||
|
container_name: ecospecies-db
|
||||||
|
image: postgres:16-alpine
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
POSTGRES_DB: ${ECOSPECIES_DB_NAME:-ecospecies}
|
||||||
|
POSTGRES_USER: ${ECOSPECIES_DB_USER:-ecospecies}
|
||||||
|
POSTGRES_PASSWORD: ${ECOSPECIES_DB_PASSWORD:?set ECOSPECIES_DB_PASSWORD}
|
||||||
|
PGDATA: /var/lib/postgresql/data/pgdata
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "pg_isready -U ${ECOSPECIES_DB_USER:-ecospecies} -d ${ECOSPECIES_DB_NAME:-ecospecies}"]
|
||||||
|
interval: 5s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 10
|
||||||
|
volumes:
|
||||||
|
- postgres_data:/var/lib/postgresql/data
|
||||||
|
|
||||||
|
importer:
|
||||||
|
container_name: ecospecies-importer
|
||||||
|
image: python:3.12-slim
|
||||||
|
restart: "no"
|
||||||
|
depends_on:
|
||||||
|
db:
|
||||||
|
condition: service_healthy
|
||||||
|
working_dir: /workspace
|
||||||
|
environment:
|
||||||
|
ECOSPECIES_DATA_DIR: ${ECOSPECIES_DATA_DIR:-/workspace/input-data/InputFiles}
|
||||||
|
ECOSPECIES_DATABASE_URL: postgresql+psycopg://${ECOSPECIES_DB_USER:-ecospecies}:${ECOSPECIES_DB_PASSWORD}@db:5432/${ECOSPECIES_DB_NAME:-ecospecies}
|
||||||
|
ECOSPECIES_VENV_DIR: /workspace/.docker/venv
|
||||||
|
PYTHONPATH: /workspace/apps/api/src
|
||||||
|
command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-import.sh"]
|
||||||
|
volumes:
|
||||||
|
- ..:/workspace
|
||||||
|
- ${ECOSPECIES_LEGACY_DATA_DIR:-../path-to-legacy-corpus}:/legacy-data:ro
|
||||||
|
- python_venv:/workspace/.docker/venv
|
||||||
|
- pip_cache:/root/.cache/pip
|
||||||
|
|
||||||
|
api:
|
||||||
|
container_name: ecospecies-api
|
||||||
|
image: python:3.12-slim
|
||||||
|
restart: unless-stopped
|
||||||
|
depends_on:
|
||||||
|
db:
|
||||||
|
condition: service_healthy
|
||||||
|
importer:
|
||||||
|
condition: service_completed_successfully
|
||||||
|
working_dir: /workspace
|
||||||
|
environment:
|
||||||
|
ECOSPECIES_DATA_DIR: ${ECOSPECIES_DATA_DIR:-/workspace/input-data/InputFiles}
|
||||||
|
ECOSPECIES_DATABASE_URL: postgresql+psycopg://${ECOSPECIES_DB_USER:-ecospecies}:${ECOSPECIES_DB_PASSWORD}@db:5432/${ECOSPECIES_DB_NAME:-ecospecies}
|
||||||
|
ECOSPECIES_HOST: 0.0.0.0
|
||||||
|
ECOSPECIES_PORT: "8000"
|
||||||
|
ECOSPECIES_AUTH_TOKENS: ${ECOSPECIES_AUTH_TOKENS:-}
|
||||||
|
ECOSPECIES_VENV_DIR: /workspace/.docker/venv
|
||||||
|
PYTHONPATH: /workspace/apps/api/src
|
||||||
|
command: ["/bin/sh", "-lc", "./scripts/bootstrap-python-env.sh && ./scripts/run-api.sh"]
|
||||||
|
volumes:
|
||||||
|
- ..:/workspace
|
||||||
|
- ${ECOSPECIES_LEGACY_DATA_DIR:-/input-data}:/legacy-data:ro
|
||||||
|
- python_venv:/workspace/.docker/venv
|
||||||
|
- pip_cache:/root/.cache/pip
|
||||||
|
|
||||||
|
web:
|
||||||
|
container_name: ecospecies-web
|
||||||
|
image: nginx:1.27-alpine
|
||||||
|
restart: unless-stopped
|
||||||
|
depends_on:
|
||||||
|
api:
|
||||||
|
condition: service_started
|
||||||
|
labels:
|
||||||
|
- "traefik.enable=true"
|
||||||
|
- "traefik.docker.network=${TRAEFIK_NETWORK:-traefik-network}"
|
||||||
|
- "traefik.http.routers.ecospecies-atlas.rule=Host(`${ECOSPECIES_HOSTNAME:?set ECOSPECIES_HOSTNAME}`) && PathPrefix(`${ECOSPECIES_BASE_PATH:-/}`)"
|
||||||
|
- "traefik.http.routers.ecospecies-atlas.entrypoints=${TRAEFIK_ENTRYPOINTS:-websecure}"
|
||||||
|
- "traefik.http.routers.ecospecies-atlas.tls.certresolver=${TRAEFIK_CERTRESOLVER:-myresolver}"
|
||||||
|
- "traefik.http.services.ecospecies-atlas.loadbalancer.server.port=80"
|
||||||
|
volumes:
|
||||||
|
- ../apps/web:/usr/share/nginx/html:ro
|
||||||
|
- ../apps/web/nginx.conf:/etc/nginx/conf.d/default.conf:ro
|
||||||
|
networks:
|
||||||
|
- default
|
||||||
|
- traefik-network
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
postgres_data:
|
||||||
|
python_venv:
|
||||||
|
pip_cache:
|
||||||
|
|
||||||
|
networks:
|
||||||
|
traefik-network:
|
||||||
|
external: true
|
||||||
|
name: ${TRAEFIK_NETWORK:-traefik-network}
|
||||||
|
|
@ -0,0 +1,48 @@
|
||||||
|
# PostgreSQL Backup Notes
|
||||||
|
|
||||||
|
This note applies to deployments that use the PostgreSQL volume defined by the Compose stack, including the Traefik deployment variant.
|
||||||
|
|
||||||
|
## What Needs Backup
|
||||||
|
|
||||||
|
At minimum, back up:
|
||||||
|
|
||||||
|
- the PostgreSQL data volume
|
||||||
|
- the deployment env file that contains the database credentials
|
||||||
|
|
||||||
|
For the Traefik deployment variant, that usually means:
|
||||||
|
|
||||||
|
- the Docker volume `postgres_data`
|
||||||
|
- `docs/docker-compose-traefik.env`
|
||||||
|
|
||||||
|
## Logical Backup
|
||||||
|
|
||||||
|
From the repository root, create a SQL dump with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/backup-postgres.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
To write to a specific file:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/backup-postgres.sh /path/to/ecospecies-backup.sql
|
||||||
|
```
|
||||||
|
|
||||||
|
## Restore From Logical Backup
|
||||||
|
|
||||||
|
Restore a SQL dump with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/restore-postgres.sh /path/to/ecospecies-backup.sql
|
||||||
|
```
|
||||||
|
|
||||||
|
## Volume-Level Backup
|
||||||
|
|
||||||
|
If the host backup system can snapshot Docker volumes safely, include the PostgreSQL volume in that schedule. A volume snapshot is useful for full recovery, but a logical dump is still recommended for portability and validation.
|
||||||
|
|
||||||
|
## Operational Guidance
|
||||||
|
|
||||||
|
- Run backups on a schedule instead of relying on ad hoc dumps.
|
||||||
|
- Test restore procedures before relying on the backup policy.
|
||||||
|
- Keep backup artifacts outside the live Docker host when possible.
|
||||||
|
- The backup and restore scripts default to `docs/docker-compose-traefik.env` and `docs/docker-compose-traefik.yml`, but both can be overridden with `ECOSPECIES_ENV_FILE` and `ECOSPECIES_COMPOSE_FILE`.
|
||||||
115
docs/roadmap.md
115
docs/roadmap.md
|
|
@ -1,5 +1,22 @@
|
||||||
# EcoSpecies Modernization Roadmap
|
# EcoSpecies Modernization Roadmap
|
||||||
|
|
||||||
|
## Current Status
|
||||||
|
|
||||||
|
As of 2026-03-27, the repo is no longer at the pure planning stage. The following pieces are already implemented and working in the live stack:
|
||||||
|
|
||||||
|
- Docker Compose deployment with explicit `ecospecies-...` container names
|
||||||
|
- path-based hosting support for `/apps/ecospecies`
|
||||||
|
- in-repo-only source directory resolution with safe path validation
|
||||||
|
- legacy SLH ingest into PostgreSQL-backed species, sections, citations, audit, and document records
|
||||||
|
- editor/admin workflows for draft, review, publish, archive, and audit history
|
||||||
|
- contributor registration and draft-authoring workflow with token-based access
|
||||||
|
- structured Markdown document storage and editor/API round-trip
|
||||||
|
- persisted taxon identifier scaffolding with legacy identifiers separated from future-facing external identifiers
|
||||||
|
- citation extraction, review, enrichment, batch enrichment, candidate matching, and reviewed-candidate selection/addition
|
||||||
|
- citation persistence back into the structured Markdown source of truth
|
||||||
|
|
||||||
|
The roadmap below has been updated to reflect that actual state.
|
||||||
|
|
||||||
## Target Product
|
## Target Product
|
||||||
|
|
||||||
Create a Docker Compose-based, open-source EcoSpecies successor that:
|
Create a Docker Compose-based, open-source EcoSpecies successor that:
|
||||||
|
|
@ -31,48 +48,91 @@ Create a Docker Compose-based, open-source EcoSpecies successor that:
|
||||||
|
|
||||||
### Phase 0: Discovery and migration planning
|
### Phase 0: Discovery and migration planning
|
||||||
|
|
||||||
|
Status: completed
|
||||||
|
|
||||||
- Inventory legacy assets and user-facing capabilities.
|
- Inventory legacy assets and user-facing capabilities.
|
||||||
- Capture the replacement architecture and ingestion strategy.
|
- Capture the replacement architecture and ingestion strategy.
|
||||||
- Define acknowledgements, provenance, and licensing boundaries.
|
- Define acknowledgements, provenance, and licensing boundaries.
|
||||||
|
|
||||||
### Phase 1: Ingestion foundation
|
### Phase 1: Ingestion foundation
|
||||||
|
|
||||||
|
Status: substantially complete, with parser refinement ongoing
|
||||||
|
|
||||||
- Parse legacy `.txt` SLH inputs into structured JSON records.
|
- Parse legacy `.txt` SLH inputs into structured JSON records.
|
||||||
- Normalize common metadata: title, scientific name, common name, FLELMR code, headings, references.
|
- Normalize common metadata: title, scientific name, common name, FLELMR/EcoSpecies code, headings, references.
|
||||||
- Create ingest diagnostics to flag malformed files and missing metadata.
|
- Create ingest diagnostics to flag malformed files and missing metadata.
|
||||||
|
- Continue parser refinement for legacy edge cases in headings, citations, and historical bibliography formats.
|
||||||
|
|
||||||
### Phase 2: Public read experience
|
### Phase 2: Public read experience
|
||||||
|
|
||||||
|
Status: implemented baseline
|
||||||
|
|
||||||
- Species listing and search.
|
- Species listing and search.
|
||||||
- Species detail view with section navigation.
|
- Species detail view with section navigation.
|
||||||
- Provenance and acknowledgement display.
|
- Provenance and acknowledgement display.
|
||||||
- Summary metrics on corpus coverage.
|
- Summary metrics on corpus coverage.
|
||||||
|
- Path-based deployment under `/apps/ecospecies`.
|
||||||
|
|
||||||
### Phase 3: Structured persistence
|
### Phase 3: Structured persistence and editorial workflow
|
||||||
|
|
||||||
- Move parsed content into PostgreSQL.
|
Status: implemented baseline, with editor UX still maturing
|
||||||
- Add editor-safe import jobs and audit metadata.
|
|
||||||
- Preserve raw source alongside normalized records.
|
|
||||||
- Establish authentication and role-based access for editor and admin workflows.
|
|
||||||
- Add persisted editorial workflow state for draft, review, and published records.
|
|
||||||
- Make document sections individually addressable for editor review and revision, with audit history for section-level changes.
|
|
||||||
|
|
||||||
### Phase 4: Linkages and visualization
|
- PostgreSQL-backed persistence for species, sections, citations, documents, taxon identifiers, and audit history.
|
||||||
|
- Editor-safe import jobs and audit metadata.
|
||||||
|
- Raw-source preservation alongside normalized records.
|
||||||
|
- Authentication and role-based access for admin/editor/contributor workflows.
|
||||||
|
- Persisted editorial workflow state for draft, review, published, and archived records.
|
||||||
|
- Structured Markdown document storage and round-trip editing.
|
||||||
|
- Citation review, enrichment, candidate selection, and reviewed-candidate addition.
|
||||||
|
- Contributor draft creation and owner-scoped editing.
|
||||||
|
|
||||||
|
### Phase 4: Standards-aware identity and bibliography
|
||||||
|
|
||||||
|
Status: partially implemented
|
||||||
|
|
||||||
|
- Preserve legacy local identifiers as provenance.
|
||||||
|
- Persist taxon identifiers separately from legacy identifiers.
|
||||||
|
- Expose `legacy_identifiers`, `taxon_identifiers`, and `primary_taxon_*` API fields.
|
||||||
|
- Persist structured citation records with DOI/OpenAlex/DataCite-style enrichment fields.
|
||||||
|
- Continue toward multi-authority identifier review, richer citation entities, and CiteGeist-backed bibliography expansion.
|
||||||
|
|
||||||
|
### Phase 5: Editor ergonomics and advanced review
|
||||||
|
|
||||||
|
Status: in progress
|
||||||
|
|
||||||
|
- Structured Markdown editor is live.
|
||||||
|
- Citation match-review dialog is live.
|
||||||
|
- Remaining work:
|
||||||
|
- CodeMirror-based Markdown editor with folding
|
||||||
|
- inline parser diagnostics in the editor
|
||||||
|
- richer citation diff/review affordances
|
||||||
|
- clearer document-node and citation provenance in the UI
|
||||||
|
|
||||||
|
### Phase 6: Linkages and visualization
|
||||||
|
|
||||||
|
Status: not started
|
||||||
|
|
||||||
- Model predator/prey, habitat, and ecological association edges.
|
- Model predator/prey, habitat, and ecological association edges.
|
||||||
- Add graph endpoints and species-relationship views.
|
- Add graph endpoints and species-relationship views.
|
||||||
- Support public-friendly visual explanations and expert filters.
|
- Support public-friendly visual explanations and expert filters.
|
||||||
|
|
||||||
### Phase 5: Reports and export
|
### Phase 7: Reports and export
|
||||||
|
|
||||||
- Recreate legacy-like text/RTF export.
|
Status: partially implemented
|
||||||
- Add machine-readable export formats such as JSON and Markdown.
|
|
||||||
- Support FLELMR-oriented authoring/export profiles.
|
|
||||||
|
|
||||||
### Phase 6: Assisted research workflows
|
- JSON and Markdown exports exist through the API/document model.
|
||||||
|
- Structured Markdown is now the primary human-readable editor/export format.
|
||||||
|
- Remaining work:
|
||||||
|
- recreate legacy-like text/RTF export
|
||||||
|
- support export profiles for legacy compatibility and standards-forward outputs
|
||||||
|
- improve citation/bibliography export fidelity
|
||||||
|
|
||||||
|
### Phase 8: Assisted research workflows
|
||||||
|
|
||||||
|
Status: planned
|
||||||
|
|
||||||
- Add local-LLM-assisted extraction and drafting in a human-review loop.
|
- Add local-LLM-assisted extraction and drafting in a human-review loop.
|
||||||
- Integrate bibliography tooling for citation consolidation.
|
- Integrate bibliography tooling for citation consolidation and topic expansion.
|
||||||
- Support candidate-species intake for records not yet in the historical corpus.
|
- Support candidate-species intake for records not yet in the historical corpus.
|
||||||
- Restrict assisted drafting and publication actions to authenticated editorial roles.
|
- Restrict assisted drafting and publication actions to authenticated editorial roles.
|
||||||
|
|
||||||
|
|
@ -84,6 +144,9 @@ Initial core entities:
|
||||||
- `source_document`
|
- `source_document`
|
||||||
- `document_section`
|
- `document_section`
|
||||||
- `citation`
|
- `citation`
|
||||||
|
- `taxon_identifier`
|
||||||
|
- `citation_identifier`
|
||||||
|
- `bibliography_topic`
|
||||||
- `taxon`
|
- `taxon`
|
||||||
- `linkage`
|
- `linkage`
|
||||||
- `media_asset`
|
- `media_asset`
|
||||||
|
|
@ -95,6 +158,7 @@ Key design rules:
|
||||||
- retain provenance and import timestamps
|
- retain provenance and import timestamps
|
||||||
- separate public published records from draft/editor states
|
- separate public published records from draft/editor states
|
||||||
- make sections addressable for citation and graph linking
|
- make sections addressable for citation and graph linking
|
||||||
|
- prefer a canonical document AST over direct projection from free-form source text
|
||||||
|
|
||||||
## LLM Extension Strategy
|
## LLM Extension Strategy
|
||||||
|
|
||||||
|
|
@ -103,6 +167,8 @@ Use local models only for assistive tasks, never silent publication:
|
||||||
- extracting candidate structured fields from new SLH text
|
- extracting candidate structured fields from new SLH text
|
||||||
- suggesting missing headings or linkage labels
|
- suggesting missing headings or linkage labels
|
||||||
- clustering similar citations
|
- clustering similar citations
|
||||||
|
- resolving bibliography entries toward DOI/OpenAlex/DataCite where available
|
||||||
|
- treating local legacy codes as provenance, not canonical identifiers
|
||||||
- drafting summaries for editor review
|
- drafting summaries for editor review
|
||||||
|
|
||||||
Guardrails:
|
Guardrails:
|
||||||
|
|
@ -111,16 +177,19 @@ Guardrails:
|
||||||
- all generated content is marked as draft
|
- all generated content is marked as draft
|
||||||
- every automated extraction stores source spans where possible
|
- every automated extraction stores source spans where possible
|
||||||
|
|
||||||
## Development Roadmap
|
## Near-Term Priorities
|
||||||
|
|
||||||
1. Implement a thin ingestion API over the legacy text corpus.
|
1. Add CodeMirror-based folding and structure-aware editing to the Markdown document editor.
|
||||||
2. Build a responsive browser UI for listing and viewing species.
|
2. Expand taxon identifier review workflows for WoRMS, GBIF, Catalogue of Life, and related authorities.
|
||||||
3. Add a persistent PostgreSQL-backed ingest store.
|
3. Deepen citation quality controls, including better parsed-field visibility and stricter/manual review loops where resolver confidence is weak.
|
||||||
4. Introduce export and visualization services.
|
4. Add CiteGeist-style topic expansion and bibliography-suggestion review for under-cited species.
|
||||||
5. Add editorial workflows and local-LLM assistance.
|
5. Improve document export fidelity so reviewed citations and standards-based identifiers are clearly represented in Markdown and downstream exports.
|
||||||
|
6. Begin the first ecological-linkage data model and API endpoints once citation/identifier workflows stabilize.
|
||||||
|
|
||||||
## Definition Of Done For The Initial Milestone
|
## Definition Of Done For The Initial Milestone
|
||||||
|
|
||||||
- `docker compose up` starts a working API and frontend.
|
- `docker compose up` starts a working API and frontend.
|
||||||
- The system can enumerate the legacy corpus and show parsed species detail for at least one real SLH file.
|
- The system can enumerate the legacy corpus and show parsed species detail for real SLH files.
|
||||||
- Project docs describe the migration approach, target architecture, and next phases.
|
- Editors can curate structured Markdown documents and citations through authenticated workflows.
|
||||||
|
- Contributors can register, create drafts, and edit only their own submissions.
|
||||||
|
- Project docs describe both the implemented modernization state and the next phases.
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,315 @@
|
||||||
|
# EcoSpecies Standards Migration Plan
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
The current EcoSpecies ingest and document model still treats legacy local fields such as `FLELMR code` / `species_code` as if they were primary identifiers. That is useful for historical provenance, but it is the wrong long-term center of gravity for a broader, modern biodiversity knowledge system.
|
||||||
|
|
||||||
|
The same problem exists for citations:
|
||||||
|
|
||||||
|
- legacy plaintext reference blocks are treated as local document text,
|
||||||
|
- citation identity is weak or missing,
|
||||||
|
- bibliography growth is tied to what happened to appear in the historical SLH file.
|
||||||
|
|
||||||
|
The new system should preserve legacy local identifiers and references, but it should not be structurally bound to them.
|
||||||
|
|
||||||
|
## Direction
|
||||||
|
|
||||||
|
Treat legacy local codes and freeform references as import-era artifacts, not canonical future-facing identifiers.
|
||||||
|
|
||||||
|
Going forward, EcoSpecies should prefer broadly recognized identifiers and registries:
|
||||||
|
|
||||||
|
- taxonomic name authority and taxon identifiers:
|
||||||
|
- Catalogue of Life IDs and release DOIs
|
||||||
|
- GBIF taxon keys
|
||||||
|
- WoRMS AphiaIDs for marine taxa
|
||||||
|
- ITIS TSNs where relevant
|
||||||
|
- optional NCBI Taxonomy IDs for research interoperability
|
||||||
|
- literature and dataset identifiers:
|
||||||
|
- DOI as the primary publication/dataset identifier
|
||||||
|
- ISBN/ISSN where DOI is absent
|
||||||
|
- OpenAlex IDs and DataCite metadata as enrichment layers
|
||||||
|
- contributor identity:
|
||||||
|
- email-based local contributor accounts now
|
||||||
|
- optional ORCID linkage later for editor and contributor identity
|
||||||
|
|
||||||
|
The system should be marine-forward because that matches the historical corpus, but not marine-exclusive. Identifier strategy should therefore be authority-aware rather than tied to a single domain-specific registry.
|
||||||
|
|
||||||
|
## Authority Selection Strategy
|
||||||
|
|
||||||
|
Choose the primary taxon authority by best-fit coverage, not by a single global rule.
|
||||||
|
|
||||||
|
- marine taxa:
|
||||||
|
- prefer WoRMS AphiaID as primary when confidently matched
|
||||||
|
- retain GBIF and Catalogue of Life as crosswalks
|
||||||
|
- non-marine or mixed-domain taxa:
|
||||||
|
- prefer Catalogue of Life or GBIF as primary, depending on match quality and coverage
|
||||||
|
- retain ITIS and other relevant identifiers as crosswalks
|
||||||
|
- unresolved or conflicting cases:
|
||||||
|
- store all candidate identifiers
|
||||||
|
- require editorial review before a primary identifier is asserted
|
||||||
|
|
||||||
|
This keeps the project ready for terrestrial expansion without discarding the value of WoRMS for the present corpus.
|
||||||
|
|
||||||
|
## Important Taxonomic Note
|
||||||
|
|
||||||
|
PhyloCode is relevant for clade naming, not as a general-purpose replacement for species-level registry IDs. It should not become the primary EcoSpecies species identifier layer. It may be useful later for clade-aware ontology and higher-level phylogenetic naming, but not as the main substitute for local `species_code` values.
|
||||||
|
|
||||||
|
## Core Design Rules
|
||||||
|
|
||||||
|
1. Legacy local identifiers remain preserved exactly as imported.
|
||||||
|
2. Canonical taxon identity becomes multi-authority, not single-local-code.
|
||||||
|
3. Citations become first-class structured entities, not just text inside a section.
|
||||||
|
4. Bibliographies can be extended by topic and citation graph, not only by source-document inheritance.
|
||||||
|
5. Exports keep provenance visible so readers can distinguish legacy source metadata from normalized external identifiers.
|
||||||
|
|
||||||
|
## Schema Changes
|
||||||
|
|
||||||
|
### Species metadata
|
||||||
|
|
||||||
|
Retain `flelmr_code` for provenance, but demote it to a legacy metadata field.
|
||||||
|
|
||||||
|
Add a taxon-identity layer:
|
||||||
|
|
||||||
|
- `taxon_name_usage`
|
||||||
|
- `taxon_identifier`
|
||||||
|
- `taxon_authority`
|
||||||
|
- `taxon_match_review`
|
||||||
|
|
||||||
|
Suggested fields:
|
||||||
|
|
||||||
|
- `taxon_identifier.authority`
|
||||||
|
- `taxon_identifier.identifier`
|
||||||
|
- `taxon_identifier.rank`
|
||||||
|
- `taxon_identifier.label`
|
||||||
|
- `taxon_identifier.is_primary`
|
||||||
|
- `taxon_identifier.source_url`
|
||||||
|
- `taxon_identifier.asserted_by`
|
||||||
|
- `taxon_identifier.match_confidence`
|
||||||
|
- `taxon_identifier.review_status`
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
- `authority = "worms", identifier = "159059", label = "AphiaID"`
|
||||||
|
- `authority = "gbif", identifier = "2290910", label = "taxonKey"`
|
||||||
|
- `authority = "col", identifier = "5T7L7", label = "taxonID"`
|
||||||
|
- `authority = "itis", identifier = "161989", label = "TSN"`
|
||||||
|
- `authority = "legacy-ecospecies", identifier = "5192", label = "FLELMR"`
|
||||||
|
|
||||||
|
### Citation model
|
||||||
|
|
||||||
|
Move from section text to structured bibliography entities:
|
||||||
|
|
||||||
|
- `citation`
|
||||||
|
- `citation_identifier`
|
||||||
|
- `citation_relation`
|
||||||
|
- `species_citation`
|
||||||
|
- `document_node_citation`
|
||||||
|
- `bibliography_topic`
|
||||||
|
|
||||||
|
Suggested citation identifier types:
|
||||||
|
|
||||||
|
- DOI
|
||||||
|
- ISBN
|
||||||
|
- ISSN
|
||||||
|
- PMID
|
||||||
|
- arXiv
|
||||||
|
- OpenAlex
|
||||||
|
- URL
|
||||||
|
|
||||||
|
## Markdown / AST Changes
|
||||||
|
|
||||||
|
Update the constrained Markdown profile so metadata stops implying that `species_code` is canonical.
|
||||||
|
|
||||||
|
Replace the current front matter recommendation:
|
||||||
|
|
||||||
|
```md
|
||||||
|
species_code: 5192
|
||||||
|
```
|
||||||
|
|
||||||
|
with a provenance-oriented shape:
|
||||||
|
|
||||||
|
```md
|
||||||
|
legacy_identifiers:
|
||||||
|
- authority: legacy-ecospecies
|
||||||
|
identifier: 5192
|
||||||
|
label: FLELMR
|
||||||
|
taxon_identifiers:
|
||||||
|
- authority: worms
|
||||||
|
identifier: 159059
|
||||||
|
label: AphiaID
|
||||||
|
primary: true
|
||||||
|
- authority: gbif
|
||||||
|
identifier: 2290910
|
||||||
|
label: taxonKey
|
||||||
|
```
|
||||||
|
|
||||||
|
Also add explicit bibliography sections:
|
||||||
|
|
||||||
|
```md
|
||||||
|
## References
|
||||||
|
|
||||||
|
- id: doi:10.1000/example
|
||||||
|
text: Smith, J. 2024. Example paper...
|
||||||
|
relation: cites
|
||||||
|
|
||||||
|
## Suggested Reading
|
||||||
|
|
||||||
|
- topic: estuarine ecology
|
||||||
|
```
|
||||||
|
|
||||||
|
The AST should preserve:
|
||||||
|
|
||||||
|
- legacy identifiers
|
||||||
|
- normalized taxon identifiers
|
||||||
|
- structured references
|
||||||
|
- topic links used for bibliography expansion
|
||||||
|
|
||||||
|
## Import Pipeline Changes
|
||||||
|
|
||||||
|
### Species identity
|
||||||
|
|
||||||
|
Import should produce:
|
||||||
|
|
||||||
|
1. raw imported name fields,
|
||||||
|
2. legacy local identifiers,
|
||||||
|
3. unresolved candidate taxon identifiers,
|
||||||
|
4. optional matched external identifiers,
|
||||||
|
5. a review state for unresolved or conflicting authority matches.
|
||||||
|
|
||||||
|
Do not block ingest if no external authority match exists. Store the unresolved state explicitly.
|
||||||
|
|
||||||
|
Primary identifier assignment should be determined by:
|
||||||
|
|
||||||
|
1. domain fit of the authority
|
||||||
|
2. confidence of the match
|
||||||
|
3. editorial review status
|
||||||
|
4. future ability to crosswalk to other authorities
|
||||||
|
|
||||||
|
### Citations
|
||||||
|
|
||||||
|
Split citation processing into stages:
|
||||||
|
|
||||||
|
1. detect bibliography/reference sections in the imported SLH text,
|
||||||
|
2. extract plaintext reference strings,
|
||||||
|
3. convert plaintext references into draft structured entries,
|
||||||
|
4. enrich identifiers and metadata,
|
||||||
|
5. assign accepted citations back to species and document nodes,
|
||||||
|
6. optionally expand bibliography by topic and citation graph.
|
||||||
|
|
||||||
|
## CiteGeist Integration
|
||||||
|
|
||||||
|
`../CiteGeist` is a strong fit for this migration.
|
||||||
|
|
||||||
|
Observed capabilities in that repo already cover much of what EcoSpecies needs:
|
||||||
|
|
||||||
|
- extracting references from plaintext,
|
||||||
|
- converting rough references into draft structured entries,
|
||||||
|
- DOI/Crossref/DataCite/OpenAlex enrichment,
|
||||||
|
- citation graph expansion,
|
||||||
|
- topic-based bibliography expansion,
|
||||||
|
- duplicate clustering and canonicalization.
|
||||||
|
|
||||||
|
### Recommended integration boundary
|
||||||
|
|
||||||
|
Do not embed CiteGeist logic directly into the EcoSpecies parser.
|
||||||
|
|
||||||
|
Instead:
|
||||||
|
|
||||||
|
1. EcoSpecies exports candidate plaintext references and topic phrases.
|
||||||
|
2. CiteGeist processes and enriches them into structured bibliography data.
|
||||||
|
3. EcoSpecies imports reviewed citation outputs into its own `citation` tables.
|
||||||
|
|
||||||
|
### First integration targets
|
||||||
|
|
||||||
|
- species-level bibliography cleanup from `References` sections
|
||||||
|
- DOI resolution and identifier assignment
|
||||||
|
- duplicate detection across species bibliographies
|
||||||
|
- topic expansion for subject areas such as habitat, trophic ecology, reproduction, invasive biology, and fisheries context
|
||||||
|
|
||||||
|
### Later integration targets
|
||||||
|
|
||||||
|
- node-level citation attachment
|
||||||
|
- bibliography review UI
|
||||||
|
- suggested-reading generation per species
|
||||||
|
- topic-seeded bibliography augmentation for under-cited species drafts
|
||||||
|
|
||||||
|
## API Changes
|
||||||
|
|
||||||
|
Add standards-aware endpoints:
|
||||||
|
|
||||||
|
- `/api/species/<slug>/identifiers`
|
||||||
|
- `/api/species/<slug>/citations`
|
||||||
|
- `/api/species/<slug>/bibliography/topics`
|
||||||
|
- `/api/editor/species/<slug>/identifier-review`
|
||||||
|
- `/api/editor/species/<slug>/citation-review`
|
||||||
|
|
||||||
|
Do not remove legacy fields immediately. Keep `flelmr_code` in payloads for compatibility while introducing:
|
||||||
|
|
||||||
|
- `legacy_identifiers`
|
||||||
|
- `taxon_identifiers`
|
||||||
|
- `primary_taxon_identifier`
|
||||||
|
|
||||||
|
## UI Changes
|
||||||
|
|
||||||
|
The species detail page should distinguish:
|
||||||
|
|
||||||
|
- scientific name
|
||||||
|
- primary external taxon identifier
|
||||||
|
- legacy local identifiers
|
||||||
|
- bibliography
|
||||||
|
- suggested reading
|
||||||
|
|
||||||
|
Editors should see:
|
||||||
|
|
||||||
|
- unresolved authority matches
|
||||||
|
- conflicting taxon IDs
|
||||||
|
- citation enrichment candidates
|
||||||
|
- duplicate-reference clusters
|
||||||
|
|
||||||
|
Contributors should only author content and draft references; identifier normalization and bibliography publication remain editorial functions.
|
||||||
|
|
||||||
|
## Migration Phases
|
||||||
|
|
||||||
|
### Phase A: Demote legacy code
|
||||||
|
|
||||||
|
- Rename internal presentation from “species code” to “legacy identifier”.
|
||||||
|
- Keep `flelmr_code` only as legacy provenance.
|
||||||
|
- Add `legacy_identifiers` to Markdown export and AST.
|
||||||
|
|
||||||
|
### Phase B: Add external taxon identifiers
|
||||||
|
|
||||||
|
- Create taxon-identifier tables and API payloads.
|
||||||
|
- Add editor review workflows for selecting a primary authority identifier.
|
||||||
|
- Default marine taxa review toward WoRMS where available.
|
||||||
|
- Default broader cross-domain review toward Catalogue of Life and GBIF where WoRMS is not the right authority.
|
||||||
|
- Keep the model open to terrestrial species from the beginning rather than treating them as out-of-scope exceptions.
|
||||||
|
|
||||||
|
### Phase C: Structured bibliography
|
||||||
|
|
||||||
|
- Create citation tables.
|
||||||
|
- Extract plaintext references from imported documents.
|
||||||
|
- Store draft citations separately from accepted citations.
|
||||||
|
|
||||||
|
### Phase D: CiteGeist bridge
|
||||||
|
|
||||||
|
- Define import/export format between EcoSpecies and CiteGeist.
|
||||||
|
- Run draft-reference normalization and DOI enrichment.
|
||||||
|
- Import reviewed structured citations back into EcoSpecies.
|
||||||
|
|
||||||
|
### Phase E: Topic-aware bibliography growth
|
||||||
|
|
||||||
|
- Store species topic phrases.
|
||||||
|
- Use CiteGeist topic expansion for bibliography augmentation.
|
||||||
|
- Keep added citations flagged by source type:
|
||||||
|
- imported
|
||||||
|
- resolved
|
||||||
|
- topic-expanded
|
||||||
|
- editor-added
|
||||||
|
|
||||||
|
## Immediate Next Steps
|
||||||
|
|
||||||
|
1. Update the Markdown profile to replace `species_code` with `legacy_identifiers` plus `taxon_identifiers`.
|
||||||
|
2. Add `legacy_identifiers` and `taxon_identifiers` to the AST/document model.
|
||||||
|
3. Introduce taxon identifier tables in the PostgreSQL schema.
|
||||||
|
4. Define a minimal EcoSpecies-to-CiteGeist interchange format for plaintext references and topic phrases.
|
||||||
|
5. Add editor-facing citation review before attempting automatic bibliography publication.
|
||||||
|
|
@ -0,0 +1,338 @@
|
||||||
|
# Structured Markdown Document Plan
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
Replace the current flat, parser-heavy free-form text handling with a document model that is:
|
||||||
|
|
||||||
|
- human-readable in plaintext
|
||||||
|
- editable in the browser with hierarchy folding
|
||||||
|
- permissive-license friendly
|
||||||
|
- suitable for first-pass conversion from legacy SLH text files
|
||||||
|
- suitable as the primary export format for a species life history
|
||||||
|
- able to project cleanly into a flexible database model with greater hierarchical depth
|
||||||
|
|
||||||
|
## Recommendation
|
||||||
|
|
||||||
|
Adopt a constrained Markdown-based authoring format as the primary human-facing document format, backed by an internal hierarchical document AST and a relational projection layer in PostgreSQL.
|
||||||
|
|
||||||
|
Use this three-layer model:
|
||||||
|
|
||||||
|
1. Source and export format: constrained EcoSpecies Markdown
|
||||||
|
2. Canonical application representation: hierarchical AST
|
||||||
|
3. Database representation: relational projection for querying, indexing, publishing, and editorial workflows
|
||||||
|
|
||||||
|
This avoids treating raw free-form text as both the storage format and the parser input.
|
||||||
|
|
||||||
|
## Why Markdown Instead Of Org
|
||||||
|
|
||||||
|
Markdown is the better fit for this codebase and licensing requirement because:
|
||||||
|
|
||||||
|
- it is familiar to most users
|
||||||
|
- it is easier to constrain than Org
|
||||||
|
- it maps naturally to hierarchical headings
|
||||||
|
- it works well with CodeMirror folding
|
||||||
|
- it does not require adopting GPL or AGPL editor code
|
||||||
|
|
||||||
|
Org-style authoring remains conceptually attractive, but embedding Org-specific tooling such as organice would introduce copyleft code, which is not aligned with a permissive-only implementation strategy.
|
||||||
|
|
||||||
|
## EcoSpecies Markdown Profile
|
||||||
|
|
||||||
|
The format should be Markdown-like, but intentionally narrower than unrestricted Markdown.
|
||||||
|
|
||||||
|
### Metadata
|
||||||
|
|
||||||
|
Use YAML front matter for canonical metadata fields:
|
||||||
|
|
||||||
|
```md
|
||||||
|
---
|
||||||
|
title: American Oyster
|
||||||
|
common_name: American Oyster
|
||||||
|
scientific_name: Crassostrea virginica
|
||||||
|
legacy_identifiers:
|
||||||
|
- authority: legacy-ecospecies
|
||||||
|
identifier: 5192
|
||||||
|
label: FLELMR
|
||||||
|
taxon_identifiers:
|
||||||
|
- authority: worms
|
||||||
|
identifier: 159059
|
||||||
|
label: AphiaID
|
||||||
|
primary: true
|
||||||
|
source_file: American Oyster SLH NOAA SEA.txt
|
||||||
|
publication_status: published
|
||||||
|
---
|
||||||
|
```
|
||||||
|
|
||||||
|
Recommended canonical fields:
|
||||||
|
|
||||||
|
- `title`
|
||||||
|
- `common_name`
|
||||||
|
- `scientific_name`
|
||||||
|
- `legacy_identifiers`
|
||||||
|
- `taxon_identifiers`
|
||||||
|
- `primary_taxon_authority`
|
||||||
|
- `source_file`
|
||||||
|
- `publication_status`
|
||||||
|
- `source_format`
|
||||||
|
- `legacy_import_id`
|
||||||
|
|
||||||
|
### Hierarchy
|
||||||
|
|
||||||
|
Use headings as the sole structure-bearing primitive.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```md
|
||||||
|
---
|
||||||
|
title: American Oyster
|
||||||
|
common_name: American Oyster
|
||||||
|
scientific_name: Crassostrea virginica
|
||||||
|
legacy_identifiers:
|
||||||
|
- authority: legacy-ecospecies
|
||||||
|
identifier: 5192
|
||||||
|
label: FLELMR
|
||||||
|
---
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
Short editor-reviewed abstract.
|
||||||
|
|
||||||
|
## Habitat
|
||||||
|
|
||||||
|
### Type
|
||||||
|
Estuarine.
|
||||||
|
|
||||||
|
### Substrate
|
||||||
|
Hard bottom, shell, mud flats, and other suitable settlement surfaces.
|
||||||
|
|
||||||
|
## Reproduction
|
||||||
|
|
||||||
|
### Season
|
||||||
|
Spawning occurs from spring through fall in much of the Gulf.
|
||||||
|
```
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
|
||||||
|
- Heading depth is meaningful.
|
||||||
|
- Skip-level headings should be rejected or normalized.
|
||||||
|
- Body text belongs to the nearest preceding heading.
|
||||||
|
- `#` level is optional if the document title already exists in front matter.
|
||||||
|
- Tables, lists, and citations are allowed only where explicitly supported.
|
||||||
|
- Arbitrary embedded HTML should be disallowed.
|
||||||
|
|
||||||
|
### Citations
|
||||||
|
|
||||||
|
Keep citations readable in Markdown but structured enough to parse.
|
||||||
|
|
||||||
|
Preferred first-pass shape:
|
||||||
|
|
||||||
|
```md
|
||||||
|
## Citations
|
||||||
|
|
||||||
|
- [7] Ahmed, M. 1975. Speciation in living oysters. Advances in Marine Biology 13:357-397.
|
||||||
|
- [15] Andrews, J.D. 1979. Pelecypoda: Ostreidae. Reproduction of Marine Invertebrates...
|
||||||
|
```
|
||||||
|
|
||||||
|
This is intentionally simpler than trying to infer citations from arbitrary prose.
|
||||||
|
|
||||||
|
## Canonical AST
|
||||||
|
|
||||||
|
Markdown should not be the sole internal representation. Parse it into an AST that preserves hierarchy explicitly.
|
||||||
|
|
||||||
|
Example conceptual shape:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"title": "American Oyster",
|
||||||
|
"common_name": "American Oyster",
|
||||||
|
"scientific_name": "Crassostrea virginica",
|
||||||
|
"legacy_identifiers": [
|
||||||
|
{
|
||||||
|
"authority": "legacy-ecospecies",
|
||||||
|
"identifier": "5192",
|
||||||
|
"label": "FLELMR"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"nodes": [
|
||||||
|
{
|
||||||
|
"id": "n1",
|
||||||
|
"type": "section",
|
||||||
|
"depth": 2,
|
||||||
|
"title": "Summary",
|
||||||
|
"body": "Short editor-reviewed abstract.",
|
||||||
|
"children": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "n2",
|
||||||
|
"type": "section",
|
||||||
|
"depth": 2,
|
||||||
|
"title": "Habitat",
|
||||||
|
"body": "",
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"id": "n3",
|
||||||
|
"type": "section",
|
||||||
|
"depth": 3,
|
||||||
|
"title": "Type",
|
||||||
|
"body": "Estuarine.",
|
||||||
|
"children": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Required AST properties:
|
||||||
|
|
||||||
|
- arbitrary hierarchical depth
|
||||||
|
- stable node identifiers
|
||||||
|
- separate metadata from body structure
|
||||||
|
- support for editor audit and provenance
|
||||||
|
- support for extracting source spans from imported legacy text when available
|
||||||
|
|
||||||
|
## Database Direction
|
||||||
|
|
||||||
|
The current flat `document_section` model should evolve into a general document tree.
|
||||||
|
|
||||||
|
Suggested core tables:
|
||||||
|
|
||||||
|
- `species_document`
|
||||||
|
- `species_document_node`
|
||||||
|
- `species_document_node_revision`
|
||||||
|
- `species_document_metadata`
|
||||||
|
- `citation`
|
||||||
|
- `species_document_export`
|
||||||
|
|
||||||
|
Suggested `species_document_node` fields:
|
||||||
|
|
||||||
|
- `id`
|
||||||
|
- `document_id`
|
||||||
|
- `parent_id`
|
||||||
|
- `position`
|
||||||
|
- `depth`
|
||||||
|
- `node_type`
|
||||||
|
- `title`
|
||||||
|
- `body_markdown`
|
||||||
|
- `body_plaintext`
|
||||||
|
- `source_heading`
|
||||||
|
- `source_span_start`
|
||||||
|
- `source_span_end`
|
||||||
|
|
||||||
|
This enables:
|
||||||
|
|
||||||
|
- greater hierarchical depth
|
||||||
|
- stable editor operations on subtrees
|
||||||
|
- future insertion of machine-extracted nested content
|
||||||
|
- simplified export back to Markdown
|
||||||
|
|
||||||
|
## Import Flow
|
||||||
|
|
||||||
|
The legacy text parser should no longer attempt to infer the final database structure directly.
|
||||||
|
|
||||||
|
Instead:
|
||||||
|
|
||||||
|
1. Parse raw legacy text into a best-effort intermediate tree.
|
||||||
|
2. Normalize extracted metadata.
|
||||||
|
3. Emit constrained Markdown.
|
||||||
|
4. Parse constrained Markdown into AST.
|
||||||
|
5. Persist AST and project relationally.
|
||||||
|
6. Record diagnostics on uncertain conversions.
|
||||||
|
|
||||||
|
This changes the parser’s role from “infer final structure perfectly” to “produce a reviewable first draft”.
|
||||||
|
|
||||||
|
## Editor Flow
|
||||||
|
|
||||||
|
The web editor should operate primarily on the Markdown representation, with a structured parse running on save or preview.
|
||||||
|
|
||||||
|
Recommended behavior:
|
||||||
|
|
||||||
|
- fold by heading depth in CodeMirror
|
||||||
|
- validate front matter and heading structure
|
||||||
|
- preview rendered sections
|
||||||
|
- show parser diagnostics inline
|
||||||
|
- save both Markdown source and parsed AST
|
||||||
|
|
||||||
|
The editor should reject or flag:
|
||||||
|
|
||||||
|
- invalid front matter
|
||||||
|
- duplicate canonical metadata keys
|
||||||
|
- heading depth jumps
|
||||||
|
- malformed citation entries in structured sections
|
||||||
|
|
||||||
|
## Export Policy
|
||||||
|
|
||||||
|
Markdown should be the primary export format for a species life history.
|
||||||
|
|
||||||
|
Export targets:
|
||||||
|
|
||||||
|
- constrained Markdown for editorial interchange
|
||||||
|
- JSON AST for machine workflows
|
||||||
|
- derived relational/API payloads for the application
|
||||||
|
- optional report-oriented exports later
|
||||||
|
|
||||||
|
The export path should be:
|
||||||
|
|
||||||
|
- database document tree -> canonical AST -> constrained Markdown
|
||||||
|
|
||||||
|
This ensures the exported plaintext remains stable and human-readable.
|
||||||
|
|
||||||
|
## Migration Strategy
|
||||||
|
|
||||||
|
### Stage 1: Introduce the document model
|
||||||
|
|
||||||
|
- add AST schema and persistence tables
|
||||||
|
- keep existing section-based reads working
|
||||||
|
- build Markdown import/export helpers
|
||||||
|
|
||||||
|
### Stage 2: Convert current parser output
|
||||||
|
|
||||||
|
- map current parsed sections into Markdown drafts
|
||||||
|
- preserve existing metadata and diagnostics
|
||||||
|
- store generated Markdown alongside current records
|
||||||
|
|
||||||
|
### Stage 3: Introduce Markdown editor
|
||||||
|
|
||||||
|
- add CodeMirror-based editor with heading folding
|
||||||
|
- add validation for front matter and heading structure
|
||||||
|
- add round-trip save through AST
|
||||||
|
|
||||||
|
### Stage 4: Move public reads to the new document model
|
||||||
|
|
||||||
|
- generate current API responses from the hierarchical document tree
|
||||||
|
- keep compatibility shims for legacy flat sections where needed
|
||||||
|
|
||||||
|
### Stage 5: Expand structured extraction
|
||||||
|
|
||||||
|
- add deeper parsing for habitat, reproduction, citations, and linkages
|
||||||
|
- add richer projections from AST to relational tables
|
||||||
|
|
||||||
|
## Immediate Implementation Tasks
|
||||||
|
|
||||||
|
Recommended first engineering tasks:
|
||||||
|
|
||||||
|
1. Define the constrained Markdown grammar and validation rules.
|
||||||
|
2. Design the AST schema and PostgreSQL tables.
|
||||||
|
3. Add Markdown import/export utilities in the API service.
|
||||||
|
4. Prototype a CodeMirror editor with heading folding.
|
||||||
|
5. Add a migration command that converts current species records into Markdown drafts.
|
||||||
|
6. Preserve current endpoints while introducing the document-tree backing model.
|
||||||
|
|
||||||
|
## Non-Goals For The First Pass
|
||||||
|
|
||||||
|
- full unrestricted Markdown feature support
|
||||||
|
- WYSIWYG editing
|
||||||
|
- arbitrary embedded HTML
|
||||||
|
- perfect citation parsing from all legacy free text
|
||||||
|
- replacing every existing API shape immediately
|
||||||
|
|
||||||
|
## Decision Summary
|
||||||
|
|
||||||
|
The planned direction is:
|
||||||
|
|
||||||
|
- constrained Markdown as the editable and exportable document format
|
||||||
|
- internal AST as the canonical application representation
|
||||||
|
- relational projection for queryable application state
|
||||||
|
- CodeMirror-based browser editing with heading folding
|
||||||
|
|
||||||
|
This is the most practical path toward human-editable hierarchy, permissive-only implementation, cleaner parsing, and deeper long-term document structure.
|
||||||
|
|
@ -0,0 +1,79 @@
|
||||||
|
# Traefik Deployment Notes
|
||||||
|
|
||||||
|
This note applies to the reverse-proxy deployment variant in `docs/docker-compose-traefik.yml`.
|
||||||
|
|
||||||
|
## Start The Stack
|
||||||
|
|
||||||
|
From the repository root:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cp docs/docker-compose-traefik.env.example docs/docker-compose-traefik.env
|
||||||
|
# edit docs/docker-compose-traefik.env
|
||||||
|
docker compose \
|
||||||
|
--env-file docs/docker-compose-traefik.env \
|
||||||
|
-f docs/docker-compose-traefik.yml \
|
||||||
|
up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Failure Modes
|
||||||
|
|
||||||
|
### Traefik cannot reach the web container
|
||||||
|
|
||||||
|
Check:
|
||||||
|
|
||||||
|
- the external Docker network named by `TRAEFIK_NETWORK` exists
|
||||||
|
- the Traefik instance is attached to that same Docker network
|
||||||
|
- the hostname in `ECOSPECIES_HOSTNAME` matches the Traefik router rule you expect
|
||||||
|
- the path in `ECOSPECIES_BASE_PATH` matches the published application prefix, for example `/apps/ecospecies`
|
||||||
|
|
||||||
|
### The site opens but the API fails
|
||||||
|
|
||||||
|
Check:
|
||||||
|
|
||||||
|
- the `api` service is healthy and running
|
||||||
|
- the `web` service is using the repo's `apps/web/nginx.conf`
|
||||||
|
- the `api` service finished waiting for `importer`
|
||||||
|
- the request path is under `ECOSPECIES_BASE_PATH` if you are publishing the app below a domain root
|
||||||
|
|
||||||
|
### Importer fails on startup
|
||||||
|
|
||||||
|
Check:
|
||||||
|
|
||||||
|
- `ECOSPECIES_LEGACY_DATA_DIR` points to a real host path
|
||||||
|
- that path contains `InputFiles - TXT`
|
||||||
|
- the mount is readable by Docker on the target host
|
||||||
|
|
||||||
|
### Database does not initialize
|
||||||
|
|
||||||
|
Check:
|
||||||
|
|
||||||
|
- `ECOSPECIES_DB_PASSWORD` is set
|
||||||
|
- the PostgreSQL volume is writable
|
||||||
|
- an old incompatible volume is not being reused unintentionally
|
||||||
|
|
||||||
|
### Editor login works but no editor state is available
|
||||||
|
|
||||||
|
Check:
|
||||||
|
|
||||||
|
- `ECOSPECIES_AUTH_TOKENS` is set on the `api` service
|
||||||
|
- the token you entered matches the configured value exactly
|
||||||
|
|
||||||
|
## Operational Notes
|
||||||
|
|
||||||
|
- This deployment variant intentionally exposes only the `web` container to Traefik.
|
||||||
|
- The `api`, `db`, and `importer` services stay on the internal Compose network.
|
||||||
|
- The `importer` runs before the API starts and seeds or synchronizes the dataset.
|
||||||
|
- The web container serves both the domain root and `/apps/ecospecies/`, but the Traefik router should target the intended public path.
|
||||||
|
|
||||||
|
## Apache Front Door
|
||||||
|
|
||||||
|
If Apache is the public front door for the hostname in `ECOSPECIES_HOSTNAME`, it must proxy the configured `ECOSPECIES_BASE_PATH` onward. Otherwise Apache can return its own `Not Found` page before the EcoSpecies stack sees the request.
|
||||||
|
|
||||||
|
Example Apache directives:
|
||||||
|
|
||||||
|
```apache
|
||||||
|
ProxyPass /apps/ecospecies http://127.0.0.1:80/apps/ecospecies
|
||||||
|
ProxyPassReverse /apps/ecospecies http://127.0.0.1:80/apps/ecospecies
|
||||||
|
```
|
||||||
|
|
||||||
|
Point the backend address at the actual Traefik listener on the host if it is not `127.0.0.1:80`, and adjust the published path if `ECOSPECIES_BASE_PATH` is different.
|
||||||
|
|
@ -0,0 +1,185 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from ecospecies_api.repository import (
|
||||||
|
get_editor_species_citations,
|
||||||
|
get_editor_species_list,
|
||||||
|
update_species_citation_enrichment,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def should_backfill(citation: dict[str, object], include_accepted: bool) -> bool:
|
||||||
|
review_status = str(citation.get("review_status", "")).strip().lower()
|
||||||
|
source_type = str(citation.get("source_type", "")).strip().lower()
|
||||||
|
enrichment_status = str(citation.get("enrichment_status", "")).strip().lower()
|
||||||
|
normalized_text = str(citation.get("normalized_text", "")).strip()
|
||||||
|
abstract_text = str(citation.get("abstract_text", "")).strip()
|
||||||
|
|
||||||
|
if not include_accepted and review_status == "accepted":
|
||||||
|
return False
|
||||||
|
if source_type in {"editor_selected_candidate", "editor_added_candidate"} and not include_accepted:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return (
|
||||||
|
source_type in {"document_extract", "editor_review", ""}
|
||||||
|
or enrichment_status in {"pending", "unresolved", "error", ""}
|
||||||
|
or not normalized_text
|
||||||
|
or not abstract_text
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def reorder_species_with_cursor(
|
||||||
|
species_items: list[dict[str, object]],
|
||||||
|
state_file: Path | None,
|
||||||
|
) -> list[dict[str, object]]:
|
||||||
|
if not state_file or not species_items:
|
||||||
|
return species_items
|
||||||
|
|
||||||
|
try:
|
||||||
|
last_slug = state_file.read_text(encoding="utf-8").strip()
|
||||||
|
except FileNotFoundError:
|
||||||
|
return species_items
|
||||||
|
|
||||||
|
if not last_slug:
|
||||||
|
return species_items
|
||||||
|
|
||||||
|
for index, item in enumerate(species_items):
|
||||||
|
if str(item.get("slug", "")).strip() == last_slug:
|
||||||
|
return species_items[index + 1 :] + species_items[: index + 1]
|
||||||
|
return species_items
|
||||||
|
|
||||||
|
|
||||||
|
def write_cursor(state_file: Path | None, slug: str) -> None:
|
||||||
|
if not state_file or not slug:
|
||||||
|
return
|
||||||
|
state_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
state_file.write_text(f"{slug}\n", encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
parser = argparse.ArgumentParser(description="Backfill EcoSpecies citation enrichment.")
|
||||||
|
parser.add_argument("--slug", help="Limit the backfill to a single species slug.")
|
||||||
|
parser.add_argument("--username", default="citation-backfill", help="Audit username to record.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--include-accepted",
|
||||||
|
action="store_true",
|
||||||
|
help="Also rerun accepted/editor-curated citations.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-species",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="Stop after this many species with eligible citations. 0 means no limit.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-citations",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="Stop after this many citations overall. 0 means no limit.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--state-file",
|
||||||
|
help="Optional cursor file used to rotate scheduled runs through the species list.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
state_file = Path(args.state_file).expanduser() if args.state_file else None
|
||||||
|
species_items = (
|
||||||
|
[item for item in get_editor_species_list() if item["slug"] == args.slug]
|
||||||
|
if args.slug
|
||||||
|
else get_editor_species_list()
|
||||||
|
)
|
||||||
|
if not args.slug:
|
||||||
|
species_items = reorder_species_with_cursor(species_items, state_file)
|
||||||
|
|
||||||
|
if args.slug and not species_items:
|
||||||
|
print(f"Species not found: {args.slug}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
species_count = 0
|
||||||
|
citation_count = 0
|
||||||
|
changed_count = 0
|
||||||
|
resolved_count = 0
|
||||||
|
unresolved_count = 0
|
||||||
|
error_count = 0
|
||||||
|
last_seen_slug = ""
|
||||||
|
|
||||||
|
for species in species_items:
|
||||||
|
if args.max_species and species_count >= args.max_species:
|
||||||
|
break
|
||||||
|
slug = str(species["slug"])
|
||||||
|
last_seen_slug = slug
|
||||||
|
citation_payload = get_editor_species_citations(slug)
|
||||||
|
if citation_payload is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
eligible = [
|
||||||
|
citation
|
||||||
|
for citation in citation_payload["citations"]
|
||||||
|
if should_backfill(citation, include_accepted=args.include_accepted)
|
||||||
|
]
|
||||||
|
if not eligible:
|
||||||
|
continue
|
||||||
|
|
||||||
|
species_count += 1
|
||||||
|
print(f"[{slug}] backfilling {len(eligible)} citation(s)", flush=True)
|
||||||
|
|
||||||
|
for citation in eligible:
|
||||||
|
if args.max_citations and citation_count >= args.max_citations:
|
||||||
|
write_cursor(state_file, last_seen_slug)
|
||||||
|
print("citation limit reached; stopping early", flush=True)
|
||||||
|
print(
|
||||||
|
"summary:"
|
||||||
|
f" species={species_count}"
|
||||||
|
f" citations={citation_count}"
|
||||||
|
f" changed={changed_count}"
|
||||||
|
f" resolved={resolved_count}"
|
||||||
|
f" unresolved={unresolved_count}"
|
||||||
|
f" errors={error_count}",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
return 0
|
||||||
|
citation_count += 1
|
||||||
|
result = update_species_citation_enrichment(
|
||||||
|
slug=slug,
|
||||||
|
citation_id=int(citation["id"]),
|
||||||
|
username=args.username,
|
||||||
|
)
|
||||||
|
if result is None:
|
||||||
|
print(f" - citation {citation['id']}: skipped (not found)", flush=True)
|
||||||
|
continue
|
||||||
|
|
||||||
|
changed_fields = result.get("changed_fields", {})
|
||||||
|
status = str(result["citation"].get("enrichment_status", "")).strip().lower()
|
||||||
|
if changed_fields:
|
||||||
|
changed_count += 1
|
||||||
|
if status == "resolved":
|
||||||
|
resolved_count += 1
|
||||||
|
elif status == "unresolved":
|
||||||
|
unresolved_count += 1
|
||||||
|
elif status == "error":
|
||||||
|
error_count += 1
|
||||||
|
print(
|
||||||
|
f" - citation {citation['id']}: {status or 'unknown'}"
|
||||||
|
+ (f" ({len(changed_fields)} field changes)" if changed_fields else "")
|
||||||
|
, flush=True)
|
||||||
|
|
||||||
|
write_cursor(state_file, last_seen_slug)
|
||||||
|
print(
|
||||||
|
"summary:"
|
||||||
|
f" species={species_count}"
|
||||||
|
f" citations={citation_count}"
|
||||||
|
f" changed={changed_count}"
|
||||||
|
f" resolved={resolved_count}"
|
||||||
|
f" unresolved={unresolved_count}"
|
||||||
|
f" errors={error_count}",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
|
|
@ -0,0 +1,28 @@
|
||||||
|
#!/bin/sh
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
ROOT_DIR="$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)"
|
||||||
|
ENV_FILE="${ECOSPECIES_ENV_FILE:-$ROOT_DIR/docs/docker-compose-traefik.env}"
|
||||||
|
COMPOSE_FILE="${ECOSPECIES_COMPOSE_FILE:-$ROOT_DIR/docs/docker-compose-traefik.yml}"
|
||||||
|
OUTPUT_FILE="${1:-$ROOT_DIR/ecospecies-backup.sql}"
|
||||||
|
|
||||||
|
if [ ! -f "$ENV_FILE" ]; then
|
||||||
|
echo "Missing env file: $ENV_FILE" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
set -a
|
||||||
|
. "$ENV_FILE"
|
||||||
|
set +a
|
||||||
|
|
||||||
|
DB_USER="${ECOSPECIES_DB_USER:-ecospecies}"
|
||||||
|
DB_NAME="${ECOSPECIES_DB_NAME:-ecospecies}"
|
||||||
|
|
||||||
|
docker compose \
|
||||||
|
--env-file "$ENV_FILE" \
|
||||||
|
-f "$COMPOSE_FILE" \
|
||||||
|
exec -T db \
|
||||||
|
pg_dump -U "$DB_USER" "$DB_NAME" \
|
||||||
|
> "$OUTPUT_FILE"
|
||||||
|
|
||||||
|
printf 'Backup written to %s\n' "$OUTPUT_FILE"
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
#!/bin/sh
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
ROOT_DIR="$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)"
|
||||||
|
ENV_FILE="${ECOSPECIES_ENV_FILE:-$ROOT_DIR/docs/docker-compose-traefik.env}"
|
||||||
|
COMPOSE_FILE="${ECOSPECIES_COMPOSE_FILE:-$ROOT_DIR/docs/docker-compose-traefik.yml}"
|
||||||
|
INPUT_FILE="${1:-}"
|
||||||
|
|
||||||
|
if [ -z "$INPUT_FILE" ]; then
|
||||||
|
echo "Usage: $0 <sql-backup-file>" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f "$ENV_FILE" ]; then
|
||||||
|
echo "Missing env file: $ENV_FILE" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f "$INPUT_FILE" ]; then
|
||||||
|
echo "Missing backup file: $INPUT_FILE" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
set -a
|
||||||
|
. "$ENV_FILE"
|
||||||
|
set +a
|
||||||
|
|
||||||
|
DB_USER="${ECOSPECIES_DB_USER:-ecospecies}"
|
||||||
|
DB_NAME="${ECOSPECIES_DB_NAME:-ecospecies}"
|
||||||
|
|
||||||
|
cat "$INPUT_FILE" | docker compose \
|
||||||
|
--env-file "$ENV_FILE" \
|
||||||
|
-f "$COMPOSE_FILE" \
|
||||||
|
exec -T db \
|
||||||
|
psql -U "$DB_USER" "$DB_NAME"
|
||||||
|
|
||||||
|
printf 'Restore completed from %s\n' "$INPUT_FILE"
|
||||||
|
|
@ -0,0 +1,21 @@
|
||||||
|
#!/bin/sh
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
ROOT_DIR="$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)"
|
||||||
|
LOG_DIR="${ECOSPECIES_BACKFILL_LOG_DIR:-$ROOT_DIR/var/logs}"
|
||||||
|
STATE_FILE="${ECOSPECIES_BACKFILL_STATE_FILE:-$ROOT_DIR/var/citation-backfill.cursor}"
|
||||||
|
LOCK_DIR="${ECOSPECIES_BACKFILL_LOCK_DIR:-$ROOT_DIR/var/citation-backfill.lock}"
|
||||||
|
MAX_SPECIES="${ECOSPECIES_BACKFILL_MAX_SPECIES:-3}"
|
||||||
|
|
||||||
|
mkdir -p "$LOG_DIR"
|
||||||
|
mkdir -p "$ROOT_DIR/var"
|
||||||
|
|
||||||
|
if ! mkdir "$LOCK_DIR" 2>/dev/null; then
|
||||||
|
echo "citation backfill already running; skipping"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
trap 'rmdir "$LOCK_DIR"' EXIT INT TERM
|
||||||
|
|
||||||
|
exec docker exec ecospecies-api /bin/sh -lc \
|
||||||
|
"PYTHONPATH=/workspace/apps/api/src /workspace/.docker/venv/bin/python -u /workspace/scripts/backfill-citations.py --username citation-backfill --max-species ${MAX_SPECIES} --state-file ${STATE_FILE}"
|
||||||
Loading…
Reference in New Issue