from __future__ import annotations from collections import Counter from datetime import datetime, timezone import json import hashlib import os from pathlib import Path import re import secrets from sqlalchemy import inspect, select, text from sqlalchemy.exc import SQLAlchemyError from ecospecies_api.citation_enrichment import ( apply_citation_candidate_selection, discover_citation_candidates, enrich_citation_payload, ) from ecospecies_api.document_format import extract_species_projection, parse_markdown_document from ecospecies_api.document_format import add_citation_to_document, export_markdown_document from ecospecies_api.document_repository import ( get_species_document_payload, save_species_document, sync_species_document, ) from ecospecies_api.db import SessionLocal, create_db_engine from ecospecies_api.models import ( Base, ContributorAccount, DocumentSection, IngestDiagnosticRecord, Species, SpeciesAuditLog, SpeciesCitation, SpeciesTaxonIdentifier, ) from ecospecies_api.parser import get_default_data_dir, slugify WORKFLOW_STATUSES = {"draft", "review", "published"} CITATION_REVIEW_STATUSES = {"draft", "reviewed", "accepted", "rejected"} SYSTEM_IMPORT_USER = "system-import" CONTRIBUTOR_SUBMISSION_PREFIX = "contributor-submission" EMAIL_PATTERN = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$") def ensure_schema() -> None: engine = create_db_engine() Base.metadata.create_all(engine) inspector = inspect(engine) species_columns = {column["name"] for column in inspector.get_columns("species")} statements: list[str] = [] if "publication_status" not in species_columns: statements.append("ALTER TABLE species ADD COLUMN publication_status VARCHAR(32) DEFAULT 'published'") if "is_archived" not in species_columns: statements.append("ALTER TABLE species ADD COLUMN is_archived BOOLEAN DEFAULT FALSE") if "editor_notes" not in species_columns: statements.append("ALTER TABLE species ADD COLUMN editor_notes TEXT DEFAULT ''") if "created_by" not in species_columns: statements.append("ALTER TABLE species ADD COLUMN created_by VARCHAR(255) DEFAULT 'system-import'") if "owner_username" not in species_columns: statements.append("ALTER TABLE species ADD COLUMN owner_username VARCHAR(255) DEFAULT ''") if "owner_role" not in species_columns: statements.append("ALTER TABLE species ADD COLUMN owner_role VARCHAR(32) DEFAULT ''") if "last_modified_by" not in species_columns: statements.append("ALTER TABLE species ADD COLUMN last_modified_by VARCHAR(255) DEFAULT 'system-import'") if statements: with engine.begin() as connection: for statement in statements: connection.execute(text(statement)) connection.execute( text( "UPDATE species SET publication_status = COALESCE(publication_status, 'published'), " "is_archived = COALESCE(is_archived, FALSE), " "editor_notes = COALESCE(editor_notes, ''), " "created_by = COALESCE(created_by, 'system-import'), " "owner_username = COALESCE(owner_username, ''), " "owner_role = COALESCE(owner_role, ''), " "last_modified_by = COALESCE(last_modified_by, 'system-import')" ) ) tables = set(inspector.get_table_names()) if "species_citation" in tables: citation_columns = { column["name"] for column in inspector.get_columns("species_citation") } citation_statements: list[str] = [] if "legacy_reference_number" not in citation_columns: citation_statements.append( "ALTER TABLE species_citation ADD COLUMN legacy_reference_number VARCHAR(64) DEFAULT ''" ) if "citation_key" not in citation_columns: citation_statements.append( "ALTER TABLE species_citation ADD COLUMN citation_key VARCHAR(255) DEFAULT ''" ) if "entry_type" not in citation_columns: citation_statements.append( "ALTER TABLE species_citation ADD COLUMN entry_type VARCHAR(64) DEFAULT 'misc'" ) if "draft_bibtex" not in citation_columns: citation_statements.append( "ALTER TABLE species_citation ADD COLUMN draft_bibtex TEXT DEFAULT ''" ) if "abstract_text" not in citation_columns: citation_statements.append( "ALTER TABLE species_citation ADD COLUMN abstract_text TEXT DEFAULT ''" ) if "source_url" not in citation_columns: citation_statements.append( "ALTER TABLE species_citation ADD COLUMN source_url VARCHAR(500) DEFAULT ''" ) if "openalex_id" not in citation_columns: citation_statements.append( "ALTER TABLE species_citation ADD COLUMN openalex_id VARCHAR(64) DEFAULT ''" ) if "resolver_source_label" not in citation_columns: citation_statements.append( "ALTER TABLE species_citation ADD COLUMN resolver_source_label VARCHAR(255) DEFAULT ''" ) if "enrichment_status" not in citation_columns: citation_statements.append( "ALTER TABLE species_citation ADD COLUMN enrichment_status VARCHAR(32) DEFAULT 'pending'" ) if "enrichment_error" not in citation_columns: citation_statements.append( "ALTER TABLE species_citation ADD COLUMN enrichment_error TEXT DEFAULT ''" ) if citation_statements: with engine.begin() as connection: for statement in citation_statements: connection.execute(text(statement)) connection.execute( text( "UPDATE species_citation SET " "legacy_reference_number = COALESCE(legacy_reference_number, ''), " "citation_key = COALESCE(citation_key, ''), " "entry_type = COALESCE(entry_type, 'misc'), " "abstract_text = COALESCE(abstract_text, ''), " "draft_bibtex = COALESCE(draft_bibtex, ''), " "source_url = COALESCE(source_url, ''), " "openalex_id = COALESCE(openalex_id, ''), " "resolver_source_label = COALESCE(resolver_source_label, ''), " "enrichment_status = COALESCE(enrichment_status, 'pending'), " "enrichment_error = COALESCE(enrichment_error, '')" ) ) def _citation_to_payload(citation: SpeciesCitation) -> dict[str, object]: return { "id": citation.id, "position": citation.position, "section_heading": citation.section_heading, "legacy_reference_number": citation.legacy_reference_number, "citation_key": citation.citation_key, "entry_type": citation.entry_type, "raw_text": citation.raw_text, "normalized_text": citation.normalized_text, "abstract_text": citation.abstract_text, "draft_bibtex": citation.draft_bibtex, "doi": citation.doi, "source_url": citation.source_url, "openalex_id": citation.openalex_id, "resolver_source_label": citation.resolver_source_label, "enrichment_status": citation.enrichment_status, "enrichment_error": citation.enrichment_error, "source_type": citation.source_type, "review_status": citation.review_status, } def _structured_document_to_payload(species: Species) -> dict[str, object] | None: if species.document is None: return None ast: dict[str, object] | None = None raw_ast = str(species.document.ast_json or "").strip() if raw_ast: try: parsed = json.loads(raw_ast) if isinstance(parsed, dict): ast = parsed except json.JSONDecodeError: ast = None return { "source_format": species.document.source_format, "updated_by": species.document.updated_by, "node_count": len(species.document.nodes), "ast": ast, } def _legacy_source_to_payload(species: Species) -> dict[str, object] | None: source_file = str(species.source_file or "").strip() if not source_file: return None try: data_dir = Path(get_default_data_dir()).resolve() candidate = (data_dir / source_file).resolve() if candidate.parent != data_dir or not candidate.is_file(): return None text = candidate.read_text(encoding="utf-8", errors="replace") except (OSError, ValueError): return None return { "source_file": source_file, "text": text, } def import_species_payload(payload: list[dict[str, object]]) -> None: ensure_schema() with SessionLocal() as session: existing_species = { item.slug: item for item in session.scalars(select(Species)).all() } incoming_slugs: set[str] = set() for item in payload: slug = item["slug"] incoming_slugs.add(slug) species = existing_species.get(slug) if species is None: species = Species( slug=slug, source_file=item["source_file"], title=item["title"], common_name=item["common_name"], scientific_name=item["scientific_name"], flelmr_code=item["flelmr_code"], summary=item["summary"], section_count=item["section_count"], publication_status="published", is_archived=False, editor_notes="", created_by=SYSTEM_IMPORT_USER, owner_username="", owner_role="", last_modified_by=SYSTEM_IMPORT_USER, ) session.add(species) session.flush() _ = species.sections _ = species.diagnostics _ = species.audit_entries editorial_fields, section_positions = _get_editor_preservation_state(species) species.source_file = item["source_file"] species.title = item["title"] species.common_name = item["common_name"] species.scientific_name = item["scientific_name"] species.flelmr_code = item["flelmr_code"] species.section_count = item["section_count"] if species.is_archived: species.is_archived = False session.add( SpeciesAuditLog( species_id=species.id, changed_by=SYSTEM_IMPORT_USER, changed_at=datetime.now(timezone.utc).isoformat(), action="import_restore", details_json=json.dumps( {"is_archived": {"from": True, "to": False}}, ensure_ascii=True, ), ) ) if "summary" not in editorial_fields: species.summary = item["summary"] if species.last_modified_by == "": species.last_modified_by = SYSTEM_IMPORT_USER existing_sections = {section.position: section for section in species.sections} incoming_positions: set[int] = set() for position, section_payload in enumerate(item["sections"], start=1): incoming_positions.add(position) section = existing_sections.get(position) if section is None: section = DocumentSection( species_id=species.id, position=position, heading=section_payload["heading"], content=section_payload["content"], ) session.add(section) continue section.heading = section_payload["heading"] if position not in section_positions: section.content = section_payload["content"] session.add(section) for position, section in existing_sections.items(): if position not in incoming_positions: session.delete(section) for diagnostic in list(species.diagnostics): session.delete(diagnostic) for diagnostic in item["diagnostics"]: if diagnostic["code"] == "missing_summary": continue session.add( IngestDiagnosticRecord( species_id=species.id, level=diagnostic["level"], code=diagnostic["code"], message=diagnostic["message"], ) ) sync_species_document(session, species, item) for slug, species in existing_species.items(): if slug in incoming_slugs: continue if not species.is_archived: species.is_archived = True session.add( SpeciesAuditLog( species_id=species.id, changed_by=SYSTEM_IMPORT_USER, changed_at=datetime.now(timezone.utc).isoformat(), action="import_archive", details_json=json.dumps( {"is_archived": {"from": False, "to": True}}, ensure_ascii=True, ), ) ) session.commit() def get_species_document(slug: str) -> dict[str, object] | None: ensure_schema() with SessionLocal() as session: return get_species_document_payload(session, slug) def _get_editor_preservation_state(species: Species) -> tuple[set[str], set[int]]: editorial_fields: set[str] = set() section_positions: set[int] = set() for entry in species.audit_entries: try: details = json.loads(entry.details_json) except json.JSONDecodeError: continue if entry.action == "editorial_update": editorial_fields.update(details.keys()) elif entry.action == "section_update": section_position = details.get("section_position") if isinstance(section_position, int): section_positions.add(section_position) return editorial_fields, section_positions def has_species_data() -> bool: ensure_schema() with SessionLocal() as session: species = session.scalar(select(Species.id).limit(1)) return species is not None def get_readiness_status() -> dict[str, object]: try: ensure_schema() with SessionLocal() as session: species_count = session.query(Species).count() return { "ready": True, "database": "ok", "species_count": species_count, "data_state": "loaded" if species_count > 0 else "empty", } except SQLAlchemyError as exc: return { "ready": False, "database": "error", "species_count": None, "data_state": "unavailable", "error": str(exc), } def _species_to_payload(species: Species, include_sections: bool = True) -> dict[str, object]: legacy_identifiers: list[dict[str, object]] = [] if species.flelmr_code: legacy_identifiers.append( { "authority": "legacy-ecospecies", "identifier": species.flelmr_code, "label": "FLELMR", } ) taxon_identifiers = [ { "authority": item.authority, "identifier": item.identifier, "label": item.label, "primary": item.is_primary, "source_url": item.source_url, } for item in species.taxon_identifiers ] primary_taxon_identifier = next( (item for item in taxon_identifiers if bool(item.get("primary"))), None, ) return { "slug": species.slug, "source_file": species.source_file, "title": species.title, "common_name": species.common_name, "scientific_name": species.scientific_name, "flelmr_code": species.flelmr_code, "legacy_identifiers": legacy_identifiers, "taxon_identifiers": taxon_identifiers, "primary_taxon_authority": ( str(primary_taxon_identifier.get("authority", "")) if primary_taxon_identifier else "" ), "primary_taxon_identifier": primary_taxon_identifier, "summary": species.summary, "section_count": species.section_count, "citation_count": len(species.citations), "publication_status": species.publication_status, "is_archived": species.is_archived, "editor_notes": species.editor_notes, "last_modified_by": species.last_modified_by, "diagnostics": [ {"level": diagnostic.level, "code": diagnostic.code, "message": diagnostic.message} for diagnostic in species.diagnostics ], "citations": [ _citation_to_payload(citation) for citation in species.citations ], "structured_document": _structured_document_to_payload(species) if include_sections else None, "legacy_source": _legacy_source_to_payload(species) if include_sections else None, "sections": ( [ { "id": section.id, "position": section.position, "heading": section.heading, "content": section.content, } for section in species.sections ] if include_sections else [] ), } def list_species( search: str = "", include_unpublished: bool = False, include_archived: bool = False, ) -> list[dict[str, object]]: ensure_schema() with SessionLocal() as session: query = select(Species).order_by(Species.common_name, Species.title) species = list(session.scalars(query)) payload = [_species_to_payload(item, include_sections=False) for item in species] if not include_archived: payload = [item for item in payload if not item["is_archived"]] if not include_unpublished: payload = [item for item in payload if item["publication_status"] == "published"] if search: needle = search.lower() payload = [ item for item in payload if needle in item["common_name"].lower() or needle in item["scientific_name"].lower() or needle in item["title"].lower() ] return payload def get_species_by_slug( slug: str, include_unpublished: bool = False, include_archived: bool = False, ) -> dict[str, object] | None: ensure_schema() with SessionLocal() as session: species = session.scalar(select(Species).where(Species.slug == slug)) if species is None: return None if not include_archived and species.is_archived: return None if not include_unpublished and species.publication_status != "published": return None _ = species.sections _ = species.diagnostics _ = species.citations return _species_to_payload(species, include_sections=True) def get_summary_metrics() -> dict[str, object]: species = list_species() section_total = sum(item["section_count"] for item in species) counter = Counter() for item in species: for diagnostic in item["diagnostics"]: counter[diagnostic["code"]] += 1 return { "species_count": len(species), "section_count": section_total, "diagnostic_counts": dict(counter), } def list_diagnostics() -> list[dict[str, object]]: species = list_species() return [ { "slug": item["slug"], "common_name": item["common_name"], "source_file": item["source_file"], "diagnostics": item["diagnostics"], } for item in species if item["diagnostics"] ] def list_public_bibliography(search: str = "") -> list[dict[str, object]]: ensure_schema() with SessionLocal() as session: species_records = list( session.scalars( select(Species) .where( Species.publication_status == "published", Species.is_archived.is_(False), ) .order_by(Species.common_name, Species.title) ) ) entries: dict[str, dict[str, object]] = {} for species in species_records: _ = species.citations for citation in species.citations: doi_key = str(citation.doi).strip().lower() openalex_key = str(citation.openalex_id).strip().lower() citation_key = str(citation.citation_key).strip().lower() normalized_key = " ".join(str(citation.normalized_text).split()).strip().lower() raw_key = " ".join(str(citation.raw_text).split()).strip().lower() dedupe_key = ( f"doi:{doi_key}" if doi_key else "" ) or ( f"openalex:{openalex_key}" if openalex_key else "" ) or ( f"key:{citation_key}" if citation_key else "" ) or ( f"normalized:{normalized_key}" if normalized_key else "" ) or ( f"raw:{raw_key}" if raw_key else "" ) if not dedupe_key: continue entry = entries.get(dedupe_key) if entry is None: entry = { **_citation_to_payload(citation), "species_refs": [], "_species_ref_keys": set(), "_legacy_reference_numbers": set(), } entries[dedupe_key] = entry if not entry.get("normalized_text") and citation.normalized_text: entry["normalized_text"] = citation.normalized_text if not entry.get("abstract_text") and citation.abstract_text: entry["abstract_text"] = citation.abstract_text if not entry.get("draft_bibtex") and citation.draft_bibtex: entry["draft_bibtex"] = citation.draft_bibtex if not entry.get("doi") and citation.doi: entry["doi"] = citation.doi if not entry.get("source_url") and citation.source_url: entry["source_url"] = citation.source_url if not entry.get("openalex_id") and citation.openalex_id: entry["openalex_id"] = citation.openalex_id species_ref_key = species.slug if species_ref_key not in entry["_species_ref_keys"]: entry["_species_ref_keys"].add(species_ref_key) entry["species_refs"].append( { "slug": species.slug, "common_name": species.common_name, "scientific_name": species.scientific_name, } ) if citation.legacy_reference_number: entry["_legacy_reference_numbers"].add(citation.legacy_reference_number) items: list[dict[str, object]] = [] needle = search.strip().lower() for entry in entries.values(): legacy_numbers = sorted(entry.pop("_legacy_reference_numbers")) entry.pop("_species_ref_keys", None) entry["legacy_reference_numbers"] = legacy_numbers entry["species_count"] = len(entry["species_refs"]) if needle: haystack = " ".join( [ str(entry.get("normalized_text", "")), str(entry.get("raw_text", "")), str(entry.get("citation_key", "")), str(entry.get("doi", "")), str(entry.get("abstract_text", "")), str(entry.get("draft_bibtex", "")), ] ).lower() if needle not in haystack: continue items.append(entry) items.sort(key=lambda item: (str(item.get("normalized_text", "") or item.get("raw_text", "")).lower(), str(item.get("citation_key", "")).lower())) return items def get_editor_species_list(search: str = "") -> list[dict[str, object]]: return list_species(search=search, include_unpublished=True, include_archived=True) def get_contributor_species_list(username: str, search: str = "") -> list[dict[str, object]]: ensure_schema() with SessionLocal() as session: query = ( select(Species) .where( Species.owner_role == "contributor", Species.owner_username == username, ) .order_by(Species.common_name, Species.title) ) species = list(session.scalars(query)) payload = [_species_to_payload(item, include_sections=False) for item in species] if search: needle = search.lower() payload = [ item for item in payload if needle in item["common_name"].lower() or needle in item["scientific_name"].lower() or needle in item["title"].lower() ] return payload def get_editor_species_workflow(slug: str) -> dict[str, object] | None: item = get_species_by_slug(slug, include_unpublished=True, include_archived=True) if item is None: return None return { "slug": item["slug"], "title": item["title"], "common_name": item["common_name"], "publication_status": item["publication_status"], "is_archived": item["is_archived"], "editor_notes": item["editor_notes"], "last_modified_by": item["last_modified_by"], "diagnostic_count": len(item["diagnostics"]), } def get_editor_species_detail(slug: str) -> dict[str, object] | None: return get_species_by_slug(slug, include_unpublished=True, include_archived=True) def get_contributor_species_detail(slug: str, username: str) -> dict[str, object] | None: ensure_schema() with SessionLocal() as session: species = session.scalar( select(Species).where( Species.slug == slug, Species.owner_role == "contributor", Species.owner_username == username, ) ) if species is None: return None _ = species.sections _ = species.diagnostics _ = species.citations return _species_to_payload(species, include_sections=True) def _citation_list_payload(species: Species) -> dict[str, object]: return { "slug": species.slug, "title": species.title, "common_name": species.common_name, "scientific_name": species.scientific_name, "citation_count": len(species.citations), "citations": [_citation_to_payload(citation) for citation in species.citations], } def get_editor_species_citations(slug: str) -> dict[str, object] | None: ensure_schema() with SessionLocal() as session: species = session.scalar(select(Species).where(Species.slug == slug)) if species is None: return None _ = species.citations return _citation_list_payload(species) def get_contributor_species_citations(slug: str, username: str) -> dict[str, object] | None: ensure_schema() with SessionLocal() as session: species = session.scalar( select(Species).where( Species.slug == slug, Species.owner_role == "contributor", Species.owner_username == username, ) ) if species is None: return None _ = species.citations return _citation_list_payload(species) def list_species_audit(slug: str) -> list[dict[str, object]] | None: ensure_schema() with SessionLocal() as session: species = session.scalar(select(Species).where(Species.slug == slug)) if species is None: return None _ = species.audit_entries return [ { "id": entry.id, "changed_by": entry.changed_by, "changed_at": entry.changed_at, "action": entry.action, "details": json.loads(entry.details_json), } for entry in species.audit_entries ] def update_species_editorial( slug: str, publication_status: str | None, summary: str | None, editor_notes: str | None, is_archived: bool | None, username: str, ) -> dict[str, object] | None: ensure_schema() with SessionLocal() as session: species = session.scalar(select(Species).where(Species.slug == slug)) if species is None: return None before = { "publication_status": species.publication_status, "summary": species.summary, "editor_notes": species.editor_notes, "is_archived": species.is_archived, } if publication_status is not None: normalized = publication_status.strip().lower() if normalized not in WORKFLOW_STATUSES: raise ValueError( f"Unsupported publication_status: {publication_status}. " f"Expected one of {sorted(WORKFLOW_STATUSES)}" ) species.publication_status = normalized if summary is not None: species.summary = summary.strip() if editor_notes is not None: species.editor_notes = editor_notes.strip() if is_archived is not None: species.is_archived = is_archived after = { "publication_status": species.publication_status, "summary": species.summary, "editor_notes": species.editor_notes, "is_archived": species.is_archived, } changed_fields = { key: {"from": before[key], "to": after[key]} for key in before if before[key] != after[key] } if changed_fields: species.last_modified_by = username session.add( SpeciesAuditLog( species_id=species.id, changed_by=username, changed_at=datetime.now(timezone.utc).isoformat(), action="editorial_update", details_json=json.dumps(changed_fields, ensure_ascii=True), ) ) session.add(species) session.commit() session.refresh(species) return { "slug": species.slug, "summary": species.summary, "publication_status": species.publication_status, "editor_notes": species.editor_notes, "is_archived": species.is_archived, "last_modified_by": species.last_modified_by, "changed_fields": changed_fields, } def _normalize_email(email: str) -> str: normalized = email.strip().lower() if not EMAIL_PATTERN.fullmatch(normalized): raise ValueError("Contributor username must be a valid email address.") return normalized def get_minimum_contributor_age() -> int: configured = os.environ.get("ECOSPECIES_MINIMUM_CONTRIBUTOR_AGE", "13").strip() try: minimum_age = int(configured) except ValueError as exc: # pragma: no cover - misconfiguration path raise ValueError("ECOSPECIES_MINIMUM_CONTRIBUTOR_AGE must be an integer.") from exc if minimum_age < 1: raise ValueError("ECOSPECIES_MINIMUM_CONTRIBUTOR_AGE must be positive.") return minimum_age def register_contributor(email: str, age_gate_confirmed: bool) -> dict[str, object]: ensure_schema() normalized_email = _normalize_email(email) minimum_age = get_minimum_contributor_age() if not age_gate_confirmed: raise ValueError( f"Contributors must confirm they are at least {minimum_age} years old." ) token = secrets.token_urlsafe(24) token_hash = hashlib.sha256(token.encode("utf-8")).hexdigest() now = datetime.now(timezone.utc).isoformat() with SessionLocal() as session: existing = session.scalar( select(ContributorAccount).where(ContributorAccount.email == normalized_email) ) if existing is not None: raise ValueError("A contributor account already exists for that email address.") session.add( ContributorAccount( email=normalized_email, token_hash=token_hash, age_gate_confirmed=True, created_at=now, is_active=True, ) ) session.commit() return { "username": normalized_email, "role": "contributor", "token": token, "minimum_age": minimum_age, "warning": "Store this token now. You will not be able to access your contributed species later without it.", } def _build_initial_contributor_markdown(email: str) -> str: title = "New Species Draft" return ( "---\n" f"title: {title}\n" "common_name: \n" "scientific_name: \n" "species_code: \n" f"source_file: {CONTRIBUTOR_SUBMISSION_PREFIX}.md\n" "publication_status: draft\n" "source_format: ecospecies-markdown-v1\n" "---\n\n" "## Summary\n" "Provide a concise summary.\n\n" "## Habitat\n" "Describe habitat.\n" ) def _next_unique_slug(session, base_slug: str) -> str: candidate = base_slug suffix = 2 while session.scalar(select(Species.id).where(Species.slug == candidate)) is not None: candidate = f"{base_slug}-{suffix}" suffix += 1 return candidate def create_contributor_species(username: str, markdown: str | None = None) -> dict[str, object]: ensure_schema() normalized_email = _normalize_email(username) source_markdown = (markdown or _build_initial_contributor_markdown(normalized_email)).strip() if not source_markdown.endswith("\n"): source_markdown += "\n" with SessionLocal() as session: document_model = parse_markdown_document(source_markdown) projection = extract_species_projection(document_model) slug_base = slugify( str(projection.get("common_name") or projection.get("title") or CONTRIBUTOR_SUBMISSION_PREFIX) ) slug = _next_unique_slug(session, slug_base) species = Species( slug=slug, source_file=f"{CONTRIBUTOR_SUBMISSION_PREFIX}-{slug}.md", title=str(projection.get("title") or "New Species Draft"), common_name=str(projection.get("common_name") or ""), scientific_name=str(projection.get("scientific_name") or ""), flelmr_code=str(projection.get("flelmr_code") or ""), summary=str(projection.get("summary") or ""), section_count=len(projection["sections"]), publication_status="draft", is_archived=False, editor_notes="", created_by=normalized_email, owner_username=normalized_email, owner_role="contributor", last_modified_by=normalized_email, ) session.add(species) session.flush() save_species_document(session, species, source_markdown, normalized_email) session.add( SpeciesAuditLog( species_id=species.id, changed_by=normalized_email, changed_at=datetime.now(timezone.utc).isoformat(), action="contributor_create", details_json=json.dumps({"publication_status": "draft"}, ensure_ascii=True), ) ) session.commit() return { "slug": species.slug, "publication_status": species.publication_status, "last_modified_by": species.last_modified_by, } def get_contributor_species_document(slug: str, username: str) -> dict[str, object] | None: ensure_schema() with SessionLocal() as session: species = session.scalar( select(Species).where( Species.slug == slug, Species.owner_role == "contributor", Species.owner_username == username, ) ) if species is None: return None return get_species_document_payload(session, slug) def update_species_section( slug: str, section_position: int, content: str, username: str, ) -> dict[str, object] | None: ensure_schema() with SessionLocal() as session: species = session.scalar(select(Species).where(Species.slug == slug)) if species is None: return None section = session.scalar( select(DocumentSection).where( DocumentSection.species_id == species.id, DocumentSection.position == section_position, ) ) if section is None: return None new_content = content.strip() changed_fields = {} if section.content != new_content: changed_fields["section_content"] = { "from": section.content, "to": new_content, } if changed_fields: section.content = new_content species.last_modified_by = username session.add( SpeciesAuditLog( species_id=species.id, changed_by=username, changed_at=datetime.now(timezone.utc).isoformat(), action="section_update", details_json=json.dumps( { "section_position": section.position, "section_heading": section.heading, **changed_fields, }, ensure_ascii=True, ), ) ) session.add(section) session.add(species) session.commit() session.refresh(section) return { "slug": species.slug, "section": { "id": section.id, "position": section.position, "heading": section.heading, "content": section.content, }, "last_modified_by": species.last_modified_by, "changed_fields": changed_fields, } def update_species_document_markdown( slug: str, markdown: str, username: str, ) -> dict[str, object] | None: ensure_schema() with SessionLocal() as session: species = session.scalar(select(Species).where(Species.slug == slug)) if species is None: return None result = save_species_document(session, species, markdown, username) session.add( SpeciesAuditLog( species_id=species.id, changed_by=username, changed_at=datetime.now(timezone.utc).isoformat(), action="document_update", details_json=json.dumps( {"source_format": "ecospecies-markdown-v1"}, ensure_ascii=True, ), ) ) session.commit() return result def update_species_citation_review( slug: str, citation_id: int, review_status: str | None, normalized_text: str | None, doi: str | None, citation_key: str | None, entry_type: str | None, draft_bibtex: str | None, username: str, *, abstract_text: str | None = None, ) -> dict[str, object] | None: ensure_schema() with SessionLocal() as session: species = session.scalar(select(Species).where(Species.slug == slug)) if species is None: return None citation = session.scalar( select(SpeciesCitation).where( SpeciesCitation.species_id == species.id, SpeciesCitation.id == citation_id, ) ) if citation is None: return None before = _citation_to_payload(citation) if review_status is not None: normalized_status = review_status.strip().lower() if normalized_status not in CITATION_REVIEW_STATUSES: raise ValueError( f"Unsupported review_status: {review_status}. " f"Expected one of {sorted(CITATION_REVIEW_STATUSES)}" ) citation.review_status = normalized_status if normalized_text is not None: citation.normalized_text = normalized_text.strip() if abstract_text is not None: citation.abstract_text = abstract_text.strip() if doi is not None: citation.doi = doi.strip() if citation_key is not None: citation.citation_key = citation_key.strip() if entry_type is not None: citation.entry_type = entry_type.strip() or "misc" if draft_bibtex is not None: citation.draft_bibtex = draft_bibtex.strip() citation.source_type = "editor_review" after = _citation_to_payload(citation) changed_fields = { field: {"from": before[field], "to": after[field]} for field in ( "review_status", "normalized_text", "abstract_text", "doi", "citation_key", "entry_type", "draft_bibtex", "source_type", ) if before[field] != after[field] } if changed_fields: species.last_modified_by = username session.add( SpeciesAuditLog( species_id=species.id, changed_by=username, changed_at=datetime.now(timezone.utc).isoformat(), action="citation_review_update", details_json=json.dumps( {"citation_id": citation.id, **changed_fields}, ensure_ascii=True, ), ) ) session.add(citation) session.add(species) session.commit() session.refresh(citation) return { "slug": species.slug, "citation": _citation_to_payload(citation), "last_modified_by": species.last_modified_by, "changed_fields": changed_fields, } def get_species_citation_candidates(slug: str, citation_id: int) -> dict[str, object] | None: ensure_schema() with SessionLocal() as session: species = session.scalar(select(Species).where(Species.slug == slug)) if species is None: return None citation = session.scalar( select(SpeciesCitation).where( SpeciesCitation.species_id == species.id, SpeciesCitation.id == citation_id, ) ) if citation is None: return None candidates = discover_citation_candidates(_citation_to_payload(citation)) return { "slug": species.slug, "citation": _citation_to_payload(citation), **candidates, } def _apply_citation_enrichment( session, species: Species, citation: SpeciesCitation, username: str, ) -> dict[str, object]: before = _citation_to_payload(citation) enrichment = enrich_citation_payload(before) for field in ( "citation_key", "entry_type", "normalized_text", "abstract_text", "draft_bibtex", "doi", "source_url", "openalex_id", "resolver_source_label", "enrichment_status", "enrichment_error", ): if field in enrichment: setattr(citation, field, str(enrichment.get(field, "")).strip()) after = _citation_to_payload(citation) changed_fields = { field: {"from": before[field], "to": after[field]} for field in ( "citation_key", "entry_type", "normalized_text", "abstract_text", "draft_bibtex", "doi", "source_url", "openalex_id", "resolver_source_label", "enrichment_status", "enrichment_error", ) if before[field] != after[field] } conflicts = enrichment.get("conflicts") if conflicts: changed_fields["resolver_conflicts"] = list(conflicts) if changed_fields: species.last_modified_by = username session.add( SpeciesAuditLog( species_id=species.id, changed_by=username, changed_at=datetime.now(timezone.utc).isoformat(), action="citation_enrichment", details_json=json.dumps( {"citation_id": citation.id, **changed_fields}, ensure_ascii=True, ), ) ) session.add(citation) session.add(species) return { "citation": _citation_to_payload(citation), "changed_fields": changed_fields, } def _next_citation_position(species: Species) -> int: if not species.citations: return 1 return max(citation.position for citation in species.citations) + 1 def update_species_citation_enrichment( slug: str, citation_id: int, username: str, ) -> dict[str, object] | None: ensure_schema() with SessionLocal() as session: species = session.scalar(select(Species).where(Species.slug == slug)) if species is None: return None citation = session.scalar( select(SpeciesCitation).where( SpeciesCitation.species_id == species.id, SpeciesCitation.id == citation_id, ) ) if citation is None: return None result = _apply_citation_enrichment(session, species, citation, username) session.commit() session.refresh(citation) return { "slug": species.slug, "citation": result["citation"], "last_modified_by": species.last_modified_by, "changed_fields": result["changed_fields"], } def apply_species_citation_candidate_selection( slug: str, citation_id: int, candidate: dict[str, object], username: str, ) -> dict[str, object] | None: ensure_schema() with SessionLocal() as session: species = session.scalar(select(Species).where(Species.slug == slug)) if species is None: return None citation = session.scalar( select(SpeciesCitation).where( SpeciesCitation.species_id == species.id, SpeciesCitation.id == citation_id, ) ) if citation is None: return None before = _citation_to_payload(citation) enrichment = apply_citation_candidate_selection(before, candidate) for field in ( "citation_key", "entry_type", "normalized_text", "abstract_text", "draft_bibtex", "doi", "source_url", "openalex_id", "resolver_source_label", "enrichment_status", "enrichment_error", ): if field in enrichment: setattr(citation, field, str(enrichment.get(field, "")).strip()) citation.source_type = "editor_selected_candidate" citation.review_status = "accepted" after = _citation_to_payload(citation) changed_fields = { field: {"from": before[field], "to": after[field]} for field in ( "citation_key", "entry_type", "normalized_text", "abstract_text", "draft_bibtex", "doi", "source_url", "openalex_id", "resolver_source_label", "enrichment_status", "enrichment_error", "source_type", "review_status", ) if before[field] != after[field] } if changed_fields: species.last_modified_by = username session.add( SpeciesAuditLog( species_id=species.id, changed_by=username, changed_at=datetime.now(timezone.utc).isoformat(), action="citation_candidate_selection", details_json=json.dumps( { "citation_id": citation.id, "selected_source_label": str(candidate.get("source_label", "")).strip(), **changed_fields, }, ensure_ascii=True, ), ) ) session.add(citation) session.add(species) session.commit() session.refresh(citation) return { "slug": species.slug, "citation": _citation_to_payload(citation), "last_modified_by": species.last_modified_by, "changed_fields": changed_fields, } def add_species_citation_from_candidate( slug: str, citation_id: int, candidate: dict[str, object], username: str, ) -> dict[str, object] | None: ensure_schema() with SessionLocal() as session: species = session.scalar(select(Species).where(Species.slug == slug)) if species is None: return None citation = session.scalar( select(SpeciesCitation).where( SpeciesCitation.species_id == species.id, SpeciesCitation.id == citation_id, ) ) if citation is None: return None enrichment = apply_citation_candidate_selection(_citation_to_payload(citation), candidate) raw_text = ( str(enrichment.get("normalized_text", "")).strip() or str(candidate.get("fields", {}).get("title", "")).strip() or str(citation.raw_text).strip() ) document_markdown = species.document.markdown_content if species.document is not None else "" document_model = parse_markdown_document(document_markdown) added = add_citation_to_document(document_model, raw_text, heading_title="Related References") updated_markdown = export_markdown_document(document_model) save_species_document(session, species, updated_markdown, username) new_citation = session.scalar( select(SpeciesCitation).where( SpeciesCitation.species_id == species.id, SpeciesCitation.raw_text == raw_text, ) ) if new_citation is None: return None new_citation.source_type = "editor_added_candidate" new_citation.review_status = "accepted" new_citation.citation_key = str(enrichment.get("citation_key", "")).strip() new_citation.entry_type = str(enrichment.get("entry_type", "misc")).strip() or "misc" new_citation.normalized_text = str(enrichment.get("normalized_text", "")).strip() new_citation.abstract_text = str(enrichment.get("abstract_text", "")).strip() new_citation.draft_bibtex = str(enrichment.get("draft_bibtex", "")).strip() new_citation.doi = str(enrichment.get("doi", "")).strip() new_citation.source_url = str(enrichment.get("source_url", "")).strip() new_citation.openalex_id = str(enrichment.get("openalex_id", "")).strip() new_citation.resolver_source_label = str(enrichment.get("resolver_source_label", "")).strip() new_citation.enrichment_status = str(enrichment.get("enrichment_status", "resolved")).strip() new_citation.enrichment_error = str(enrichment.get("enrichment_error", "")).strip() if not added: new_citation.source_type = "editor_added_candidate" new_citation.review_status = "accepted" session.add(new_citation) species.last_modified_by = username session.add( SpeciesAuditLog( species_id=species.id, changed_by=username, changed_at=datetime.now(timezone.utc).isoformat(), action="citation_candidate_addition", details_json=json.dumps( { "source_citation_id": citation.id, "selected_source_label": str(candidate.get("source_label", "")).strip(), "new_citation_key": new_citation.citation_key, }, ensure_ascii=True, ), ) ) session.commit() session.refresh(new_citation) return { "slug": species.slug, "citation": _citation_to_payload(new_citation), "last_modified_by": species.last_modified_by, } def update_species_citations_enrichment_batch( slug: str, username: str, ) -> dict[str, object] | None: ensure_schema() with SessionLocal() as session: species = session.scalar(select(Species).where(Species.slug == slug)) if species is None: return None _ = species.citations updated_citations: list[dict[str, object]] = [] changed_count = 0 resolved_count = 0 unresolved_count = 0 error_count = 0 for citation in species.citations: result = _apply_citation_enrichment(session, species, citation, username) updated_citations.append(result["citation"]) if result["changed_fields"]: changed_count += 1 status = str(result["citation"].get("enrichment_status", "")).strip() if status == "resolved": resolved_count += 1 elif status == "unresolved": unresolved_count += 1 elif status == "error": error_count += 1 session.commit() return { "slug": species.slug, "citation_count": len(updated_citations), "changed_count": changed_count, "resolved_count": resolved_count, "unresolved_count": unresolved_count, "error_count": error_count, "citations": updated_citations, "last_modified_by": species.last_modified_by, } def _should_backfill_citation(citation: SpeciesCitation, include_accepted: bool = False) -> bool: review_status = str(citation.review_status or "").strip().lower() source_type = str(citation.source_type or "").strip().lower() enrichment_status = str(citation.enrichment_status or "").strip().lower() normalized_text = str(citation.normalized_text or "").strip() abstract_text = str(citation.abstract_text or "").strip() if not include_accepted and review_status == "accepted": return False if source_type in {"editor_selected_candidate", "editor_added_candidate"} and not include_accepted: return False return ( source_type in {"document_extract", "editor_review", ""} or enrichment_status in {"pending", "unresolved", "error", ""} or not normalized_text or not abstract_text ) def backfill_species_citations( slug: str, username: str, include_accepted: bool = False, ) -> dict[str, object] | None: ensure_schema() with SessionLocal() as session: species = session.scalar(select(Species).where(Species.slug == slug)) if species is None: return None _ = species.citations updated_citations: list[dict[str, object]] = [] changed_count = 0 resolved_count = 0 unresolved_count = 0 error_count = 0 backfilled_count = 0 for citation in species.citations: if _should_backfill_citation(citation, include_accepted=include_accepted): backfilled_count += 1 result = _apply_citation_enrichment(session, species, citation, username) payload = result["citation"] if result["changed_fields"]: changed_count += 1 else: payload = _citation_to_payload(citation) updated_citations.append(payload) status = str(payload.get("enrichment_status", "")).strip() if status == "resolved": resolved_count += 1 elif status == "unresolved": unresolved_count += 1 elif status == "error": error_count += 1 session.commit() return { "slug": species.slug, "citation_count": len(updated_citations), "backfilled_count": backfilled_count, "changed_count": changed_count, "resolved_count": resolved_count, "unresolved_count": unresolved_count, "error_count": error_count, "citations": updated_citations, "last_modified_by": species.last_modified_by, } def update_contributor_species_document_markdown( slug: str, markdown: str, username: str, ) -> dict[str, object] | None: ensure_schema() normalized_email = _normalize_email(username) with SessionLocal() as session: species = session.scalar( select(Species).where( Species.slug == slug, Species.owner_role == "contributor", Species.owner_username == normalized_email, ) ) if species is None: return None result = save_species_document(session, species, markdown, normalized_email) if species.publication_status == "published": species.publication_status = "review" session.add( SpeciesAuditLog( species_id=species.id, changed_by=normalized_email, changed_at=datetime.now(timezone.utc).isoformat(), action="contributor_document_update", details_json=json.dumps( {"source_format": "ecospecies-markdown-v1"}, ensure_ascii=True, ), ) ) session.commit() return { **result, "publication_status": species.publication_status, }