from __future__ import annotations from dataclasses import dataclass, field import json import re from pathlib import Path from typing import Any from .render import html_escape @dataclass(slots=True) class ContentCard: title: str body: str href: str = "" meta: str = "" kind: str = "feature" source: str = "" link_label: str = "Read More" @dataclass(slots=True) class SiteContent: feature_cards: list[ContentCard] = field(default_factory=list) section_cards: list[ContentCard] = field(default_factory=list) app_cards: list[ContentCard] = field(default_factory=list) bibliography_entries: list[ContentCard] = field(default_factory=list) notes: list[str] = field(default_factory=list) def cards_from_config(items: list[dict[str, Any]], *, default_kind: str) -> list[ContentCard]: cards: list[ContentCard] = [] for item in items: if not isinstance(item, dict): continue title = str(item.get("title") or item.get("name") or "Item") cards.append( ContentCard( title=title, body=str(item.get("body") or item.get("description") or item.get("summary") or ""), href=str(item.get("href") or item.get("url") or ""), meta=str(item.get("meta") or item.get("kind") or default_kind), kind=str(item.get("kind") or default_kind), source=str(item.get("source") or item.get("id") or title.lower().replace(" ", "-")), link_label=str(item.get("link_label") or item.get("label") or "Read More"), ) ) return cards def _first_paragraph(text: str) -> str: paragraphs = [chunk.strip() for chunk in re.split(r"\n\s*\n", text) if chunk.strip()] return paragraphs[0] if paragraphs else text.strip() def _read_json(path: Path) -> dict[str, Any]: return json.loads(path.read_text(encoding="utf-8")) def _read_yaml(path: Path) -> Any: text = path.read_text(encoding="utf-8") try: # pragma: no cover - exercised only if PyYAML is installed import yaml # type: ignore return yaml.safe_load(text) or {} except Exception: stripped = text.strip() if stripped.startswith("{") or stripped.startswith("["): return json.loads(stripped) return _parse_minimal_yaml(text) def _parse_scalar(value: str) -> Any: value = value.strip() if value in {"", "null", "~"}: return None if value == "[]": return [] if value.startswith("[") and value.endswith("]"): inner = value[1:-1].strip() if not inner: return [] return [_parse_scalar(part) for part in inner.split(",")] if value.startswith('"') and value.endswith('"'): return value[1:-1] if value.startswith("'") and value.endswith("'"): return value[1:-1] if value.isdigit(): return int(value) if value.lower() in {"true", "false"}: return value.lower() == "true" return value def _parse_minimal_yaml(text: str) -> dict[str, Any]: lines = [line.rstrip() for line in text.splitlines() if line.strip() and not line.strip().startswith("#")] root: dict[str, Any] = {} current_key: str | None = None current_item: dict[str, Any] | None = None for index, raw in enumerate(lines): stripped = raw.lstrip(" ") indent = len(raw) - len(stripped) if indent == 0: current_item = None if ":" not in stripped: continue key, value = stripped.split(":", 1) key = key.strip() value = value.strip() if value: root[key] = _parse_scalar(value) else: next_line = lines[index + 1] if index + 1 < len(lines) else "" root[key] = [] if next_line.lstrip(" ").startswith("- ") else {} current_key = key continue if current_key is None: continue container = root.get(current_key) if isinstance(container, list) and stripped.startswith("- "): item_text = stripped[2:].strip() if not item_text: current_item = {} container.append(current_item) elif ":" in item_text: item_key, item_value = item_text.split(":", 1) current_item = {item_key.strip(): _parse_scalar(item_value)} container.append(current_item) else: current_item = None container.append(_parse_scalar(item_text)) continue target = current_item if isinstance(current_item, dict) else container if isinstance(target, dict) and ":" in stripped: key, value = stripped.split(":", 1) target[key.strip()] = _parse_scalar(value) elif isinstance(target, list) and stripped.startswith("- "): target.append(_parse_scalar(stripped[2:])) return root def load_doclift_cards(bundle_root: str | Path) -> list[ContentCard]: base = Path(bundle_root) manifest = _read_json(base / "manifest.json") cards: list[ContentCard] = [] for item in manifest.get("documents", []): if not isinstance(item, dict): continue title = str(item.get("title") or item.get("document_id") or "Document") body = str(item.get("summary") or item.get("description") or item.get("document_kind") or "") markdown_path = item.get("markdown_path") source_href = str(item.get("canonical_url") or item.get("source_path") or "") if markdown_path: md_path = base / str(markdown_path) if md_path.exists(): body = _first_paragraph(md_path.read_text(encoding="utf-8")) cards.append( ContentCard( title=title, body=body, href=source_href, meta=str(item.get("document_kind") or "document"), kind="notebook", source=str(item.get("document_id") or title.lower().replace(" ", "-")), ) ) return cards def load_groundrecall_cards(bundle_root: str | Path) -> list[ContentCard]: base = Path(bundle_root) bundle_path = base / "groundrecall_query_bundle.json" if not bundle_path.exists(): bundle_path = base / "exports" / "codex" / "codex_bundle.json" if not bundle_path.exists(): return [] payload = _read_json(bundle_path) concept = payload.get("concept") or {} title = str(concept.get("title") or payload.get("title") or "GroundRecall concept") body = str(payload.get("summary") or payload.get("explanation") or payload.get("body") or "") claims = payload.get("claims") or payload.get("related_claims") or [] claim_count = len(claims) if isinstance(claims, list) else 0 cards = [ ContentCard( title=title, body=body or f"{claim_count} related claims and observations are bundled here.", href=str(payload.get("source_url") or ""), meta=f"GroundRecall bundle · {claim_count} claims", kind="section", source=str(concept.get("concept_id") or title.lower().replace(" ", "-")), ) ] for claim in claims if isinstance(claims, list) else []: if not isinstance(claim, dict): continue cards.append( ContentCard( title=str(claim.get("claim_text") or claim.get("title") or "Claim"), body=str(claim.get("support") or claim.get("notes") or ""), href=str(claim.get("source_url") or ""), meta=str(claim.get("claim_kind") or "claim"), kind="section", source=str(claim.get("claim_id") or claim.get("id") or ""), ) ) return cards def load_didactopus_cards(pack_root: str | Path) -> list[ContentCard]: base = Path(pack_root) pack_path = base / "pack.yaml" concepts_path = base / "concepts.yaml" if not pack_path.exists() or not concepts_path.exists(): return [] pack = _read_yaml(pack_path) or {} concepts = _read_yaml(concepts_path) or {} cards: list[ContentCard] = [] for concept in concepts.get("concepts", []): if not isinstance(concept, dict): continue title = str(concept.get("title") or concept.get("id") or "Concept") description = str(concept.get("description") or "") prerequisites = concept.get("prerequisites") or [] prereq_text = ", ".join(str(item) for item in prerequisites) if prerequisites else "None" body = description or f"Prerequisites: {prereq_text}." cards.append( ContentCard( title=title, body=body, href=str(pack.get("display_name") or pack.get("name") or ""), meta=f"Didactopus concept · {prereq_text}", kind="app", source=str(concept.get("id") or title.lower().replace(" ", "-")), ) ) return cards def load_citegeist_cards(source_root: str | Path) -> list[ContentCard]: root = Path(source_root) bib_files = sorted( path for path in root.rglob("*.bib") if path.is_file() and not path.name.endswith("-bak.bib") and not path.name.startswith(".") ) if not bib_files: return [] cards: list[ContentCard] = [] try: from citegeist.bibtex import parse_bibtex # type: ignore except Exception: parse_bibtex = None for bib_path in bib_files: text = bib_path.read_text(encoding="utf-8") entries = parse_bibtex(text) if parse_bibtex is not None else _fallback_parse_bibtex(text) for entry in entries: data = entry if isinstance(entry, dict) else entry.__dict__ title = str(data.get("title") or data.get("citation_key") or "Reference") author = str(data.get("author") or data.get("editor") or "") year = str(data.get("year") or "") body = " · ".join(part for part in [author, year] if part).strip() cards.append( ContentCard( title=title, body=body or bib_path.name, href=str(bib_path.relative_to(root)), meta="CiteGeist bibliography", kind="bibliography", source=str(data.get("citation_key") or title.lower().replace(" ", "-")), ) ) return cards def _fallback_parse_bibtex(text: str) -> list[dict[str, str]]: entries: list[dict[str, str]] = [] current: dict[str, str] | None = None for line in text.splitlines(): stripped = line.strip() if not stripped: continue if stripped.startswith("@") and "{" in stripped: if current: entries.append(current) kind, rest = stripped[1:].split("{", 1) key = rest.split(",", 1)[0].strip() current = {"entry_type": kind.strip(), "citation_key": key} continue if current and "=" in stripped: field, value = stripped.split("=", 1) current[field.strip().lower()] = value.strip().strip(",{}") if current: entries.append(current) return entries