SciSiteForge/scisiteforge/content.py

301 lines
11 KiB
Python

from __future__ import annotations
from dataclasses import dataclass, field
import json
import re
from pathlib import Path
from typing import Any
from .render import html_escape
@dataclass(slots=True)
class ContentCard:
title: str
body: str
href: str = ""
meta: str = ""
kind: str = "feature"
source: str = ""
link_label: str = "Read More"
@dataclass(slots=True)
class SiteContent:
feature_cards: list[ContentCard] = field(default_factory=list)
section_cards: list[ContentCard] = field(default_factory=list)
app_cards: list[ContentCard] = field(default_factory=list)
bibliography_entries: list[ContentCard] = field(default_factory=list)
notes: list[str] = field(default_factory=list)
def cards_from_config(items: list[dict[str, Any]], *, default_kind: str) -> list[ContentCard]:
cards: list[ContentCard] = []
for item in items:
if not isinstance(item, dict):
continue
title = str(item.get("title") or item.get("name") or "Item")
cards.append(
ContentCard(
title=title,
body=str(item.get("body") or item.get("description") or item.get("summary") or ""),
href=str(item.get("href") or item.get("url") or ""),
meta=str(item.get("meta") or item.get("kind") or default_kind),
kind=str(item.get("kind") or default_kind),
source=str(item.get("source") or item.get("id") or title.lower().replace(" ", "-")),
link_label=str(item.get("link_label") or item.get("label") or "Read More"),
)
)
return cards
def _first_paragraph(text: str) -> str:
paragraphs = [chunk.strip() for chunk in re.split(r"\n\s*\n", text) if chunk.strip()]
return paragraphs[0] if paragraphs else text.strip()
def _read_json(path: Path) -> dict[str, Any]:
return json.loads(path.read_text(encoding="utf-8"))
def _read_yaml(path: Path) -> Any:
text = path.read_text(encoding="utf-8")
try: # pragma: no cover - exercised only if PyYAML is installed
import yaml # type: ignore
return yaml.safe_load(text) or {}
except Exception:
stripped = text.strip()
if stripped.startswith("{") or stripped.startswith("["):
return json.loads(stripped)
return _parse_minimal_yaml(text)
def _parse_scalar(value: str) -> Any:
value = value.strip()
if value in {"", "null", "~"}:
return None
if value == "[]":
return []
if value.startswith("[") and value.endswith("]"):
inner = value[1:-1].strip()
if not inner:
return []
return [_parse_scalar(part) for part in inner.split(",")]
if value.startswith('"') and value.endswith('"'):
return value[1:-1]
if value.startswith("'") and value.endswith("'"):
return value[1:-1]
if value.isdigit():
return int(value)
if value.lower() in {"true", "false"}:
return value.lower() == "true"
return value
def _parse_minimal_yaml(text: str) -> dict[str, Any]:
lines = [line.rstrip() for line in text.splitlines() if line.strip() and not line.strip().startswith("#")]
root: dict[str, Any] = {}
current_key: str | None = None
current_item: dict[str, Any] | None = None
for index, raw in enumerate(lines):
stripped = raw.lstrip(" ")
indent = len(raw) - len(stripped)
if indent == 0:
current_item = None
if ":" not in stripped:
continue
key, value = stripped.split(":", 1)
key = key.strip()
value = value.strip()
if value:
root[key] = _parse_scalar(value)
else:
next_line = lines[index + 1] if index + 1 < len(lines) else ""
root[key] = [] if next_line.lstrip(" ").startswith("- ") else {}
current_key = key
continue
if current_key is None:
continue
container = root.get(current_key)
if isinstance(container, list) and stripped.startswith("- "):
item_text = stripped[2:].strip()
if not item_text:
current_item = {}
container.append(current_item)
elif ":" in item_text:
item_key, item_value = item_text.split(":", 1)
current_item = {item_key.strip(): _parse_scalar(item_value)}
container.append(current_item)
else:
current_item = None
container.append(_parse_scalar(item_text))
continue
target = current_item if isinstance(current_item, dict) else container
if isinstance(target, dict) and ":" in stripped:
key, value = stripped.split(":", 1)
target[key.strip()] = _parse_scalar(value)
elif isinstance(target, list) and stripped.startswith("- "):
target.append(_parse_scalar(stripped[2:]))
return root
def load_doclift_cards(bundle_root: str | Path) -> list[ContentCard]:
base = Path(bundle_root)
manifest = _read_json(base / "manifest.json")
cards: list[ContentCard] = []
for item in manifest.get("documents", []):
if not isinstance(item, dict):
continue
title = str(item.get("title") or item.get("document_id") or "Document")
body = str(item.get("summary") or item.get("description") or item.get("document_kind") or "")
markdown_path = item.get("markdown_path")
source_href = str(item.get("canonical_url") or item.get("source_path") or "")
if markdown_path:
md_path = base / str(markdown_path)
if md_path.exists():
body = _first_paragraph(md_path.read_text(encoding="utf-8"))
cards.append(
ContentCard(
title=title,
body=body,
href=source_href,
meta=str(item.get("document_kind") or "document"),
kind="notebook",
source=str(item.get("document_id") or title.lower().replace(" ", "-")),
)
)
return cards
def load_groundrecall_cards(bundle_root: str | Path) -> list[ContentCard]:
base = Path(bundle_root)
bundle_path = base / "groundrecall_query_bundle.json"
if not bundle_path.exists():
bundle_path = base / "exports" / "codex" / "codex_bundle.json"
if not bundle_path.exists():
return []
payload = _read_json(bundle_path)
concept = payload.get("concept") or {}
title = str(concept.get("title") or payload.get("title") or "GroundRecall concept")
body = str(payload.get("summary") or payload.get("explanation") or payload.get("body") or "")
claims = payload.get("claims") or payload.get("related_claims") or []
claim_count = len(claims) if isinstance(claims, list) else 0
cards = [
ContentCard(
title=title,
body=body or f"{claim_count} related claims and observations are bundled here.",
href=str(payload.get("source_url") or ""),
meta=f"GroundRecall bundle · {claim_count} claims",
kind="section",
source=str(concept.get("concept_id") or title.lower().replace(" ", "-")),
)
]
for claim in claims if isinstance(claims, list) else []:
if not isinstance(claim, dict):
continue
cards.append(
ContentCard(
title=str(claim.get("claim_text") or claim.get("title") or "Claim"),
body=str(claim.get("support") or claim.get("notes") or ""),
href=str(claim.get("source_url") or ""),
meta=str(claim.get("claim_kind") or "claim"),
kind="section",
source=str(claim.get("claim_id") or claim.get("id") or ""),
)
)
return cards
def load_didactopus_cards(pack_root: str | Path) -> list[ContentCard]:
base = Path(pack_root)
pack_path = base / "pack.yaml"
concepts_path = base / "concepts.yaml"
if not pack_path.exists() or not concepts_path.exists():
return []
pack = _read_yaml(pack_path) or {}
concepts = _read_yaml(concepts_path) or {}
cards: list[ContentCard] = []
for concept in concepts.get("concepts", []):
if not isinstance(concept, dict):
continue
title = str(concept.get("title") or concept.get("id") or "Concept")
description = str(concept.get("description") or "")
prerequisites = concept.get("prerequisites") or []
prereq_text = ", ".join(str(item) for item in prerequisites) if prerequisites else "None"
body = description or f"Prerequisites: {prereq_text}."
cards.append(
ContentCard(
title=title,
body=body,
href=str(pack.get("display_name") or pack.get("name") or ""),
meta=f"Didactopus concept · {prereq_text}",
kind="app",
source=str(concept.get("id") or title.lower().replace(" ", "-")),
)
)
return cards
def load_citegeist_cards(source_root: str | Path) -> list[ContentCard]:
root = Path(source_root)
bib_files = sorted(
path
for path in root.rglob("*.bib")
if path.is_file() and not path.name.endswith("-bak.bib") and not path.name.startswith(".")
)
if not bib_files:
return []
cards: list[ContentCard] = []
try:
from citegeist.bibtex import parse_bibtex # type: ignore
except Exception:
parse_bibtex = None
for bib_path in bib_files:
text = bib_path.read_text(encoding="utf-8")
entries = parse_bibtex(text) if parse_bibtex is not None else _fallback_parse_bibtex(text)
for entry in entries:
data = entry if isinstance(entry, dict) else entry.__dict__
title = str(data.get("title") or data.get("citation_key") or "Reference")
author = str(data.get("author") or data.get("editor") or "")
year = str(data.get("year") or "")
body = " · ".join(part for part in [author, year] if part).strip()
cards.append(
ContentCard(
title=title,
body=body or bib_path.name,
href=str(bib_path.relative_to(root)),
meta="CiteGeist bibliography",
kind="bibliography",
source=str(data.get("citation_key") or title.lower().replace(" ", "-")),
)
)
return cards
def _fallback_parse_bibtex(text: str) -> list[dict[str, str]]:
entries: list[dict[str, str]] = []
current: dict[str, str] | None = None
for line in text.splitlines():
stripped = line.strip()
if not stripped:
continue
if stripped.startswith("@") and "{" in stripped:
if current:
entries.append(current)
kind, rest = stripped[1:].split("{", 1)
key = rest.split(",", 1)[0].strip()
current = {"entry_type": kind.strip(), "citation_key": key}
continue
if current and "=" in stripped:
field, value = stripped.split("=", 1)
current[field.strip().lower()] = value.strip().strip(",{}")
if current:
entries.append(current)
return entries