GroundRecall/src/groundrecall/ingest.py

from __future__ import annotations

import argparse
import inspect
import json
import shutil
import socket
import subprocess
from collections import OrderedDict
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

from .groundrecall_discovery import DiscoveredArtifact
from .groundrecall_lint import lint_import_directory
from .groundrecall_normalizer import (
    ImportContext,
    build_artifact_record,
    build_claim_record,
    build_concept_records,
    build_fragment_record,
    build_observation_record,
    build_relation_records,
    manifest_record,
    standardize_concept_rows,
)
from .groundrecall_review_bridge import export_review_bundle_from_import
from .groundrecall_review_queue import build_review_queue
from .groundrecall_segmenter import SegmentedPage, segment_markdown_artifact
from .groundrecall_source_adapters.base import detect_source_adapter
import groundrecall.groundrecall_source_adapters  # noqa: F401


VALID_MODES = {"archive", "quick", "grounded"}


@dataclass
class ImportResult:
    manifest: dict[str, Any]
    artifacts: list[dict[str, Any]]
    fragments: list[dict[str, Any]]
    observations: list[dict[str, Any]]
    claims: list[dict[str, Any]]
    concepts: list[dict[str, Any]]
    relations: list[dict[str, Any]]
    out_dir: Path


def _timestamp() -> str:
    return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")


def _default_import_id(source_root: Path) -> str:
    stem = source_root.name.lower().replace("_", "-")
    stamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
    return f"{stem}-{stamp}"


def _portable_source_root_ref(source_path: Path, output_root: Path) -> tuple[str, str]:
    anchor = output_root.resolve().parent
    if source_path.is_relative_to(anchor):
        relative = source_path.relative_to(anchor)
        if relative == Path("."):
            return source_path.name, "source_label"
        return relative.as_posix(), "output_root_parent_relative"
    return source_path.name, "source_label"


def _write_json(path: Path, payload: dict[str, Any]) -> None:
    path.write_text(json.dumps(payload, indent=2), encoding="utf-8")


def _write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None:
    text = "\n".join(json.dumps(row, sort_keys=True) for row in rows)
    if text:
        text += "\n"
    path.write_text(text, encoding="utf-8")


def _dedupe_by_key(rows: list[dict[str, Any]], key: str) -> list[dict[str, Any]]:
    unique: OrderedDict[str, dict[str, Any]] = OrderedDict()
    for row in rows:
        unique.setdefault(str(row[key]), row)
    return list(unique.values())


def _convert_tex_to_markdown(path: Path) -> str | None:
    pandoc = shutil.which("pandoc")
    if pandoc is None:
        return None
    result = subprocess.run(
        [pandoc, "-f", "latex", "-t", "gfm", str(path)],
        capture_output=True,
        text=True,
        check=False,
    )
    if result.returncode != 0:
        return None
    markdown = result.stdout.strip()
    return markdown or None


def _segment_artifact(artifact: DiscoveredArtifact) -> SegmentedPage | None:
    if not artifact.is_text:
        return None
    suffix = artifact.path.suffix.lower()
    if suffix not in {".md", ".markdown", ".txt", ".tex", ".log"}:
        return None
    if suffix == ".tex":
        converted = _convert_tex_to_markdown(artifact.path)
        if converted is not None:
            return segment_markdown_artifact(artifact, text=converted)
    return segment_markdown_artifact(artifact)


def run_groundrecall_import(
    source_root: str | Path,
    out_root: str | Path | None = None,
    mode: str = "quick",
    import_id: str | None = None,
    machine_id: str | None = None,
    agent_id: str = "groundrecall.ingest",
) -> ImportResult:
    source_path = Path(source_root).resolve()
    if mode not in VALID_MODES:
        raise ValueError(f"Unsupported import mode: {mode}")
    adapter = detect_source_adapter(source_path)
    discovered = adapter.discover(source_path)
    artifacts = [
        DiscoveredArtifact(
            path=item.path,
            relative_path=item.relative_path,
            artifact_kind=item.artifact_kind,
            is_text=item.is_text,
        )
        for item in discovered
    ]
    actual_import_id = import_id or _default_import_id(source_path)
    output_root = Path(out_root) if out_root else source_path / "imports"
    source_root_ref, source_root_kind = _portable_source_root_ref(source_path, output_root)
    output_dir = output_root / actual_import_id
    output_dir.mkdir(parents=True, exist_ok=True)

    context = ImportContext(
        import_id=actual_import_id,
        import_mode=mode,
        machine_id=machine_id or socket.gethostname(),
        agent_id=agent_id,
        source_root=source_root_ref,
        imported_at=_timestamp(),
    )

    artifact_rows: list[dict[str, Any]] = []
    fragment_rows: list[dict[str, Any]] = []
    observation_rows: list[dict[str, Any]] = []
    claim_rows: list[dict[str, Any]] = []
    concept_rows: list[dict[str, Any]] = []
    relation_rows: list[dict[str, Any]] = []
    build_rows_params = inspect.signature(adapter.build_rows).parameters
    if "root" in build_rows_params:
        structured_rows = adapter.build_rows(context, discovered, root=source_path)
    else:
        structured_rows = adapter.build_rows(context, discovered)
    if structured_rows is not None:
        artifact_rows.extend(structured_rows.artifact_rows)
        fragment_rows.extend(structured_rows.fragment_rows)
        observation_rows.extend(structured_rows.observation_rows)
        claim_rows.extend(structured_rows.claim_rows)
        concept_rows.extend(structured_rows.concept_rows)
        relation_rows.extend(structured_rows.relation_rows)
    else:
        for artifact in artifacts:
            page = _segment_artifact(artifact)
            artifact_row = build_artifact_record(context, artifact, page)
            artifact_rows.append(artifact_row)
            if page is None:
                continue

            concept_rows.extend(build_concept_records(context, artifact_row, page.concepts))
            relation_rows.extend(build_relation_records(context, artifact_row, page.concepts, page.links))

            for index, observation in enumerate(page.observations, start=1):
                fragment_row = build_fragment_record(context, artifact_row, observation, index)
                fragment_rows.append(fragment_row)
                observation_row = build_observation_record(context, artifact_row, observation, index)
                observation_rows.append(observation_row)
                if mode == "archive":
                    continue
                if observation.role not in {"claim", "summary"}:
                    continue
                claim_rows.append(
                    build_claim_record(
                        context,
                        observation_row,
                        observation,
                        page.concepts[:3],
                        index,
                        fragment_ids=[fragment_row["fragment_id"]],
                    )
                )

    fragment_rows = _dedupe_by_key(fragment_rows, "fragment_id")
    concept_rows, claim_rows, relation_rows = standardize_concept_rows(concept_rows, claim_rows, relation_rows)
    concept_rows = _dedupe_by_key(concept_rows, "concept_id")
    relation_rows = _dedupe_by_key(relation_rows, "relation_id")
    artifact_rows = _dedupe_by_key(artifact_rows, "artifact_id")
    observation_rows = _dedupe_by_key(observation_rows, "observation_id")
    claim_rows = _dedupe_by_key(claim_rows, "claim_id")

    manifest = manifest_record(context) | {
        "source_adapter": adapter.name,
        "import_intent": adapter.import_intent(),
        "source_root_kind": source_root_kind,
        "artifact_count": len(artifact_rows),
        "fragment_count": len(fragment_rows),
        "observation_count": len(observation_rows),
        "claim_count": len(claim_rows),
        "concept_count": len(concept_rows),
        "relation_count": len(relation_rows),
    }

    _write_json(output_dir / "manifest.json", manifest)
    _write_jsonl(output_dir / "artifacts.jsonl", artifact_rows)
    _write_jsonl(output_dir / "fragments.jsonl", fragment_rows)
    _write_jsonl(output_dir / "observations.jsonl", observation_rows)
    _write_jsonl(output_dir / "claims.jsonl", claim_rows)
    _write_jsonl(output_dir / "concepts.jsonl", concept_rows)
    _write_jsonl(output_dir / "relations.jsonl", relation_rows)
    lint_payload = lint_import_directory(output_dir)
    _write_json(output_dir / "lint_findings.json", lint_payload)
    review_queue = build_review_queue(output_dir)
    _write_json(output_dir / "review_queue.json", review_queue)
    export_review_bundle_from_import(output_dir)

    return ImportResult(
        manifest=manifest,
        artifacts=artifact_rows,
        fragments=fragment_rows,
        observations=observation_rows,
        claims=claim_rows,
        concepts=concept_rows,
        relations=relation_rows,
        out_dir=output_dir,
    )


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description="Import an llmwiki-style repository into GroundRecall import artifacts.")
    parser.add_argument("source_root")
    parser.add_argument("--out-root", default=None)
    parser.add_argument("--mode", choices=sorted(VALID_MODES), default="quick")
    parser.add_argument("--import-id", default=None)
    parser.add_argument("--machine-id", default=None)
    parser.add_argument("--agent-id", default="groundrecall.ingest")
    return parser


def main() -> None:
    args = build_parser().parse_args()
    result = run_groundrecall_import(
        source_root=args.source_root,
        out_root=args.out_root,
        mode=args.mode,
        import_id=args.import_id,
        machine_id=args.machine_id,
        agent_id=args.agent_id,
    )
    print(f"Wrote import artifacts to {result.out_dir}")