diff --git a/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py b/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py index 04e6b91..22cbf5c 100755 --- a/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py +++ b/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py @@ -88,6 +88,7 @@ class DocliftBundleSourceAdapter: if figures_path.exists(): figure_payload = json.loads(figures_path.read_text(encoding="utf-8")) source_path = str(figure_payload.get("source_path") or document.get("source_path") or relative_markdown) + source_path_kind = str(figure_payload.get("source_path_kind") or document.get("source_path_kind") or "source_root_relative") concept_rows.append( { @@ -113,6 +114,7 @@ class DocliftBundleSourceAdapter: "line_start": 0, "line_end": 0, "source_url": source_path, + "metadata": {"source_path_kind": source_path_kind}, "grounding_status": "grounded", "support_kind": "direct_source", "confidence_hint": 0.85, diff --git a/src/groundrecall/ingest.py b/src/groundrecall/ingest.py index 27854e3..feec996 100644 --- a/src/groundrecall/ingest.py +++ b/src/groundrecall/ingest.py @@ -53,6 +53,13 @@ def _default_import_id(source_root: Path) -> str: return f"{stem}-{stamp}" +def _portable_source_root_ref(source_path: Path, output_root: Path) -> tuple[str, str]: + anchor = output_root.resolve().parent + if source_path.is_relative_to(anchor): + return source_path.relative_to(anchor).as_posix(), "output_root_parent_relative" + return source_path.name, "source_label" + + def _write_json(path: Path, payload: dict[str, Any]) -> None: path.write_text(json.dumps(payload, indent=2), encoding="utf-8") @@ -124,6 +131,7 @@ def run_groundrecall_import( ] actual_import_id = import_id or _default_import_id(source_path) output_root = Path(out_root) if out_root else source_path / "imports" + source_root_ref, source_root_kind = _portable_source_root_ref(source_path, output_root) output_dir = output_root / actual_import_id output_dir.mkdir(parents=True, exist_ok=True) @@ -132,7 +140,7 @@ def run_groundrecall_import( import_mode=mode, machine_id=machine_id or socket.gethostname(), agent_id=agent_id, - source_root=str(source_path), + source_root=source_root_ref, imported_at=_timestamp(), ) @@ -177,6 +185,7 @@ def run_groundrecall_import( manifest = manifest_record(context) | { "source_adapter": adapter.name, "import_intent": adapter.import_intent(), + "source_root_kind": source_root_kind, "artifact_count": len(artifact_rows), "observation_count": len(observation_rows), "claim_count": len(claim_rows), diff --git a/src/groundrecall/review_export.py b/src/groundrecall/review_export.py index 22628ea..f0442ea 100644 --- a/src/groundrecall/review_export.py +++ b/src/groundrecall/review_export.py @@ -163,6 +163,15 @@ def _extract_citation_keys(text: str) -> list[str]: return sorted(set(keys)) +def _resolve_source_root(import_dir: Path, source_root: str) -> str: + if not source_root: + return "" + root = Path(source_root) + if root.is_absolute(): + return str(root) + return str((import_dir.parent.parent / root).resolve()) + + def _artifact_citation_payloads( artifacts: list[dict[str, Any]], *, @@ -223,14 +232,15 @@ def _artifact_citation_payloads( def build_citation_review_entries_from_import(import_dir: str | Path) -> list[CitationReviewEntry]: base = Path(import_dir) manifest = _read_json(base / "manifest.json") + resolved_source_root = _resolve_source_root(base, manifest.get("source_root", "")) artifacts = _read_jsonl(base / "artifacts.jsonl") observations = _read_jsonl(base / "observations.jsonl") claims = _read_jsonl(base / "claims.jsonl") - bibliography_index = load_bibliography_index(manifest.get("source_root", "")) + bibliography_index = load_bibliography_index(resolved_source_root) artifact_payloads, _ = _artifact_citation_payloads( artifacts, - source_root=manifest.get("source_root", ""), + source_root=resolved_source_root, ) observations_by_id = {item["observation_id"]: item for item in observations} artifact_claim_links: dict[str, dict[str, set[str]]] = defaultdict(lambda: {"claim_ids": set(), "concept_ids": set()}) @@ -299,6 +309,7 @@ def build_citation_review_entries_from_import(import_dir: str | Path) -> list[Ci def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> dict[str, Any]: manifest = _read_json(import_dir / "manifest.json") + resolved_source_root = _resolve_source_root(import_dir, manifest.get("source_root", "")) lint_payload = _read_json(import_dir / "lint_findings.json") queue_payload = _read_json(import_dir / "review_queue.json") artifacts = _read_jsonl(import_dir / "artifacts.jsonl") @@ -316,7 +327,7 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di artifact_citations, artifact_citation_summary = _artifact_citation_payloads( artifacts, - source_root=manifest.get("source_root", ""), + source_root=resolved_source_root, ) artifact_by_id = {item["artifact_id"]: item for item in artifacts} diff --git a/tests/test_groundrecall_source_adapters.py b/tests/test_groundrecall_source_adapters.py index b00f3c4..31b4852 100644 --- a/tests/test_groundrecall_source_adapters.py +++ b/tests/test_groundrecall_source_adapters.py @@ -214,7 +214,10 @@ def test_doclift_bundle_import_generates_structured_concepts(tmp_path: Path) -> result = run_groundrecall_import(_copied_fixture_doclift_bundle(tmp_path), mode="quick", import_id="doclift-test") assert result.manifest["source_adapter"] == "doclift_bundle" assert result.manifest["import_intent"] == "both" + assert result.manifest["source_root"] == "doclift_bundle_minimal" + assert result.manifest["source_root_kind"] == "source_label" concept_ids = {item["concept_id"] for item in result.concepts} assert "concept::lecture-1" in concept_ids claim_ids = {item["claim_id"] for item in result.claims} assert "clm_doclift_1" in claim_ids + assert result.observations[0]["source_url"] == "legacy/lecture-1.doc"