Store portable source roots for imports

This commit is contained in:
welsberr 2026-04-23 10:27:41 -04:00
parent 1731e0006a
commit 836ad9c110
4 changed files with 29 additions and 4 deletions

View File

@ -88,6 +88,7 @@ class DocliftBundleSourceAdapter:
if figures_path.exists():
figure_payload = json.loads(figures_path.read_text(encoding="utf-8"))
source_path = str(figure_payload.get("source_path") or document.get("source_path") or relative_markdown)
source_path_kind = str(figure_payload.get("source_path_kind") or document.get("source_path_kind") or "source_root_relative")
concept_rows.append(
{
@ -113,6 +114,7 @@ class DocliftBundleSourceAdapter:
"line_start": 0,
"line_end": 0,
"source_url": source_path,
"metadata": {"source_path_kind": source_path_kind},
"grounding_status": "grounded",
"support_kind": "direct_source",
"confidence_hint": 0.85,

View File

@ -53,6 +53,13 @@ def _default_import_id(source_root: Path) -> str:
return f"{stem}-{stamp}"
def _portable_source_root_ref(source_path: Path, output_root: Path) -> tuple[str, str]:
anchor = output_root.resolve().parent
if source_path.is_relative_to(anchor):
return source_path.relative_to(anchor).as_posix(), "output_root_parent_relative"
return source_path.name, "source_label"
def _write_json(path: Path, payload: dict[str, Any]) -> None:
path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
@ -124,6 +131,7 @@ def run_groundrecall_import(
]
actual_import_id = import_id or _default_import_id(source_path)
output_root = Path(out_root) if out_root else source_path / "imports"
source_root_ref, source_root_kind = _portable_source_root_ref(source_path, output_root)
output_dir = output_root / actual_import_id
output_dir.mkdir(parents=True, exist_ok=True)
@ -132,7 +140,7 @@ def run_groundrecall_import(
import_mode=mode,
machine_id=machine_id or socket.gethostname(),
agent_id=agent_id,
source_root=str(source_path),
source_root=source_root_ref,
imported_at=_timestamp(),
)
@ -177,6 +185,7 @@ def run_groundrecall_import(
manifest = manifest_record(context) | {
"source_adapter": adapter.name,
"import_intent": adapter.import_intent(),
"source_root_kind": source_root_kind,
"artifact_count": len(artifact_rows),
"observation_count": len(observation_rows),
"claim_count": len(claim_rows),

View File

@ -163,6 +163,15 @@ def _extract_citation_keys(text: str) -> list[str]:
return sorted(set(keys))
def _resolve_source_root(import_dir: Path, source_root: str) -> str:
if not source_root:
return ""
root = Path(source_root)
if root.is_absolute():
return str(root)
return str((import_dir.parent.parent / root).resolve())
def _artifact_citation_payloads(
artifacts: list[dict[str, Any]],
*,
@ -223,14 +232,15 @@ def _artifact_citation_payloads(
def build_citation_review_entries_from_import(import_dir: str | Path) -> list[CitationReviewEntry]:
base = Path(import_dir)
manifest = _read_json(base / "manifest.json")
resolved_source_root = _resolve_source_root(base, manifest.get("source_root", ""))
artifacts = _read_jsonl(base / "artifacts.jsonl")
observations = _read_jsonl(base / "observations.jsonl")
claims = _read_jsonl(base / "claims.jsonl")
bibliography_index = load_bibliography_index(manifest.get("source_root", ""))
bibliography_index = load_bibliography_index(resolved_source_root)
artifact_payloads, _ = _artifact_citation_payloads(
artifacts,
source_root=manifest.get("source_root", ""),
source_root=resolved_source_root,
)
observations_by_id = {item["observation_id"]: item for item in observations}
artifact_claim_links: dict[str, dict[str, set[str]]] = defaultdict(lambda: {"claim_ids": set(), "concept_ids": set()})
@ -299,6 +309,7 @@ def build_citation_review_entries_from_import(import_dir: str | Path) -> list[Ci
def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> dict[str, Any]:
manifest = _read_json(import_dir / "manifest.json")
resolved_source_root = _resolve_source_root(import_dir, manifest.get("source_root", ""))
lint_payload = _read_json(import_dir / "lint_findings.json")
queue_payload = _read_json(import_dir / "review_queue.json")
artifacts = _read_jsonl(import_dir / "artifacts.jsonl")
@ -316,7 +327,7 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di
artifact_citations, artifact_citation_summary = _artifact_citation_payloads(
artifacts,
source_root=manifest.get("source_root", ""),
source_root=resolved_source_root,
)
artifact_by_id = {item["artifact_id"]: item for item in artifacts}

View File

@ -214,7 +214,10 @@ def test_doclift_bundle_import_generates_structured_concepts(tmp_path: Path) ->
result = run_groundrecall_import(_copied_fixture_doclift_bundle(tmp_path), mode="quick", import_id="doclift-test")
assert result.manifest["source_adapter"] == "doclift_bundle"
assert result.manifest["import_intent"] == "both"
assert result.manifest["source_root"] == "doclift_bundle_minimal"
assert result.manifest["source_root_kind"] == "source_label"
concept_ids = {item["concept_id"] for item in result.concepts}
assert "concept::lecture-1" in concept_ids
claim_ids = {item["claim_id"] for item in result.claims}
assert "clm_doclift_1" in claim_ids
assert result.observations[0]["source_url"] == "legacy/lecture-1.doc"