Store portable source roots for imports
This commit is contained in:
parent
1731e0006a
commit
836ad9c110
|
|
@ -88,6 +88,7 @@ class DocliftBundleSourceAdapter:
|
|||
if figures_path.exists():
|
||||
figure_payload = json.loads(figures_path.read_text(encoding="utf-8"))
|
||||
source_path = str(figure_payload.get("source_path") or document.get("source_path") or relative_markdown)
|
||||
source_path_kind = str(figure_payload.get("source_path_kind") or document.get("source_path_kind") or "source_root_relative")
|
||||
|
||||
concept_rows.append(
|
||||
{
|
||||
|
|
@ -113,6 +114,7 @@ class DocliftBundleSourceAdapter:
|
|||
"line_start": 0,
|
||||
"line_end": 0,
|
||||
"source_url": source_path,
|
||||
"metadata": {"source_path_kind": source_path_kind},
|
||||
"grounding_status": "grounded",
|
||||
"support_kind": "direct_source",
|
||||
"confidence_hint": 0.85,
|
||||
|
|
|
|||
|
|
@ -53,6 +53,13 @@ def _default_import_id(source_root: Path) -> str:
|
|||
return f"{stem}-{stamp}"
|
||||
|
||||
|
||||
def _portable_source_root_ref(source_path: Path, output_root: Path) -> tuple[str, str]:
|
||||
anchor = output_root.resolve().parent
|
||||
if source_path.is_relative_to(anchor):
|
||||
return source_path.relative_to(anchor).as_posix(), "output_root_parent_relative"
|
||||
return source_path.name, "source_label"
|
||||
|
||||
|
||||
def _write_json(path: Path, payload: dict[str, Any]) -> None:
|
||||
path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
||||
|
||||
|
|
@ -124,6 +131,7 @@ def run_groundrecall_import(
|
|||
]
|
||||
actual_import_id = import_id or _default_import_id(source_path)
|
||||
output_root = Path(out_root) if out_root else source_path / "imports"
|
||||
source_root_ref, source_root_kind = _portable_source_root_ref(source_path, output_root)
|
||||
output_dir = output_root / actual_import_id
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
|
@ -132,7 +140,7 @@ def run_groundrecall_import(
|
|||
import_mode=mode,
|
||||
machine_id=machine_id or socket.gethostname(),
|
||||
agent_id=agent_id,
|
||||
source_root=str(source_path),
|
||||
source_root=source_root_ref,
|
||||
imported_at=_timestamp(),
|
||||
)
|
||||
|
||||
|
|
@ -177,6 +185,7 @@ def run_groundrecall_import(
|
|||
manifest = manifest_record(context) | {
|
||||
"source_adapter": adapter.name,
|
||||
"import_intent": adapter.import_intent(),
|
||||
"source_root_kind": source_root_kind,
|
||||
"artifact_count": len(artifact_rows),
|
||||
"observation_count": len(observation_rows),
|
||||
"claim_count": len(claim_rows),
|
||||
|
|
|
|||
|
|
@ -163,6 +163,15 @@ def _extract_citation_keys(text: str) -> list[str]:
|
|||
return sorted(set(keys))
|
||||
|
||||
|
||||
def _resolve_source_root(import_dir: Path, source_root: str) -> str:
|
||||
if not source_root:
|
||||
return ""
|
||||
root = Path(source_root)
|
||||
if root.is_absolute():
|
||||
return str(root)
|
||||
return str((import_dir.parent.parent / root).resolve())
|
||||
|
||||
|
||||
def _artifact_citation_payloads(
|
||||
artifacts: list[dict[str, Any]],
|
||||
*,
|
||||
|
|
@ -223,14 +232,15 @@ def _artifact_citation_payloads(
|
|||
def build_citation_review_entries_from_import(import_dir: str | Path) -> list[CitationReviewEntry]:
|
||||
base = Path(import_dir)
|
||||
manifest = _read_json(base / "manifest.json")
|
||||
resolved_source_root = _resolve_source_root(base, manifest.get("source_root", ""))
|
||||
artifacts = _read_jsonl(base / "artifacts.jsonl")
|
||||
observations = _read_jsonl(base / "observations.jsonl")
|
||||
claims = _read_jsonl(base / "claims.jsonl")
|
||||
bibliography_index = load_bibliography_index(manifest.get("source_root", ""))
|
||||
bibliography_index = load_bibliography_index(resolved_source_root)
|
||||
|
||||
artifact_payloads, _ = _artifact_citation_payloads(
|
||||
artifacts,
|
||||
source_root=manifest.get("source_root", ""),
|
||||
source_root=resolved_source_root,
|
||||
)
|
||||
observations_by_id = {item["observation_id"]: item for item in observations}
|
||||
artifact_claim_links: dict[str, dict[str, set[str]]] = defaultdict(lambda: {"claim_ids": set(), "concept_ids": set()})
|
||||
|
|
@ -299,6 +309,7 @@ def build_citation_review_entries_from_import(import_dir: str | Path) -> list[Ci
|
|||
|
||||
def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> dict[str, Any]:
|
||||
manifest = _read_json(import_dir / "manifest.json")
|
||||
resolved_source_root = _resolve_source_root(import_dir, manifest.get("source_root", ""))
|
||||
lint_payload = _read_json(import_dir / "lint_findings.json")
|
||||
queue_payload = _read_json(import_dir / "review_queue.json")
|
||||
artifacts = _read_jsonl(import_dir / "artifacts.jsonl")
|
||||
|
|
@ -316,7 +327,7 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di
|
|||
|
||||
artifact_citations, artifact_citation_summary = _artifact_citation_payloads(
|
||||
artifacts,
|
||||
source_root=manifest.get("source_root", ""),
|
||||
source_root=resolved_source_root,
|
||||
)
|
||||
artifact_by_id = {item["artifact_id"]: item for item in artifacts}
|
||||
|
||||
|
|
|
|||
|
|
@ -214,7 +214,10 @@ def test_doclift_bundle_import_generates_structured_concepts(tmp_path: Path) ->
|
|||
result = run_groundrecall_import(_copied_fixture_doclift_bundle(tmp_path), mode="quick", import_id="doclift-test")
|
||||
assert result.manifest["source_adapter"] == "doclift_bundle"
|
||||
assert result.manifest["import_intent"] == "both"
|
||||
assert result.manifest["source_root"] == "doclift_bundle_minimal"
|
||||
assert result.manifest["source_root_kind"] == "source_label"
|
||||
concept_ids = {item["concept_id"] for item in result.concepts}
|
||||
assert "concept::lecture-1" in concept_ids
|
||||
claim_ids = {item["claim_id"] for item in result.claims}
|
||||
assert "clm_doclift_1" in claim_ids
|
||||
assert result.observations[0]["source_url"] == "legacy/lecture-1.doc"
|
||||
|
|
|
|||
Loading…
Reference in New Issue