Add augmentation bundle probe for Notebook hubs
This commit is contained in:
parent
5a25e41043
commit
ce2188816a
|
|
@ -0,0 +1,119 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
import re
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from .augmentation_bundle import load_augmentation_bundle
|
||||||
|
|
||||||
|
|
||||||
|
def _load_json(path: Path) -> dict[str, Any]:
|
||||||
|
return json.loads(path.read_text(encoding="utf-8")) if path.exists() else {}
|
||||||
|
|
||||||
|
|
||||||
|
def _load_alignment(path: Path) -> list[dict[str, str]]:
|
||||||
|
if not path.exists():
|
||||||
|
return []
|
||||||
|
payload = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
||||||
|
rows = payload.get("alignments", []) or []
|
||||||
|
return [item for item in rows if isinstance(item, dict)]
|
||||||
|
|
||||||
|
|
||||||
|
def _slugify(text: str) -> str:
|
||||||
|
cleaned = re.sub(r"[^a-zA-Z0-9]+", "-", text.strip().lower()).strip("-")
|
||||||
|
return cleaned or "untitled"
|
||||||
|
|
||||||
|
|
||||||
|
def probe_augmentation_bundle(
|
||||||
|
augmentation_bundle_dir: str | Path,
|
||||||
|
groundrecall_query_bundle_path: str | Path,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
bundle = load_augmentation_bundle(augmentation_bundle_dir)
|
||||||
|
bundle_payload = _load_json(Path(groundrecall_query_bundle_path))
|
||||||
|
concept = bundle_payload.get("concept", {}) or {}
|
||||||
|
related = bundle_payload.get("related_concepts", []) or []
|
||||||
|
|
||||||
|
target_titles = {str(concept.get("title", "")).strip()}
|
||||||
|
related_titles = set()
|
||||||
|
related_ids = set()
|
||||||
|
for item in related:
|
||||||
|
title = str(item.get("title", "") or item.get("label", "") or "").strip()
|
||||||
|
if title:
|
||||||
|
related_titles.add(title)
|
||||||
|
concept_id = str(item.get("id", "") or "").strip()
|
||||||
|
if concept_id:
|
||||||
|
related_ids.add(concept_id.replace("concept::", "", 1))
|
||||||
|
normalized_hub = {_slugify(title) for title in target_titles if title}
|
||||||
|
normalized_related = {_slugify(title) for title in related_titles if title} | {_slugify(item) for item in related_ids if item}
|
||||||
|
|
||||||
|
snippets_dir = Path(bundle["snippets_dir"])
|
||||||
|
snippet_paths = sorted(
|
||||||
|
str(path)
|
||||||
|
for path in snippets_dir.glob("*.md")
|
||||||
|
if path.name != "README.md"
|
||||||
|
)
|
||||||
|
alignments = _load_alignment(Path(bundle["concept_alignment"]))
|
||||||
|
matched_hub = []
|
||||||
|
matched_related = []
|
||||||
|
unmatched = []
|
||||||
|
for item in alignments:
|
||||||
|
source_title = str(item.get("source_title", "")).strip()
|
||||||
|
target_title = str(item.get("target_title", "")).strip()
|
||||||
|
row = {"source_title": source_title, "target_title": target_title}
|
||||||
|
normalized_target = _slugify(target_title)
|
||||||
|
if normalized_target in normalized_hub:
|
||||||
|
matched_hub.append(row)
|
||||||
|
elif normalized_target in normalized_related:
|
||||||
|
matched_related.append(row)
|
||||||
|
else:
|
||||||
|
unmatched.append(row)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"bundle_title": bundle.get("title", ""),
|
||||||
|
"bundle_dir": bundle.get("bundle_dir", ""),
|
||||||
|
"groundrecall_query_bundle_path": str(Path(groundrecall_query_bundle_path).resolve()),
|
||||||
|
"hub_concept_title": next(iter(target_titles), ""),
|
||||||
|
"related_concept_titles": sorted(related_titles),
|
||||||
|
"snippet_count": len(snippet_paths),
|
||||||
|
"snippet_paths": snippet_paths,
|
||||||
|
"alignment_count": len(alignments),
|
||||||
|
"matched_hub_alignment_count": len(matched_hub),
|
||||||
|
"matched_related_alignment_count": len(matched_related),
|
||||||
|
"unmatched_alignment_count": len(unmatched),
|
||||||
|
"matched_hub_alignments": matched_hub,
|
||||||
|
"matched_related_alignments": matched_related,
|
||||||
|
"unmatched_alignments": unmatched,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def write_probe_report(
|
||||||
|
augmentation_bundle_dir: str | Path,
|
||||||
|
groundrecall_query_bundle_path: str | Path,
|
||||||
|
out_path: str | Path,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
payload = probe_augmentation_bundle(augmentation_bundle_dir, groundrecall_query_bundle_path)
|
||||||
|
out = Path(out_path)
|
||||||
|
out.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
||||||
|
return payload
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="Probe an augmentation bundle against a GroundRecall hub bundle.")
|
||||||
|
parser.add_argument("augmentation_bundle")
|
||||||
|
parser.add_argument("groundrecall_query_bundle")
|
||||||
|
parser.add_argument("--out")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
payload = probe_augmentation_bundle(args.augmentation_bundle, args.groundrecall_query_bundle)
|
||||||
|
if args.out:
|
||||||
|
Path(args.out).write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
||||||
|
print(json.dumps(payload, indent=2))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -7,6 +7,7 @@ from pathlib import Path
|
||||||
from .config import load_config
|
from .config import load_config
|
||||||
from .doclift_bundle_demo import run_doclift_bundle_demo
|
from .doclift_bundle_demo import run_doclift_bundle_demo
|
||||||
from .groundrecall_pack_bridge import run_doclift_bundle_with_groundrecall
|
from .groundrecall_pack_bridge import run_doclift_bundle_with_groundrecall
|
||||||
|
from .augmentation_bundle_probe import write_probe_report
|
||||||
from .notebook_page import export_notebook_page_from_groundrecall_bundle
|
from .notebook_page import export_notebook_page_from_groundrecall_bundle
|
||||||
from .notebook_page import export_notebook_page_from_groundrecall_store
|
from .notebook_page import export_notebook_page_from_groundrecall_store
|
||||||
from .review_loader import load_draft_pack
|
from .review_loader import load_draft_pack
|
||||||
|
|
@ -65,6 +66,14 @@ def build_parser() -> argparse.ArgumentParser:
|
||||||
notebook_gr_parser.add_argument("groundrecall_store_dir")
|
notebook_gr_parser.add_argument("groundrecall_store_dir")
|
||||||
notebook_gr_parser.add_argument("groundrecall_concept_ref")
|
notebook_gr_parser.add_argument("groundrecall_concept_ref")
|
||||||
notebook_gr_parser.add_argument("output_dir")
|
notebook_gr_parser.add_argument("output_dir")
|
||||||
|
|
||||||
|
augmentation_probe_parser = subparsers.add_parser(
|
||||||
|
"augmentation-bundle-probe",
|
||||||
|
help="Probe an augmentation bundle against an existing GroundRecall query bundle",
|
||||||
|
)
|
||||||
|
augmentation_probe_parser.add_argument("augmentation_bundle")
|
||||||
|
augmentation_probe_parser.add_argument("groundrecall_query_bundle")
|
||||||
|
augmentation_probe_parser.add_argument("output_path")
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -152,4 +161,12 @@ def main() -> None:
|
||||||
)
|
)
|
||||||
print(summary)
|
print(summary)
|
||||||
return
|
return
|
||||||
|
if args.command == "augmentation-bundle-probe":
|
||||||
|
summary = write_probe_report(
|
||||||
|
args.augmentation_bundle,
|
||||||
|
args.groundrecall_query_bundle,
|
||||||
|
args.output_path,
|
||||||
|
)
|
||||||
|
print(summary)
|
||||||
|
return
|
||||||
build_parser().print_help()
|
build_parser().print_help()
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,54 @@
|
||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
|
||||||
|
from didactopus.augmentation_bundle_probe import probe_augmentation_bundle
|
||||||
|
|
||||||
|
|
||||||
|
def test_probe_augmentation_bundle_reports_hub_and_related_matches(tmp_path: Path) -> None:
|
||||||
|
bundle = tmp_path / "bundle"
|
||||||
|
snippets = bundle / "snippets"
|
||||||
|
snippets.mkdir(parents=True)
|
||||||
|
(bundle / "bundle.yaml").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"title: Demo Bundle",
|
||||||
|
"snippets_dir: snippets",
|
||||||
|
"source_inventory: sources.yaml",
|
||||||
|
"concept_alignment: snippets/concept-alignment.yaml",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(bundle / "sources.yaml").write_text("sources: []\n", encoding="utf-8")
|
||||||
|
(snippets / "a.md").write_text("# A\n", encoding="utf-8")
|
||||||
|
(snippets / "b.md").write_text("# B\n", encoding="utf-8")
|
||||||
|
(snippets / "concept-alignment.yaml").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"alignments:",
|
||||||
|
" - source_title: Entropy Comparison",
|
||||||
|
" target_title: Thermodynamics and Entropy",
|
||||||
|
" - source_title: Drift Note",
|
||||||
|
" target_title: Genetic drift",
|
||||||
|
" - source_title: Extra",
|
||||||
|
" target_title: Missing",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
groundrecall_bundle = tmp_path / "groundrecall_query_bundle.json"
|
||||||
|
groundrecall_bundle.write_text(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"concept": {"title": "Thermodynamics and Entropy"},
|
||||||
|
"related_concepts": [{"id": "concept::genetic-drift", "label": "Genetic drift"}, {"label": "Natural selection"}],
|
||||||
|
}
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
payload = probe_augmentation_bundle(bundle, groundrecall_bundle)
|
||||||
|
assert payload["snippet_count"] == 2
|
||||||
|
assert payload["matched_hub_alignment_count"] == 1
|
||||||
|
assert payload["matched_related_alignment_count"] == 1
|
||||||
|
assert payload["unmatched_alignment_count"] == 1
|
||||||
Loading…
Reference in New Issue