diff --git a/src/didactopus/augmentation_bundle_probe.py b/src/didactopus/augmentation_bundle_probe.py new file mode 100644 index 0000000..96aafba --- /dev/null +++ b/src/didactopus/augmentation_bundle_probe.py @@ -0,0 +1,119 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any +import re + +import yaml + +from .augmentation_bundle import load_augmentation_bundle + + +def _load_json(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) if path.exists() else {} + + +def _load_alignment(path: Path) -> list[dict[str, str]]: + if not path.exists(): + return [] + payload = yaml.safe_load(path.read_text(encoding="utf-8")) or {} + rows = payload.get("alignments", []) or [] + return [item for item in rows if isinstance(item, dict)] + + +def _slugify(text: str) -> str: + cleaned = re.sub(r"[^a-zA-Z0-9]+", "-", text.strip().lower()).strip("-") + return cleaned or "untitled" + + +def probe_augmentation_bundle( + augmentation_bundle_dir: str | Path, + groundrecall_query_bundle_path: str | Path, +) -> dict[str, Any]: + bundle = load_augmentation_bundle(augmentation_bundle_dir) + bundle_payload = _load_json(Path(groundrecall_query_bundle_path)) + concept = bundle_payload.get("concept", {}) or {} + related = bundle_payload.get("related_concepts", []) or [] + + target_titles = {str(concept.get("title", "")).strip()} + related_titles = set() + related_ids = set() + for item in related: + title = str(item.get("title", "") or item.get("label", "") or "").strip() + if title: + related_titles.add(title) + concept_id = str(item.get("id", "") or "").strip() + if concept_id: + related_ids.add(concept_id.replace("concept::", "", 1)) + normalized_hub = {_slugify(title) for title in target_titles if title} + normalized_related = {_slugify(title) for title in related_titles if title} | {_slugify(item) for item in related_ids if item} + + snippets_dir = Path(bundle["snippets_dir"]) + snippet_paths = sorted( + str(path) + for path in snippets_dir.glob("*.md") + if path.name != "README.md" + ) + alignments = _load_alignment(Path(bundle["concept_alignment"])) + matched_hub = [] + matched_related = [] + unmatched = [] + for item in alignments: + source_title = str(item.get("source_title", "")).strip() + target_title = str(item.get("target_title", "")).strip() + row = {"source_title": source_title, "target_title": target_title} + normalized_target = _slugify(target_title) + if normalized_target in normalized_hub: + matched_hub.append(row) + elif normalized_target in normalized_related: + matched_related.append(row) + else: + unmatched.append(row) + + return { + "bundle_title": bundle.get("title", ""), + "bundle_dir": bundle.get("bundle_dir", ""), + "groundrecall_query_bundle_path": str(Path(groundrecall_query_bundle_path).resolve()), + "hub_concept_title": next(iter(target_titles), ""), + "related_concept_titles": sorted(related_titles), + "snippet_count": len(snippet_paths), + "snippet_paths": snippet_paths, + "alignment_count": len(alignments), + "matched_hub_alignment_count": len(matched_hub), + "matched_related_alignment_count": len(matched_related), + "unmatched_alignment_count": len(unmatched), + "matched_hub_alignments": matched_hub, + "matched_related_alignments": matched_related, + "unmatched_alignments": unmatched, + } + + +def write_probe_report( + augmentation_bundle_dir: str | Path, + groundrecall_query_bundle_path: str | Path, + out_path: str | Path, +) -> dict[str, Any]: + payload = probe_augmentation_bundle(augmentation_bundle_dir, groundrecall_query_bundle_path) + out = Path(out_path) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(json.dumps(payload, indent=2), encoding="utf-8") + return payload + + +def main() -> None: + parser = argparse.ArgumentParser(description="Probe an augmentation bundle against a GroundRecall hub bundle.") + parser.add_argument("augmentation_bundle") + parser.add_argument("groundrecall_query_bundle") + parser.add_argument("--out") + args = parser.parse_args() + + payload = probe_augmentation_bundle(args.augmentation_bundle, args.groundrecall_query_bundle) + if args.out: + Path(args.out).write_text(json.dumps(payload, indent=2), encoding="utf-8") + print(json.dumps(payload, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/src/didactopus/main.py b/src/didactopus/main.py index f4e607f..fd6dbdc 100644 --- a/src/didactopus/main.py +++ b/src/didactopus/main.py @@ -7,6 +7,7 @@ from pathlib import Path from .config import load_config from .doclift_bundle_demo import run_doclift_bundle_demo from .groundrecall_pack_bridge import run_doclift_bundle_with_groundrecall +from .augmentation_bundle_probe import write_probe_report from .notebook_page import export_notebook_page_from_groundrecall_bundle from .notebook_page import export_notebook_page_from_groundrecall_store from .review_loader import load_draft_pack @@ -65,6 +66,14 @@ def build_parser() -> argparse.ArgumentParser: notebook_gr_parser.add_argument("groundrecall_store_dir") notebook_gr_parser.add_argument("groundrecall_concept_ref") notebook_gr_parser.add_argument("output_dir") + + augmentation_probe_parser = subparsers.add_parser( + "augmentation-bundle-probe", + help="Probe an augmentation bundle against an existing GroundRecall query bundle", + ) + augmentation_probe_parser.add_argument("augmentation_bundle") + augmentation_probe_parser.add_argument("groundrecall_query_bundle") + augmentation_probe_parser.add_argument("output_path") return parser @@ -152,4 +161,12 @@ def main() -> None: ) print(summary) return + if args.command == "augmentation-bundle-probe": + summary = write_probe_report( + args.augmentation_bundle, + args.groundrecall_query_bundle, + args.output_path, + ) + print(summary) + return build_parser().print_help() diff --git a/tests/test_augmentation_bundle_probe.py b/tests/test_augmentation_bundle_probe.py new file mode 100644 index 0000000..96afaf3 --- /dev/null +++ b/tests/test_augmentation_bundle_probe.py @@ -0,0 +1,54 @@ +from pathlib import Path +import json + +from didactopus.augmentation_bundle_probe import probe_augmentation_bundle + + +def test_probe_augmentation_bundle_reports_hub_and_related_matches(tmp_path: Path) -> None: + bundle = tmp_path / "bundle" + snippets = bundle / "snippets" + snippets.mkdir(parents=True) + (bundle / "bundle.yaml").write_text( + "\n".join( + [ + "title: Demo Bundle", + "snippets_dir: snippets", + "source_inventory: sources.yaml", + "concept_alignment: snippets/concept-alignment.yaml", + ] + ), + encoding="utf-8", + ) + (bundle / "sources.yaml").write_text("sources: []\n", encoding="utf-8") + (snippets / "a.md").write_text("# A\n", encoding="utf-8") + (snippets / "b.md").write_text("# B\n", encoding="utf-8") + (snippets / "concept-alignment.yaml").write_text( + "\n".join( + [ + "alignments:", + " - source_title: Entropy Comparison", + " target_title: Thermodynamics and Entropy", + " - source_title: Drift Note", + " target_title: Genetic drift", + " - source_title: Extra", + " target_title: Missing", + ] + ), + encoding="utf-8", + ) + groundrecall_bundle = tmp_path / "groundrecall_query_bundle.json" + groundrecall_bundle.write_text( + json.dumps( + { + "concept": {"title": "Thermodynamics and Entropy"}, + "related_concepts": [{"id": "concept::genetic-drift", "label": "Genetic drift"}, {"label": "Natural selection"}], + } + ), + encoding="utf-8", + ) + + payload = probe_augmentation_bundle(bundle, groundrecall_bundle) + assert payload["snippet_count"] == 2 + assert payload["matched_hub_alignment_count"] == 1 + assert payload["matched_related_alignment_count"] == 1 + assert payload["unmatched_alignment_count"] == 1