Compare commits
4 Commits
9549961d10
...
b7e2f9f540
| Author | SHA1 | Date |
|---|---|---|
|
|
b7e2f9f540 | |
|
|
8e616f6bc6 | |
|
|
8d2b6928a8 | |
|
|
bb64c01123 |
20
README.md
20
README.md
|
|
@ -140,6 +140,26 @@ For the fastest included example, use the MIT OCW Information and Entropy demo.
|
|||
- progress visualization
|
||||
- skill export
|
||||
|
||||
## `doclift` Bundle Ingestion
|
||||
|
||||
When your source material starts as legacy office documents, the intended
|
||||
boundary is:
|
||||
|
||||
1. `doclift` normalizes the source tree into a bundle.
|
||||
2. `Didactopus` turns that bundle into a draft pack and learning path.
|
||||
3. `GroundRecall` can import the same bundle directly when you need canonical
|
||||
knowledge storage instead of a learner pack.
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
doclift convert-dir /path/to/legacy-course /tmp/doclift-bundle --asset-root /path/to/legacy-course
|
||||
didactopus doclift-bundle /tmp/doclift-bundle /tmp/didactopus-pack --course-title "Example Course"
|
||||
```
|
||||
|
||||
That command writes the normal draft-pack outputs plus a
|
||||
`doclift_bundle_summary.json` file that records the bundle-to-pack conversion.
|
||||
|
||||
## Didactopus As Pedagogy Support
|
||||
|
||||
Didactopus is broader than a learner chat loop.
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ dependencies = [
|
|||
]
|
||||
|
||||
[project.scripts]
|
||||
didactopus = "didactopus.main:main"
|
||||
didactopus-api = "didactopus.api:main"
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,86 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from .artifact_registry import validate_pack
|
||||
from .document_adapters import adapt_documents
|
||||
from .knowledge_graph import write_knowledge_graph
|
||||
from .pack_emitter import build_draft_pack, write_draft_pack, write_source_corpus
|
||||
from .rule_policy import RuleContext, build_default_rules, run_rules
|
||||
from .topic_ingest import build_topic_bundle, document_to_course, extract_concept_candidates, merge_courses_into_topic_course
|
||||
|
||||
|
||||
def run_doclift_bundle_demo(
|
||||
bundle_dir: str | Path,
|
||||
course_title: str,
|
||||
pack_dir: str | Path,
|
||||
author: str = "doclift bundle import",
|
||||
license_name: str = "See source bundle metadata",
|
||||
) -> dict:
|
||||
bundle_dir = Path(bundle_dir)
|
||||
pack_dir = Path(pack_dir)
|
||||
|
||||
docs = adapt_documents(bundle_dir)
|
||||
if not docs:
|
||||
raise ValueError(f"No documents found in doclift bundle {bundle_dir}")
|
||||
|
||||
courses = [document_to_course(doc, course_title) for doc in docs]
|
||||
merged = merge_courses_into_topic_course(build_topic_bundle(course_title, courses))
|
||||
concepts = extract_concept_candidates(merged)
|
||||
lesson_concept_ids = {concept.id for concept in concepts if concept.title in {lesson.title for module in merged.modules for lesson in module.lessons}}
|
||||
concepts = [concept for concept in concepts if concept.id in lesson_concept_ids]
|
||||
ctx = RuleContext(course=merged, concepts=concepts)
|
||||
run_rules(ctx, build_default_rules(enable_projects=False, enable_review=False))
|
||||
|
||||
draft = build_draft_pack(
|
||||
merged,
|
||||
ctx.concepts,
|
||||
author=author,
|
||||
license_name=license_name,
|
||||
review_flags=ctx.review_flags,
|
||||
conflicts=[],
|
||||
)
|
||||
write_draft_pack(draft, pack_dir)
|
||||
write_source_corpus(merged, pack_dir)
|
||||
write_knowledge_graph(merged, ctx.concepts, pack_dir)
|
||||
|
||||
validation = validate_pack(pack_dir)
|
||||
if not validation.is_valid:
|
||||
raise ValueError(f"Generated pack failed validation: {validation.errors}")
|
||||
|
||||
summary = {
|
||||
"bundle_dir": str(bundle_dir),
|
||||
"course_title": course_title,
|
||||
"pack_dir": str(pack_dir),
|
||||
"source_document_count": len(docs),
|
||||
"module_count": len(merged.modules),
|
||||
"concept_count": len(ctx.concepts),
|
||||
"review_flags": list(ctx.review_flags),
|
||||
}
|
||||
(pack_dir / "doclift_bundle_summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")
|
||||
return summary
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Generate a Didactopus draft pack from a doclift bundle.")
|
||||
parser.add_argument("bundle_dir")
|
||||
parser.add_argument("pack_dir")
|
||||
parser.add_argument("--course-title", required=True)
|
||||
parser.add_argument("--author", default="doclift bundle import")
|
||||
parser.add_argument("--license-name", default="See source bundle metadata")
|
||||
args = parser.parse_args()
|
||||
|
||||
summary = run_doclift_bundle_demo(
|
||||
bundle_dir=args.bundle_dir,
|
||||
course_title=args.course_title,
|
||||
pack_dir=args.pack_dir,
|
||||
author=args.author,
|
||||
license_name=args.license_name,
|
||||
)
|
||||
print(json.dumps(summary, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
import re
|
||||
from .course_schema import NormalizedDocument, Section
|
||||
|
|
@ -31,6 +32,12 @@ def read_textish(path: str | Path) -> str:
|
|||
return Path(path).read_text(encoding="utf-8")
|
||||
|
||||
|
||||
def _safe_read_json(path: Path) -> dict:
|
||||
if not path.exists():
|
||||
return {}
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def adapt_markdown(path: str | Path) -> NormalizedDocument:
|
||||
text = read_textish(path)
|
||||
return NormalizedDocument(
|
||||
|
|
@ -108,8 +115,62 @@ def adapt_pptx(path: str | Path) -> NormalizedDocument:
|
|||
)
|
||||
|
||||
|
||||
def is_doclift_bundle(path: str | Path) -> bool:
|
||||
base = Path(path)
|
||||
if not base.is_dir():
|
||||
return False
|
||||
manifest_path = base / "manifest.json"
|
||||
documents_dir = base / "documents"
|
||||
return manifest_path.exists() and documents_dir.exists()
|
||||
|
||||
|
||||
def adapt_doclift_bundle(path: str | Path) -> list[NormalizedDocument]:
|
||||
base = Path(path)
|
||||
manifest = _safe_read_json(base / "manifest.json")
|
||||
by_output_dir = {
|
||||
Path(item.get("output_dir", "")).name: item
|
||||
for item in manifest.get("documents", [])
|
||||
if isinstance(item, dict) and item.get("output_dir")
|
||||
}
|
||||
docs: list[NormalizedDocument] = []
|
||||
for doc_dir in sorted(child for child in (base / "documents").iterdir() if child.is_dir()):
|
||||
markdown_path = doc_dir / "document.md"
|
||||
if not markdown_path.exists():
|
||||
continue
|
||||
text = markdown_path.read_text(encoding="utf-8")
|
||||
sections = _simple_section_split(text)
|
||||
bundle_meta = by_output_dir.get(doc_dir.name, {})
|
||||
figures_payload = _safe_read_json(doc_dir / "document.figures.json")
|
||||
tables_payload = _safe_read_json(doc_dir / "document.tables.json")
|
||||
source_path = figures_payload.get("source_path") or tables_payload.get("source_path") or str(markdown_path)
|
||||
docs.append(
|
||||
NormalizedDocument(
|
||||
source_path=str(source_path),
|
||||
source_type="doclift_bundle",
|
||||
title=str(bundle_meta.get("title") or _title_from_path(doc_dir.name)),
|
||||
text=text,
|
||||
sections=sections,
|
||||
metadata={
|
||||
"doclift_bundle": True,
|
||||
"bundle_root": str(base),
|
||||
"bundle_document_dir": str(doc_dir),
|
||||
"bundle_markdown_path": str(markdown_path),
|
||||
"document_kind": bundle_meta.get("document_kind", "document"),
|
||||
"layout_path": bundle_meta.get("layout_path", str(doc_dir / "document.layout.json")),
|
||||
"tables_path": bundle_meta.get("tables_path", str(doc_dir / "document.tables.json")),
|
||||
"figures_path": bundle_meta.get("figures_path", str(doc_dir / "document.figures.json")),
|
||||
"table_count": bundle_meta.get("table_count", 0),
|
||||
"figure_reference_count": bundle_meta.get("figure_reference_count", 0),
|
||||
},
|
||||
)
|
||||
)
|
||||
return docs
|
||||
|
||||
|
||||
def detect_adapter(path: str | Path) -> str:
|
||||
p = Path(path)
|
||||
if is_doclift_bundle(p):
|
||||
return "doclift_bundle"
|
||||
suffix = p.suffix.lower()
|
||||
if suffix == ".md":
|
||||
return "markdown"
|
||||
|
|
@ -128,11 +189,13 @@ def detect_adapter(path: str | Path) -> str:
|
|||
|
||||
def is_supported_document(path: str | Path) -> bool:
|
||||
p = Path(path)
|
||||
return p.is_file() and detect_adapter(p) in {"markdown", "text", "html", "pdf", "docx", "pptx"}
|
||||
return detect_adapter(p) in {"markdown", "text", "html", "pdf", "docx", "pptx", "doclift_bundle"} and (p.is_file() or p.is_dir())
|
||||
|
||||
|
||||
def adapt_documents(path: str | Path) -> list[NormalizedDocument]:
|
||||
p = Path(path)
|
||||
if is_doclift_bundle(p):
|
||||
return adapt_doclift_bundle(p)
|
||||
if p.is_dir():
|
||||
docs = [adapt_document(child) for child in sorted(p.rglob("*")) if is_supported_document(child)]
|
||||
return docs
|
||||
|
|
@ -141,6 +204,11 @@ def adapt_documents(path: str | Path) -> list[NormalizedDocument]:
|
|||
|
||||
def adapt_document(path: str | Path) -> NormalizedDocument:
|
||||
adapter = detect_adapter(path)
|
||||
if adapter == "doclift_bundle":
|
||||
docs = adapt_doclift_bundle(path)
|
||||
if not docs:
|
||||
raise ValueError(f"No documents found in doclift bundle {path}")
|
||||
return docs[0]
|
||||
if adapter == "markdown":
|
||||
return adapt_markdown(path)
|
||||
if adapter == "html":
|
||||
|
|
|
|||
|
|
@ -1,16 +1,18 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from .config import load_config
|
||||
from .doclift_bundle_demo import run_doclift_bundle_demo
|
||||
from .review_loader import load_draft_pack
|
||||
from .review_schema import ReviewSession, ReviewAction
|
||||
from .review_actions import apply_action
|
||||
from .review_export import export_review_state_json, export_promoted_pack, export_review_ui_data
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
def build_review_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="Didactopus interactive review workflow scaffold")
|
||||
parser.add_argument("--draft-pack", required=True, help="Path to draft pack directory")
|
||||
parser.add_argument("--output-dir", default="review-output")
|
||||
|
|
@ -18,8 +20,25 @@ def build_parser() -> argparse.ArgumentParser:
|
|||
return parser
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = build_parser().parse_args()
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="Didactopus command-line tools")
|
||||
subparsers = parser.add_subparsers(dest="command")
|
||||
|
||||
review_parser = subparsers.add_parser("review", help="Run the interactive review workflow scaffold")
|
||||
review_parser.add_argument("--draft-pack", required=True, help="Path to draft pack directory")
|
||||
review_parser.add_argument("--output-dir", default="review-output")
|
||||
review_parser.add_argument("--config", default="configs/config.example.yaml")
|
||||
|
||||
doclift_parser = subparsers.add_parser("doclift-bundle", help="Generate a draft pack from a doclift bundle")
|
||||
doclift_parser.add_argument("bundle_dir")
|
||||
doclift_parser.add_argument("pack_dir")
|
||||
doclift_parser.add_argument("--course-title", required=True)
|
||||
doclift_parser.add_argument("--author", default="doclift bundle import")
|
||||
doclift_parser.add_argument("--license-name", default="See source bundle metadata")
|
||||
return parser
|
||||
|
||||
|
||||
def run_review_workflow(args: argparse.Namespace) -> None:
|
||||
config = load_config(Path(args.config))
|
||||
draft = load_draft_pack(args.draft_pack)
|
||||
session = ReviewSession(reviewer=config.review.default_reviewer, draft_pack=draft)
|
||||
|
|
@ -53,3 +72,27 @@ def main() -> None:
|
|||
print(f"Concepts: {len(session.draft_pack.concepts)}")
|
||||
print(f"Ledger entries: {len(session.ledger)}")
|
||||
print(f"Output dir: {outdir}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
argv = sys.argv[1:]
|
||||
if not argv or argv[0].startswith("-"):
|
||||
args = build_review_parser().parse_args(argv)
|
||||
run_review_workflow(args)
|
||||
return
|
||||
|
||||
args = build_parser().parse_args(argv)
|
||||
if args.command == "review":
|
||||
run_review_workflow(args)
|
||||
return
|
||||
if args.command == "doclift-bundle":
|
||||
summary = run_doclift_bundle_demo(
|
||||
bundle_dir=args.bundle_dir,
|
||||
course_title=args.course_title,
|
||||
pack_dir=args.pack_dir,
|
||||
author=args.author,
|
||||
license_name=args.license_name,
|
||||
)
|
||||
print(summary)
|
||||
return
|
||||
build_parser().print_help()
|
||||
|
|
|
|||
9
tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.figures.json
vendored
Executable file
9
tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.figures.json
vendored
Executable file
|
|
@ -0,0 +1,9 @@
|
|||
{
|
||||
"source_path": "legacy/lecture-1.doc",
|
||||
"figure_references": [
|
||||
{
|
||||
"label": "Figure 1",
|
||||
"caption": "Example figure caption"
|
||||
}
|
||||
]
|
||||
}
|
||||
8
tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.layout.json
vendored
Executable file
8
tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.layout.json
vendored
Executable file
|
|
@ -0,0 +1,8 @@
|
|||
[
|
||||
{
|
||||
"line_index": 0,
|
||||
"text": "Lecture 1. Example",
|
||||
"kind": "heading",
|
||||
"indent": 0
|
||||
}
|
||||
]
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
# Lecture 1. Example
|
||||
|
||||
## Module A
|
||||
|
||||
### Lesson A
|
||||
|
||||
- Objective: Explain lesson A.
|
||||
|
||||
Body text that grounds the example lesson.
|
||||
13
tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.tables.json
vendored
Executable file
13
tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.tables.json
vendored
Executable file
|
|
@ -0,0 +1,13 @@
|
|||
{
|
||||
"source_path": "legacy/lecture-1.doc",
|
||||
"tables": [
|
||||
{
|
||||
"table_id": "table-1",
|
||||
"caption": "Example table",
|
||||
"rows": [
|
||||
["Column A", "Column B"],
|
||||
["1", "2"]
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
"documents": [
|
||||
{
|
||||
"document_id": "lecture-1",
|
||||
"title": "Lecture 1. Example",
|
||||
"document_kind": "lecture",
|
||||
"output_dir": "documents/lecture-1",
|
||||
"markdown_path": "documents/lecture-1/document.md",
|
||||
"layout_path": "documents/lecture-1/document.layout.json",
|
||||
"tables_path": "documents/lecture-1/document.tables.json",
|
||||
"figures_path": "documents/lecture-1/document.figures.json",
|
||||
"table_count": 1,
|
||||
"figure_reference_count": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from didactopus.doclift_bundle_demo import run_doclift_bundle_demo
|
||||
|
||||
|
||||
def _fixture_bundle() -> Path:
|
||||
return Path(__file__).parent / "fixtures" / "doclift_bundle_minimal"
|
||||
|
||||
|
||||
def test_doclift_bundle_demo_generates_pack(tmp_path: Path) -> None:
|
||||
summary = run_doclift_bundle_demo(_fixture_bundle(), "Example Course", tmp_path / "pack")
|
||||
|
||||
assert summary["source_document_count"] == 1
|
||||
assert (tmp_path / "pack" / "pack.yaml").exists()
|
||||
assert (tmp_path / "pack" / "source_corpus.json").exists()
|
||||
assert (tmp_path / "pack" / "knowledge_graph.json").exists()
|
||||
assert (tmp_path / "pack" / "doclift_bundle_summary.json").exists()
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import didactopus.main as main_module
|
||||
|
||||
|
||||
def test_main_doclift_bundle_subcommand(monkeypatch, capsys, tmp_path: Path) -> None:
|
||||
captured: dict = {}
|
||||
|
||||
def _fake_run_doclift_bundle_demo(bundle_dir, course_title, pack_dir, author, license_name):
|
||||
captured.update(
|
||||
{
|
||||
"bundle_dir": str(bundle_dir),
|
||||
"course_title": course_title,
|
||||
"pack_dir": str(pack_dir),
|
||||
"author": author,
|
||||
"license_name": license_name,
|
||||
}
|
||||
)
|
||||
return {"pack_dir": str(pack_dir), "course_title": course_title}
|
||||
|
||||
monkeypatch.setattr(main_module, "run_doclift_bundle_demo", _fake_run_doclift_bundle_demo)
|
||||
monkeypatch.setattr(
|
||||
main_module.sys,
|
||||
"argv",
|
||||
[
|
||||
"didactopus",
|
||||
"doclift-bundle",
|
||||
str(tmp_path / "bundle"),
|
||||
str(tmp_path / "pack"),
|
||||
"--course-title",
|
||||
"Example Course",
|
||||
],
|
||||
)
|
||||
|
||||
main_module.main()
|
||||
out = capsys.readouterr().out
|
||||
|
||||
assert captured["course_title"] == "Example Course"
|
||||
assert "Example Course" in out
|
||||
|
||||
|
||||
def test_main_legacy_review_mode_uses_review_parser(monkeypatch, tmp_path: Path) -> None:
|
||||
called: dict = {}
|
||||
|
||||
def _fake_run_review_workflow(args):
|
||||
called["draft_pack"] = args.draft_pack
|
||||
called["output_dir"] = args.output_dir
|
||||
|
||||
monkeypatch.setattr(main_module, "run_review_workflow", _fake_run_review_workflow)
|
||||
monkeypatch.setattr(
|
||||
main_module.sys,
|
||||
"argv",
|
||||
[
|
||||
"didactopus",
|
||||
"--draft-pack",
|
||||
str(tmp_path / "draft"),
|
||||
"--output-dir",
|
||||
str(tmp_path / "out"),
|
||||
],
|
||||
)
|
||||
|
||||
main_module.main()
|
||||
|
||||
assert called["draft_pack"] == str(tmp_path / "draft")
|
||||
assert called["output_dir"] == str(tmp_path / "out")
|
||||
|
|
@ -1,5 +1,7 @@
|
|||
import json
|
||||
from pathlib import Path
|
||||
from didactopus.document_adapters import adapt_document
|
||||
from didactopus.document_adapters import adapt_documents
|
||||
from didactopus.topic_ingest import document_to_course, build_topic_bundle, merge_courses_into_topic_course, extract_concept_candidates
|
||||
|
||||
|
||||
|
|
@ -60,3 +62,41 @@ def test_extract_concepts_retains_lessons_but_filters_generic_terms(tmp_path: Pa
|
|||
assert "MIT OCW 6.050J Information and Entropy: Syllabus" in titles
|
||||
assert "Explain" not in titles
|
||||
assert "Channel Capacity" in titles
|
||||
|
||||
|
||||
def test_adapt_documents_from_doclift_bundle(tmp_path: Path) -> None:
|
||||
bundle = tmp_path / "bundle"
|
||||
doc_dir = bundle / "documents" / "lesson-a"
|
||||
doc_dir.mkdir(parents=True)
|
||||
(bundle / "manifest.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"documents": [
|
||||
{
|
||||
"title": "Lecture 1. Example",
|
||||
"document_kind": "lecture",
|
||||
"output_dir": str(doc_dir),
|
||||
"layout_path": str(doc_dir / "document.layout.json"),
|
||||
"tables_path": str(doc_dir / "document.tables.json"),
|
||||
"figures_path": str(doc_dir / "document.figures.json"),
|
||||
"table_count": 1,
|
||||
"figure_reference_count": 0,
|
||||
}
|
||||
]
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
(doc_dir / "document.md").write_text("# Lecture 1. Example\n\n## Module\n### Lesson A\nBody.", encoding="utf-8")
|
||||
(doc_dir / "document.layout.json").write_text("[]", encoding="utf-8")
|
||||
(doc_dir / "document.tables.json").write_text(json.dumps({"source_path": "/tmp/source.doc", "tables": []}), encoding="utf-8")
|
||||
(doc_dir / "document.figures.json").write_text(json.dumps({"source_path": "/tmp/source.doc", "figure_references": []}), encoding="utf-8")
|
||||
|
||||
docs = adapt_documents(bundle)
|
||||
|
||||
assert len(docs) == 1
|
||||
assert docs[0].source_type == "doclift_bundle"
|
||||
assert docs[0].title == "Lecture 1. Example"
|
||||
assert docs[0].metadata["document_kind"] == "lecture"
|
||||
assert docs[0].metadata["doclift_bundle"] is True
|
||||
assert docs[0].source_path == "/tmp/source.doc"
|
||||
|
|
|
|||
Loading…
Reference in New Issue