Add archive phrase inventory for Notebook seeding

2026-05-10 01:51:06 -04:00 · 2026-05-10 01:51:06 -04:00 · 80a4d02caa
parent ce2188816a
commit 80a4d02caa
3 changed files with 458 additions and 0 deletions
--- a/src/didactopus/archive_phrase_inventory.py
+++ b/src/didactopus/archive_phrase_inventory.py
@ -0,0 +1,365 @@
 from __future__ import annotations
 import json
 import re
 from collections import Counter, defaultdict
 from pathlib import Path
 from typing import Any
 _SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+")
 _TOKEN_RE = re.compile(r"[A-Za-z][A-Za-z\-']*")
 _HEADING_RE = re.compile(r"^\s{0,3}#{1,6}\s+(.*)$", re.MULTILINE)
 _BULLET_RE = re.compile(r"^\s*[-*+]\s+", re.MULTILINE)
 _DEFINITION_PATTERNS = [
    re.compile(pattern, re.IGNORECASE)
    for pattern in (
        r"\bdefined as\b",
        r"\brefers to\b",
        r"\bmeans\b",
        r"\bis (?:a|an|the)\b",
        r"\bdescribes\b",
        r"\bconsists of\b",
    )
 ]
 _DISTINCTION_PATTERNS = [
    re.compile(pattern, re.IGNORECASE)
    for pattern in (
        r"\bversus\b|\bvs\.?\b",
        r"\bnot\b.+\bbut\b",
        r"\bdistinguish\b",
        r"\bcontrast\b",
        r"\bcompare\b",
        r"\bdifferent from\b",
        r"\bdoes not imply\b",
        r"\bnot identical\b",
    )
 ]
 _QUALIFICATION_PATTERNS = [
    re.compile(pattern, re.IGNORECASE)
    for pattern in (
        r"\bhowever\b",
        r"\balthough\b",
        r"\bbut\b",
        r"\bunless\b",
        r"\bonly if\b",
        r"\bdepends on\b",
        r"\bmay\b",
        r"\bcan\b",
        r"\brequires\b",
        r"\bcannot\b",
        r"\bdoes not\b",
    )
 ]
 _STOPWORDS = {
    "a", "about", "after", "all", "also", "an", "and", "any", "are", "as", "at",
    "be", "because", "been", "before", "between", "both", "but", "by", "can", "could",
    "did", "do", "does", "each", "for", "from", "had", "has", "have", "how", "if",
    "in", "into", "is", "it", "its", "may", "more", "most", "must", "no", "not",
    "of", "on", "only", "or", "other", "our", "out", "over", "same", "should", "so",
    "some", "such", "than", "that", "the", "their", "them", "there", "these", "they",
    "this", "those", "through", "to", "too", "under", "unless", "up", "use", "using",
    "very", "was", "we", "were", "what", "when", "which", "while", "with", "would",
 }
 _GENERIC_PHRASES = {
    "source file",
    "new york",
 }
 def _slugify(text: str) -> str:
    return re.sub(r"[^a-z0-9]+", "-", text.strip().lower()).strip("-")
 def _discover_source_paths(inputs: list[str | Path]) -> list[Path]:
    discovered: list[Path] = []
    seen: set[Path] = set()
    for item in inputs:
        path = Path(item)
        candidates: list[Path]
        if path.is_file():
            candidates = [path]
        elif (path / "documents").exists():
            candidates = sorted((path / "documents").glob("*/document.md"))
        elif path.is_dir():
            candidates = sorted(
                p for p in path.rglob("*") if p.is_file() and p.suffix.lower() in {".md", ".txt"}
            )
        else:
            candidates = []
        for candidate in candidates:
            if candidate.name in {"concept-alignment.yaml", "bundle.yaml", "bundle.yml"}:
                continue
            resolved = candidate.resolve()
            if resolved not in seen:
                seen.add(resolved)
                discovered.append(candidate)
    return discovered
 def _read_text(path: Path) -> str:
    return path.read_text(encoding="utf-8", errors="ignore")
 def _document_title(path: Path, text: str) -> str:
    match = _HEADING_RE.search(text)
    if match:
        return " ".join(match.group(1).split()).strip()
    return path.parent.name.replace("-", " ").strip() or path.stem.replace("-", " ").strip()
 def _clean_sentence(text: str) -> str:
    text = _BULLET_RE.sub("", text)
    return " ".join(text.split()).strip(" -")
 def _sentences(text: str) -> list[str]:
    return [cleaned for item in _SENTENCE_SPLIT_RE.split(text.replace("\n", " ")) if (cleaned := _clean_sentence(item))]
 def _inventory_text_window(
    text: str,
    *,
    max_chars: int = 80000,
    max_sentences: int = 400,
 ) -> str:
    trimmed = text[:max_chars]
    sentences = _sentences(trimmed)
    if len(sentences) > max_sentences:
        sentences = sentences[:max_sentences]
    return " ".join(sentences)
 def _extract_candidate_phrases(text: str, min_words: int = 2, max_words: int = 4) -> list[str]:
    tokens = _TOKEN_RE.findall(text)
    lowered = [token.lower() for token in tokens]
    out: list[str] = []
    seen: set[str] = set()
    for start in range(len(lowered)):
        for size in range(min_words, max_words + 1):
            window = lowered[start : start + size]
            if len(window) != size:
                continue
            if window[0] in _STOPWORDS or window[-1] in _STOPWORDS:
                continue
            if sum(1 for token in window if token in _STOPWORDS) > 1:
                continue
            if any(len(token) < 3 and token not in {"of", "vs"} for token in window):
                continue
            phrase = " ".join(window)
            if phrase in seen:
                continue
            seen.add(phrase)
            out.append(phrase)
    return out
 def _score_phrase(entry: dict[str, Any], seed_slugs: set[str]) -> float:
    phrase_slug = _slugify(entry["phrase"])
    seed_bonus = 3 if phrase_slug in seed_slugs else 0
    prefix_bonus = 2 if any(phrase_slug in seed or seed in phrase_slug for seed in seed_slugs) else 0
    return (
        entry["document_count"] * 3
        + entry["occurrence_count"]
        + entry["definition_hits"] * 2
        + entry["distinction_hits"] * 2
        + entry["qualification_hits"]
        + entry["heading_hits"] * 2
        + seed_bonus
        + prefix_bonus
    )
 def _is_redundant_subphrase(
    entry: dict[str, Any],
    group_rows: list[dict[str, Any]],
    seed_slugs: set[str],
 ) -> bool:
    phrase = str(entry["phrase"])
    phrase_slug = _slugify(phrase)
    if phrase_slug in seed_slugs:
        return False
    for other in group_rows:
        if other is entry:
            continue
        other_phrase = str(other["phrase"])
        if len(other_phrase) <= len(phrase):
            continue
        if not (other_phrase.startswith(f"{phrase} ") or other_phrase.endswith(f" {phrase}")):
            continue
        if (
            other["occurrence_count"] == entry["occurrence_count"]
            and other["document_count"] == entry["document_count"]
            and other["definition_hits"] >= entry["definition_hits"]
            and other["distinction_hits"] >= entry["distinction_hits"]
            and other["qualification_hits"] >= entry["qualification_hits"]
        ):
            return True
    return False
 def _filter_phrase_rows(rows: list[dict[str, Any]], seed_slugs: set[str]) -> list[dict[str, Any]]:
    grouped: defaultdict[tuple[int, int], list[dict[str, Any]]] = defaultdict(list)
    for row in rows:
        if row["phrase"] in _GENERIC_PHRASES:
            continue
        grouped[(int(row["occurrence_count"]), int(row["document_count"]))].append(row)
    filtered: list[dict[str, Any]] = []
    for group in grouped.values():
        for row in group:
            if not _is_redundant_subphrase(row, group, seed_slugs):
                filtered.append(row)
    return filtered
 def build_archive_phrase_inventory(
    inputs: list[str | Path],
    *,
    seed_terms: list[str] | None = None,
    top_n: int = 50,
 ) -> dict[str, Any]:
    paths = _discover_source_paths(inputs)
    seed_terms = [term.strip() for term in (seed_terms or []) if str(term).strip()]
    seed_slugs = {_slugify(term) for term in seed_terms}
    phrase_stats: dict[str, dict[str, Any]] = {}
    document_rows: list[dict[str, Any]] = []
    for path in paths:
        text = _read_text(path)
        title = _document_title(path, text)
        inventory_text = _inventory_text_window(text)
        heading_phrases = _extract_candidate_phrases(title, min_words=2, max_words=4)
        sentence_rows = _sentences(inventory_text)
        per_doc_counter: Counter[str] = Counter()
        per_doc_hits: defaultdict[str, dict[str, int]] = defaultdict(
            lambda: {"definition_hits": 0, "distinction_hits": 0, "qualification_hits": 0, "heading_hits": 0}
        )
        for phrase in heading_phrases:
            per_doc_counter[phrase] += 1
            per_doc_hits[phrase]["heading_hits"] += 1
        for sentence in sentence_rows:
            sentence_phrases = _extract_candidate_phrases(sentence)
            definition_hit = any(pattern.search(sentence) for pattern in _DEFINITION_PATTERNS)
            distinction_hit = any(pattern.search(sentence) for pattern in _DISTINCTION_PATTERNS)
            qualification_hit = any(pattern.search(sentence) for pattern in _QUALIFICATION_PATTERNS)
            for phrase in sentence_phrases:
                per_doc_counter[phrase] += 1
                if definition_hit:
                    per_doc_hits[phrase]["definition_hits"] += 1
                if distinction_hit:
                    per_doc_hits[phrase]["distinction_hits"] += 1
                if qualification_hit:
                    per_doc_hits[phrase]["qualification_hits"] += 1
        document_rows.append(
            {
                "path": str(path),
                "title": title,
                "top_phrases": [{"phrase": phrase, "count": count} for phrase, count in per_doc_counter.most_common(8)],
            }
        )
        for phrase, count in per_doc_counter.items():
            stats = phrase_stats.setdefault(
                phrase,
                {
                    "phrase": phrase,
                    "occurrence_count": 0,
                    "document_count": 0,
                    "definition_hits": 0,
                    "distinction_hits": 0,
                    "qualification_hits": 0,
                    "heading_hits": 0,
                    "source_paths": [],
                },
            )
            stats["occurrence_count"] += count
            stats["document_count"] += 1
            stats["definition_hits"] += per_doc_hits[phrase]["definition_hits"]
            stats["distinction_hits"] += per_doc_hits[phrase]["distinction_hits"]
            stats["qualification_hits"] += per_doc_hits[phrase]["qualification_hits"]
            stats["heading_hits"] += per_doc_hits[phrase]["heading_hits"]
            stats["source_paths"].append(str(path))
    phrase_rows = list(phrase_stats.values())
    for row in phrase_rows:
        row["seed_match"] = _slugify(row["phrase"]) in seed_slugs
        row["score"] = _score_phrase(row, seed_slugs)
        row["translation_priority"] = bool(
            row["seed_match"] or row["definition_hits"] or row["distinction_hits"] or row["qualification_hits"]
        )
        row["source_paths"] = sorted(set(row["source_paths"]))
    phrase_rows = _filter_phrase_rows(phrase_rows, seed_slugs)
    phrase_rows.sort(
        key=lambda item: (-float(item["score"]), -int(item["document_count"]), -int(item["occurrence_count"]), item["phrase"])
    )
    return {
        "summary": {
            "document_count": len(paths),
            "distinct_phrase_count": len(phrase_rows),
            "seed_term_count": len(seed_terms),
            "translation_priority_count": sum(1 for row in phrase_rows if row["translation_priority"]),
        },
        "input_paths": [str(Path(item)) for item in inputs],
        "seed_terms": seed_terms,
        "prioritized_concepts": [
            {
                "phrase": row["phrase"],
                "score": row["score"],
                "document_count": row["document_count"],
                "occurrence_count": row["occurrence_count"],
                "seed_match": row["seed_match"],
                "translation_priority": row["translation_priority"],
                "definition_hits": row["definition_hits"],
                "distinction_hits": row["distinction_hits"],
                "qualification_hits": row["qualification_hits"],
            }
            for row in phrase_rows[:top_n]
        ],
        "phrase_rows": phrase_rows[:top_n],
        "documents": document_rows,
    }
 def write_archive_phrase_inventory_report(
    inputs: list[str | Path],
    out_path: str | Path,
    *,
    seed_terms: list[str] | None = None,
    top_n: int = 50,
 ) -> dict[str, Any]:
    report = build_archive_phrase_inventory(inputs, seed_terms=seed_terms, top_n=top_n)
    out = Path(out_path)
    out.parent.mkdir(parents=True, exist_ok=True)
    out.write_text(json.dumps(report, indent=2), encoding="utf-8")
    md_path = out.with_suffix(".md")
    lines = [
        "# Archive Phrase Inventory",
        "",
        f"- documents: `{report['summary']['document_count']}`",
        f"- distinct phrases: `{report['summary']['distinct_phrase_count']}`",
        f"- seed terms: `{report['summary']['seed_term_count']}`",
        f"- translation-priority phrases: `{report['summary']['translation_priority_count']}`",
        "",
        "## Prioritized Concepts",
    ]
    for item in report["prioritized_concepts"][:20]:
        flags: list[str] = []
        if item["seed_match"]:
            flags.append("seed")
        if item["translation_priority"]:
            flags.append("translation")
        if item["distinction_hits"]:
            flags.append(f"distinctions={item['distinction_hits']}")
        if item["definition_hits"]:
            flags.append(f"definitions={item['definition_hits']}")
        if item["qualification_hits"]:
            flags.append(f"qualifications={item['qualification_hits']}")
        suffix = f" ({', '.join(flags)})" if flags else ""
        lines.append(f"- `{item['phrase']}` score={item['score']} docs={item['document_count']} hits={item['occurrence_count']}{suffix}")
    md_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
    return {"report_path": str(out), "markdown_path": str(md_path), "summary": report["summary"]}
--- a/src/didactopus/main.py
+++ b/src/didactopus/main.py
@ -8,6 +8,7 @@ from .config import load_config
 from .doclift_bundle_demo import run_doclift_bundle_demo
 from .groundrecall_pack_bridge import run_doclift_bundle_with_groundrecall
 from .augmentation_bundle_probe import write_probe_report
 from .archive_phrase_inventory import write_archive_phrase_inventory_report
 from .notebook_page import export_notebook_page_from_groundrecall_bundle
 from .notebook_page import export_notebook_page_from_groundrecall_store
 from .review_loader import load_draft_pack
@ -74,6 +75,15 @@ def build_parser() -> argparse.ArgumentParser:
    augmentation_probe_parser.add_argument("augmentation_bundle")
    augmentation_probe_parser.add_argument("groundrecall_query_bundle")
    augmentation_probe_parser.add_argument("output_path")
    phrase_inventory_parser = subparsers.add_parser(
        "archive-phrase-inventory",
        help="Extract and rank repeated phrase candidates from archive-style source bundles",
    )
    phrase_inventory_parser.add_argument("output_path")
    phrase_inventory_parser.add_argument("input_paths", nargs="+")
    phrase_inventory_parser.add_argument("--seed-term", action="append", default=[])
    phrase_inventory_parser.add_argument("--top-n", type=int, default=50)
    return parser
@ -169,4 +179,13 @@ def main() -> None:
        )
        print(summary)
        return
    if args.command == "archive-phrase-inventory":
        summary = write_archive_phrase_inventory_report(
            args.input_paths,
            args.output_path,
            seed_terms=args.seed_term,
            top_n=args.top_n,
        )
        print(summary)
        return
    build_parser().print_help()
--- a/tests/test_archive_phrase_inventory.py
+++ b/tests/test_archive_phrase_inventory.py
@ -0,0 +1,74 @@
 from __future__ import annotations
 import json
 from pathlib import Path
 from didactopus.archive_phrase_inventory import build_archive_phrase_inventory
 from didactopus.archive_phrase_inventory import write_archive_phrase_inventory_report
 def test_build_archive_phrase_inventory_prioritizes_repeated_seeded_distinctions(tmp_path: Path) -> None:
    docs = tmp_path / "bundle" / "documents"
    (docs / "intro").mkdir(parents=True)
    (docs / "drift").mkdir(parents=True)
    (docs / "intro" / "document.md").write_text(
        "# Introduction to Evolutionary Biology\n\n"
        "Natural selection is not identical to genetic drift. "
        "Common descent refers to the branching history of populations.\n"
    )
    (docs / "drift" / "document.md").write_text(
        "# Drift and Selection\n\n"
        "Genetic drift can change allele frequencies. "
        "Natural selection and genetic drift should be distinguished in explanation.\n"
    )
    report = build_archive_phrase_inventory(
        [tmp_path / "bundle"],
        seed_terms=["natural selection", "genetic drift", "common descent"],
        top_n=10,
    )
    phrases = [item["phrase"] for item in report["prioritized_concepts"]]
    assert "natural selection" in phrases
    assert "genetic drift" in phrases
    row = next(item for item in report["prioritized_concepts"] if item["phrase"] == "genetic drift")
    assert row["seed_match"] is True
    assert row["translation_priority"] is True
    assert row["document_count"] >= 2
 def test_write_archive_phrase_inventory_report_writes_json_and_markdown(tmp_path: Path) -> None:
    source = tmp_path / "notes.md"
    source.write_text(
        "# Heritable Change\n\n"
        "Phenotypic plasticity does not by itself imply heritable evolutionary change.\n"
    )
    out = tmp_path / "report.json"
    summary = write_archive_phrase_inventory_report(
        [source],
        out,
        seed_terms=["phenotypic plasticity"],
        top_n=5,
    )
    assert out.exists()
    assert out.with_suffix(".md").exists()
    payload = json.loads(out.read_text())
    assert payload["summary"]["document_count"] == 1
    assert summary["summary"]["distinct_phrase_count"] >= 1
 def test_bundle_control_files_are_skipped(tmp_path: Path) -> None:
    bundle = tmp_path / "augmentation"
    snippets = bundle / "snippets"
    snippets.mkdir(parents=True)
    (bundle / "bundle.yaml").write_text("title: test\n")
    (snippets / "concept-alignment.yaml").write_text("items: []\n")
    (snippets / "plasticity.md").write_text(
        "# Plasticity\n\nPlasticity can mislead evolutionary inference if heredity is not checked.\n"
    )
    report = build_archive_phrase_inventory([bundle], top_n=10)
    assert report["summary"]["document_count"] == 1
    phrases = [item["phrase"] for item in report["prioritized_concepts"]]
    assert any("plasticity" in phrase or "evolutionary inference" in phrase for phrase in phrases)