Add archive phrase inventory for Notebook seeding
This commit is contained in:
parent
ce2188816a
commit
80a4d02caa
|
|
@ -0,0 +1,365 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from collections import Counter, defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
_SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+")
|
||||
_TOKEN_RE = re.compile(r"[A-Za-z][A-Za-z\-']*")
|
||||
_HEADING_RE = re.compile(r"^\s{0,3}#{1,6}\s+(.*)$", re.MULTILINE)
|
||||
_BULLET_RE = re.compile(r"^\s*[-*+]\s+", re.MULTILINE)
|
||||
_DEFINITION_PATTERNS = [
|
||||
re.compile(pattern, re.IGNORECASE)
|
||||
for pattern in (
|
||||
r"\bdefined as\b",
|
||||
r"\brefers to\b",
|
||||
r"\bmeans\b",
|
||||
r"\bis (?:a|an|the)\b",
|
||||
r"\bdescribes\b",
|
||||
r"\bconsists of\b",
|
||||
)
|
||||
]
|
||||
_DISTINCTION_PATTERNS = [
|
||||
re.compile(pattern, re.IGNORECASE)
|
||||
for pattern in (
|
||||
r"\bversus\b|\bvs\.?\b",
|
||||
r"\bnot\b.+\bbut\b",
|
||||
r"\bdistinguish\b",
|
||||
r"\bcontrast\b",
|
||||
r"\bcompare\b",
|
||||
r"\bdifferent from\b",
|
||||
r"\bdoes not imply\b",
|
||||
r"\bnot identical\b",
|
||||
)
|
||||
]
|
||||
_QUALIFICATION_PATTERNS = [
|
||||
re.compile(pattern, re.IGNORECASE)
|
||||
for pattern in (
|
||||
r"\bhowever\b",
|
||||
r"\balthough\b",
|
||||
r"\bbut\b",
|
||||
r"\bunless\b",
|
||||
r"\bonly if\b",
|
||||
r"\bdepends on\b",
|
||||
r"\bmay\b",
|
||||
r"\bcan\b",
|
||||
r"\brequires\b",
|
||||
r"\bcannot\b",
|
||||
r"\bdoes not\b",
|
||||
)
|
||||
]
|
||||
_STOPWORDS = {
|
||||
"a", "about", "after", "all", "also", "an", "and", "any", "are", "as", "at",
|
||||
"be", "because", "been", "before", "between", "both", "but", "by", "can", "could",
|
||||
"did", "do", "does", "each", "for", "from", "had", "has", "have", "how", "if",
|
||||
"in", "into", "is", "it", "its", "may", "more", "most", "must", "no", "not",
|
||||
"of", "on", "only", "or", "other", "our", "out", "over", "same", "should", "so",
|
||||
"some", "such", "than", "that", "the", "their", "them", "there", "these", "they",
|
||||
"this", "those", "through", "to", "too", "under", "unless", "up", "use", "using",
|
||||
"very", "was", "we", "were", "what", "when", "which", "while", "with", "would",
|
||||
}
|
||||
_GENERIC_PHRASES = {
|
||||
"source file",
|
||||
"new york",
|
||||
}
|
||||
|
||||
|
||||
def _slugify(text: str) -> str:
|
||||
return re.sub(r"[^a-z0-9]+", "-", text.strip().lower()).strip("-")
|
||||
|
||||
|
||||
def _discover_source_paths(inputs: list[str | Path]) -> list[Path]:
|
||||
discovered: list[Path] = []
|
||||
seen: set[Path] = set()
|
||||
for item in inputs:
|
||||
path = Path(item)
|
||||
candidates: list[Path]
|
||||
if path.is_file():
|
||||
candidates = [path]
|
||||
elif (path / "documents").exists():
|
||||
candidates = sorted((path / "documents").glob("*/document.md"))
|
||||
elif path.is_dir():
|
||||
candidates = sorted(
|
||||
p for p in path.rglob("*") if p.is_file() and p.suffix.lower() in {".md", ".txt"}
|
||||
)
|
||||
else:
|
||||
candidates = []
|
||||
for candidate in candidates:
|
||||
if candidate.name in {"concept-alignment.yaml", "bundle.yaml", "bundle.yml"}:
|
||||
continue
|
||||
resolved = candidate.resolve()
|
||||
if resolved not in seen:
|
||||
seen.add(resolved)
|
||||
discovered.append(candidate)
|
||||
return discovered
|
||||
|
||||
|
||||
def _read_text(path: Path) -> str:
|
||||
return path.read_text(encoding="utf-8", errors="ignore")
|
||||
|
||||
|
||||
def _document_title(path: Path, text: str) -> str:
|
||||
match = _HEADING_RE.search(text)
|
||||
if match:
|
||||
return " ".join(match.group(1).split()).strip()
|
||||
return path.parent.name.replace("-", " ").strip() or path.stem.replace("-", " ").strip()
|
||||
|
||||
|
||||
def _clean_sentence(text: str) -> str:
|
||||
text = _BULLET_RE.sub("", text)
|
||||
return " ".join(text.split()).strip(" -")
|
||||
|
||||
|
||||
def _sentences(text: str) -> list[str]:
|
||||
return [cleaned for item in _SENTENCE_SPLIT_RE.split(text.replace("\n", " ")) if (cleaned := _clean_sentence(item))]
|
||||
|
||||
|
||||
def _inventory_text_window(
|
||||
text: str,
|
||||
*,
|
||||
max_chars: int = 80000,
|
||||
max_sentences: int = 400,
|
||||
) -> str:
|
||||
trimmed = text[:max_chars]
|
||||
sentences = _sentences(trimmed)
|
||||
if len(sentences) > max_sentences:
|
||||
sentences = sentences[:max_sentences]
|
||||
return " ".join(sentences)
|
||||
|
||||
|
||||
def _extract_candidate_phrases(text: str, min_words: int = 2, max_words: int = 4) -> list[str]:
|
||||
tokens = _TOKEN_RE.findall(text)
|
||||
lowered = [token.lower() for token in tokens]
|
||||
out: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for start in range(len(lowered)):
|
||||
for size in range(min_words, max_words + 1):
|
||||
window = lowered[start : start + size]
|
||||
if len(window) != size:
|
||||
continue
|
||||
if window[0] in _STOPWORDS or window[-1] in _STOPWORDS:
|
||||
continue
|
||||
if sum(1 for token in window if token in _STOPWORDS) > 1:
|
||||
continue
|
||||
if any(len(token) < 3 and token not in {"of", "vs"} for token in window):
|
||||
continue
|
||||
phrase = " ".join(window)
|
||||
if phrase in seen:
|
||||
continue
|
||||
seen.add(phrase)
|
||||
out.append(phrase)
|
||||
return out
|
||||
|
||||
|
||||
def _score_phrase(entry: dict[str, Any], seed_slugs: set[str]) -> float:
|
||||
phrase_slug = _slugify(entry["phrase"])
|
||||
seed_bonus = 3 if phrase_slug in seed_slugs else 0
|
||||
prefix_bonus = 2 if any(phrase_slug in seed or seed in phrase_slug for seed in seed_slugs) else 0
|
||||
return (
|
||||
entry["document_count"] * 3
|
||||
+ entry["occurrence_count"]
|
||||
+ entry["definition_hits"] * 2
|
||||
+ entry["distinction_hits"] * 2
|
||||
+ entry["qualification_hits"]
|
||||
+ entry["heading_hits"] * 2
|
||||
+ seed_bonus
|
||||
+ prefix_bonus
|
||||
)
|
||||
|
||||
|
||||
def _is_redundant_subphrase(
|
||||
entry: dict[str, Any],
|
||||
group_rows: list[dict[str, Any]],
|
||||
seed_slugs: set[str],
|
||||
) -> bool:
|
||||
phrase = str(entry["phrase"])
|
||||
phrase_slug = _slugify(phrase)
|
||||
if phrase_slug in seed_slugs:
|
||||
return False
|
||||
for other in group_rows:
|
||||
if other is entry:
|
||||
continue
|
||||
other_phrase = str(other["phrase"])
|
||||
if len(other_phrase) <= len(phrase):
|
||||
continue
|
||||
if not (other_phrase.startswith(f"{phrase} ") or other_phrase.endswith(f" {phrase}")):
|
||||
continue
|
||||
if (
|
||||
other["occurrence_count"] == entry["occurrence_count"]
|
||||
and other["document_count"] == entry["document_count"]
|
||||
and other["definition_hits"] >= entry["definition_hits"]
|
||||
and other["distinction_hits"] >= entry["distinction_hits"]
|
||||
and other["qualification_hits"] >= entry["qualification_hits"]
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _filter_phrase_rows(rows: list[dict[str, Any]], seed_slugs: set[str]) -> list[dict[str, Any]]:
|
||||
grouped: defaultdict[tuple[int, int], list[dict[str, Any]]] = defaultdict(list)
|
||||
for row in rows:
|
||||
if row["phrase"] in _GENERIC_PHRASES:
|
||||
continue
|
||||
grouped[(int(row["occurrence_count"]), int(row["document_count"]))].append(row)
|
||||
|
||||
filtered: list[dict[str, Any]] = []
|
||||
for group in grouped.values():
|
||||
for row in group:
|
||||
if not _is_redundant_subphrase(row, group, seed_slugs):
|
||||
filtered.append(row)
|
||||
return filtered
|
||||
|
||||
|
||||
def build_archive_phrase_inventory(
|
||||
inputs: list[str | Path],
|
||||
*,
|
||||
seed_terms: list[str] | None = None,
|
||||
top_n: int = 50,
|
||||
) -> dict[str, Any]:
|
||||
paths = _discover_source_paths(inputs)
|
||||
seed_terms = [term.strip() for term in (seed_terms or []) if str(term).strip()]
|
||||
seed_slugs = {_slugify(term) for term in seed_terms}
|
||||
phrase_stats: dict[str, dict[str, Any]] = {}
|
||||
document_rows: list[dict[str, Any]] = []
|
||||
|
||||
for path in paths:
|
||||
text = _read_text(path)
|
||||
title = _document_title(path, text)
|
||||
inventory_text = _inventory_text_window(text)
|
||||
heading_phrases = _extract_candidate_phrases(title, min_words=2, max_words=4)
|
||||
sentence_rows = _sentences(inventory_text)
|
||||
per_doc_counter: Counter[str] = Counter()
|
||||
per_doc_hits: defaultdict[str, dict[str, int]] = defaultdict(
|
||||
lambda: {"definition_hits": 0, "distinction_hits": 0, "qualification_hits": 0, "heading_hits": 0}
|
||||
)
|
||||
|
||||
for phrase in heading_phrases:
|
||||
per_doc_counter[phrase] += 1
|
||||
per_doc_hits[phrase]["heading_hits"] += 1
|
||||
|
||||
for sentence in sentence_rows:
|
||||
sentence_phrases = _extract_candidate_phrases(sentence)
|
||||
definition_hit = any(pattern.search(sentence) for pattern in _DEFINITION_PATTERNS)
|
||||
distinction_hit = any(pattern.search(sentence) for pattern in _DISTINCTION_PATTERNS)
|
||||
qualification_hit = any(pattern.search(sentence) for pattern in _QUALIFICATION_PATTERNS)
|
||||
for phrase in sentence_phrases:
|
||||
per_doc_counter[phrase] += 1
|
||||
if definition_hit:
|
||||
per_doc_hits[phrase]["definition_hits"] += 1
|
||||
if distinction_hit:
|
||||
per_doc_hits[phrase]["distinction_hits"] += 1
|
||||
if qualification_hit:
|
||||
per_doc_hits[phrase]["qualification_hits"] += 1
|
||||
|
||||
document_rows.append(
|
||||
{
|
||||
"path": str(path),
|
||||
"title": title,
|
||||
"top_phrases": [{"phrase": phrase, "count": count} for phrase, count in per_doc_counter.most_common(8)],
|
||||
}
|
||||
)
|
||||
|
||||
for phrase, count in per_doc_counter.items():
|
||||
stats = phrase_stats.setdefault(
|
||||
phrase,
|
||||
{
|
||||
"phrase": phrase,
|
||||
"occurrence_count": 0,
|
||||
"document_count": 0,
|
||||
"definition_hits": 0,
|
||||
"distinction_hits": 0,
|
||||
"qualification_hits": 0,
|
||||
"heading_hits": 0,
|
||||
"source_paths": [],
|
||||
},
|
||||
)
|
||||
stats["occurrence_count"] += count
|
||||
stats["document_count"] += 1
|
||||
stats["definition_hits"] += per_doc_hits[phrase]["definition_hits"]
|
||||
stats["distinction_hits"] += per_doc_hits[phrase]["distinction_hits"]
|
||||
stats["qualification_hits"] += per_doc_hits[phrase]["qualification_hits"]
|
||||
stats["heading_hits"] += per_doc_hits[phrase]["heading_hits"]
|
||||
stats["source_paths"].append(str(path))
|
||||
|
||||
phrase_rows = list(phrase_stats.values())
|
||||
for row in phrase_rows:
|
||||
row["seed_match"] = _slugify(row["phrase"]) in seed_slugs
|
||||
row["score"] = _score_phrase(row, seed_slugs)
|
||||
row["translation_priority"] = bool(
|
||||
row["seed_match"] or row["definition_hits"] or row["distinction_hits"] or row["qualification_hits"]
|
||||
)
|
||||
row["source_paths"] = sorted(set(row["source_paths"]))
|
||||
phrase_rows = _filter_phrase_rows(phrase_rows, seed_slugs)
|
||||
phrase_rows.sort(
|
||||
key=lambda item: (-float(item["score"]), -int(item["document_count"]), -int(item["occurrence_count"]), item["phrase"])
|
||||
)
|
||||
|
||||
return {
|
||||
"summary": {
|
||||
"document_count": len(paths),
|
||||
"distinct_phrase_count": len(phrase_rows),
|
||||
"seed_term_count": len(seed_terms),
|
||||
"translation_priority_count": sum(1 for row in phrase_rows if row["translation_priority"]),
|
||||
},
|
||||
"input_paths": [str(Path(item)) for item in inputs],
|
||||
"seed_terms": seed_terms,
|
||||
"prioritized_concepts": [
|
||||
{
|
||||
"phrase": row["phrase"],
|
||||
"score": row["score"],
|
||||
"document_count": row["document_count"],
|
||||
"occurrence_count": row["occurrence_count"],
|
||||
"seed_match": row["seed_match"],
|
||||
"translation_priority": row["translation_priority"],
|
||||
"definition_hits": row["definition_hits"],
|
||||
"distinction_hits": row["distinction_hits"],
|
||||
"qualification_hits": row["qualification_hits"],
|
||||
}
|
||||
for row in phrase_rows[:top_n]
|
||||
],
|
||||
"phrase_rows": phrase_rows[:top_n],
|
||||
"documents": document_rows,
|
||||
}
|
||||
|
||||
|
||||
def write_archive_phrase_inventory_report(
|
||||
inputs: list[str | Path],
|
||||
out_path: str | Path,
|
||||
*,
|
||||
seed_terms: list[str] | None = None,
|
||||
top_n: int = 50,
|
||||
) -> dict[str, Any]:
|
||||
report = build_archive_phrase_inventory(inputs, seed_terms=seed_terms, top_n=top_n)
|
||||
out = Path(out_path)
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
out.write_text(json.dumps(report, indent=2), encoding="utf-8")
|
||||
md_path = out.with_suffix(".md")
|
||||
lines = [
|
||||
"# Archive Phrase Inventory",
|
||||
"",
|
||||
f"- documents: `{report['summary']['document_count']}`",
|
||||
f"- distinct phrases: `{report['summary']['distinct_phrase_count']}`",
|
||||
f"- seed terms: `{report['summary']['seed_term_count']}`",
|
||||
f"- translation-priority phrases: `{report['summary']['translation_priority_count']}`",
|
||||
"",
|
||||
"## Prioritized Concepts",
|
||||
]
|
||||
for item in report["prioritized_concepts"][:20]:
|
||||
flags: list[str] = []
|
||||
if item["seed_match"]:
|
||||
flags.append("seed")
|
||||
if item["translation_priority"]:
|
||||
flags.append("translation")
|
||||
if item["distinction_hits"]:
|
||||
flags.append(f"distinctions={item['distinction_hits']}")
|
||||
if item["definition_hits"]:
|
||||
flags.append(f"definitions={item['definition_hits']}")
|
||||
if item["qualification_hits"]:
|
||||
flags.append(f"qualifications={item['qualification_hits']}")
|
||||
suffix = f" ({', '.join(flags)})" if flags else ""
|
||||
lines.append(f"- `{item['phrase']}` score={item['score']} docs={item['document_count']} hits={item['occurrence_count']}{suffix}")
|
||||
md_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
||||
return {"report_path": str(out), "markdown_path": str(md_path), "summary": report["summary"]}
|
||||
|
|
@ -8,6 +8,7 @@ from .config import load_config
|
|||
from .doclift_bundle_demo import run_doclift_bundle_demo
|
||||
from .groundrecall_pack_bridge import run_doclift_bundle_with_groundrecall
|
||||
from .augmentation_bundle_probe import write_probe_report
|
||||
from .archive_phrase_inventory import write_archive_phrase_inventory_report
|
||||
from .notebook_page import export_notebook_page_from_groundrecall_bundle
|
||||
from .notebook_page import export_notebook_page_from_groundrecall_store
|
||||
from .review_loader import load_draft_pack
|
||||
|
|
@ -74,6 +75,15 @@ def build_parser() -> argparse.ArgumentParser:
|
|||
augmentation_probe_parser.add_argument("augmentation_bundle")
|
||||
augmentation_probe_parser.add_argument("groundrecall_query_bundle")
|
||||
augmentation_probe_parser.add_argument("output_path")
|
||||
|
||||
phrase_inventory_parser = subparsers.add_parser(
|
||||
"archive-phrase-inventory",
|
||||
help="Extract and rank repeated phrase candidates from archive-style source bundles",
|
||||
)
|
||||
phrase_inventory_parser.add_argument("output_path")
|
||||
phrase_inventory_parser.add_argument("input_paths", nargs="+")
|
||||
phrase_inventory_parser.add_argument("--seed-term", action="append", default=[])
|
||||
phrase_inventory_parser.add_argument("--top-n", type=int, default=50)
|
||||
return parser
|
||||
|
||||
|
||||
|
|
@ -169,4 +179,13 @@ def main() -> None:
|
|||
)
|
||||
print(summary)
|
||||
return
|
||||
if args.command == "archive-phrase-inventory":
|
||||
summary = write_archive_phrase_inventory_report(
|
||||
args.input_paths,
|
||||
args.output_path,
|
||||
seed_terms=args.seed_term,
|
||||
top_n=args.top_n,
|
||||
)
|
||||
print(summary)
|
||||
return
|
||||
build_parser().print_help()
|
||||
|
|
|
|||
|
|
@ -0,0 +1,74 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from didactopus.archive_phrase_inventory import build_archive_phrase_inventory
|
||||
from didactopus.archive_phrase_inventory import write_archive_phrase_inventory_report
|
||||
|
||||
|
||||
def test_build_archive_phrase_inventory_prioritizes_repeated_seeded_distinctions(tmp_path: Path) -> None:
|
||||
docs = tmp_path / "bundle" / "documents"
|
||||
(docs / "intro").mkdir(parents=True)
|
||||
(docs / "drift").mkdir(parents=True)
|
||||
(docs / "intro" / "document.md").write_text(
|
||||
"# Introduction to Evolutionary Biology\n\n"
|
||||
"Natural selection is not identical to genetic drift. "
|
||||
"Common descent refers to the branching history of populations.\n"
|
||||
)
|
||||
(docs / "drift" / "document.md").write_text(
|
||||
"# Drift and Selection\n\n"
|
||||
"Genetic drift can change allele frequencies. "
|
||||
"Natural selection and genetic drift should be distinguished in explanation.\n"
|
||||
)
|
||||
|
||||
report = build_archive_phrase_inventory(
|
||||
[tmp_path / "bundle"],
|
||||
seed_terms=["natural selection", "genetic drift", "common descent"],
|
||||
top_n=10,
|
||||
)
|
||||
|
||||
phrases = [item["phrase"] for item in report["prioritized_concepts"]]
|
||||
assert "natural selection" in phrases
|
||||
assert "genetic drift" in phrases
|
||||
row = next(item for item in report["prioritized_concepts"] if item["phrase"] == "genetic drift")
|
||||
assert row["seed_match"] is True
|
||||
assert row["translation_priority"] is True
|
||||
assert row["document_count"] >= 2
|
||||
|
||||
|
||||
def test_write_archive_phrase_inventory_report_writes_json_and_markdown(tmp_path: Path) -> None:
|
||||
source = tmp_path / "notes.md"
|
||||
source.write_text(
|
||||
"# Heritable Change\n\n"
|
||||
"Phenotypic plasticity does not by itself imply heritable evolutionary change.\n"
|
||||
)
|
||||
out = tmp_path / "report.json"
|
||||
summary = write_archive_phrase_inventory_report(
|
||||
[source],
|
||||
out,
|
||||
seed_terms=["phenotypic plasticity"],
|
||||
top_n=5,
|
||||
)
|
||||
|
||||
assert out.exists()
|
||||
assert out.with_suffix(".md").exists()
|
||||
payload = json.loads(out.read_text())
|
||||
assert payload["summary"]["document_count"] == 1
|
||||
assert summary["summary"]["distinct_phrase_count"] >= 1
|
||||
|
||||
|
||||
def test_bundle_control_files_are_skipped(tmp_path: Path) -> None:
|
||||
bundle = tmp_path / "augmentation"
|
||||
snippets = bundle / "snippets"
|
||||
snippets.mkdir(parents=True)
|
||||
(bundle / "bundle.yaml").write_text("title: test\n")
|
||||
(snippets / "concept-alignment.yaml").write_text("items: []\n")
|
||||
(snippets / "plasticity.md").write_text(
|
||||
"# Plasticity\n\nPlasticity can mislead evolutionary inference if heredity is not checked.\n"
|
||||
)
|
||||
|
||||
report = build_archive_phrase_inventory([bundle], top_n=10)
|
||||
assert report["summary"]["document_count"] == 1
|
||||
phrases = [item["phrase"] for item in report["prioritized_concepts"]]
|
||||
assert any("plasticity" in phrase or "evolutionary inference" in phrase for phrase in phrases)
|
||||
Loading…
Reference in New Issue