Classify converted documents and improve titles

2026-04-22 21:26:37 -04:00 · 2026-04-22 21:26:37 -04:00 · bb8d54aa15
parent 787e3e7330
commit bb8d54aa15
4 changed files with 104 additions and 1 deletions
--- a/src/doclift/convert.py
+++ b/src/doclift/convert.py
@ -4,6 +4,7 @@ from pathlib import Path
 from .legacy_doc import (
    build_layout_manifest,
    classify_document,
    clean_text,
    collect_figure_assets,
    extract_references,
@ -27,6 +28,7 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
    raw = run_catdoc(source_path)
    cleaned = clean_text(raw)
    title = extract_title(cleaned, source_path.stem)
    document_kind = classify_document(cleaned, source_path)
    body = strip_title(cleaned, title)
    layout_body = normalize_text_preserve_layout(strip_title(raw, title))
    tables = extract_tables(layout_body)
@ -64,6 +66,7 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
    return DocumentBundle(
        document_id=slugify(title),
        title=title,
        document_kind=document_kind,
        source_path=str(source_path),
        output_dir=str(doc_out),
        markdown_path=str(markdown_path),
@ -98,6 +101,7 @@ def convert_directory(source_root: Path, out_root: Path, asset_root: Path | None
                    {
                        "document_id": bundle.document_id,
                        "title": bundle.title,
                        "document_kind": bundle.document_kind,
                        "table_count": bundle.table_count,
                        "figure_reference_count": bundle.figure_reference_count,
                    }
--- a/src/doclift/legacy_doc.py
+++ b/src/doclift/legacy_doc.py
@ -55,6 +55,42 @@ def normalize_text_preserve_layout(text: str) -> str:
 def extract_title(text: str, fallback: str) -> str:
    lines = text.splitlines()
    nonempty = [line.strip() for line in lines if line.strip()]
    if not nonempty:
        return fallback
    joined = " ".join(nonempty[:8])
    upper_joined = joined.upper()
    first = nonempty[0]
    if first.upper().startswith("MAKE-UP EXAM"):
        return first
    if first.upper() in {"EXAM I", "EXAM II"}:
        return first
    if "FINAL EXAM" in upper_joined:
        for line in nonempty[:8]:
            if "FINAL EXAM" in line.upper():
                return line
    if "CLASS NOTES" in upper_joined:
        title_parts: list[str] = []
        started = False
        for line in nonempty[:6]:
            upper = line.upper()
            if upper.startswith("MARB "):
                continue
            if upper == "CLASS NOTES":
                break
            if upper in {"SPRING 2000", "SPRING 1999", "SPRING 2001"}:
                continue
            started = True
            title_parts.append(line)
        if started:
            return " ".join(title_parts)
    if first.upper().startswith("MARB ") and len(nonempty) > 1:
        second = nonempty[1]
        if re.match(r"^\d+\s+Credit\b", second, re.IGNORECASE):
            return first
    for index, line in enumerate(lines):
        stripped = line.strip()
        if not stripped:
@ -71,12 +107,15 @@ def extract_title(text: str, fallback: str) -> str:
            return stripped
        if stripped.upper() in {
            "SPRING 2000",
            "SPRING 1999",
            "MARB 401",
            "MARB 482 SEMINAR IN MARINE BIOLOGY",
            "COURSE SYLLABUS",
            "EXAM I",
            "EXAM II",
            "FINAL EXAM SPRING 1999",
            "CLASS NOTES",
            "OF",
        }:
            continue
        if stripped.startswith(("February ", "April ")):
@ -85,6 +124,24 @@ def extract_title(text: str, fallback: str) -> str:
    return fallback
 def classify_document(text: str, source_path: Path) -> str:
    nonempty = [line.strip() for line in text.splitlines() if line.strip()]
    joined = " ".join(nonempty[:10]).upper()
    name = source_path.name.upper()
    if name.startswith("SYLLABUS") or "COURSE SYLLABUS" in joined:
        return "syllabus"
    if "FINAL EXAM" in joined:
        return "final_exam"
    if name.startswith("EXAM") or name.startswith("MAKE-UP") or re.match(r"^EXAM\b", joined):
        return "exam"
    if "CLASS NOTES" in joined or name == "COVER.DOC":
        return "cover_notes"
    if re.match(r"^LECTURE\s+\d+\.", joined):
        return "lecture"
    return "document"
 def strip_title(text: str, title: str) -> str:
    lines = text.splitlines()
    normalized_title = " ".join(title.split())
--- a/src/doclift/schemas.py
+++ b/src/doclift/schemas.py
@ -34,6 +34,7 @@ class FigureAsset(BaseModel):
 class DocumentBundle(BaseModel):
    document_id: str
    title: str
    document_kind: str = "document"
    source_path: str
    output_dir: str
    markdown_path: str
--- a/tests/test_legacy_doc.py
+++ b/tests/test_legacy_doc.py
@ -1,4 +1,13 @@
-from doclift.legacy_doc import FigureAsset, extract_references, extract_tables, link_related_assets
+from pathlib import Path
 from doclift.legacy_doc import (
    FigureAsset,
    classify_document,
    extract_references,
    extract_tables,
    extract_title,
    link_related_assets,
 )
 def test_extract_references_dedupes() -> None:
@ -45,3 +54,35 @@ def test_link_related_assets_matches_explicit_figure_refs() -> None:
    ]
    matched = link_related_assets(["Fig. 5.1"], assets)
    assert [asset.asset_id for asset in matched] == ["a1"]
 def test_extract_title_prefers_exam_headers() -> None:
    text = "\n".join(
        [
            "EXAM I",
            "February 25, 1999",
            "Answer three of the following essay questions.",
        ]
    )
    assert extract_title(text, "fallback") == "EXAM I"
 def test_extract_title_handles_cover_sheet() -> None:
    text = "\n".join(
        [
            "MARB 401",
            "PHYSIOLOGICAL ECOLOGY",
            "OF",
            "MARINE MAMMALS",
            "CLASS NOTES",
            "SPRING 2000",
        ]
    )
    assert extract_title(text, "fallback") == "PHYSIOLOGICAL ECOLOGY OF MARINE MAMMALS"
 def test_classify_document_kinds() -> None:
    assert classify_document("EXAM II\nApril 6, 1999\n", Path("Exam II-99.doc")) == "exam"
    assert classify_document("FINAL EXAM SPRING 1999\nAnswer 3 questions\n", Path("final exam.991.doc")) == "final_exam"
    assert classify_document("MARB 401\nPHYSIOLOGICAL ECOLOGY\nOF\nMARINE MAMMALS\nCLASS NOTES\n", Path("COVER.doc")) == "cover_notes"
    assert classify_document("SPRING 2000\nMARB 401\nPhysiological Ecology of Marine Mammals\n", Path("Syllabus 401.001.doc")) == "syllabus"