Classify converted documents and improve titles

2026-04-22 21:26:37 -04:00 · 2026-04-22 21:26:37 -04:00 · bb8d54aa15
parent 787e3e7330
commit bb8d54aa15
4 changed files with 104 additions and 1 deletions
--- a/src/doclift/convert.py
+++ b/src/doclift/convert.py
@ -4,6 +4,7 @@ from pathlib import Path

 from .legacy_doc import (
    build_layout_manifest,
+    classify_document,
    clean_text,
    collect_figure_assets,
    extract_references,
@ -27,6 +28,7 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
    raw = run_catdoc(source_path)
    cleaned = clean_text(raw)
    title = extract_title(cleaned, source_path.stem)
+    document_kind = classify_document(cleaned, source_path)
    body = strip_title(cleaned, title)
    layout_body = normalize_text_preserve_layout(strip_title(raw, title))
    tables = extract_tables(layout_body)
@ -64,6 +66,7 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
    return DocumentBundle(
        document_id=slugify(title),
        title=title,
+        document_kind=document_kind,
        source_path=str(source_path),
        output_dir=str(doc_out),
        markdown_path=str(markdown_path),
@ -98,6 +101,7 @@ def convert_directory(source_root: Path, out_root: Path, asset_root: Path | None
                    {
                        "document_id": bundle.document_id,
                        "title": bundle.title,
+                        "document_kind": bundle.document_kind,
                        "table_count": bundle.table_count,
                        "figure_reference_count": bundle.figure_reference_count,
                    }
--- a/src/doclift/legacy_doc.py
+++ b/src/doclift/legacy_doc.py
@ -55,6 +55,42 @@ def normalize_text_preserve_layout(text: str) -> str:

 def extract_title(text: str, fallback: str) -> str:
    lines = text.splitlines()
+    nonempty = [line.strip() for line in lines if line.strip()]
+    if not nonempty:
+        return fallback
+
+    joined = " ".join(nonempty[:8])
+    upper_joined = joined.upper()
+    first = nonempty[0]
+
+    if first.upper().startswith("MAKE-UP EXAM"):
+        return first
+    if first.upper() in {"EXAM I", "EXAM II"}:
+        return first
+    if "FINAL EXAM" in upper_joined:
+        for line in nonempty[:8]:
+            if "FINAL EXAM" in line.upper():
+                return line
+    if "CLASS NOTES" in upper_joined:
+        title_parts: list[str] = []
+        started = False
+        for line in nonempty[:6]:
+            upper = line.upper()
+            if upper.startswith("MARB "):
+                continue
+            if upper == "CLASS NOTES":
+                break
+            if upper in {"SPRING 2000", "SPRING 1999", "SPRING 2001"}:
+                continue
+            started = True
+            title_parts.append(line)
+        if started:
+            return " ".join(title_parts)
+    if first.upper().startswith("MARB ") and len(nonempty) > 1:
+        second = nonempty[1]
+        if re.match(r"^\d+\s+Credit\b", second, re.IGNORECASE):
+            return first
+
    for index, line in enumerate(lines):
        stripped = line.strip()
        if not stripped:
@ -71,12 +107,15 @@ def extract_title(text: str, fallback: str) -> str:
            return stripped
        if stripped.upper() in {
            "SPRING 2000",
+            "SPRING 1999",
            "MARB 401",
            "MARB 482 SEMINAR IN MARINE BIOLOGY",
            "COURSE SYLLABUS",
            "EXAM I",
            "EXAM II",
            "FINAL EXAM SPRING 1999",
+            "CLASS NOTES",
+            "OF",
        }:
            continue
        if stripped.startswith(("February ", "April ")):
@ -85,6 +124,24 @@ def extract_title(text: str, fallback: str) -> str:
    return fallback


+def classify_document(text: str, source_path: Path) -> str:
+    nonempty = [line.strip() for line in text.splitlines() if line.strip()]
+    joined = " ".join(nonempty[:10]).upper()
+    name = source_path.name.upper()
+
+    if name.startswith("SYLLABUS") or "COURSE SYLLABUS" in joined:
+        return "syllabus"
+    if "FINAL EXAM" in joined:
+        return "final_exam"
+    if name.startswith("EXAM") or name.startswith("MAKE-UP") or re.match(r"^EXAM\b", joined):
+        return "exam"
+    if "CLASS NOTES" in joined or name == "COVER.DOC":
+        return "cover_notes"
+    if re.match(r"^LECTURE\s+\d+\.", joined):
+        return "lecture"
+    return "document"
+
+
 def strip_title(text: str, title: str) -> str:
    lines = text.splitlines()
    normalized_title = " ".join(title.split())
--- a/src/doclift/schemas.py
+++ b/src/doclift/schemas.py
@ -34,6 +34,7 @@ class FigureAsset(BaseModel):
 class DocumentBundle(BaseModel):
    document_id: str
    title: str
+    document_kind: str = "document"
    source_path: str
    output_dir: str
    markdown_path: str
--- a/tests/test_legacy_doc.py
+++ b/tests/test_legacy_doc.py
@ -1,4 +1,13 @@
-from doclift.legacy_doc import FigureAsset, extract_references, extract_tables, link_related_assets
+from pathlib import Path
+
+from doclift.legacy_doc import (
+    FigureAsset,
+    classify_document,
+    extract_references,
+    extract_tables,
+    extract_title,
+    link_related_assets,
+)


 def test_extract_references_dedupes() -> None:
@ -45,3 +54,35 @@ def test_link_related_assets_matches_explicit_figure_refs() -> None:
    ]
    matched = link_related_assets(["Fig. 5.1"], assets)
    assert [asset.asset_id for asset in matched] == ["a1"]
+
+
+def test_extract_title_prefers_exam_headers() -> None:
+    text = "\n".join(
+        [
+            "EXAM I",
+            "February 25, 1999",
+            "Answer three of the following essay questions.",
+        ]
+    )
+    assert extract_title(text, "fallback") == "EXAM I"
+
+
+def test_extract_title_handles_cover_sheet() -> None:
+    text = "\n".join(
+        [
+            "MARB 401",
+            "PHYSIOLOGICAL ECOLOGY",
+            "OF",
+            "MARINE MAMMALS",
+            "CLASS NOTES",
+            "SPRING 2000",
+        ]
+    )
+    assert extract_title(text, "fallback") == "PHYSIOLOGICAL ECOLOGY OF MARINE MAMMALS"
+
+
+def test_classify_document_kinds() -> None:
+    assert classify_document("EXAM II\nApril 6, 1999\n", Path("Exam II-99.doc")) == "exam"
+    assert classify_document("FINAL EXAM SPRING 1999\nAnswer 3 questions\n", Path("final exam.991.doc")) == "final_exam"
+    assert classify_document("MARB 401\nPHYSIOLOGICAL ECOLOGY\nOF\nMARINE MAMMALS\nCLASS NOTES\n", Path("COVER.doc")) == "cover_notes"
+    assert classify_document("SPRING 2000\nMARB 401\nPhysiological Ecology of Marine Mammals\n", Path("Syllabus 401.001.doc")) == "syllabus"