diff --git a/src/doclift/convert.py b/src/doclift/convert.py index b2489c6..fe43260 100755 --- a/src/doclift/convert.py +++ b/src/doclift/convert.py @@ -4,6 +4,7 @@ from pathlib import Path from .legacy_doc import ( build_layout_manifest, + classify_document, clean_text, collect_figure_assets, extract_references, @@ -27,6 +28,7 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None = raw = run_catdoc(source_path) cleaned = clean_text(raw) title = extract_title(cleaned, source_path.stem) + document_kind = classify_document(cleaned, source_path) body = strip_title(cleaned, title) layout_body = normalize_text_preserve_layout(strip_title(raw, title)) tables = extract_tables(layout_body) @@ -64,6 +66,7 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None = return DocumentBundle( document_id=slugify(title), title=title, + document_kind=document_kind, source_path=str(source_path), output_dir=str(doc_out), markdown_path=str(markdown_path), @@ -98,6 +101,7 @@ def convert_directory(source_root: Path, out_root: Path, asset_root: Path | None { "document_id": bundle.document_id, "title": bundle.title, + "document_kind": bundle.document_kind, "table_count": bundle.table_count, "figure_reference_count": bundle.figure_reference_count, } diff --git a/src/doclift/legacy_doc.py b/src/doclift/legacy_doc.py index 98e7ee9..2391390 100755 --- a/src/doclift/legacy_doc.py +++ b/src/doclift/legacy_doc.py @@ -55,6 +55,42 @@ def normalize_text_preserve_layout(text: str) -> str: def extract_title(text: str, fallback: str) -> str: lines = text.splitlines() + nonempty = [line.strip() for line in lines if line.strip()] + if not nonempty: + return fallback + + joined = " ".join(nonempty[:8]) + upper_joined = joined.upper() + first = nonempty[0] + + if first.upper().startswith("MAKE-UP EXAM"): + return first + if first.upper() in {"EXAM I", "EXAM II"}: + return first + if "FINAL EXAM" in upper_joined: + for line in nonempty[:8]: + if "FINAL EXAM" in line.upper(): + return line + if "CLASS NOTES" in upper_joined: + title_parts: list[str] = [] + started = False + for line in nonempty[:6]: + upper = line.upper() + if upper.startswith("MARB "): + continue + if upper == "CLASS NOTES": + break + if upper in {"SPRING 2000", "SPRING 1999", "SPRING 2001"}: + continue + started = True + title_parts.append(line) + if started: + return " ".join(title_parts) + if first.upper().startswith("MARB ") and len(nonempty) > 1: + second = nonempty[1] + if re.match(r"^\d+\s+Credit\b", second, re.IGNORECASE): + return first + for index, line in enumerate(lines): stripped = line.strip() if not stripped: @@ -71,12 +107,15 @@ def extract_title(text: str, fallback: str) -> str: return stripped if stripped.upper() in { "SPRING 2000", + "SPRING 1999", "MARB 401", "MARB 482 SEMINAR IN MARINE BIOLOGY", "COURSE SYLLABUS", "EXAM I", "EXAM II", "FINAL EXAM SPRING 1999", + "CLASS NOTES", + "OF", }: continue if stripped.startswith(("February ", "April ")): @@ -85,6 +124,24 @@ def extract_title(text: str, fallback: str) -> str: return fallback +def classify_document(text: str, source_path: Path) -> str: + nonempty = [line.strip() for line in text.splitlines() if line.strip()] + joined = " ".join(nonempty[:10]).upper() + name = source_path.name.upper() + + if name.startswith("SYLLABUS") or "COURSE SYLLABUS" in joined: + return "syllabus" + if "FINAL EXAM" in joined: + return "final_exam" + if name.startswith("EXAM") or name.startswith("MAKE-UP") or re.match(r"^EXAM\b", joined): + return "exam" + if "CLASS NOTES" in joined or name == "COVER.DOC": + return "cover_notes" + if re.match(r"^LECTURE\s+\d+\.", joined): + return "lecture" + return "document" + + def strip_title(text: str, title: str) -> str: lines = text.splitlines() normalized_title = " ".join(title.split()) diff --git a/src/doclift/schemas.py b/src/doclift/schemas.py index 286f220..a69ff09 100755 --- a/src/doclift/schemas.py +++ b/src/doclift/schemas.py @@ -34,6 +34,7 @@ class FigureAsset(BaseModel): class DocumentBundle(BaseModel): document_id: str title: str + document_kind: str = "document" source_path: str output_dir: str markdown_path: str diff --git a/tests/test_legacy_doc.py b/tests/test_legacy_doc.py index 468c7be..345a6d4 100755 --- a/tests/test_legacy_doc.py +++ b/tests/test_legacy_doc.py @@ -1,4 +1,13 @@ -from doclift.legacy_doc import FigureAsset, extract_references, extract_tables, link_related_assets +from pathlib import Path + +from doclift.legacy_doc import ( + FigureAsset, + classify_document, + extract_references, + extract_tables, + extract_title, + link_related_assets, +) def test_extract_references_dedupes() -> None: @@ -45,3 +54,35 @@ def test_link_related_assets_matches_explicit_figure_refs() -> None: ] matched = link_related_assets(["Fig. 5.1"], assets) assert [asset.asset_id for asset in matched] == ["a1"] + + +def test_extract_title_prefers_exam_headers() -> None: + text = "\n".join( + [ + "EXAM I", + "February 25, 1999", + "Answer three of the following essay questions.", + ] + ) + assert extract_title(text, "fallback") == "EXAM I" + + +def test_extract_title_handles_cover_sheet() -> None: + text = "\n".join( + [ + "MARB 401", + "PHYSIOLOGICAL ECOLOGY", + "OF", + "MARINE MAMMALS", + "CLASS NOTES", + "SPRING 2000", + ] + ) + assert extract_title(text, "fallback") == "PHYSIOLOGICAL ECOLOGY OF MARINE MAMMALS" + + +def test_classify_document_kinds() -> None: + assert classify_document("EXAM II\nApril 6, 1999\n", Path("Exam II-99.doc")) == "exam" + assert classify_document("FINAL EXAM SPRING 1999\nAnswer 3 questions\n", Path("final exam.991.doc")) == "final_exam" + assert classify_document("MARB 401\nPHYSIOLOGICAL ECOLOGY\nOF\nMARINE MAMMALS\nCLASS NOTES\n", Path("COVER.doc")) == "cover_notes" + assert classify_document("SPRING 2000\nMARB 401\nPhysiological Ecology of Marine Mammals\n", Path("Syllabus 401.001.doc")) == "syllabus"