Classify converted documents and improve titles

This commit is contained in:
welsberr 2026-04-22 21:26:37 -04:00
parent 787e3e7330
commit bb8d54aa15
4 changed files with 104 additions and 1 deletions

View File

@ -4,6 +4,7 @@ from pathlib import Path
from .legacy_doc import ( from .legacy_doc import (
build_layout_manifest, build_layout_manifest,
classify_document,
clean_text, clean_text,
collect_figure_assets, collect_figure_assets,
extract_references, extract_references,
@ -27,6 +28,7 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
raw = run_catdoc(source_path) raw = run_catdoc(source_path)
cleaned = clean_text(raw) cleaned = clean_text(raw)
title = extract_title(cleaned, source_path.stem) title = extract_title(cleaned, source_path.stem)
document_kind = classify_document(cleaned, source_path)
body = strip_title(cleaned, title) body = strip_title(cleaned, title)
layout_body = normalize_text_preserve_layout(strip_title(raw, title)) layout_body = normalize_text_preserve_layout(strip_title(raw, title))
tables = extract_tables(layout_body) tables = extract_tables(layout_body)
@ -64,6 +66,7 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
return DocumentBundle( return DocumentBundle(
document_id=slugify(title), document_id=slugify(title),
title=title, title=title,
document_kind=document_kind,
source_path=str(source_path), source_path=str(source_path),
output_dir=str(doc_out), output_dir=str(doc_out),
markdown_path=str(markdown_path), markdown_path=str(markdown_path),
@ -98,6 +101,7 @@ def convert_directory(source_root: Path, out_root: Path, asset_root: Path | None
{ {
"document_id": bundle.document_id, "document_id": bundle.document_id,
"title": bundle.title, "title": bundle.title,
"document_kind": bundle.document_kind,
"table_count": bundle.table_count, "table_count": bundle.table_count,
"figure_reference_count": bundle.figure_reference_count, "figure_reference_count": bundle.figure_reference_count,
} }

View File

@ -55,6 +55,42 @@ def normalize_text_preserve_layout(text: str) -> str:
def extract_title(text: str, fallback: str) -> str: def extract_title(text: str, fallback: str) -> str:
lines = text.splitlines() lines = text.splitlines()
nonempty = [line.strip() for line in lines if line.strip()]
if not nonempty:
return fallback
joined = " ".join(nonempty[:8])
upper_joined = joined.upper()
first = nonempty[0]
if first.upper().startswith("MAKE-UP EXAM"):
return first
if first.upper() in {"EXAM I", "EXAM II"}:
return first
if "FINAL EXAM" in upper_joined:
for line in nonempty[:8]:
if "FINAL EXAM" in line.upper():
return line
if "CLASS NOTES" in upper_joined:
title_parts: list[str] = []
started = False
for line in nonempty[:6]:
upper = line.upper()
if upper.startswith("MARB "):
continue
if upper == "CLASS NOTES":
break
if upper in {"SPRING 2000", "SPRING 1999", "SPRING 2001"}:
continue
started = True
title_parts.append(line)
if started:
return " ".join(title_parts)
if first.upper().startswith("MARB ") and len(nonempty) > 1:
second = nonempty[1]
if re.match(r"^\d+\s+Credit\b", second, re.IGNORECASE):
return first
for index, line in enumerate(lines): for index, line in enumerate(lines):
stripped = line.strip() stripped = line.strip()
if not stripped: if not stripped:
@ -71,12 +107,15 @@ def extract_title(text: str, fallback: str) -> str:
return stripped return stripped
if stripped.upper() in { if stripped.upper() in {
"SPRING 2000", "SPRING 2000",
"SPRING 1999",
"MARB 401", "MARB 401",
"MARB 482 SEMINAR IN MARINE BIOLOGY", "MARB 482 SEMINAR IN MARINE BIOLOGY",
"COURSE SYLLABUS", "COURSE SYLLABUS",
"EXAM I", "EXAM I",
"EXAM II", "EXAM II",
"FINAL EXAM SPRING 1999", "FINAL EXAM SPRING 1999",
"CLASS NOTES",
"OF",
}: }:
continue continue
if stripped.startswith(("February ", "April ")): if stripped.startswith(("February ", "April ")):
@ -85,6 +124,24 @@ def extract_title(text: str, fallback: str) -> str:
return fallback return fallback
def classify_document(text: str, source_path: Path) -> str:
nonempty = [line.strip() for line in text.splitlines() if line.strip()]
joined = " ".join(nonempty[:10]).upper()
name = source_path.name.upper()
if name.startswith("SYLLABUS") or "COURSE SYLLABUS" in joined:
return "syllabus"
if "FINAL EXAM" in joined:
return "final_exam"
if name.startswith("EXAM") or name.startswith("MAKE-UP") or re.match(r"^EXAM\b", joined):
return "exam"
if "CLASS NOTES" in joined or name == "COVER.DOC":
return "cover_notes"
if re.match(r"^LECTURE\s+\d+\.", joined):
return "lecture"
return "document"
def strip_title(text: str, title: str) -> str: def strip_title(text: str, title: str) -> str:
lines = text.splitlines() lines = text.splitlines()
normalized_title = " ".join(title.split()) normalized_title = " ".join(title.split())

View File

@ -34,6 +34,7 @@ class FigureAsset(BaseModel):
class DocumentBundle(BaseModel): class DocumentBundle(BaseModel):
document_id: str document_id: str
title: str title: str
document_kind: str = "document"
source_path: str source_path: str
output_dir: str output_dir: str
markdown_path: str markdown_path: str

View File

@ -1,4 +1,13 @@
from doclift.legacy_doc import FigureAsset, extract_references, extract_tables, link_related_assets from pathlib import Path
from doclift.legacy_doc import (
FigureAsset,
classify_document,
extract_references,
extract_tables,
extract_title,
link_related_assets,
)
def test_extract_references_dedupes() -> None: def test_extract_references_dedupes() -> None:
@ -45,3 +54,35 @@ def test_link_related_assets_matches_explicit_figure_refs() -> None:
] ]
matched = link_related_assets(["Fig. 5.1"], assets) matched = link_related_assets(["Fig. 5.1"], assets)
assert [asset.asset_id for asset in matched] == ["a1"] assert [asset.asset_id for asset in matched] == ["a1"]
def test_extract_title_prefers_exam_headers() -> None:
text = "\n".join(
[
"EXAM I",
"February 25, 1999",
"Answer three of the following essay questions.",
]
)
assert extract_title(text, "fallback") == "EXAM I"
def test_extract_title_handles_cover_sheet() -> None:
text = "\n".join(
[
"MARB 401",
"PHYSIOLOGICAL ECOLOGY",
"OF",
"MARINE MAMMALS",
"CLASS NOTES",
"SPRING 2000",
]
)
assert extract_title(text, "fallback") == "PHYSIOLOGICAL ECOLOGY OF MARINE MAMMALS"
def test_classify_document_kinds() -> None:
assert classify_document("EXAM II\nApril 6, 1999\n", Path("Exam II-99.doc")) == "exam"
assert classify_document("FINAL EXAM SPRING 1999\nAnswer 3 questions\n", Path("final exam.991.doc")) == "final_exam"
assert classify_document("MARB 401\nPHYSIOLOGICAL ECOLOGY\nOF\nMARINE MAMMALS\nCLASS NOTES\n", Path("COVER.doc")) == "cover_notes"
assert classify_document("SPRING 2000\nMARB 401\nPhysiological Ecology of Marine Mammals\n", Path("Syllabus 401.001.doc")) == "syllabus"