Classify converted documents and improve titles

This commit is contained in:
welsberr 2026-04-22 21:26:37 -04:00
parent 787e3e7330
commit bb8d54aa15
4 changed files with 104 additions and 1 deletions

View File

@ -4,6 +4,7 @@ from pathlib import Path
from .legacy_doc import (
build_layout_manifest,
classify_document,
clean_text,
collect_figure_assets,
extract_references,
@ -27,6 +28,7 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
raw = run_catdoc(source_path)
cleaned = clean_text(raw)
title = extract_title(cleaned, source_path.stem)
document_kind = classify_document(cleaned, source_path)
body = strip_title(cleaned, title)
layout_body = normalize_text_preserve_layout(strip_title(raw, title))
tables = extract_tables(layout_body)
@ -64,6 +66,7 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
return DocumentBundle(
document_id=slugify(title),
title=title,
document_kind=document_kind,
source_path=str(source_path),
output_dir=str(doc_out),
markdown_path=str(markdown_path),
@ -98,6 +101,7 @@ def convert_directory(source_root: Path, out_root: Path, asset_root: Path | None
{
"document_id": bundle.document_id,
"title": bundle.title,
"document_kind": bundle.document_kind,
"table_count": bundle.table_count,
"figure_reference_count": bundle.figure_reference_count,
}

View File

@ -55,6 +55,42 @@ def normalize_text_preserve_layout(text: str) -> str:
def extract_title(text: str, fallback: str) -> str:
lines = text.splitlines()
nonempty = [line.strip() for line in lines if line.strip()]
if not nonempty:
return fallback
joined = " ".join(nonempty[:8])
upper_joined = joined.upper()
first = nonempty[0]
if first.upper().startswith("MAKE-UP EXAM"):
return first
if first.upper() in {"EXAM I", "EXAM II"}:
return first
if "FINAL EXAM" in upper_joined:
for line in nonempty[:8]:
if "FINAL EXAM" in line.upper():
return line
if "CLASS NOTES" in upper_joined:
title_parts: list[str] = []
started = False
for line in nonempty[:6]:
upper = line.upper()
if upper.startswith("MARB "):
continue
if upper == "CLASS NOTES":
break
if upper in {"SPRING 2000", "SPRING 1999", "SPRING 2001"}:
continue
started = True
title_parts.append(line)
if started:
return " ".join(title_parts)
if first.upper().startswith("MARB ") and len(nonempty) > 1:
second = nonempty[1]
if re.match(r"^\d+\s+Credit\b", second, re.IGNORECASE):
return first
for index, line in enumerate(lines):
stripped = line.strip()
if not stripped:
@ -71,12 +107,15 @@ def extract_title(text: str, fallback: str) -> str:
return stripped
if stripped.upper() in {
"SPRING 2000",
"SPRING 1999",
"MARB 401",
"MARB 482 SEMINAR IN MARINE BIOLOGY",
"COURSE SYLLABUS",
"EXAM I",
"EXAM II",
"FINAL EXAM SPRING 1999",
"CLASS NOTES",
"OF",
}:
continue
if stripped.startswith(("February ", "April ")):
@ -85,6 +124,24 @@ def extract_title(text: str, fallback: str) -> str:
return fallback
def classify_document(text: str, source_path: Path) -> str:
nonempty = [line.strip() for line in text.splitlines() if line.strip()]
joined = " ".join(nonempty[:10]).upper()
name = source_path.name.upper()
if name.startswith("SYLLABUS") or "COURSE SYLLABUS" in joined:
return "syllabus"
if "FINAL EXAM" in joined:
return "final_exam"
if name.startswith("EXAM") or name.startswith("MAKE-UP") or re.match(r"^EXAM\b", joined):
return "exam"
if "CLASS NOTES" in joined or name == "COVER.DOC":
return "cover_notes"
if re.match(r"^LECTURE\s+\d+\.", joined):
return "lecture"
return "document"
def strip_title(text: str, title: str) -> str:
lines = text.splitlines()
normalized_title = " ".join(title.split())

View File

@ -34,6 +34,7 @@ class FigureAsset(BaseModel):
class DocumentBundle(BaseModel):
document_id: str
title: str
document_kind: str = "document"
source_path: str
output_dir: str
markdown_path: str

View File

@ -1,4 +1,13 @@
from doclift.legacy_doc import FigureAsset, extract_references, extract_tables, link_related_assets
from pathlib import Path
from doclift.legacy_doc import (
FigureAsset,
classify_document,
extract_references,
extract_tables,
extract_title,
link_related_assets,
)
def test_extract_references_dedupes() -> None:
@ -45,3 +54,35 @@ def test_link_related_assets_matches_explicit_figure_refs() -> None:
]
matched = link_related_assets(["Fig. 5.1"], assets)
assert [asset.asset_id for asset in matched] == ["a1"]
def test_extract_title_prefers_exam_headers() -> None:
text = "\n".join(
[
"EXAM I",
"February 25, 1999",
"Answer three of the following essay questions.",
]
)
assert extract_title(text, "fallback") == "EXAM I"
def test_extract_title_handles_cover_sheet() -> None:
text = "\n".join(
[
"MARB 401",
"PHYSIOLOGICAL ECOLOGY",
"OF",
"MARINE MAMMALS",
"CLASS NOTES",
"SPRING 2000",
]
)
assert extract_title(text, "fallback") == "PHYSIOLOGICAL ECOLOGY OF MARINE MAMMALS"
def test_classify_document_kinds() -> None:
assert classify_document("EXAM II\nApril 6, 1999\n", Path("Exam II-99.doc")) == "exam"
assert classify_document("FINAL EXAM SPRING 1999\nAnswer 3 questions\n", Path("final exam.991.doc")) == "final_exam"
assert classify_document("MARB 401\nPHYSIOLOGICAL ECOLOGY\nOF\nMARINE MAMMALS\nCLASS NOTES\n", Path("COVER.doc")) == "cover_notes"
assert classify_document("SPRING 2000\nMARB 401\nPhysiological Ecology of Marine Mammals\n", Path("Syllabus 401.001.doc")) == "syllabus"