Classify converted documents and improve titles
This commit is contained in:
parent
787e3e7330
commit
bb8d54aa15
|
|
@ -4,6 +4,7 @@ from pathlib import Path
|
|||
|
||||
from .legacy_doc import (
|
||||
build_layout_manifest,
|
||||
classify_document,
|
||||
clean_text,
|
||||
collect_figure_assets,
|
||||
extract_references,
|
||||
|
|
@ -27,6 +28,7 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
|
|||
raw = run_catdoc(source_path)
|
||||
cleaned = clean_text(raw)
|
||||
title = extract_title(cleaned, source_path.stem)
|
||||
document_kind = classify_document(cleaned, source_path)
|
||||
body = strip_title(cleaned, title)
|
||||
layout_body = normalize_text_preserve_layout(strip_title(raw, title))
|
||||
tables = extract_tables(layout_body)
|
||||
|
|
@ -64,6 +66,7 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
|
|||
return DocumentBundle(
|
||||
document_id=slugify(title),
|
||||
title=title,
|
||||
document_kind=document_kind,
|
||||
source_path=str(source_path),
|
||||
output_dir=str(doc_out),
|
||||
markdown_path=str(markdown_path),
|
||||
|
|
@ -98,6 +101,7 @@ def convert_directory(source_root: Path, out_root: Path, asset_root: Path | None
|
|||
{
|
||||
"document_id": bundle.document_id,
|
||||
"title": bundle.title,
|
||||
"document_kind": bundle.document_kind,
|
||||
"table_count": bundle.table_count,
|
||||
"figure_reference_count": bundle.figure_reference_count,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -55,6 +55,42 @@ def normalize_text_preserve_layout(text: str) -> str:
|
|||
|
||||
def extract_title(text: str, fallback: str) -> str:
|
||||
lines = text.splitlines()
|
||||
nonempty = [line.strip() for line in lines if line.strip()]
|
||||
if not nonempty:
|
||||
return fallback
|
||||
|
||||
joined = " ".join(nonempty[:8])
|
||||
upper_joined = joined.upper()
|
||||
first = nonempty[0]
|
||||
|
||||
if first.upper().startswith("MAKE-UP EXAM"):
|
||||
return first
|
||||
if first.upper() in {"EXAM I", "EXAM II"}:
|
||||
return first
|
||||
if "FINAL EXAM" in upper_joined:
|
||||
for line in nonempty[:8]:
|
||||
if "FINAL EXAM" in line.upper():
|
||||
return line
|
||||
if "CLASS NOTES" in upper_joined:
|
||||
title_parts: list[str] = []
|
||||
started = False
|
||||
for line in nonempty[:6]:
|
||||
upper = line.upper()
|
||||
if upper.startswith("MARB "):
|
||||
continue
|
||||
if upper == "CLASS NOTES":
|
||||
break
|
||||
if upper in {"SPRING 2000", "SPRING 1999", "SPRING 2001"}:
|
||||
continue
|
||||
started = True
|
||||
title_parts.append(line)
|
||||
if started:
|
||||
return " ".join(title_parts)
|
||||
if first.upper().startswith("MARB ") and len(nonempty) > 1:
|
||||
second = nonempty[1]
|
||||
if re.match(r"^\d+\s+Credit\b", second, re.IGNORECASE):
|
||||
return first
|
||||
|
||||
for index, line in enumerate(lines):
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
|
|
@ -71,12 +107,15 @@ def extract_title(text: str, fallback: str) -> str:
|
|||
return stripped
|
||||
if stripped.upper() in {
|
||||
"SPRING 2000",
|
||||
"SPRING 1999",
|
||||
"MARB 401",
|
||||
"MARB 482 SEMINAR IN MARINE BIOLOGY",
|
||||
"COURSE SYLLABUS",
|
||||
"EXAM I",
|
||||
"EXAM II",
|
||||
"FINAL EXAM SPRING 1999",
|
||||
"CLASS NOTES",
|
||||
"OF",
|
||||
}:
|
||||
continue
|
||||
if stripped.startswith(("February ", "April ")):
|
||||
|
|
@ -85,6 +124,24 @@ def extract_title(text: str, fallback: str) -> str:
|
|||
return fallback
|
||||
|
||||
|
||||
def classify_document(text: str, source_path: Path) -> str:
|
||||
nonempty = [line.strip() for line in text.splitlines() if line.strip()]
|
||||
joined = " ".join(nonempty[:10]).upper()
|
||||
name = source_path.name.upper()
|
||||
|
||||
if name.startswith("SYLLABUS") or "COURSE SYLLABUS" in joined:
|
||||
return "syllabus"
|
||||
if "FINAL EXAM" in joined:
|
||||
return "final_exam"
|
||||
if name.startswith("EXAM") or name.startswith("MAKE-UP") or re.match(r"^EXAM\b", joined):
|
||||
return "exam"
|
||||
if "CLASS NOTES" in joined or name == "COVER.DOC":
|
||||
return "cover_notes"
|
||||
if re.match(r"^LECTURE\s+\d+\.", joined):
|
||||
return "lecture"
|
||||
return "document"
|
||||
|
||||
|
||||
def strip_title(text: str, title: str) -> str:
|
||||
lines = text.splitlines()
|
||||
normalized_title = " ".join(title.split())
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ class FigureAsset(BaseModel):
|
|||
class DocumentBundle(BaseModel):
|
||||
document_id: str
|
||||
title: str
|
||||
document_kind: str = "document"
|
||||
source_path: str
|
||||
output_dir: str
|
||||
markdown_path: str
|
||||
|
|
|
|||
|
|
@ -1,4 +1,13 @@
|
|||
from doclift.legacy_doc import FigureAsset, extract_references, extract_tables, link_related_assets
|
||||
from pathlib import Path
|
||||
|
||||
from doclift.legacy_doc import (
|
||||
FigureAsset,
|
||||
classify_document,
|
||||
extract_references,
|
||||
extract_tables,
|
||||
extract_title,
|
||||
link_related_assets,
|
||||
)
|
||||
|
||||
|
||||
def test_extract_references_dedupes() -> None:
|
||||
|
|
@ -45,3 +54,35 @@ def test_link_related_assets_matches_explicit_figure_refs() -> None:
|
|||
]
|
||||
matched = link_related_assets(["Fig. 5.1"], assets)
|
||||
assert [asset.asset_id for asset in matched] == ["a1"]
|
||||
|
||||
|
||||
def test_extract_title_prefers_exam_headers() -> None:
|
||||
text = "\n".join(
|
||||
[
|
||||
"EXAM I",
|
||||
"February 25, 1999",
|
||||
"Answer three of the following essay questions.",
|
||||
]
|
||||
)
|
||||
assert extract_title(text, "fallback") == "EXAM I"
|
||||
|
||||
|
||||
def test_extract_title_handles_cover_sheet() -> None:
|
||||
text = "\n".join(
|
||||
[
|
||||
"MARB 401",
|
||||
"PHYSIOLOGICAL ECOLOGY",
|
||||
"OF",
|
||||
"MARINE MAMMALS",
|
||||
"CLASS NOTES",
|
||||
"SPRING 2000",
|
||||
]
|
||||
)
|
||||
assert extract_title(text, "fallback") == "PHYSIOLOGICAL ECOLOGY OF MARINE MAMMALS"
|
||||
|
||||
|
||||
def test_classify_document_kinds() -> None:
|
||||
assert classify_document("EXAM II\nApril 6, 1999\n", Path("Exam II-99.doc")) == "exam"
|
||||
assert classify_document("FINAL EXAM SPRING 1999\nAnswer 3 questions\n", Path("final exam.991.doc")) == "final_exam"
|
||||
assert classify_document("MARB 401\nPHYSIOLOGICAL ECOLOGY\nOF\nMARINE MAMMALS\nCLASS NOTES\n", Path("COVER.doc")) == "cover_notes"
|
||||
assert classify_document("SPRING 2000\nMARB 401\nPhysiological Ecology of Marine Mammals\n", Path("Syllabus 401.001.doc")) == "syllabus"
|
||||
|
|
|
|||
Loading…
Reference in New Issue