Didactopus/src/didactopus/document_adapters.py

142 lines
4.0 KiB
Python

from __future__ import annotations
from pathlib import Path
import re
from .course_schema import NormalizedDocument, Section
def _title_from_path(path: str | Path) -> str:
p = Path(path)
return p.stem.replace("_", " ").replace("-", " ").title()
def _simple_section_split(text: str) -> list[Section]:
sections = []
current_heading = "Main"
current_lines = []
for line in text.splitlines():
if re.match(r"^(#{1,3})\s+", line):
if current_lines:
sections.append(Section(heading=current_heading, body="\n".join(current_lines).strip()))
current_heading = re.sub(r"^(#{1,3})\s+", "", line).strip()
current_lines = []
else:
current_lines.append(line)
if current_lines:
sections.append(Section(heading=current_heading, body="\n".join(current_lines).strip()))
return sections
def read_textish(path: str | Path) -> str:
return Path(path).read_text(encoding="utf-8")
def adapt_markdown(path: str | Path) -> NormalizedDocument:
text = read_textish(path)
return NormalizedDocument(
source_path=str(path),
source_type="markdown",
title=_title_from_path(path),
text=text,
sections=_simple_section_split(text),
metadata={},
)
def adapt_text(path: str | Path) -> NormalizedDocument:
text = read_textish(path)
return NormalizedDocument(
source_path=str(path),
source_type="text",
title=_title_from_path(path),
text=text,
sections=_simple_section_split(text),
metadata={},
)
def adapt_html(path: str | Path) -> NormalizedDocument:
raw = read_textish(path)
text = re.sub(r"<[^>]+>", " ", raw)
text = re.sub(r"\s+", " ", text).strip()
return NormalizedDocument(
source_path=str(path),
source_type="html",
title=_title_from_path(path),
text=text,
sections=[Section(heading="HTML Extract", body=text)],
metadata={"extraction": "stub-html-strip"},
)
def adapt_pdf(path: str | Path) -> NormalizedDocument:
# Stub: in a real implementation, plug in PDF text extraction here.
text = read_textish(path)
return NormalizedDocument(
source_path=str(path),
source_type="pdf",
title=_title_from_path(path),
text=text,
sections=_simple_section_split(text),
metadata={"extraction": "stub-pdf-text"},
)
def adapt_docx(path: str | Path) -> NormalizedDocument:
# Stub: in a real implementation, plug in DOCX extraction here.
text = read_textish(path)
return NormalizedDocument(
source_path=str(path),
source_type="docx",
title=_title_from_path(path),
text=text,
sections=_simple_section_split(text),
metadata={"extraction": "stub-docx-text"},
)
def adapt_pptx(path: str | Path) -> NormalizedDocument:
# Stub: in a real implementation, plug in PPTX extraction here.
text = read_textish(path)
return NormalizedDocument(
source_path=str(path),
source_type="pptx",
title=_title_from_path(path),
text=text,
sections=_simple_section_split(text),
metadata={"extraction": "stub-pptx-text"},
)
def detect_adapter(path: str | Path) -> str:
p = Path(path)
suffix = p.suffix.lower()
if suffix == ".md":
return "markdown"
if suffix in {".txt"}:
return "text"
if suffix in {".html", ".htm"}:
return "html"
if suffix == ".pdf":
return "pdf"
if suffix == ".docx":
return "docx"
if suffix == ".pptx":
return "pptx"
return "text"
def adapt_document(path: str | Path) -> NormalizedDocument:
adapter = detect_adapter(path)
if adapter == "markdown":
return adapt_markdown(path)
if adapter == "html":
return adapt_html(path)
if adapter == "pdf":
return adapt_pdf(path)
if adapter == "docx":
return adapt_docx(path)
if adapter == "pptx":
return adapt_pptx(path)
return adapt_text(path)