Didactopus/src/didactopus/document_adapters.py

from __future__ import annotations

from pathlib import Path
import re
from .course_schema import NormalizedDocument, Section


def _title_from_path(path: str | Path) -> str:
    p = Path(path)
    return p.stem.replace("_", " ").replace("-", " ").title()


def _simple_section_split(text: str) -> list[Section]:
    sections = []
    current_heading = "Main"
    current_lines = []
    for line in text.splitlines():
        if re.match(r"^(#{1,3})\s+", line):
            if current_lines:
                sections.append(Section(heading=current_heading, body="\n".join(current_lines).strip()))
            current_heading = re.sub(r"^(#{1,3})\s+", "", line).strip()
            current_lines = []
        else:
            current_lines.append(line)
    if current_lines:
        sections.append(Section(heading=current_heading, body="\n".join(current_lines).strip()))
    return sections


def read_textish(path: str | Path) -> str:
    return Path(path).read_text(encoding="utf-8")


def adapt_markdown(path: str | Path) -> NormalizedDocument:
    text = read_textish(path)
    return NormalizedDocument(
        source_path=str(path),
        source_type="markdown",
        title=_title_from_path(path),
        text=text,
        sections=_simple_section_split(text),
        metadata={},
    )


def adapt_text(path: str | Path) -> NormalizedDocument:
    text = read_textish(path)
    return NormalizedDocument(
        source_path=str(path),
        source_type="text",
        title=_title_from_path(path),
        text=text,
        sections=_simple_section_split(text),
        metadata={},
    )


def adapt_html(path: str | Path) -> NormalizedDocument:
    raw = read_textish(path)
    text = re.sub(r"<[^>]+>", " ", raw)
    text = re.sub(r"\s+", " ", text).strip()
    return NormalizedDocument(
        source_path=str(path),
        source_type="html",
        title=_title_from_path(path),
        text=text,
        sections=[Section(heading="HTML Extract", body=text)],
        metadata={"extraction": "stub-html-strip"},
    )


def adapt_pdf(path: str | Path) -> NormalizedDocument:
    # Stub: in a real implementation, plug in PDF text extraction here.
    text = read_textish(path)
    return NormalizedDocument(
        source_path=str(path),
        source_type="pdf",
        title=_title_from_path(path),
        text=text,
        sections=_simple_section_split(text),
        metadata={"extraction": "stub-pdf-text"},
    )


def adapt_docx(path: str | Path) -> NormalizedDocument:
    # Stub: in a real implementation, plug in DOCX extraction here.
    text = read_textish(path)
    return NormalizedDocument(
        source_path=str(path),
        source_type="docx",
        title=_title_from_path(path),
        text=text,
        sections=_simple_section_split(text),
        metadata={"extraction": "stub-docx-text"},
    )


def adapt_pptx(path: str | Path) -> NormalizedDocument:
    # Stub: in a real implementation, plug in PPTX extraction here.
    text = read_textish(path)
    return NormalizedDocument(
        source_path=str(path),
        source_type="pptx",
        title=_title_from_path(path),
        text=text,
        sections=_simple_section_split(text),
        metadata={"extraction": "stub-pptx-text"},
    )


def detect_adapter(path: str | Path) -> str:
    p = Path(path)
    suffix = p.suffix.lower()
    if suffix == ".md":
        return "markdown"
    if suffix in {".txt"}:
        return "text"
    if suffix in {".html", ".htm"}:
        return "html"
    if suffix == ".pdf":
        return "pdf"
    if suffix == ".docx":
        return "docx"
    if suffix == ".pptx":
        return "pptx"
    return "text"


def adapt_document(path: str | Path) -> NormalizedDocument:
    adapter = detect_adapter(path)
    if adapter == "markdown":
        return adapt_markdown(path)
    if adapter == "html":
        return adapt_html(path)
    if adapter == "pdf":
        return adapt_pdf(path)
    if adapter == "docx":
        return adapt_docx(path)
    if adapter == "pptx":
        return adapt_pptx(path)
    return adapt_text(path)