Emit chunk sidecars in doclift bundles

2026-04-27 10:53:52 -04:00 · 2026-04-27 10:53:52 -04:00 · 07fe114626
parent 28aea13192
commit 07fe114626
3 changed files with 119 additions and 1 deletions
--- a/src/doclift/convert.py
+++ b/src/doclift/convert.py
@ -1,6 +1,7 @@
 from __future__ import annotations
 from pathlib import Path
 import re
 from .legacy_doc import (
    build_layout_manifest,
@ -16,7 +17,7 @@ from .legacy_doc import (
    run_catdoc,
    strip_title,
 )
-from .schemas import ConversionReport, DocumentBundle
+from .schemas import ConversionReport, DocumentBundle, DocumentChunk
 from .utils import slugify, write_json
@ -28,6 +29,94 @@ def _relative_to_root(path: Path, root: Path) -> str:
    return path.relative_to(root).as_posix()
 def _build_document_chunks(title: str, body: str, layout_body: str, tables: list) -> list[DocumentChunk]:
    paragraphs = _extract_paragraphs(_body_for_chunking(body, tables))
    layout_lines = layout_body.splitlines()
    layout_cursor = 0
    chunks: list[DocumentChunk] = []
    for index, paragraph in enumerate(paragraphs, start=1):
        role = _classify_chunk_role(paragraph)
        line_start, line_end, layout_cursor = _locate_chunk_span(paragraph, layout_lines, layout_cursor)
        chunks.append(
            DocumentChunk(
                chunk_id=f"{slugify(title)}-c{index}",
                role=role,
                section=title,
                line_start=line_start,
                line_end=line_end,
                text=paragraph,
                confidence_hint=0.8 if role == "claim" else 0.75,
            )
        )
    return chunks
 def _extract_paragraphs(body: str) -> list[str]:
    paragraphs: list[str] = []
    current: list[str] = []
    for line in body.splitlines():
        stripped = line.strip()
        if not stripped:
            if current:
                paragraphs.append(" ".join(current).strip())
                current = []
            continue
        current.append(stripped)
    if current:
        paragraphs.append(" ".join(current).strip())
    return paragraphs
 def _body_for_chunking(body: str, tables: list) -> str:
    excluded = {
        line.strip()
        for table in tables
        for line in [table.caption, *table.raw_lines]
        if line.strip()
    }
    kept_lines: list[str] = []
    for line in body.splitlines():
        if line.strip() in excluded:
            continue
        kept_lines.append(line)
    return "\n".join(kept_lines)
 def _classify_chunk_role(paragraph: str) -> str:
    if paragraph.startswith(("- ", "* ")):
        return "claim"
    if re.match(r"^(objective|claim|finding|result|conclusion):", paragraph, re.IGNORECASE):
        return "claim"
    return "summary"
 def _locate_chunk_span(paragraph: str, layout_lines: list[str], start_index: int) -> tuple[int, int, int]:
    paragraph_lines = [line.strip() for line in paragraph.splitlines() if line.strip()]
    if not paragraph_lines:
        paragraph_lines = [paragraph.strip()]
    normalized_layout = [line.strip() for line in layout_lines]
    tokens = " ".join(part for part in paragraph_lines if part).split()
    token_count = len(tokens)
    if token_count == 0:
        return 0, 0, start_index
    for offset in range(start_index, len(normalized_layout)):
        if not normalized_layout[offset]:
            continue
        collected: list[str] = []
        end_offset = offset
        while end_offset < len(normalized_layout) and len(" ".join(collected).split()) < token_count:
            candidate = normalized_layout[end_offset]
            if candidate:
                collected.append(candidate)
            end_offset += 1
        if " ".join(collected).split() == tokens:
            return offset + 1, end_offset, end_offset
    return 0, 0, start_index
 def convert_doc(source_path: Path, source_root: Path, out_root: Path, figure_assets: list | None = None) -> DocumentBundle:
    raw = run_catdoc(source_path)
    cleaned = clean_text(raw)
@ -47,6 +136,8 @@ def convert_doc(source_path: Path, source_root: Path, out_root: Path, figure_ass
    layout_path = doc_out / "document.layout.json"
    tables_path = doc_out / "document.tables.json"
    figures_path = doc_out / "document.figures.json"
    chunks_path = doc_out / "document.chunks.json"
    chunks = _build_document_chunks(title, body, layout_body, tables)
    markdown_path.write_text(render_markdown(title, body, tables, figure_refs, related_assets), encoding="utf-8")
    write_json(layout_path, layout)
@ -68,6 +159,7 @@ def convert_doc(source_path: Path, source_root: Path, out_root: Path, figure_ass
            "related_assets": [asset.model_dump() for asset in related_assets],
        },
    )
    write_json(chunks_path, {"chunks": [chunk.model_dump() for chunk in chunks]})
    return DocumentBundle(
        document_id=slugify(title),
@ -80,9 +172,11 @@ def convert_doc(source_path: Path, source_root: Path, out_root: Path, figure_ass
        layout_path=_relative_to_root(layout_path, out_root),
        tables_path=_relative_to_root(tables_path, out_root),
        figures_path=_relative_to_root(figures_path, out_root),
        chunks_path=_relative_to_root(chunks_path, out_root),
        bundle_path_kind="bundle_root_relative",
        table_count=len(tables),
        figure_reference_count=len(figure_refs),
        chunk_count=len(chunks),
    )
--- a/src/doclift/schemas.py
+++ b/src/doclift/schemas.py
@ -31,6 +31,16 @@ class FigureAsset(BaseModel):
    looks_like_figure: bool = False
 class DocumentChunk(BaseModel):
    chunk_id: str
    role: str = "summary"
    section: str = ""
    line_start: int = 0
    line_end: int = 0
    text: str
    confidence_hint: float = 0.75
 class DocumentBundle(BaseModel):
    document_id: str
    title: str
@ -42,9 +52,11 @@ class DocumentBundle(BaseModel):
    layout_path: str
    tables_path: str
    figures_path: str
    chunks_path: str
    bundle_path_kind: str = "bundle_root_relative"
    table_count: int = 0
    figure_reference_count: int = 0
    chunk_count: int = 0
 class ConversionReport(BaseModel):
--- a/tests/test_convert.py
+++ b/tests/test_convert.py
@ -40,11 +40,18 @@ def test_convert_directory_writes_manifest_and_conversion_report(tmp_path: Path,
            encoding="utf-8"
        )
    )
    chunks_payload = json.loads(
        (out_root / "documents" / "sample-lecture-1-example-legacy-document" / "document.chunks.json").read_text(
            encoding="utf-8"
        )
    )
    assert manifest["document_count"] == 1
    assert manifest["source_root"] == "src"
    assert manifest["documents"][0]["source_path"] == "sample.doc"
    assert manifest["documents"][0]["markdown_path"] == "documents/sample-lecture-1-example-legacy-document/document.md"
    assert manifest["documents"][0]["chunks_path"] == "documents/sample-lecture-1-example-legacy-document/document.chunks.json"
    assert manifest["documents"][0]["chunk_count"] == 1
    assert conversion_report["summary"]["documents_with_tables"] == 1
    assert conversion_report["summary"]["documents_with_figure_references"] == 1
    assert figures_payload["source_path"] == "sample.doc"
@ -52,3 +59,8 @@ def test_convert_directory_writes_manifest_and_conversion_report(tmp_path: Path,
    assert figures_payload["figure_references"] == ["Fig. 5.1"]
    assert len(figures_payload["related_assets"]) == 1
    assert figures_payload["related_assets"][0]["path"] == "Fig. 5.1.bmp"
    assert len(chunks_payload["chunks"]) == 1
    assert chunks_payload["chunks"][0]["chunk_id"] == "lecture-1-example-legacy-document-c1"
    assert chunks_payload["chunks"][0]["role"] == "summary"
    assert chunks_payload["chunks"][0]["line_start"] >= 1
    assert chunks_payload["chunks"][0]["text"] == "See Fig. 5.1 and Table 1."