Emit chunk sidecars in doclift bundles

2026-04-27 10:53:52 -04:00 · 2026-04-27 10:53:52 -04:00 · 07fe114626
parent 28aea13192
commit 07fe114626
3 changed files with 119 additions and 1 deletions
--- a/src/doclift/convert.py
+++ b/src/doclift/convert.py
@ -1,6 +1,7 @@
 from __future__ import annotations

 from pathlib import Path
+import re

 from .legacy_doc import (
    build_layout_manifest,
@ -16,7 +17,7 @@ from .legacy_doc import (
    run_catdoc,
    strip_title,
 )
-from .schemas import ConversionReport, DocumentBundle
+from .schemas import ConversionReport, DocumentBundle, DocumentChunk
 from .utils import slugify, write_json


@ -28,6 +29,94 @@ def _relative_to_root(path: Path, root: Path) -> str:
    return path.relative_to(root).as_posix()


+def _build_document_chunks(title: str, body: str, layout_body: str, tables: list) -> list[DocumentChunk]:
+    paragraphs = _extract_paragraphs(_body_for_chunking(body, tables))
+    layout_lines = layout_body.splitlines()
+    layout_cursor = 0
+    chunks: list[DocumentChunk] = []
+
+    for index, paragraph in enumerate(paragraphs, start=1):
+        role = _classify_chunk_role(paragraph)
+        line_start, line_end, layout_cursor = _locate_chunk_span(paragraph, layout_lines, layout_cursor)
+        chunks.append(
+            DocumentChunk(
+                chunk_id=f"{slugify(title)}-c{index}",
+                role=role,
+                section=title,
+                line_start=line_start,
+                line_end=line_end,
+                text=paragraph,
+                confidence_hint=0.8 if role == "claim" else 0.75,
+            )
+        )
+    return chunks
+
+
+def _extract_paragraphs(body: str) -> list[str]:
+    paragraphs: list[str] = []
+    current: list[str] = []
+    for line in body.splitlines():
+        stripped = line.strip()
+        if not stripped:
+            if current:
+                paragraphs.append(" ".join(current).strip())
+                current = []
+            continue
+        current.append(stripped)
+    if current:
+        paragraphs.append(" ".join(current).strip())
+    return paragraphs
+
+
+def _body_for_chunking(body: str, tables: list) -> str:
+    excluded = {
+        line.strip()
+        for table in tables
+        for line in [table.caption, *table.raw_lines]
+        if line.strip()
+    }
+    kept_lines: list[str] = []
+    for line in body.splitlines():
+        if line.strip() in excluded:
+            continue
+        kept_lines.append(line)
+    return "\n".join(kept_lines)
+
+
+def _classify_chunk_role(paragraph: str) -> str:
+    if paragraph.startswith(("- ", "* ")):
+        return "claim"
+    if re.match(r"^(objective|claim|finding|result|conclusion):", paragraph, re.IGNORECASE):
+        return "claim"
+    return "summary"
+
+
+def _locate_chunk_span(paragraph: str, layout_lines: list[str], start_index: int) -> tuple[int, int, int]:
+    paragraph_lines = [line.strip() for line in paragraph.splitlines() if line.strip()]
+    if not paragraph_lines:
+        paragraph_lines = [paragraph.strip()]
+
+    normalized_layout = [line.strip() for line in layout_lines]
+    tokens = " ".join(part for part in paragraph_lines if part).split()
+    token_count = len(tokens)
+    if token_count == 0:
+        return 0, 0, start_index
+
+    for offset in range(start_index, len(normalized_layout)):
+        if not normalized_layout[offset]:
+            continue
+        collected: list[str] = []
+        end_offset = offset
+        while end_offset < len(normalized_layout) and len(" ".join(collected).split()) < token_count:
+            candidate = normalized_layout[end_offset]
+            if candidate:
+                collected.append(candidate)
+            end_offset += 1
+        if " ".join(collected).split() == tokens:
+            return offset + 1, end_offset, end_offset
+    return 0, 0, start_index
+
+
 def convert_doc(source_path: Path, source_root: Path, out_root: Path, figure_assets: list | None = None) -> DocumentBundle:
    raw = run_catdoc(source_path)
    cleaned = clean_text(raw)
@ -47,6 +136,8 @@ def convert_doc(source_path: Path, source_root: Path, out_root: Path, figure_ass
    layout_path = doc_out / "document.layout.json"
    tables_path = doc_out / "document.tables.json"
    figures_path = doc_out / "document.figures.json"
+    chunks_path = doc_out / "document.chunks.json"
+    chunks = _build_document_chunks(title, body, layout_body, tables)

    markdown_path.write_text(render_markdown(title, body, tables, figure_refs, related_assets), encoding="utf-8")
    write_json(layout_path, layout)
@ -68,6 +159,7 @@ def convert_doc(source_path: Path, source_root: Path, out_root: Path, figure_ass
            "related_assets": [asset.model_dump() for asset in related_assets],
        },
    )
+    write_json(chunks_path, {"chunks": [chunk.model_dump() for chunk in chunks]})

    return DocumentBundle(
        document_id=slugify(title),
@ -80,9 +172,11 @@ def convert_doc(source_path: Path, source_root: Path, out_root: Path, figure_ass
        layout_path=_relative_to_root(layout_path, out_root),
        tables_path=_relative_to_root(tables_path, out_root),
        figures_path=_relative_to_root(figures_path, out_root),
+        chunks_path=_relative_to_root(chunks_path, out_root),
        bundle_path_kind="bundle_root_relative",
        table_count=len(tables),
        figure_reference_count=len(figure_refs),
+        chunk_count=len(chunks),
    )


--- a/src/doclift/schemas.py
+++ b/src/doclift/schemas.py
@ -31,6 +31,16 @@ class FigureAsset(BaseModel):
    looks_like_figure: bool = False


+class DocumentChunk(BaseModel):
+    chunk_id: str
+    role: str = "summary"
+    section: str = ""
+    line_start: int = 0
+    line_end: int = 0
+    text: str
+    confidence_hint: float = 0.75
+
+
 class DocumentBundle(BaseModel):
    document_id: str
    title: str
@ -42,9 +52,11 @@ class DocumentBundle(BaseModel):
    layout_path: str
    tables_path: str
    figures_path: str
+    chunks_path: str
    bundle_path_kind: str = "bundle_root_relative"
    table_count: int = 0
    figure_reference_count: int = 0
+    chunk_count: int = 0


 class ConversionReport(BaseModel):
--- a/tests/test_convert.py
+++ b/tests/test_convert.py
@ -40,11 +40,18 @@ def test_convert_directory_writes_manifest_and_conversion_report(tmp_path: Path,
            encoding="utf-8"
        )
    )
+    chunks_payload = json.loads(
+        (out_root / "documents" / "sample-lecture-1-example-legacy-document" / "document.chunks.json").read_text(
+            encoding="utf-8"
+        )
+    )

    assert manifest["document_count"] == 1
    assert manifest["source_root"] == "src"
    assert manifest["documents"][0]["source_path"] == "sample.doc"
    assert manifest["documents"][0]["markdown_path"] == "documents/sample-lecture-1-example-legacy-document/document.md"
+    assert manifest["documents"][0]["chunks_path"] == "documents/sample-lecture-1-example-legacy-document/document.chunks.json"
+    assert manifest["documents"][0]["chunk_count"] == 1
    assert conversion_report["summary"]["documents_with_tables"] == 1
    assert conversion_report["summary"]["documents_with_figure_references"] == 1
    assert figures_payload["source_path"] == "sample.doc"
@ -52,3 +59,8 @@ def test_convert_directory_writes_manifest_and_conversion_report(tmp_path: Path,
    assert figures_payload["figure_references"] == ["Fig. 5.1"]
    assert len(figures_payload["related_assets"]) == 1
    assert figures_payload["related_assets"][0]["path"] == "Fig. 5.1.bmp"
+    assert len(chunks_payload["chunks"]) == 1
+    assert chunks_payload["chunks"][0]["chunk_id"] == "lecture-1-example-legacy-document-c1"
+    assert chunks_payload["chunks"][0]["role"] == "summary"
+    assert chunks_payload["chunks"][0]["line_start"] >= 1
+    assert chunks_payload["chunks"][0]["text"] == "See Fig. 5.1 and Table 1."