From 07fe11462690561e754e235e634eebc90dd92630 Mon Sep 17 00:00:00 2001
From: welsberr <welsberr@gmail.com>
Date: Mon, 27 Apr 2026 10:53:52 -0400
Subject: [PATCH] Emit chunk sidecars in doclift bundles

---
 src/doclift/convert.py | 96 +++++++++++++++++++++++++++++++++++++++++-
 src/doclift/schemas.py | 12 ++++++
 tests/test_convert.py  | 12 ++++++
 3 files changed, 119 insertions(+), 1 deletion(-)

diff --git a/src/doclift/convert.py b/src/doclift/convert.py
index 1d9b7b9..0f8465e 100755
--- a/src/doclift/convert.py
+++ b/src/doclift/convert.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from pathlib import Path
+import re
 
 from .legacy_doc import (
     build_layout_manifest,
@@ -16,7 +17,7 @@ from .legacy_doc import (
     run_catdoc,
     strip_title,
 )
-from .schemas import ConversionReport, DocumentBundle
+from .schemas import ConversionReport, DocumentBundle, DocumentChunk
 from .utils import slugify, write_json
 
 
@@ -28,6 +29,94 @@ def _relative_to_root(path: Path, root: Path) -> str:
     return path.relative_to(root).as_posix()
 
 
+def _build_document_chunks(title: str, body: str, layout_body: str, tables: list) -> list[DocumentChunk]:
+    paragraphs = _extract_paragraphs(_body_for_chunking(body, tables))
+    layout_lines = layout_body.splitlines()
+    layout_cursor = 0
+    chunks: list[DocumentChunk] = []
+
+    for index, paragraph in enumerate(paragraphs, start=1):
+        role = _classify_chunk_role(paragraph)
+        line_start, line_end, layout_cursor = _locate_chunk_span(paragraph, layout_lines, layout_cursor)
+        chunks.append(
+            DocumentChunk(
+                chunk_id=f"{slugify(title)}-c{index}",
+                role=role,
+                section=title,
+                line_start=line_start,
+                line_end=line_end,
+                text=paragraph,
+                confidence_hint=0.8 if role == "claim" else 0.75,
+            )
+        )
+    return chunks
+
+
+def _extract_paragraphs(body: str) -> list[str]:
+    paragraphs: list[str] = []
+    current: list[str] = []
+    for line in body.splitlines():
+        stripped = line.strip()
+        if not stripped:
+            if current:
+                paragraphs.append(" ".join(current).strip())
+                current = []
+            continue
+        current.append(stripped)
+    if current:
+        paragraphs.append(" ".join(current).strip())
+    return paragraphs
+
+
+def _body_for_chunking(body: str, tables: list) -> str:
+    excluded = {
+        line.strip()
+        for table in tables
+        for line in [table.caption, *table.raw_lines]
+        if line.strip()
+    }
+    kept_lines: list[str] = []
+    for line in body.splitlines():
+        if line.strip() in excluded:
+            continue
+        kept_lines.append(line)
+    return "\n".join(kept_lines)
+
+
+def _classify_chunk_role(paragraph: str) -> str:
+    if paragraph.startswith(("- ", "* ")):
+        return "claim"
+    if re.match(r"^(objective|claim|finding|result|conclusion):", paragraph, re.IGNORECASE):
+        return "claim"
+    return "summary"
+
+
+def _locate_chunk_span(paragraph: str, layout_lines: list[str], start_index: int) -> tuple[int, int, int]:
+    paragraph_lines = [line.strip() for line in paragraph.splitlines() if line.strip()]
+    if not paragraph_lines:
+        paragraph_lines = [paragraph.strip()]
+
+    normalized_layout = [line.strip() for line in layout_lines]
+    tokens = " ".join(part for part in paragraph_lines if part).split()
+    token_count = len(tokens)
+    if token_count == 0:
+        return 0, 0, start_index
+
+    for offset in range(start_index, len(normalized_layout)):
+        if not normalized_layout[offset]:
+            continue
+        collected: list[str] = []
+        end_offset = offset
+        while end_offset < len(normalized_layout) and len(" ".join(collected).split()) < token_count:
+            candidate = normalized_layout[end_offset]
+            if candidate:
+                collected.append(candidate)
+            end_offset += 1
+        if " ".join(collected).split() == tokens:
+            return offset + 1, end_offset, end_offset
+    return 0, 0, start_index
+
+
 def convert_doc(source_path: Path, source_root: Path, out_root: Path, figure_assets: list | None = None) -> DocumentBundle:
     raw = run_catdoc(source_path)
     cleaned = clean_text(raw)
@@ -47,6 +136,8 @@ def convert_doc(source_path: Path, source_root: Path, out_root: Path, figure_ass
     layout_path = doc_out / "document.layout.json"
     tables_path = doc_out / "document.tables.json"
     figures_path = doc_out / "document.figures.json"
+    chunks_path = doc_out / "document.chunks.json"
+    chunks = _build_document_chunks(title, body, layout_body, tables)
 
     markdown_path.write_text(render_markdown(title, body, tables, figure_refs, related_assets), encoding="utf-8")
     write_json(layout_path, layout)
@@ -68,6 +159,7 @@ def convert_doc(source_path: Path, source_root: Path, out_root: Path, figure_ass
             "related_assets": [asset.model_dump() for asset in related_assets],
         },
     )
+    write_json(chunks_path, {"chunks": [chunk.model_dump() for chunk in chunks]})
 
     return DocumentBundle(
         document_id=slugify(title),
@@ -80,9 +172,11 @@ def convert_doc(source_path: Path, source_root: Path, out_root: Path, figure_ass
         layout_path=_relative_to_root(layout_path, out_root),
         tables_path=_relative_to_root(tables_path, out_root),
         figures_path=_relative_to_root(figures_path, out_root),
+        chunks_path=_relative_to_root(chunks_path, out_root),
         bundle_path_kind="bundle_root_relative",
         table_count=len(tables),
         figure_reference_count=len(figure_refs),
+        chunk_count=len(chunks),
     )
 
 
diff --git a/src/doclift/schemas.py b/src/doclift/schemas.py
index f3957a0..178364f 100755
--- a/src/doclift/schemas.py
+++ b/src/doclift/schemas.py
@@ -31,6 +31,16 @@ class FigureAsset(BaseModel):
     looks_like_figure: bool = False
 
 
+class DocumentChunk(BaseModel):
+    chunk_id: str
+    role: str = "summary"
+    section: str = ""
+    line_start: int = 0
+    line_end: int = 0
+    text: str
+    confidence_hint: float = 0.75
+
+
 class DocumentBundle(BaseModel):
     document_id: str
     title: str
@@ -42,9 +52,11 @@ class DocumentBundle(BaseModel):
     layout_path: str
     tables_path: str
     figures_path: str
+    chunks_path: str
     bundle_path_kind: str = "bundle_root_relative"
     table_count: int = 0
     figure_reference_count: int = 0
+    chunk_count: int = 0
 
 
 class ConversionReport(BaseModel):
diff --git a/tests/test_convert.py b/tests/test_convert.py
index 56d2455..8c7bdeb 100755
--- a/tests/test_convert.py
+++ b/tests/test_convert.py
@@ -40,11 +40,18 @@ def test_convert_directory_writes_manifest_and_conversion_report(tmp_path: Path,
             encoding="utf-8"
         )
     )
+    chunks_payload = json.loads(
+        (out_root / "documents" / "sample-lecture-1-example-legacy-document" / "document.chunks.json").read_text(
+            encoding="utf-8"
+        )
+    )
 
     assert manifest["document_count"] == 1
     assert manifest["source_root"] == "src"
     assert manifest["documents"][0]["source_path"] == "sample.doc"
     assert manifest["documents"][0]["markdown_path"] == "documents/sample-lecture-1-example-legacy-document/document.md"
+    assert manifest["documents"][0]["chunks_path"] == "documents/sample-lecture-1-example-legacy-document/document.chunks.json"
+    assert manifest["documents"][0]["chunk_count"] == 1
     assert conversion_report["summary"]["documents_with_tables"] == 1
     assert conversion_report["summary"]["documents_with_figure_references"] == 1
     assert figures_payload["source_path"] == "sample.doc"
@@ -52,3 +59,8 @@ def test_convert_directory_writes_manifest_and_conversion_report(tmp_path: Path,
     assert figures_payload["figure_references"] == ["Fig. 5.1"]
     assert len(figures_payload["related_assets"]) == 1
     assert figures_payload["related_assets"][0]["path"] == "Fig. 5.1.bmp"
+    assert len(chunks_payload["chunks"]) == 1
+    assert chunks_payload["chunks"][0]["chunk_id"] == "lecture-1-example-legacy-document-c1"
+    assert chunks_payload["chunks"][0]["role"] == "summary"
+    assert chunks_payload["chunks"][0]["line_start"] >= 1
+    assert chunks_payload["chunks"][0]["text"] == "See Fig. 5.1 and Table 1."