Add doclift claim tournament framework

2026-05-08 02:20:22 -04:00 · 2026-05-08 02:20:22 -04:00 · 54ed7568b6
parent 8201dd83ee
commit 54ed7568b6
12 changed files with 590 additions and 21 deletions
--- a/docs/doclift-claim-tournament.md
+++ b/docs/doclift-claim-tournament.md
@ -0,0 +1,30 @@
+# Doclift Claim Tournament
+
+This benchmark is a small evaluation harness for comparing multiple
+doclift prose-claim extraction strategies before changing the default
+GroundRecall import behavior.
+
+Current tracks:
+
+- `conservative`: prefers higher precision and sentence-level claims.
+- `broad`: allows paragraph-level claims and shorter sentence candidates to
+  improve recall.
+
+Judge criteria:
+
+- maximize F1 against the benchmark gold claims
+- prefer higher recall when F1 ties
+- penalize meta or identity-claim noise
+- prefer predicted claim counts close to the gold-set size
+
+Fixture location:
+
+- `tests/fixtures/doclift_claim_eval/`
+
+Primary entrypoint:
+
+- `groundrecall.doclift_claim_tournament.evaluate_doclift_claim_tracks(...)`
+
+This is intentionally small and deterministic. It is meant to support an
+iterative tournament workflow, not to serve as a full evaluation platform by
+itself.
--- a/src/groundrecall/doclift_claim_tournament.py
+++ b/src/groundrecall/doclift_claim_tournament.py
@ -0,0 +1,175 @@
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from .groundrecall_source_adapters.doclift_bundle import DocliftBundleSourceAdapter
+
+
+_TOKEN_RE = re.compile(r"[a-z0-9]+")
+_META_PATTERNS = (
+    "is a web_article in the imported doclift bundle",
+    "is a bibliography_topic in the imported doclift bundle",
+    "this essay has been transferred here",
+)
+
+
+@dataclass
+class ClaimTrackScore:
+    strategy: str
+    predicted_claims: list[str]
+    gold_claims: list[str]
+    matches: int
+    precision: float
+    recall: float
+    f1: float
+    meta_noise: int
+
+    def as_dict(self) -> dict[str, Any]:
+        return {
+            "strategy": self.strategy,
+            "predicted_claims": list(self.predicted_claims),
+            "gold_claims": list(self.gold_claims),
+            "matches": self.matches,
+            "precision": self.precision,
+            "recall": self.recall,
+            "f1": self.f1,
+            "meta_noise": self.meta_noise,
+        }
+
+
+def _normalize_tokens(text: str) -> set[str]:
+    return set(_TOKEN_RE.findall(text.lower()))
+
+
+def _claim_overlap(a: str, b: str) -> float:
+    left = _normalize_tokens(a)
+    right = _normalize_tokens(b)
+    if not left or not right:
+        return 0.0
+    return len(left & right) / len(left | right)
+
+
+def _is_meta_noise(text: str) -> bool:
+    lowered = text.lower()
+    return any(pattern in lowered for pattern in _META_PATTERNS)
+
+
+def _score_track(predicted_claims: list[str], gold_claims: list[str], strategy: str) -> ClaimTrackScore:
+    matched_gold: set[int] = set()
+    matches = 0
+    for predicted in predicted_claims:
+        best_index = None
+        best_score = 0.0
+        for index, gold in enumerate(gold_claims):
+            if index in matched_gold:
+                continue
+            overlap = _claim_overlap(predicted, gold)
+            if overlap > best_score:
+                best_score = overlap
+                best_index = index
+        if best_index is not None and best_score >= 0.34:
+            matched_gold.add(best_index)
+            matches += 1
+
+    precision = matches / len(predicted_claims) if predicted_claims else 0.0
+    recall = matches / len(gold_claims) if gold_claims else 0.0
+    f1 = (2 * precision * recall / (precision + recall)) if precision and recall else 0.0
+    meta_noise = sum(1 for claim in predicted_claims if _is_meta_noise(claim))
+    return ClaimTrackScore(
+        strategy=strategy,
+        predicted_claims=predicted_claims,
+        gold_claims=gold_claims,
+        matches=matches,
+        precision=precision,
+        recall=recall,
+        f1=f1,
+        meta_noise=meta_noise,
+    )
+
+
+def _winner_key(score: ClaimTrackScore) -> tuple[float, float, float, float]:
+    return (
+        score.f1,
+        score.recall,
+        -float(score.meta_noise),
+        -abs(len(score.predicted_claims) - len(score.gold_claims)),
+    )
+
+
+def evaluate_doclift_claim_tracks(bundle_root: str | Path, benchmark_path: str | Path) -> dict[str, Any]:
+    base = Path(bundle_root)
+    benchmark = json.loads(Path(benchmark_path).read_text(encoding="utf-8"))
+    manifest = json.loads((base / "manifest.json").read_text(encoding="utf-8"))
+    adapter = DocliftBundleSourceAdapter()
+    documents = {str(item.get("document_id")): item for item in manifest.get("documents", []) if isinstance(item, dict)}
+
+    per_document: list[dict[str, Any]] = []
+    aggregate: dict[str, dict[str, float]] = {
+        "conservative": {"matches": 0.0, "predicted": 0.0, "gold": 0.0, "meta_noise": 0.0},
+        "broad": {"matches": 0.0, "predicted": 0.0, "gold": 0.0, "meta_noise": 0.0},
+    }
+
+    for entry in benchmark.get("documents", []):
+        document_id = str(entry["document_id"])
+        document = documents[document_id]
+        gold_claims = [str(item).strip() for item in entry.get("gold_claims", []) if str(item).strip()]
+        track_scores = []
+        for strategy in ("conservative", "broad"):
+            predicted_claims = adapter.extract_document_claims(base, document, strategy=strategy, limit=6)
+            score = _score_track(predicted_claims, gold_claims, strategy)
+            track_scores.append(score)
+            aggregate[strategy]["matches"] += score.matches
+            aggregate[strategy]["predicted"] += len(score.predicted_claims)
+            aggregate[strategy]["gold"] += len(score.gold_claims)
+            aggregate[strategy]["meta_noise"] += score.meta_noise
+        winner = max(track_scores, key=_winner_key)
+        per_document.append(
+            {
+                "document_id": document_id,
+                "title": str(document.get("title") or ""),
+                "winner": winner.strategy,
+                "tracks": [score.as_dict() for score in track_scores],
+            }
+        )
+
+    judge_summary: dict[str, Any] = {"tracks": {}}
+    for strategy, totals in aggregate.items():
+        precision = totals["matches"] / totals["predicted"] if totals["predicted"] else 0.0
+        recall = totals["matches"] / totals["gold"] if totals["gold"] else 0.0
+        f1 = (2 * precision * recall / (precision + recall)) if precision and recall else 0.0
+        judge_summary["tracks"][strategy] = {
+            "matches": int(totals["matches"]),
+            "predicted_claims": int(totals["predicted"]),
+            "gold_claims": int(totals["gold"]),
+            "precision": precision,
+            "recall": recall,
+            "f1": f1,
+            "meta_noise": int(totals["meta_noise"]),
+        }
+
+    judge_summary["winner"] = max(
+        judge_summary["tracks"].items(),
+        key=lambda item: (
+            item[1]["f1"],
+            item[1]["recall"],
+            -float(item[1]["meta_noise"]),
+            -abs(item[1]["predicted_claims"] - item[1]["gold_claims"]),
+        ),
+    )[0]
+    judge_summary["criteria"] = [
+        "maximize f1 against gold claims",
+        "prefer higher recall when f1 ties",
+        "penalize meta/identity claim noise",
+        "prefer predicted claim counts close to gold-set size",
+    ]
+
+    return {
+        "bundle_root": str(base),
+        "benchmark_path": str(benchmark_path),
+        "per_document": per_document,
+        "judge_summary": judge_summary,
+    }
--- a/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py
+++ b/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py
@ -1,6 +1,7 @@
 from __future__ import annotations

 import json
+import re
 from hashlib import sha256
 from pathlib import Path

@ -10,10 +11,30 @@ from .base import DiscoveredImportSource, StructuredImportRows, register_source_
 class DocliftBundleSourceAdapter:
    name = "doclift_bundle"

+    _PROSE_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")
+    _METADATA_PREFIXES = (
+        "posted by",
+        "share to ",
+        "email this",
+        "blogthis",
+        "labels:",
+        "post a comment",
+        "older post",
+        "newer post",
+        "subscribe to",
+        "copyright",
+        "[last update",
+        "this essay has been transferred here",
+    )
+
    def _resolve_bundle_path(self, base: Path, value: str | Path | None) -> Path:
        if value is None:
            return Path()
+        if isinstance(value, str) and not value.strip():
+            return Path()
        path = Path(value)
+        if not str(path):
+            return Path()
        if path.is_absolute():
            return path
        return base / path
@ -39,6 +60,131 @@ class DocliftBundleSourceAdapter:
            return [chunk for chunk in payload if isinstance(chunk, dict)]
        return []

+    def _load_markdown_text(self, base: Path, document: dict) -> str:
+        markdown_path = self._resolve_bundle_path(base, document.get("markdown_path", ""))
+        if not markdown_path.exists():
+            return ""
+        return markdown_path.read_text(encoding="utf-8")
+
+    def _normalize_inline_text(self, value: str) -> str:
+        text = value.replace("\xa0", " ")
+        text = re.sub(r"\[[^\]]+\]\([^)]+\)", "", text)
+        text = re.sub(r"\[[^\]]+\]", "", text)
+        text = re.sub(r"\s+", " ", text)
+        return text.strip()
+
+    def _looks_like_metadata_line(self, value: str) -> bool:
+        lowered = value.strip().lower()
+        if not lowered:
+            return True
+        if any(lowered.startswith(prefix) for prefix in self._METADATA_PREFIXES):
+            return True
+        if lowered in {"home", "sandwalk", "comments", "recent comments", "loading..."}:
+            return True
+        if "property='og:" in lowered or lowered.startswith("http"):
+            return True
+        return False
+
+    def _is_claim_candidate(self, cleaned: str, *, title: str = "", strategy: str = "conservative") -> bool:
+        lowered = cleaned.lower()
+        normalized_title = self._normalize_inline_text(title).lower()
+        min_length = 70 if strategy == "conservative" else 40
+        if len(cleaned) < min_length:
+            return False
+        if strategy == "conservative" and len(cleaned) > 360:
+            return False
+        if strategy == "broad" and len(cleaned) > 520:
+            return False
+        if any(lowered.startswith(prefix) for prefix in self._METADATA_PREFIXES):
+            return False
+        if normalized_title and lowered == normalized_title:
+            return False
+        if cleaned.count(" ") < 8:
+            return False
+        return True
+
+    def _extract_claim_sentences_from_paragraphs(
+        self,
+        paragraphs: list[str],
+        *,
+        title: str = "",
+        limit: int = 4,
+        strategy: str = "conservative",
+    ) -> list[str]:
+        claims: list[str] = []
+        seen: set[str] = set()
+        for paragraph in paragraphs:
+            normalized_paragraph = self._normalize_inline_text(paragraph)
+            if len(normalized_paragraph) < 80:
+                continue
+            if strategy == "broad":
+                paragraph_key = normalized_paragraph.lower()
+                if self._is_claim_candidate(normalized_paragraph, title=title, strategy=strategy) and paragraph_key not in seen:
+                    seen.add(paragraph_key)
+                    claims.append(normalized_paragraph)
+                    if len(claims) >= limit:
+                        return claims
+            for sentence in self._PROSE_SENTENCE_SPLIT.split(normalized_paragraph):
+                cleaned = self._normalize_inline_text(sentence)
+                lowered = cleaned.lower()
+                if not self._is_claim_candidate(cleaned, title=title, strategy=strategy):
+                    continue
+                if lowered in seen:
+                    continue
+                seen.add(lowered)
+                claims.append(cleaned)
+                if len(claims) >= limit:
+                    return claims
+        return claims
+
+    def _extract_claim_sentences(self, markdown_text: str, *, title: str = "", limit: int = 4, strategy: str = "conservative") -> list[str]:
+        paragraphs: list[str] = []
+        current: list[str] = []
+        for raw_line in markdown_text.splitlines():
+            line = raw_line.strip()
+            if not line:
+                if current:
+                    paragraphs.append(" ".join(current))
+                    current = []
+                continue
+            if line.startswith("#") or line.startswith("![") or line.startswith("|"):
+                continue
+            if self._looks_like_metadata_line(line):
+                continue
+            if len(line) < 40:
+                continue
+            current.append(line)
+        if current:
+            paragraphs.append(" ".join(current))
+        if strategy == "broad":
+            broad_claims = self._extract_claim_sentences_from_paragraphs(
+                paragraphs,
+                title=title,
+                limit=max(limit * 2, limit),
+                strategy="broad",
+            )
+            if len(broad_claims) >= limit:
+                return broad_claims[:limit]
+            return broad_claims
+        return self._extract_claim_sentences_from_paragraphs(
+            paragraphs,
+            title=title,
+            limit=limit,
+            strategy="conservative",
+        )
+
+    def extract_document_claims(
+        self,
+        base: Path,
+        document: dict,
+        *,
+        strategy: str = "conservative",
+        limit: int = 4,
+    ) -> list[str]:
+        markdown_text = self._load_markdown_text(base, document)
+        title = str(document.get("title") or "")
+        return self._extract_claim_sentences(markdown_text, title=title, limit=limit, strategy=strategy)
+
    def discover(self, root: str | Path) -> list[DiscoveredImportSource]:
        base = Path(root)
        rows: list[DiscoveredImportSource] = []
@ -108,7 +254,7 @@ class DocliftBundleSourceAdapter:
            artifact_id = artifact_by_path.get(str(relative_markdown), "")
            figures_path = self._resolve_bundle_path(base, document.get("figures_path", ""))
            figure_payload = {}
-            if figures_path.exists():
+            if figures_path.is_file():
                figure_payload = json.loads(figures_path.read_text(encoding="utf-8"))
            source_path = str(figure_payload.get("source_path") or document.get("source_path") or relative_markdown)
            source_path_kind = str(figure_payload.get("source_path_kind") or document.get("source_path_kind") or "source_root_relative")
@ -144,22 +290,7 @@ class DocliftBundleSourceAdapter:
                    "current_status": "draft",
                }
            )
-            claim_rows.append(
-                {
-                    "claim_id": f"clm_doclift_{index}",
-                    "import_id": context.import_id,
-                    "claim_text": f"{title} is a {document.get('document_kind', 'document')} in the imported doclift bundle.",
-                    "claim_kind": "summary",
-                    "source_observation_ids": [observation_id],
-                    "supporting_fragment_ids": [],
-                    "concept_ids": [concept_id],
-                    "contradicts_claim_ids": [],
-                    "supersedes_claim_ids": [],
-                    "confidence_hint": 0.85,
-                    "grounding_status": "grounded",
-                    "current_status": "triaged",
-                }
-            )
+            document_claim_ids: list[str] = []
            for chunk_index, chunk in enumerate(self._load_chunks(base, document), start=1):
                chunk_text = str(chunk.get("text") or "").strip()
                if not chunk_text:
@ -209,9 +340,10 @@ class DocliftBundleSourceAdapter:
                    }
                )
                if chunk_role in {"claim", "summary"}:
+                    claim_id = f"clm_doclift_{index}_{chunk_index}"
                    claim_rows.append(
                        {
-                            "claim_id": f"clm_doclift_{index}_{chunk_index}",
+                            "claim_id": claim_id,
                            "import_id": context.import_id,
                            "claim_text": chunk_text,
                            "claim_kind": "statement" if chunk_role == "claim" else "summary",
@ -225,6 +357,69 @@ class DocliftBundleSourceAdapter:
                            "current_status": "triaged",
                        }
                    )
+                    document_claim_ids.append(claim_id)
+            if not document_claim_ids and str(document.get("document_kind") or "").strip() in {"web_article", "document"}:
+                for derived_index, claim_text in enumerate(self.extract_document_claims(base, document, strategy="conservative"), start=1):
+                    derived_observation_id = f"obs_doclift_{index}_derived_{derived_index}"
+                    claim_id = f"clm_doclift_{index}_derived_{derived_index}"
+                    observation_rows.append(
+                        {
+                            "observation_id": derived_observation_id,
+                            "import_id": context.import_id,
+                            "artifact_id": artifact_id,
+                            "role": "claim",
+                            "text": claim_text,
+                            "origin_path": relative_markdown,
+                            "origin_section": title,
+                            "line_start": 0,
+                            "line_end": 0,
+                            "source_url": source_path,
+                            "metadata": {
+                                "source_path_kind": source_path_kind,
+                                "derived_from": "markdown_sentence",
+                            },
+                            "grounding_status": "grounded",
+                            "support_kind": "direct_source",
+                            "confidence_hint": 0.65,
+                            "current_status": "draft",
+                        }
+                    )
+                    claim_rows.append(
+                        {
+                            "claim_id": claim_id,
+                            "import_id": context.import_id,
+                            "claim_text": claim_text,
+                            "claim_kind": "statement",
+                            "source_observation_ids": [derived_observation_id],
+                            "supporting_fragment_ids": [],
+                            "concept_ids": [concept_id],
+                            "contradicts_claim_ids": [],
+                            "supersedes_claim_ids": [],
+                            "confidence_hint": 0.65,
+                            "grounding_status": "grounded",
+                            "current_status": "triaged",
+                        }
+                    )
+                    document_claim_ids.append(claim_id)
+            if not document_claim_ids:
+                fallback_claim_id = f"clm_doclift_{index}"
+                claim_rows.append(
+                    {
+                        "claim_id": fallback_claim_id,
+                        "import_id": context.import_id,
+                        "claim_text": f"{title} is a {document.get('document_kind', 'document')} in the imported doclift bundle.",
+                        "claim_kind": "summary",
+                        "source_observation_ids": [observation_id],
+                        "supporting_fragment_ids": [],
+                        "concept_ids": [concept_id],
+                        "contradicts_claim_ids": [],
+                        "supersedes_claim_ids": [],
+                        "confidence_hint": 0.85,
+                        "grounding_status": "grounded",
+                        "current_status": "triaged",
+                    }
+                )
+                document_claim_ids.append(fallback_claim_id)
            if previous_concept_id is not None:
                relation_rows.append(
                    {
@ -233,7 +428,7 @@ class DocliftBundleSourceAdapter:
                        "source_id": previous_concept_id,
                        "target_id": concept_id,
                        "relation_type": "references",
-                        "evidence_ids": [f"clm_doclift_{index}"],
+                        "evidence_ids": document_claim_ids[:1],
                        "current_status": "draft",
                    }
                )
--- a/src/groundrecall/review_export.py
+++ b/src/groundrecall/review_export.py
@ -460,6 +460,7 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di
        concept_reviews.append(
            {
                "concept_id": concept.concept_id,
+                "label": concept.title,
                "title": concept.title,
                "status": concept.status,
                "description": concept.description,
--- a/tests/fixtures/doclift_claim_eval/benchmark.json
+++ b/tests/fixtures/doclift_claim_eval/benchmark.json
@ -0,0 +1,18 @@
+{
+  "documents": [
+    {
+      "document_id": "intro-essay",
+      "gold_claims": [
+        "Evolution is a change in the gene pool of a population over time.",
+        "Populations evolve, but individual organisms do not evolve during their lifetimes."
+      ]
+    },
+    {
+      "document_id": "drift-essay",
+      "gold_claims": [
+        "Random genetic drift is a fundamental and important part of evolution.",
+        "Neutral and slightly deleterious alleles can be fixed in a population by random genetic drift."
+      ]
+    }
+  ]
+}
--- a/tests/fixtures/doclift_claim_eval/documents/drift-essay/document.chunks.json
+++ b/tests/fixtures/doclift_claim_eval/documents/drift-essay/document.chunks.json
@ -0,0 +1,12 @@
+{
+  "chunks": [
+    {
+      "chunk_id": "drift-essay-body-1",
+      "role": "body",
+      "section": "Drift Essay",
+      "text": "This essay has been transferred here from an old server that has been decommissioned. Random genetic drift is a fundamental and important part of evolution. Neutral and slightly deleterious alleles can be fixed in a population by random genetic drift.",
+      "line_start": 1,
+      "line_end": 5
+    }
+  ]
+}
--- a/tests/fixtures/doclift_claim_eval/documents/drift-essay/document.md
+++ b/tests/fixtures/doclift_claim_eval/documents/drift-essay/document.md
@ -0,0 +1,9 @@
+# Drift Essay
+
+This essay has been transferred here from an old server that has been decommissioned.
+
+Random genetic drift is a fundamental and important part of evolution.
+
+Neutral and slightly deleterious alleles can be fixed in a population by random genetic drift.
+
+Posted by Example Author
--- a/tests/fixtures/doclift_claim_eval/documents/intro-essay/document.chunks.json
+++ b/tests/fixtures/doclift_claim_eval/documents/intro-essay/document.chunks.json
@ -0,0 +1,12 @@
+{
+  "chunks": [
+    {
+      "chunk_id": "intro-essay-body-1",
+      "role": "body",
+      "section": "Intro Essay",
+      "text": "Evolution is a change in the gene pool of a population over time. Populations evolve. Individual organisms do not evolve, because they retain the same genes throughout their lives.",
+      "line_start": 1,
+      "line_end": 4
+    }
+  ]
+}
--- a/tests/fixtures/doclift_claim_eval/documents/intro-essay/document.md
+++ b/tests/fixtures/doclift_claim_eval/documents/intro-essay/document.md
@ -0,0 +1,9 @@
+# Intro Essay
+
+Introduction to Evolutionary Biology
+
+Evolution is a change in the gene pool of a population over time.
+
+Populations evolve. Individual organisms do not evolve, because they retain the same genes throughout their lives.
+
+Posted by Example Author
--- a/tests/fixtures/doclift_claim_eval/manifest.json
+++ b/tests/fixtures/doclift_claim_eval/manifest.json
@ -0,0 +1,20 @@
+{
+  "documents": [
+    {
+      "document_id": "intro-essay",
+      "title": "Intro Essay",
+      "document_kind": "web_article",
+      "output_dir": "documents/intro-essay",
+      "markdown_path": "documents/intro-essay/document.md",
+      "chunks_path": "documents/intro-essay/document.chunks.json"
+    },
+    {
+      "document_id": "drift-essay",
+      "title": "Drift Essay",
+      "document_kind": "web_article",
+      "output_dir": "documents/drift-essay",
+      "markdown_path": "documents/drift-essay/document.md",
+      "chunks_path": "documents/drift-essay/document.chunks.json"
+    }
+  ]
+}
--- a/tests/test_doclift_claim_tournament.py
+++ b/tests/test_doclift_claim_tournament.py
@ -0,0 +1,30 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from groundrecall.doclift_claim_tournament import evaluate_doclift_claim_tracks
+
+
+def _fixture_root() -> Path:
+    return Path(__file__).parent / "fixtures" / "doclift_claim_eval"
+
+
+def test_doclift_claim_tournament_scores_two_tracks() -> None:
+    root = _fixture_root()
+    result = evaluate_doclift_claim_tracks(root, root / "benchmark.json")
+
+    assert result["judge_summary"]["winner"] in {"conservative", "broad"}
+    assert set(result["judge_summary"]["tracks"].keys()) == {"conservative", "broad"}
+    assert len(result["per_document"]) == 2
+    intro = next(item for item in result["per_document"] if item["document_id"] == "intro-essay")
+    assert intro["tracks"][0]["predicted_claims"]
+    assert intro["tracks"][1]["predicted_claims"]
+
+
+def test_doclift_claim_tournament_broad_track_improves_recall_on_fixture() -> None:
+    root = _fixture_root()
+    result = evaluate_doclift_claim_tracks(root, root / "benchmark.json")
+    tracks = result["judge_summary"]["tracks"]
+
+    assert tracks["broad"]["recall"] >= tracks["conservative"]["recall"]
+    assert tracks["broad"]["matches"] >= tracks["conservative"]["matches"]
--- a/tests/test_groundrecall_source_adapters.py
+++ b/tests/test_groundrecall_source_adapters.py
@ -1,5 +1,6 @@
 from __future__ import annotations

+import json
 from pathlib import Path
 import shutil

@ -286,9 +287,66 @@ def test_doclift_bundle_import_generates_structured_concepts(tmp_path: Path) ->
    concept_ids = {item["concept_id"] for item in result.concepts}
    assert "concept::lecture-1" in concept_ids
    claim_ids = {item["claim_id"] for item in result.claims}
-    assert "clm_doclift_1" in claim_ids
    assert "clm_doclift_1_1" in claim_ids
+    assert "clm_doclift_1" not in claim_ids
    assert result.observations[0]["source_url"] == "legacy/lecture-1.doc"
    assert len(result.fragments) == 2
    assert result.fragments[0]["metadata"]["source_kind"] == "doclift_chunk"
-    assert result.claims[1]["supporting_fragment_ids"] == ["frag_doclift_1_1"]
+    claim_by_id = {item["claim_id"]: item for item in result.claims}
+    assert claim_by_id["clm_doclift_1_1"]["supporting_fragment_ids"] == ["frag_doclift_1_1"]
+
+
+def test_doclift_bundle_import_derives_claims_from_prose_when_chunks_are_body_only(tmp_path: Path) -> None:
+    root = tmp_path / "doclift_bundle_prose"
+    document_dir = root / "documents" / "essay-1"
+    document_dir.mkdir(parents=True)
+    (root / "manifest.json").write_text(
+        '{\n'
+        '  "documents": [\n'
+        '    {\n'
+        '      "document_id": "essay-1",\n'
+        '      "title": "Drift Essay",\n'
+        '      "document_kind": "web_article",\n'
+        '      "output_dir": "documents/essay-1",\n'
+        '      "markdown_path": "documents/essay-1/document.md"\n'
+        '    }\n'
+        '  ]\n'
+        '}\n',
+        encoding="utf-8",
+    )
+    (document_dir / "document.md").write_text(
+        "\n".join(
+            [
+                "# Drift Essay",
+                "",
+                "Random genetic drift can dominate allele-frequency change in small populations.",
+                "This matters because many alleles are fixed or lost without any adaptive advantage.",
+                "",
+                "Posted by Example Author",
+            ]
+        ),
+        encoding="utf-8",
+    )
+    (document_dir / "document.chunks.json").write_text(
+        json.dumps(
+            {
+                "chunks": [
+                    {
+                        "chunk_id": "essay-1-body-1",
+                        "role": "body",
+                        "section": "Drift Essay",
+                        "text": "Random genetic drift can dominate allele-frequency change in small populations.",
+                        "line_start": 1,
+                        "line_end": 2,
+                    }
+                ]
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    result = run_groundrecall_import(root, mode="quick", import_id="doclift-prose-test")
+    claim_texts = [item["claim_text"] for item in result.claims]
+
+    assert any("Random genetic drift can dominate allele-frequency change in small populations." in text for text in claim_texts)
+    assert not any(text == "Drift Essay is a web_article in the imported doclift bundle." for text in claim_texts)