diff --git a/docs/doclift-claim-tournament.md b/docs/doclift-claim-tournament.md new file mode 100644 index 0000000..f4cc096 --- /dev/null +++ b/docs/doclift-claim-tournament.md @@ -0,0 +1,30 @@ +# Doclift Claim Tournament + +This benchmark is a small evaluation harness for comparing multiple +doclift prose-claim extraction strategies before changing the default +GroundRecall import behavior. + +Current tracks: + +- `conservative`: prefers higher precision and sentence-level claims. +- `broad`: allows paragraph-level claims and shorter sentence candidates to + improve recall. + +Judge criteria: + +- maximize F1 against the benchmark gold claims +- prefer higher recall when F1 ties +- penalize meta or identity-claim noise +- prefer predicted claim counts close to the gold-set size + +Fixture location: + +- `tests/fixtures/doclift_claim_eval/` + +Primary entrypoint: + +- `groundrecall.doclift_claim_tournament.evaluate_doclift_claim_tracks(...)` + +This is intentionally small and deterministic. It is meant to support an +iterative tournament workflow, not to serve as a full evaluation platform by +itself. diff --git a/src/groundrecall/doclift_claim_tournament.py b/src/groundrecall/doclift_claim_tournament.py new file mode 100644 index 0000000..8e0226e --- /dev/null +++ b/src/groundrecall/doclift_claim_tournament.py @@ -0,0 +1,175 @@ +from __future__ import annotations + +import json +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from .groundrecall_source_adapters.doclift_bundle import DocliftBundleSourceAdapter + + +_TOKEN_RE = re.compile(r"[a-z0-9]+") +_META_PATTERNS = ( + "is a web_article in the imported doclift bundle", + "is a bibliography_topic in the imported doclift bundle", + "this essay has been transferred here", +) + + +@dataclass +class ClaimTrackScore: + strategy: str + predicted_claims: list[str] + gold_claims: list[str] + matches: int + precision: float + recall: float + f1: float + meta_noise: int + + def as_dict(self) -> dict[str, Any]: + return { + "strategy": self.strategy, + "predicted_claims": list(self.predicted_claims), + "gold_claims": list(self.gold_claims), + "matches": self.matches, + "precision": self.precision, + "recall": self.recall, + "f1": self.f1, + "meta_noise": self.meta_noise, + } + + +def _normalize_tokens(text: str) -> set[str]: + return set(_TOKEN_RE.findall(text.lower())) + + +def _claim_overlap(a: str, b: str) -> float: + left = _normalize_tokens(a) + right = _normalize_tokens(b) + if not left or not right: + return 0.0 + return len(left & right) / len(left | right) + + +def _is_meta_noise(text: str) -> bool: + lowered = text.lower() + return any(pattern in lowered for pattern in _META_PATTERNS) + + +def _score_track(predicted_claims: list[str], gold_claims: list[str], strategy: str) -> ClaimTrackScore: + matched_gold: set[int] = set() + matches = 0 + for predicted in predicted_claims: + best_index = None + best_score = 0.0 + for index, gold in enumerate(gold_claims): + if index in matched_gold: + continue + overlap = _claim_overlap(predicted, gold) + if overlap > best_score: + best_score = overlap + best_index = index + if best_index is not None and best_score >= 0.34: + matched_gold.add(best_index) + matches += 1 + + precision = matches / len(predicted_claims) if predicted_claims else 0.0 + recall = matches / len(gold_claims) if gold_claims else 0.0 + f1 = (2 * precision * recall / (precision + recall)) if precision and recall else 0.0 + meta_noise = sum(1 for claim in predicted_claims if _is_meta_noise(claim)) + return ClaimTrackScore( + strategy=strategy, + predicted_claims=predicted_claims, + gold_claims=gold_claims, + matches=matches, + precision=precision, + recall=recall, + f1=f1, + meta_noise=meta_noise, + ) + + +def _winner_key(score: ClaimTrackScore) -> tuple[float, float, float, float]: + return ( + score.f1, + score.recall, + -float(score.meta_noise), + -abs(len(score.predicted_claims) - len(score.gold_claims)), + ) + + +def evaluate_doclift_claim_tracks(bundle_root: str | Path, benchmark_path: str | Path) -> dict[str, Any]: + base = Path(bundle_root) + benchmark = json.loads(Path(benchmark_path).read_text(encoding="utf-8")) + manifest = json.loads((base / "manifest.json").read_text(encoding="utf-8")) + adapter = DocliftBundleSourceAdapter() + documents = {str(item.get("document_id")): item for item in manifest.get("documents", []) if isinstance(item, dict)} + + per_document: list[dict[str, Any]] = [] + aggregate: dict[str, dict[str, float]] = { + "conservative": {"matches": 0.0, "predicted": 0.0, "gold": 0.0, "meta_noise": 0.0}, + "broad": {"matches": 0.0, "predicted": 0.0, "gold": 0.0, "meta_noise": 0.0}, + } + + for entry in benchmark.get("documents", []): + document_id = str(entry["document_id"]) + document = documents[document_id] + gold_claims = [str(item).strip() for item in entry.get("gold_claims", []) if str(item).strip()] + track_scores = [] + for strategy in ("conservative", "broad"): + predicted_claims = adapter.extract_document_claims(base, document, strategy=strategy, limit=6) + score = _score_track(predicted_claims, gold_claims, strategy) + track_scores.append(score) + aggregate[strategy]["matches"] += score.matches + aggregate[strategy]["predicted"] += len(score.predicted_claims) + aggregate[strategy]["gold"] += len(score.gold_claims) + aggregate[strategy]["meta_noise"] += score.meta_noise + winner = max(track_scores, key=_winner_key) + per_document.append( + { + "document_id": document_id, + "title": str(document.get("title") or ""), + "winner": winner.strategy, + "tracks": [score.as_dict() for score in track_scores], + } + ) + + judge_summary: dict[str, Any] = {"tracks": {}} + for strategy, totals in aggregate.items(): + precision = totals["matches"] / totals["predicted"] if totals["predicted"] else 0.0 + recall = totals["matches"] / totals["gold"] if totals["gold"] else 0.0 + f1 = (2 * precision * recall / (precision + recall)) if precision and recall else 0.0 + judge_summary["tracks"][strategy] = { + "matches": int(totals["matches"]), + "predicted_claims": int(totals["predicted"]), + "gold_claims": int(totals["gold"]), + "precision": precision, + "recall": recall, + "f1": f1, + "meta_noise": int(totals["meta_noise"]), + } + + judge_summary["winner"] = max( + judge_summary["tracks"].items(), + key=lambda item: ( + item[1]["f1"], + item[1]["recall"], + -float(item[1]["meta_noise"]), + -abs(item[1]["predicted_claims"] - item[1]["gold_claims"]), + ), + )[0] + judge_summary["criteria"] = [ + "maximize f1 against gold claims", + "prefer higher recall when f1 ties", + "penalize meta/identity claim noise", + "prefer predicted claim counts close to gold-set size", + ] + + return { + "bundle_root": str(base), + "benchmark_path": str(benchmark_path), + "per_document": per_document, + "judge_summary": judge_summary, + } diff --git a/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py b/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py index fde221c..1d71da3 100755 --- a/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py +++ b/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +import re from hashlib import sha256 from pathlib import Path @@ -10,10 +11,30 @@ from .base import DiscoveredImportSource, StructuredImportRows, register_source_ class DocliftBundleSourceAdapter: name = "doclift_bundle" + _PROSE_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+") + _METADATA_PREFIXES = ( + "posted by", + "share to ", + "email this", + "blogthis", + "labels:", + "post a comment", + "older post", + "newer post", + "subscribe to", + "copyright", + "[last update", + "this essay has been transferred here", + ) + def _resolve_bundle_path(self, base: Path, value: str | Path | None) -> Path: if value is None: return Path() + if isinstance(value, str) and not value.strip(): + return Path() path = Path(value) + if not str(path): + return Path() if path.is_absolute(): return path return base / path @@ -39,6 +60,131 @@ class DocliftBundleSourceAdapter: return [chunk for chunk in payload if isinstance(chunk, dict)] return [] + def _load_markdown_text(self, base: Path, document: dict) -> str: + markdown_path = self._resolve_bundle_path(base, document.get("markdown_path", "")) + if not markdown_path.exists(): + return "" + return markdown_path.read_text(encoding="utf-8") + + def _normalize_inline_text(self, value: str) -> str: + text = value.replace("\xa0", " ") + text = re.sub(r"\[[^\]]+\]\([^)]+\)", "", text) + text = re.sub(r"\[[^\]]+\]", "", text) + text = re.sub(r"\s+", " ", text) + return text.strip() + + def _looks_like_metadata_line(self, value: str) -> bool: + lowered = value.strip().lower() + if not lowered: + return True + if any(lowered.startswith(prefix) for prefix in self._METADATA_PREFIXES): + return True + if lowered in {"home", "sandwalk", "comments", "recent comments", "loading..."}: + return True + if "property='og:" in lowered or lowered.startswith("http"): + return True + return False + + def _is_claim_candidate(self, cleaned: str, *, title: str = "", strategy: str = "conservative") -> bool: + lowered = cleaned.lower() + normalized_title = self._normalize_inline_text(title).lower() + min_length = 70 if strategy == "conservative" else 40 + if len(cleaned) < min_length: + return False + if strategy == "conservative" and len(cleaned) > 360: + return False + if strategy == "broad" and len(cleaned) > 520: + return False + if any(lowered.startswith(prefix) for prefix in self._METADATA_PREFIXES): + return False + if normalized_title and lowered == normalized_title: + return False + if cleaned.count(" ") < 8: + return False + return True + + def _extract_claim_sentences_from_paragraphs( + self, + paragraphs: list[str], + *, + title: str = "", + limit: int = 4, + strategy: str = "conservative", + ) -> list[str]: + claims: list[str] = [] + seen: set[str] = set() + for paragraph in paragraphs: + normalized_paragraph = self._normalize_inline_text(paragraph) + if len(normalized_paragraph) < 80: + continue + if strategy == "broad": + paragraph_key = normalized_paragraph.lower() + if self._is_claim_candidate(normalized_paragraph, title=title, strategy=strategy) and paragraph_key not in seen: + seen.add(paragraph_key) + claims.append(normalized_paragraph) + if len(claims) >= limit: + return claims + for sentence in self._PROSE_SENTENCE_SPLIT.split(normalized_paragraph): + cleaned = self._normalize_inline_text(sentence) + lowered = cleaned.lower() + if not self._is_claim_candidate(cleaned, title=title, strategy=strategy): + continue + if lowered in seen: + continue + seen.add(lowered) + claims.append(cleaned) + if len(claims) >= limit: + return claims + return claims + + def _extract_claim_sentences(self, markdown_text: str, *, title: str = "", limit: int = 4, strategy: str = "conservative") -> list[str]: + paragraphs: list[str] = [] + current: list[str] = [] + for raw_line in markdown_text.splitlines(): + line = raw_line.strip() + if not line: + if current: + paragraphs.append(" ".join(current)) + current = [] + continue + if line.startswith("#") or line.startswith("![") or line.startswith("|"): + continue + if self._looks_like_metadata_line(line): + continue + if len(line) < 40: + continue + current.append(line) + if current: + paragraphs.append(" ".join(current)) + if strategy == "broad": + broad_claims = self._extract_claim_sentences_from_paragraphs( + paragraphs, + title=title, + limit=max(limit * 2, limit), + strategy="broad", + ) + if len(broad_claims) >= limit: + return broad_claims[:limit] + return broad_claims + return self._extract_claim_sentences_from_paragraphs( + paragraphs, + title=title, + limit=limit, + strategy="conservative", + ) + + def extract_document_claims( + self, + base: Path, + document: dict, + *, + strategy: str = "conservative", + limit: int = 4, + ) -> list[str]: + markdown_text = self._load_markdown_text(base, document) + title = str(document.get("title") or "") + return self._extract_claim_sentences(markdown_text, title=title, limit=limit, strategy=strategy) + def discover(self, root: str | Path) -> list[DiscoveredImportSource]: base = Path(root) rows: list[DiscoveredImportSource] = [] @@ -108,7 +254,7 @@ class DocliftBundleSourceAdapter: artifact_id = artifact_by_path.get(str(relative_markdown), "") figures_path = self._resolve_bundle_path(base, document.get("figures_path", "")) figure_payload = {} - if figures_path.exists(): + if figures_path.is_file(): figure_payload = json.loads(figures_path.read_text(encoding="utf-8")) source_path = str(figure_payload.get("source_path") or document.get("source_path") or relative_markdown) source_path_kind = str(figure_payload.get("source_path_kind") or document.get("source_path_kind") or "source_root_relative") @@ -144,22 +290,7 @@ class DocliftBundleSourceAdapter: "current_status": "draft", } ) - claim_rows.append( - { - "claim_id": f"clm_doclift_{index}", - "import_id": context.import_id, - "claim_text": f"{title} is a {document.get('document_kind', 'document')} in the imported doclift bundle.", - "claim_kind": "summary", - "source_observation_ids": [observation_id], - "supporting_fragment_ids": [], - "concept_ids": [concept_id], - "contradicts_claim_ids": [], - "supersedes_claim_ids": [], - "confidence_hint": 0.85, - "grounding_status": "grounded", - "current_status": "triaged", - } - ) + document_claim_ids: list[str] = [] for chunk_index, chunk in enumerate(self._load_chunks(base, document), start=1): chunk_text = str(chunk.get("text") or "").strip() if not chunk_text: @@ -209,9 +340,10 @@ class DocliftBundleSourceAdapter: } ) if chunk_role in {"claim", "summary"}: + claim_id = f"clm_doclift_{index}_{chunk_index}" claim_rows.append( { - "claim_id": f"clm_doclift_{index}_{chunk_index}", + "claim_id": claim_id, "import_id": context.import_id, "claim_text": chunk_text, "claim_kind": "statement" if chunk_role == "claim" else "summary", @@ -225,6 +357,69 @@ class DocliftBundleSourceAdapter: "current_status": "triaged", } ) + document_claim_ids.append(claim_id) + if not document_claim_ids and str(document.get("document_kind") or "").strip() in {"web_article", "document"}: + for derived_index, claim_text in enumerate(self.extract_document_claims(base, document, strategy="conservative"), start=1): + derived_observation_id = f"obs_doclift_{index}_derived_{derived_index}" + claim_id = f"clm_doclift_{index}_derived_{derived_index}" + observation_rows.append( + { + "observation_id": derived_observation_id, + "import_id": context.import_id, + "artifact_id": artifact_id, + "role": "claim", + "text": claim_text, + "origin_path": relative_markdown, + "origin_section": title, + "line_start": 0, + "line_end": 0, + "source_url": source_path, + "metadata": { + "source_path_kind": source_path_kind, + "derived_from": "markdown_sentence", + }, + "grounding_status": "grounded", + "support_kind": "direct_source", + "confidence_hint": 0.65, + "current_status": "draft", + } + ) + claim_rows.append( + { + "claim_id": claim_id, + "import_id": context.import_id, + "claim_text": claim_text, + "claim_kind": "statement", + "source_observation_ids": [derived_observation_id], + "supporting_fragment_ids": [], + "concept_ids": [concept_id], + "contradicts_claim_ids": [], + "supersedes_claim_ids": [], + "confidence_hint": 0.65, + "grounding_status": "grounded", + "current_status": "triaged", + } + ) + document_claim_ids.append(claim_id) + if not document_claim_ids: + fallback_claim_id = f"clm_doclift_{index}" + claim_rows.append( + { + "claim_id": fallback_claim_id, + "import_id": context.import_id, + "claim_text": f"{title} is a {document.get('document_kind', 'document')} in the imported doclift bundle.", + "claim_kind": "summary", + "source_observation_ids": [observation_id], + "supporting_fragment_ids": [], + "concept_ids": [concept_id], + "contradicts_claim_ids": [], + "supersedes_claim_ids": [], + "confidence_hint": 0.85, + "grounding_status": "grounded", + "current_status": "triaged", + } + ) + document_claim_ids.append(fallback_claim_id) if previous_concept_id is not None: relation_rows.append( { @@ -233,7 +428,7 @@ class DocliftBundleSourceAdapter: "source_id": previous_concept_id, "target_id": concept_id, "relation_type": "references", - "evidence_ids": [f"clm_doclift_{index}"], + "evidence_ids": document_claim_ids[:1], "current_status": "draft", } ) diff --git a/src/groundrecall/review_export.py b/src/groundrecall/review_export.py index 23a97c0..09237be 100644 --- a/src/groundrecall/review_export.py +++ b/src/groundrecall/review_export.py @@ -460,6 +460,7 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di concept_reviews.append( { "concept_id": concept.concept_id, + "label": concept.title, "title": concept.title, "status": concept.status, "description": concept.description, diff --git a/tests/fixtures/doclift_claim_eval/benchmark.json b/tests/fixtures/doclift_claim_eval/benchmark.json new file mode 100644 index 0000000..609a14d --- /dev/null +++ b/tests/fixtures/doclift_claim_eval/benchmark.json @@ -0,0 +1,18 @@ +{ + "documents": [ + { + "document_id": "intro-essay", + "gold_claims": [ + "Evolution is a change in the gene pool of a population over time.", + "Populations evolve, but individual organisms do not evolve during their lifetimes." + ] + }, + { + "document_id": "drift-essay", + "gold_claims": [ + "Random genetic drift is a fundamental and important part of evolution.", + "Neutral and slightly deleterious alleles can be fixed in a population by random genetic drift." + ] + } + ] +} diff --git a/tests/fixtures/doclift_claim_eval/documents/drift-essay/document.chunks.json b/tests/fixtures/doclift_claim_eval/documents/drift-essay/document.chunks.json new file mode 100644 index 0000000..bb4e21c --- /dev/null +++ b/tests/fixtures/doclift_claim_eval/documents/drift-essay/document.chunks.json @@ -0,0 +1,12 @@ +{ + "chunks": [ + { + "chunk_id": "drift-essay-body-1", + "role": "body", + "section": "Drift Essay", + "text": "This essay has been transferred here from an old server that has been decommissioned. Random genetic drift is a fundamental and important part of evolution. Neutral and slightly deleterious alleles can be fixed in a population by random genetic drift.", + "line_start": 1, + "line_end": 5 + } + ] +} diff --git a/tests/fixtures/doclift_claim_eval/documents/drift-essay/document.md b/tests/fixtures/doclift_claim_eval/documents/drift-essay/document.md new file mode 100644 index 0000000..faba4f5 --- /dev/null +++ b/tests/fixtures/doclift_claim_eval/documents/drift-essay/document.md @@ -0,0 +1,9 @@ +# Drift Essay + +This essay has been transferred here from an old server that has been decommissioned. + +Random genetic drift is a fundamental and important part of evolution. + +Neutral and slightly deleterious alleles can be fixed in a population by random genetic drift. + +Posted by Example Author diff --git a/tests/fixtures/doclift_claim_eval/documents/intro-essay/document.chunks.json b/tests/fixtures/doclift_claim_eval/documents/intro-essay/document.chunks.json new file mode 100644 index 0000000..488655d --- /dev/null +++ b/tests/fixtures/doclift_claim_eval/documents/intro-essay/document.chunks.json @@ -0,0 +1,12 @@ +{ + "chunks": [ + { + "chunk_id": "intro-essay-body-1", + "role": "body", + "section": "Intro Essay", + "text": "Evolution is a change in the gene pool of a population over time. Populations evolve. Individual organisms do not evolve, because they retain the same genes throughout their lives.", + "line_start": 1, + "line_end": 4 + } + ] +} diff --git a/tests/fixtures/doclift_claim_eval/documents/intro-essay/document.md b/tests/fixtures/doclift_claim_eval/documents/intro-essay/document.md new file mode 100644 index 0000000..533f478 --- /dev/null +++ b/tests/fixtures/doclift_claim_eval/documents/intro-essay/document.md @@ -0,0 +1,9 @@ +# Intro Essay + +Introduction to Evolutionary Biology + +Evolution is a change in the gene pool of a population over time. + +Populations evolve. Individual organisms do not evolve, because they retain the same genes throughout their lives. + +Posted by Example Author diff --git a/tests/fixtures/doclift_claim_eval/manifest.json b/tests/fixtures/doclift_claim_eval/manifest.json new file mode 100644 index 0000000..e9f68d7 --- /dev/null +++ b/tests/fixtures/doclift_claim_eval/manifest.json @@ -0,0 +1,20 @@ +{ + "documents": [ + { + "document_id": "intro-essay", + "title": "Intro Essay", + "document_kind": "web_article", + "output_dir": "documents/intro-essay", + "markdown_path": "documents/intro-essay/document.md", + "chunks_path": "documents/intro-essay/document.chunks.json" + }, + { + "document_id": "drift-essay", + "title": "Drift Essay", + "document_kind": "web_article", + "output_dir": "documents/drift-essay", + "markdown_path": "documents/drift-essay/document.md", + "chunks_path": "documents/drift-essay/document.chunks.json" + } + ] +} diff --git a/tests/test_doclift_claim_tournament.py b/tests/test_doclift_claim_tournament.py new file mode 100644 index 0000000..e1b4407 --- /dev/null +++ b/tests/test_doclift_claim_tournament.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +from pathlib import Path + +from groundrecall.doclift_claim_tournament import evaluate_doclift_claim_tracks + + +def _fixture_root() -> Path: + return Path(__file__).parent / "fixtures" / "doclift_claim_eval" + + +def test_doclift_claim_tournament_scores_two_tracks() -> None: + root = _fixture_root() + result = evaluate_doclift_claim_tracks(root, root / "benchmark.json") + + assert result["judge_summary"]["winner"] in {"conservative", "broad"} + assert set(result["judge_summary"]["tracks"].keys()) == {"conservative", "broad"} + assert len(result["per_document"]) == 2 + intro = next(item for item in result["per_document"] if item["document_id"] == "intro-essay") + assert intro["tracks"][0]["predicted_claims"] + assert intro["tracks"][1]["predicted_claims"] + + +def test_doclift_claim_tournament_broad_track_improves_recall_on_fixture() -> None: + root = _fixture_root() + result = evaluate_doclift_claim_tracks(root, root / "benchmark.json") + tracks = result["judge_summary"]["tracks"] + + assert tracks["broad"]["recall"] >= tracks["conservative"]["recall"] + assert tracks["broad"]["matches"] >= tracks["conservative"]["matches"] diff --git a/tests/test_groundrecall_source_adapters.py b/tests/test_groundrecall_source_adapters.py index 5bbd04c..a3dc2a9 100644 --- a/tests/test_groundrecall_source_adapters.py +++ b/tests/test_groundrecall_source_adapters.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json from pathlib import Path import shutil @@ -286,9 +287,66 @@ def test_doclift_bundle_import_generates_structured_concepts(tmp_path: Path) -> concept_ids = {item["concept_id"] for item in result.concepts} assert "concept::lecture-1" in concept_ids claim_ids = {item["claim_id"] for item in result.claims} - assert "clm_doclift_1" in claim_ids assert "clm_doclift_1_1" in claim_ids + assert "clm_doclift_1" not in claim_ids assert result.observations[0]["source_url"] == "legacy/lecture-1.doc" assert len(result.fragments) == 2 assert result.fragments[0]["metadata"]["source_kind"] == "doclift_chunk" - assert result.claims[1]["supporting_fragment_ids"] == ["frag_doclift_1_1"] + claim_by_id = {item["claim_id"]: item for item in result.claims} + assert claim_by_id["clm_doclift_1_1"]["supporting_fragment_ids"] == ["frag_doclift_1_1"] + + +def test_doclift_bundle_import_derives_claims_from_prose_when_chunks_are_body_only(tmp_path: Path) -> None: + root = tmp_path / "doclift_bundle_prose" + document_dir = root / "documents" / "essay-1" + document_dir.mkdir(parents=True) + (root / "manifest.json").write_text( + '{\n' + ' "documents": [\n' + ' {\n' + ' "document_id": "essay-1",\n' + ' "title": "Drift Essay",\n' + ' "document_kind": "web_article",\n' + ' "output_dir": "documents/essay-1",\n' + ' "markdown_path": "documents/essay-1/document.md"\n' + ' }\n' + ' ]\n' + '}\n', + encoding="utf-8", + ) + (document_dir / "document.md").write_text( + "\n".join( + [ + "# Drift Essay", + "", + "Random genetic drift can dominate allele-frequency change in small populations.", + "This matters because many alleles are fixed or lost without any adaptive advantage.", + "", + "Posted by Example Author", + ] + ), + encoding="utf-8", + ) + (document_dir / "document.chunks.json").write_text( + json.dumps( + { + "chunks": [ + { + "chunk_id": "essay-1-body-1", + "role": "body", + "section": "Drift Essay", + "text": "Random genetic drift can dominate allele-frequency change in small populations.", + "line_start": 1, + "line_end": 2, + } + ] + } + ), + encoding="utf-8", + ) + + result = run_groundrecall_import(root, mode="quick", import_id="doclift-prose-test") + claim_texts = [item["claim_text"] for item in result.claims] + + assert any("Random genetic drift can dominate allele-frequency change in small populations." in text for text in claim_texts) + assert not any(text == "Drift Essay is a web_article in the imported doclift bundle." for text in claim_texts)