Filter doclift blog trailer noise

2026-05-08 02:38:19 -04:00 · 2026-05-08 02:38:19 -04:00 · 779ebdb515
parent a9974110e2
commit 779ebdb515
2 changed files with 85 additions and 0 deletions
--- a/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py
+++ b/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py
@ -27,6 +27,18 @@ class DocliftBundleSourceAdapter:
        "[last update",
        "this essay has been transferred here",
    )
+    _TERMINAL_METADATA_PREFIXES = (
+        "posted by",
+        "comments",
+        "post a comment",
+        "newer post",
+        "older post",
+        "subscribe to",
+        "email this",
+        "blogthis",
+        "share to ",
+        "labels:",
+    )
    _CLAIM_CUES = (
        " is ",
        " are ",
@ -101,6 +113,16 @@ class DocliftBundleSourceAdapter:
            return True
        return False

+    def _is_terminal_metadata_line(self, value: str) -> bool:
+        lowered = value.strip().lower()
+        if not lowered:
+            return False
+        if any(lowered.startswith(prefix) for prefix in self._TERMINAL_METADATA_PREFIXES):
+            return True
+        if re.match(r"^\d+\s+comments\b", lowered):
+            return True
+        return False
+
    def _is_claim_candidate(self, cleaned: str, *, title: str = "", strategy: str = "conservative") -> bool:
        lowered = cleaned.lower()
        normalized_title = self._normalize_inline_text(title).lower()
@ -266,6 +288,10 @@ class DocliftBundleSourceAdapter:
                    paragraphs.append(" ".join(current))
                    current = []
                continue
+            if self._is_terminal_metadata_line(line):
+                if current:
+                    paragraphs.append(" ".join(current))
+                break
            if line.startswith("#") or line.startswith("![") or line.startswith("|"):
                continue
            if self._looks_like_metadata_line(line):
--- a/tests/test_groundrecall_source_adapters.py
+++ b/tests/test_groundrecall_source_adapters.py
@ -353,3 +353,62 @@ def test_doclift_bundle_import_derives_claims_from_prose_when_chunks_are_body_on
    derived_observations = [item for item in result.observations if item["observation_id"].startswith("obs_doclift_1_derived_")]
    assert derived_observations
    assert derived_observations[0]["metadata"]["claim_strategy"] in {"conservative", "balanced", "broad"}
+
+
+def test_doclift_bundle_import_ignores_blog_comment_trailers(tmp_path: Path) -> None:
+    root = tmp_path / "doclift_bundle_blog"
+    document_dir = root / "documents" / "blog-1"
+    document_dir.mkdir(parents=True)
+    (root / "manifest.json").write_text(
+        '{\n'
+        '  "documents": [\n'
+        '    {\n'
+        '      "document_id": "blog-1",\n'
+        '      "title": "Blog Essay",\n'
+        '      "document_kind": "web_article",\n'
+        '      "output_dir": "documents/blog-1",\n'
+        '      "markdown_path": "documents/blog-1/document.md"\n'
+        '    }\n'
+        '  ]\n'
+        '}\n',
+        encoding="utf-8",
+    )
+    (document_dir / "document.md").write_text(
+        "\n".join(
+                [
+                    "# Blog Essay",
+                    "",
+                    "Random genetic drift is a fundamental and important part of evolution because many population-level changes are not driven by selection alone.",
+                "",
+                "Posted by Example Author",
+                "",
+                "3 comments:",
+                "",
+                "A mistake can do this glory is very unlikely and opposite to the first instinct that biology is complexity from a thinking being.",
+            ]
+        ),
+        encoding="utf-8",
+    )
+    (document_dir / "document.chunks.json").write_text(
+        json.dumps(
+            {
+                "chunks": [
+                        {
+                            "chunk_id": "blog-1-body-1",
+                            "role": "body",
+                            "section": "Blog Essay",
+                            "text": "Random genetic drift is a fundamental and important part of evolution because many population-level changes are not driven by selection alone.",
+                            "line_start": 1,
+                            "line_end": 2,
+                        }
+                ]
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    result = run_groundrecall_import(root, mode="quick", import_id="doclift-blog-test")
+    claim_texts = [item["claim_text"] for item in result.claims]
+
+    assert any("Random genetic drift is a fundamental and important part of evolution" in text for text in claim_texts)
+    assert not any("A mistake can do this glory" in text for text in claim_texts)