From 779ebdb5156db04f81311de4bfd6824de3431426 Mon Sep 17 00:00:00 2001 From: welsberr Date: Fri, 8 May 2026 02:38:19 -0400 Subject: [PATCH] Filter doclift blog trailer noise --- .../doclift_bundle.py | 26 ++++++++ tests/test_groundrecall_source_adapters.py | 59 +++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py b/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py index ceb2692..5e272ce 100755 --- a/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py +++ b/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py @@ -27,6 +27,18 @@ class DocliftBundleSourceAdapter: "[last update", "this essay has been transferred here", ) + _TERMINAL_METADATA_PREFIXES = ( + "posted by", + "comments", + "post a comment", + "newer post", + "older post", + "subscribe to", + "email this", + "blogthis", + "share to ", + "labels:", + ) _CLAIM_CUES = ( " is ", " are ", @@ -101,6 +113,16 @@ class DocliftBundleSourceAdapter: return True return False + def _is_terminal_metadata_line(self, value: str) -> bool: + lowered = value.strip().lower() + if not lowered: + return False + if any(lowered.startswith(prefix) for prefix in self._TERMINAL_METADATA_PREFIXES): + return True + if re.match(r"^\d+\s+comments\b", lowered): + return True + return False + def _is_claim_candidate(self, cleaned: str, *, title: str = "", strategy: str = "conservative") -> bool: lowered = cleaned.lower() normalized_title = self._normalize_inline_text(title).lower() @@ -266,6 +288,10 @@ class DocliftBundleSourceAdapter: paragraphs.append(" ".join(current)) current = [] continue + if self._is_terminal_metadata_line(line): + if current: + paragraphs.append(" ".join(current)) + break if line.startswith("#") or line.startswith("![") or line.startswith("|"): continue if self._looks_like_metadata_line(line): diff --git a/tests/test_groundrecall_source_adapters.py b/tests/test_groundrecall_source_adapters.py index 7a8d347..0988e46 100644 --- a/tests/test_groundrecall_source_adapters.py +++ b/tests/test_groundrecall_source_adapters.py @@ -353,3 +353,62 @@ def test_doclift_bundle_import_derives_claims_from_prose_when_chunks_are_body_on derived_observations = [item for item in result.observations if item["observation_id"].startswith("obs_doclift_1_derived_")] assert derived_observations assert derived_observations[0]["metadata"]["claim_strategy"] in {"conservative", "balanced", "broad"} + + +def test_doclift_bundle_import_ignores_blog_comment_trailers(tmp_path: Path) -> None: + root = tmp_path / "doclift_bundle_blog" + document_dir = root / "documents" / "blog-1" + document_dir.mkdir(parents=True) + (root / "manifest.json").write_text( + '{\n' + ' "documents": [\n' + ' {\n' + ' "document_id": "blog-1",\n' + ' "title": "Blog Essay",\n' + ' "document_kind": "web_article",\n' + ' "output_dir": "documents/blog-1",\n' + ' "markdown_path": "documents/blog-1/document.md"\n' + ' }\n' + ' ]\n' + '}\n', + encoding="utf-8", + ) + (document_dir / "document.md").write_text( + "\n".join( + [ + "# Blog Essay", + "", + "Random genetic drift is a fundamental and important part of evolution because many population-level changes are not driven by selection alone.", + "", + "Posted by Example Author", + "", + "3 comments:", + "", + "A mistake can do this glory is very unlikely and opposite to the first instinct that biology is complexity from a thinking being.", + ] + ), + encoding="utf-8", + ) + (document_dir / "document.chunks.json").write_text( + json.dumps( + { + "chunks": [ + { + "chunk_id": "blog-1-body-1", + "role": "body", + "section": "Blog Essay", + "text": "Random genetic drift is a fundamental and important part of evolution because many population-level changes are not driven by selection alone.", + "line_start": 1, + "line_end": 2, + } + ] + } + ), + encoding="utf-8", + ) + + result = run_groundrecall_import(root, mode="quick", import_id="doclift-blog-test") + claim_texts = [item["claim_text"] for item in result.claims] + + assert any("Random genetic drift is a fundamental and important part of evolution" in text for text in claim_texts) + assert not any("A mistake can do this glory" in text for text in claim_texts)