Filter doclift blog trailer noise

This commit is contained in:
welsberr 2026-05-08 02:38:19 -04:00
parent a9974110e2
commit 779ebdb515
2 changed files with 85 additions and 0 deletions

View File

@ -27,6 +27,18 @@ class DocliftBundleSourceAdapter:
"[last update",
"this essay has been transferred here",
)
_TERMINAL_METADATA_PREFIXES = (
"posted by",
"comments",
"post a comment",
"newer post",
"older post",
"subscribe to",
"email this",
"blogthis",
"share to ",
"labels:",
)
_CLAIM_CUES = (
" is ",
" are ",
@ -101,6 +113,16 @@ class DocliftBundleSourceAdapter:
return True
return False
def _is_terminal_metadata_line(self, value: str) -> bool:
lowered = value.strip().lower()
if not lowered:
return False
if any(lowered.startswith(prefix) for prefix in self._TERMINAL_METADATA_PREFIXES):
return True
if re.match(r"^\d+\s+comments\b", lowered):
return True
return False
def _is_claim_candidate(self, cleaned: str, *, title: str = "", strategy: str = "conservative") -> bool:
lowered = cleaned.lower()
normalized_title = self._normalize_inline_text(title).lower()
@ -266,6 +288,10 @@ class DocliftBundleSourceAdapter:
paragraphs.append(" ".join(current))
current = []
continue
if self._is_terminal_metadata_line(line):
if current:
paragraphs.append(" ".join(current))
break
if line.startswith("#") or line.startswith("![") or line.startswith("|"):
continue
if self._looks_like_metadata_line(line):

View File

@ -353,3 +353,62 @@ def test_doclift_bundle_import_derives_claims_from_prose_when_chunks_are_body_on
derived_observations = [item for item in result.observations if item["observation_id"].startswith("obs_doclift_1_derived_")]
assert derived_observations
assert derived_observations[0]["metadata"]["claim_strategy"] in {"conservative", "balanced", "broad"}
def test_doclift_bundle_import_ignores_blog_comment_trailers(tmp_path: Path) -> None:
root = tmp_path / "doclift_bundle_blog"
document_dir = root / "documents" / "blog-1"
document_dir.mkdir(parents=True)
(root / "manifest.json").write_text(
'{\n'
' "documents": [\n'
' {\n'
' "document_id": "blog-1",\n'
' "title": "Blog Essay",\n'
' "document_kind": "web_article",\n'
' "output_dir": "documents/blog-1",\n'
' "markdown_path": "documents/blog-1/document.md"\n'
' }\n'
' ]\n'
'}\n',
encoding="utf-8",
)
(document_dir / "document.md").write_text(
"\n".join(
[
"# Blog Essay",
"",
"Random genetic drift is a fundamental and important part of evolution because many population-level changes are not driven by selection alone.",
"",
"Posted by Example Author",
"",
"3 comments:",
"",
"A mistake can do this glory is very unlikely and opposite to the first instinct that biology is complexity from a thinking being.",
]
),
encoding="utf-8",
)
(document_dir / "document.chunks.json").write_text(
json.dumps(
{
"chunks": [
{
"chunk_id": "blog-1-body-1",
"role": "body",
"section": "Blog Essay",
"text": "Random genetic drift is a fundamental and important part of evolution because many population-level changes are not driven by selection alone.",
"line_start": 1,
"line_end": 2,
}
]
}
),
encoding="utf-8",
)
result = run_groundrecall_import(root, mode="quick", import_id="doclift-blog-test")
claim_texts = [item["claim_text"] for item in result.claims]
assert any("Random genetic drift is a fundamental and important part of evolution" in text for text in claim_texts)
assert not any("A mistake can do this glory" in text for text in claim_texts)