Filter doclift blog trailer noise
This commit is contained in:
parent
a9974110e2
commit
779ebdb515
|
|
@ -27,6 +27,18 @@ class DocliftBundleSourceAdapter:
|
|||
"[last update",
|
||||
"this essay has been transferred here",
|
||||
)
|
||||
_TERMINAL_METADATA_PREFIXES = (
|
||||
"posted by",
|
||||
"comments",
|
||||
"post a comment",
|
||||
"newer post",
|
||||
"older post",
|
||||
"subscribe to",
|
||||
"email this",
|
||||
"blogthis",
|
||||
"share to ",
|
||||
"labels:",
|
||||
)
|
||||
_CLAIM_CUES = (
|
||||
" is ",
|
||||
" are ",
|
||||
|
|
@ -101,6 +113,16 @@ class DocliftBundleSourceAdapter:
|
|||
return True
|
||||
return False
|
||||
|
||||
def _is_terminal_metadata_line(self, value: str) -> bool:
|
||||
lowered = value.strip().lower()
|
||||
if not lowered:
|
||||
return False
|
||||
if any(lowered.startswith(prefix) for prefix in self._TERMINAL_METADATA_PREFIXES):
|
||||
return True
|
||||
if re.match(r"^\d+\s+comments\b", lowered):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _is_claim_candidate(self, cleaned: str, *, title: str = "", strategy: str = "conservative") -> bool:
|
||||
lowered = cleaned.lower()
|
||||
normalized_title = self._normalize_inline_text(title).lower()
|
||||
|
|
@ -266,6 +288,10 @@ class DocliftBundleSourceAdapter:
|
|||
paragraphs.append(" ".join(current))
|
||||
current = []
|
||||
continue
|
||||
if self._is_terminal_metadata_line(line):
|
||||
if current:
|
||||
paragraphs.append(" ".join(current))
|
||||
break
|
||||
if line.startswith("#") or line.startswith("![") or line.startswith("|"):
|
||||
continue
|
||||
if self._looks_like_metadata_line(line):
|
||||
|
|
|
|||
|
|
@ -353,3 +353,62 @@ def test_doclift_bundle_import_derives_claims_from_prose_when_chunks_are_body_on
|
|||
derived_observations = [item for item in result.observations if item["observation_id"].startswith("obs_doclift_1_derived_")]
|
||||
assert derived_observations
|
||||
assert derived_observations[0]["metadata"]["claim_strategy"] in {"conservative", "balanced", "broad"}
|
||||
|
||||
|
||||
def test_doclift_bundle_import_ignores_blog_comment_trailers(tmp_path: Path) -> None:
|
||||
root = tmp_path / "doclift_bundle_blog"
|
||||
document_dir = root / "documents" / "blog-1"
|
||||
document_dir.mkdir(parents=True)
|
||||
(root / "manifest.json").write_text(
|
||||
'{\n'
|
||||
' "documents": [\n'
|
||||
' {\n'
|
||||
' "document_id": "blog-1",\n'
|
||||
' "title": "Blog Essay",\n'
|
||||
' "document_kind": "web_article",\n'
|
||||
' "output_dir": "documents/blog-1",\n'
|
||||
' "markdown_path": "documents/blog-1/document.md"\n'
|
||||
' }\n'
|
||||
' ]\n'
|
||||
'}\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
(document_dir / "document.md").write_text(
|
||||
"\n".join(
|
||||
[
|
||||
"# Blog Essay",
|
||||
"",
|
||||
"Random genetic drift is a fundamental and important part of evolution because many population-level changes are not driven by selection alone.",
|
||||
"",
|
||||
"Posted by Example Author",
|
||||
"",
|
||||
"3 comments:",
|
||||
"",
|
||||
"A mistake can do this glory is very unlikely and opposite to the first instinct that biology is complexity from a thinking being.",
|
||||
]
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
(document_dir / "document.chunks.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"chunks": [
|
||||
{
|
||||
"chunk_id": "blog-1-body-1",
|
||||
"role": "body",
|
||||
"section": "Blog Essay",
|
||||
"text": "Random genetic drift is a fundamental and important part of evolution because many population-level changes are not driven by selection alone.",
|
||||
"line_start": 1,
|
||||
"line_end": 2,
|
||||
}
|
||||
]
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
result = run_groundrecall_import(root, mode="quick", import_id="doclift-blog-test")
|
||||
claim_texts = [item["claim_text"] for item in result.claims]
|
||||
|
||||
assert any("Random genetic drift is a fundamental and important part of evolution" in text for text in claim_texts)
|
||||
assert not any("A mistake can do this glory" in text for text in claim_texts)
|
||||
|
|
|
|||
Loading…
Reference in New Issue