Filter doclift blog trailer noise
This commit is contained in:
parent
a9974110e2
commit
779ebdb515
|
|
@ -27,6 +27,18 @@ class DocliftBundleSourceAdapter:
|
||||||
"[last update",
|
"[last update",
|
||||||
"this essay has been transferred here",
|
"this essay has been transferred here",
|
||||||
)
|
)
|
||||||
|
_TERMINAL_METADATA_PREFIXES = (
|
||||||
|
"posted by",
|
||||||
|
"comments",
|
||||||
|
"post a comment",
|
||||||
|
"newer post",
|
||||||
|
"older post",
|
||||||
|
"subscribe to",
|
||||||
|
"email this",
|
||||||
|
"blogthis",
|
||||||
|
"share to ",
|
||||||
|
"labels:",
|
||||||
|
)
|
||||||
_CLAIM_CUES = (
|
_CLAIM_CUES = (
|
||||||
" is ",
|
" is ",
|
||||||
" are ",
|
" are ",
|
||||||
|
|
@ -101,6 +113,16 @@ class DocliftBundleSourceAdapter:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _is_terminal_metadata_line(self, value: str) -> bool:
|
||||||
|
lowered = value.strip().lower()
|
||||||
|
if not lowered:
|
||||||
|
return False
|
||||||
|
if any(lowered.startswith(prefix) for prefix in self._TERMINAL_METADATA_PREFIXES):
|
||||||
|
return True
|
||||||
|
if re.match(r"^\d+\s+comments\b", lowered):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def _is_claim_candidate(self, cleaned: str, *, title: str = "", strategy: str = "conservative") -> bool:
|
def _is_claim_candidate(self, cleaned: str, *, title: str = "", strategy: str = "conservative") -> bool:
|
||||||
lowered = cleaned.lower()
|
lowered = cleaned.lower()
|
||||||
normalized_title = self._normalize_inline_text(title).lower()
|
normalized_title = self._normalize_inline_text(title).lower()
|
||||||
|
|
@ -266,6 +288,10 @@ class DocliftBundleSourceAdapter:
|
||||||
paragraphs.append(" ".join(current))
|
paragraphs.append(" ".join(current))
|
||||||
current = []
|
current = []
|
||||||
continue
|
continue
|
||||||
|
if self._is_terminal_metadata_line(line):
|
||||||
|
if current:
|
||||||
|
paragraphs.append(" ".join(current))
|
||||||
|
break
|
||||||
if line.startswith("#") or line.startswith("![") or line.startswith("|"):
|
if line.startswith("#") or line.startswith("![") or line.startswith("|"):
|
||||||
continue
|
continue
|
||||||
if self._looks_like_metadata_line(line):
|
if self._looks_like_metadata_line(line):
|
||||||
|
|
|
||||||
|
|
@ -353,3 +353,62 @@ def test_doclift_bundle_import_derives_claims_from_prose_when_chunks_are_body_on
|
||||||
derived_observations = [item for item in result.observations if item["observation_id"].startswith("obs_doclift_1_derived_")]
|
derived_observations = [item for item in result.observations if item["observation_id"].startswith("obs_doclift_1_derived_")]
|
||||||
assert derived_observations
|
assert derived_observations
|
||||||
assert derived_observations[0]["metadata"]["claim_strategy"] in {"conservative", "balanced", "broad"}
|
assert derived_observations[0]["metadata"]["claim_strategy"] in {"conservative", "balanced", "broad"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_doclift_bundle_import_ignores_blog_comment_trailers(tmp_path: Path) -> None:
|
||||||
|
root = tmp_path / "doclift_bundle_blog"
|
||||||
|
document_dir = root / "documents" / "blog-1"
|
||||||
|
document_dir.mkdir(parents=True)
|
||||||
|
(root / "manifest.json").write_text(
|
||||||
|
'{\n'
|
||||||
|
' "documents": [\n'
|
||||||
|
' {\n'
|
||||||
|
' "document_id": "blog-1",\n'
|
||||||
|
' "title": "Blog Essay",\n'
|
||||||
|
' "document_kind": "web_article",\n'
|
||||||
|
' "output_dir": "documents/blog-1",\n'
|
||||||
|
' "markdown_path": "documents/blog-1/document.md"\n'
|
||||||
|
' }\n'
|
||||||
|
' ]\n'
|
||||||
|
'}\n',
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(document_dir / "document.md").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"# Blog Essay",
|
||||||
|
"",
|
||||||
|
"Random genetic drift is a fundamental and important part of evolution because many population-level changes are not driven by selection alone.",
|
||||||
|
"",
|
||||||
|
"Posted by Example Author",
|
||||||
|
"",
|
||||||
|
"3 comments:",
|
||||||
|
"",
|
||||||
|
"A mistake can do this glory is very unlikely and opposite to the first instinct that biology is complexity from a thinking being.",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(document_dir / "document.chunks.json").write_text(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"chunks": [
|
||||||
|
{
|
||||||
|
"chunk_id": "blog-1-body-1",
|
||||||
|
"role": "body",
|
||||||
|
"section": "Blog Essay",
|
||||||
|
"text": "Random genetic drift is a fundamental and important part of evolution because many population-level changes are not driven by selection alone.",
|
||||||
|
"line_start": 1,
|
||||||
|
"line_end": 2,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
result = run_groundrecall_import(root, mode="quick", import_id="doclift-blog-test")
|
||||||
|
claim_texts = [item["claim_text"] for item in result.claims]
|
||||||
|
|
||||||
|
assert any("Random genetic drift is a fundamental and important part of evolution" in text for text in claim_texts)
|
||||||
|
assert not any("A mistake can do this glory" in text for text in claim_texts)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue