Tighten doclift prose claim filtering

This commit is contained in:
welsberr 2026-05-08 02:40:38 -04:00
parent 779ebdb515
commit 0999ec35cd
1 changed files with 14 additions and 0 deletions

View File

@ -54,6 +54,14 @@ class DocliftBundleSourceAdapter:
" by random genetic drift", " by random genetic drift",
" over time", " over time",
) )
_LEADIN_PREFIXES = (
"some popular accounts give the impression",
"perhaps they think",
"perhaps they don't think",
"i'm not sure how",
"this is the important point",
"it means that",
)
def _resolve_bundle_path(self, base: Path, value: str | Path | None) -> Path: def _resolve_bundle_path(self, base: Path, value: str | Path | None) -> Path:
if value is None: if value is None:
@ -142,12 +150,18 @@ class DocliftBundleSourceAdapter:
return False return False
if any(lowered.startswith(prefix) for prefix in self._METADATA_PREFIXES): if any(lowered.startswith(prefix) for prefix in self._METADATA_PREFIXES):
return False return False
if "?" in cleaned:
return False
if any(lowered.startswith(prefix) for prefix in self._LEADIN_PREFIXES):
return False
if normalized_title and lowered == normalized_title: if normalized_title and lowered == normalized_title:
return False return False
if cleaned.count(" ") < 8: if cleaned.count(" ") < 8:
return False return False
if strategy in {"balanced", "conservative"} and cleaned[:1].islower(): if strategy in {"balanced", "conservative"} and cleaned[:1].islower():
return False return False
if not cleaned.endswith((".", "!", "?", '"')):
return False
return True return True
def _claim_priority(self, cleaned: str, *, strategy: str = "conservative") -> tuple[int, int]: def _claim_priority(self, cleaned: str, *, strategy: str = "conservative") -> tuple[int, int]: