Tighten doclift prose claim filtering
This commit is contained in:
parent
779ebdb515
commit
0999ec35cd
|
|
@ -54,6 +54,14 @@ class DocliftBundleSourceAdapter:
|
||||||
" by random genetic drift",
|
" by random genetic drift",
|
||||||
" over time",
|
" over time",
|
||||||
)
|
)
|
||||||
|
_LEADIN_PREFIXES = (
|
||||||
|
"some popular accounts give the impression",
|
||||||
|
"perhaps they think",
|
||||||
|
"perhaps they don't think",
|
||||||
|
"i'm not sure how",
|
||||||
|
"this is the important point",
|
||||||
|
"it means that",
|
||||||
|
)
|
||||||
|
|
||||||
def _resolve_bundle_path(self, base: Path, value: str | Path | None) -> Path:
|
def _resolve_bundle_path(self, base: Path, value: str | Path | None) -> Path:
|
||||||
if value is None:
|
if value is None:
|
||||||
|
|
@ -142,12 +150,18 @@ class DocliftBundleSourceAdapter:
|
||||||
return False
|
return False
|
||||||
if any(lowered.startswith(prefix) for prefix in self._METADATA_PREFIXES):
|
if any(lowered.startswith(prefix) for prefix in self._METADATA_PREFIXES):
|
||||||
return False
|
return False
|
||||||
|
if "?" in cleaned:
|
||||||
|
return False
|
||||||
|
if any(lowered.startswith(prefix) for prefix in self._LEADIN_PREFIXES):
|
||||||
|
return False
|
||||||
if normalized_title and lowered == normalized_title:
|
if normalized_title and lowered == normalized_title:
|
||||||
return False
|
return False
|
||||||
if cleaned.count(" ") < 8:
|
if cleaned.count(" ") < 8:
|
||||||
return False
|
return False
|
||||||
if strategy in {"balanced", "conservative"} and cleaned[:1].islower():
|
if strategy in {"balanced", "conservative"} and cleaned[:1].islower():
|
||||||
return False
|
return False
|
||||||
|
if not cleaned.endswith((".", "!", "?", '"')):
|
||||||
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _claim_priority(self, cleaned: str, *, strategy: str = "conservative") -> tuple[int, int]:
|
def _claim_priority(self, cleaned: str, *, strategy: str = "conservative") -> tuple[int, int]:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue