from __future__ import annotations
from dataclasses import dataclass, field
from hashlib import sha256
from html.parser import HTMLParser
from pathlib import Path
import re
from typing import Any
from .base import DiscoveredImportSource, StructuredImportRows, register_source_adapter
ARTIFACT_SUFFIXES = {".html", ".htm"}
ARTICLE_TITLE_RE = re.compile(r'
(.*?)
', re.I | re.S)
BYLINE_RE = re.compile(
r'\s*Posted\s+(?P.*?)\s+by\s+(?P.*?)',
re.I | re.S,
)
COMMENT_META_RE = re.compile(
r'',
re.I | re.S,
)
COMMENTS_SECTION_RE = re.compile(r'', re.I | re.S)
def _strip_tags(text: str) -> str:
return re.sub(r"(?s)<[^>]+>", " ", text)
def _normalize_space(text: str) -> str:
text = text.replace("\r", "\n")
text = re.sub(r"[ \t]+\n", "\n", text)
text = re.sub(r"\n{3,}", "\n\n", text)
text = re.sub(r"[ \t]{2,}", " ", text)
return text.strip()
def _fragment_to_text(fragment: str) -> str:
fragment = re.sub(r"(?is)