from __future__ import annotations from dataclasses import dataclass, field from hashlib import sha256 from html.parser import HTMLParser from pathlib import Path import re from typing import Any from .base import DiscoveredImportSource, StructuredImportRows, register_source_adapter ARTIFACT_SUFFIXES = {".html", ".htm"} ARTICLE_TITLE_RE = re.compile(r'

(.*?)

', re.I | re.S) BYLINE_RE = re.compile( r'

\s*Posted\s+(?P.*?)\s+by\s+', re.I | re.S, ) COMMENT_META_RE = re.compile( r'

\s*(?P.*?)\s*·\s*(?P.*?)

', re.I | re.S, ) COMMENTS_SECTION_RE = re.compile(r'
(.*?)
', re.I | re.S) def _strip_tags(text: str) -> str: return re.sub(r"(?s)<[^>]+>", " ", text) def _normalize_space(text: str) -> str: text = text.replace("\r", "\n") text = re.sub(r"[ \t]+\n", "\n", text) text = re.sub(r"\n{3,}", "\n\n", text) text = re.sub(r"[ \t]{2,}", " ", text) return text.strip() def _fragment_to_text(fragment: str) -> str: fragment = re.sub(r"(?is)", " ", fragment) fragment = re.sub(r"(?is)", " ", fragment) fragment = re.sub( r'(?is)]*href=["\']([^"\']+)["\'][^>]*>(.*?)', lambda match: f"{_strip_tags(match.group(2)).strip()} ({match.group(1).strip()})".strip(), fragment, ) fragment = re.sub(r"(?i)", "\n", fragment) fragment = re.sub(r"(?i)", "\n\n", fragment) fragment = re.sub(r"(?i)]*>", "", fragment) fragment = re.sub(r"(?i)", "\n", fragment) fragment = re.sub(r"(?i)]*>", "", fragment) fragment = re.sub(r"(?i)", "\n", fragment) fragment = re.sub(r"(?i)]*>", "", fragment) fragment = re.sub(r"(?i)", "\n", fragment) fragment = re.sub(r"(?i)]*>", "\n> ", fragment) fragment = re.sub(r"(?i)", "\n", fragment) fragment = re.sub(r"(?i)]*>", "\n- ", fragment) fragment = re.sub(r"(?i)]*>|", "\n", fragment) fragment = re.sub(r"(?i)]*>|", "\n", fragment) fragment = _strip_tags(fragment) fragment = re.sub(r"\s*\n\s*", "\n", fragment) return _normalize_space(fragment.replace("\xa0", " ")) def _id_from_path(relative_path: str) -> str: return f"pt_{sha256(relative_path.encode('utf-8')).hexdigest()[:12]}" def _site_root(root: Path) -> Path: candidate = root / "public_html" if (candidate / "archives").is_dir(): return candidate return root def _discover_html_files(site_root: Path) -> list[Path]: archives = site_root / "archives" if not archives.is_dir(): return [] rows = [] for path in sorted(archives.rglob("*")): if not path.is_file() or path.suffix.lower() not in ARTIFACT_SUFFIXES: continue if path.name.lower() == "index.html": continue rows.append(path) return rows def _extract_article(html_text: str, relative_path: str) -> dict[str, Any] | None: title_match = ARTICLE_TITLE_RE.search(html_text) body_match = re.search(r'
(.*)', html_text, re.I | re.S) if title_match is None or body_match is None: return None meta_match = BYLINE_RE.search(html_text) comments_match = COMMENTS_SECTION_RE.search(html_text) body_html = body_match.group(1) if comments_match is not None: body_html = body_html[: comments_match.start() - body_match.end()] body_text = _fragment_to_text(body_html) title = _fragment_to_text(title_match.group(1)) author = _fragment_to_text(meta_match.group("author")) if meta_match else "" published_at = _fragment_to_text(meta_match.group("date")) if meta_match else "" canonical_url = "/" + relative_path.lstrip("/") return { "document_id": _id_from_path(relative_path), "title": title, "author": author, "published_at": published_at, "canonical_url": canonical_url, "body_text": body_text, } def _extract_comments(html_text: str, relative_path: str, parent_document_id: str) -> list[dict[str, Any]]: comments_match = COMMENTS_SECTION_RE.search(html_text) if comments_match is None: return [] comments_html = comments_match.group(1) rows: list[dict[str, Any]] = [] starts = list(re.finditer(r'
', comments_html, re.I)) for index, match in enumerate(starts): start = match.end() end = starts[index + 1].start() if index + 1 < len(starts) else len(comments_html) chunk = comments_html[start:end] meta_match = COMMENT_META_RE.search(chunk) body_match = re.search(r'
(.*)
', chunk, re.I | re.S) if body_match is None: continue rows.append( { "document_id": f"{parent_document_id}__comment_{match.group('comment_id')}", "parent_document_id": parent_document_id, "comment_id": match.group("comment_id"), "document_kind": "comment", "comment_author": _fragment_to_text(meta_match.group("author")) if meta_match else "", "comment_date": _fragment_to_text(meta_match.group("date")) if meta_match else "", "canonical_url": "/" + relative_path.lstrip("/"), "body_text": _fragment_to_text(body_match.group(1)), } ) return rows class PandasThumbMtSourceAdapter: name = "pandasthumb_mt" def detect(self, root: str | Path) -> bool: site_root = _site_root(Path(root)) return (site_root / "archives").is_dir() and (site_root / "index.html").exists() def discover(self, root: str | Path) -> list[DiscoveredImportSource]: site_root = _site_root(Path(root)) rows: list[DiscoveredImportSource] = [] for path in _discover_html_files(site_root): rows.append( DiscoveredImportSource( path=path, relative_path=path.relative_to(site_root).as_posix(), source_kind="pandasthumb_mt", artifact_kind="pandasthumb_mt_page", is_text=True, metadata={"corpus": "pandasthumb_mt"}, ) ) return rows def import_intent(self) -> str: return "grounded_knowledge" def build_rows(self, context, sources: list[DiscoveredImportSource]) -> StructuredImportRows | None: artifact_rows: list[dict[str, Any]] = [] observation_rows: list[dict[str, Any]] = [] claim_rows: list[dict[str, Any]] = [] concept_rows: list[dict[str, Any]] = [] relation_rows: list[dict[str, Any]] = [] for source in sources: html_text = source.path.read_text(encoding="utf-8", errors="replace") article = _extract_article(html_text, source.relative_path) if article is None: continue artifact_id = _id_from_path(source.relative_path) artifact_rows.append( { "artifact_id": artifact_id, "import_id": context.import_id, "artifact_kind": source.artifact_kind, "path": source.relative_path, "title": article["title"], "sha256": sha256(source.path.read_bytes()).hexdigest(), "created_at": context.imported_at, "metadata": { "corpus": "pandasthumb_mt", "document_kind": "article", "author": article["author"], "published_at": article["published_at"], "canonical_url": article["canonical_url"], }, "current_status": "draft", } ) observation_rows.append( { "observation_id": f"obs_{artifact_id}_body", "import_id": context.import_id, "artifact_id": artifact_id, "role": "summary", "text": article["body_text"], "origin_path": source.relative_path, "origin_section": article["title"], "line_start": 0, "line_end": 0, "source_url": article["canonical_url"], "metadata": { "corpus": "pandasthumb_mt", "document_kind": "article", "author": article["author"], "published_at": article["published_at"], }, "grounding_status": "grounded", "support_kind": "direct_source", "confidence_hint": 0.75, "current_status": "draft", } ) for comment in _extract_comments(html_text, source.relative_path, artifact_id): comment_artifact_id = comment["document_id"] artifact_rows.append( { "artifact_id": comment_artifact_id, "import_id": context.import_id, "artifact_kind": "pandasthumb_mt_comment", "path": source.relative_path, "title": f"{article['title']} comment {comment['comment_id']}", "sha256": sha256( f"{source.relative_path}#{comment['comment_id']}".encode("utf-8") ).hexdigest(), "created_at": context.imported_at, "metadata": { "corpus": "pandasthumb_mt", "document_kind": "comment", "parent_document_id": artifact_id, "comment_id": comment["comment_id"], "comment_author": comment["comment_author"], "comment_date": comment["comment_date"], "canonical_url": comment["canonical_url"], }, "current_status": "draft", } ) observation_rows.append( { "observation_id": f"obs_{comment_artifact_id}_body", "import_id": context.import_id, "artifact_id": comment_artifact_id, "role": "summary", "text": comment["body_text"], "origin_path": source.relative_path, "origin_section": f"comment {comment['comment_id']}", "line_start": 0, "line_end": 0, "source_url": comment["canonical_url"], "metadata": { "corpus": "pandasthumb_mt", "document_kind": "comment", "parent_document_id": artifact_id, "comment_id": comment["comment_id"], "comment_author": comment["comment_author"], "comment_date": comment["comment_date"], }, "grounding_status": "grounded", "support_kind": "direct_source", "confidence_hint": 0.7, "current_status": "draft", } ) return StructuredImportRows( artifact_rows=artifact_rows, fragment_rows=[], observation_rows=observation_rows, claim_rows=claim_rows, concept_rows=concept_rows, relation_rows=relation_rows, ) register_source_adapter(PandasThumbMtSourceAdapter())