GroundRecall/src/groundrecall/groundrecall_source_adapters/pandasthumb_mt.py

292 lines
12 KiB
Python

from __future__ import annotations
from dataclasses import dataclass, field
from hashlib import sha256
from html.parser import HTMLParser
from pathlib import Path
import re
from typing import Any
from .base import DiscoveredImportSource, StructuredImportRows, register_source_adapter
ARTIFACT_SUFFIXES = {".html", ".htm"}
ARTICLE_TITLE_RE = re.compile(r'<h1 class="post-title">(.*?)</h1>', re.I | re.S)
BYLINE_RE = re.compile(
r'<p class="post-meta">\s*Posted\s+(?P<date>.*?)\s+by\s+<span class="post-author">(?P<author>.*?)</span>',
re.I | re.S,
)
COMMENT_META_RE = re.compile(
r'<p class="comment-meta">\s*<span class="comment-author">(?P<author>.*?)</span>\s*&middot;\s*(?P<date>.*?)</p>',
re.I | re.S,
)
COMMENTS_SECTION_RE = re.compile(r'<section class="comments-section">(.*?)</section>', re.I | re.S)
def _strip_tags(text: str) -> str:
return re.sub(r"(?s)<[^>]+>", " ", text)
def _normalize_space(text: str) -> str:
text = text.replace("\r", "\n")
text = re.sub(r"[ \t]+\n", "\n", text)
text = re.sub(r"\n{3,}", "\n\n", text)
text = re.sub(r"[ \t]{2,}", " ", text)
return text.strip()
def _fragment_to_text(fragment: str) -> str:
fragment = re.sub(r"(?is)<script\b.*?</script>", " ", fragment)
fragment = re.sub(r"(?is)<style\b.*?</style>", " ", fragment)
fragment = re.sub(
r'(?is)<a\b[^>]*href=["\']([^"\']+)["\'][^>]*>(.*?)</a>',
lambda match: f"{_strip_tags(match.group(2)).strip()} ({match.group(1).strip()})".strip(),
fragment,
)
fragment = re.sub(r"(?i)<br\s*/?>", "\n", fragment)
fragment = re.sub(r"(?i)</p\s*>", "\n\n", fragment)
fragment = re.sub(r"(?i)<p\b[^>]*>", "", fragment)
fragment = re.sub(r"(?i)</div\s*>", "\n", fragment)
fragment = re.sub(r"(?i)<div\b[^>]*>", "", fragment)
fragment = re.sub(r"(?i)</section\s*>", "\n", fragment)
fragment = re.sub(r"(?i)<section\b[^>]*>", "", fragment)
fragment = re.sub(r"(?i)</blockquote\s*>", "\n", fragment)
fragment = re.sub(r"(?i)<blockquote\b[^>]*>", "\n> ", fragment)
fragment = re.sub(r"(?i)</li\s*>", "\n", fragment)
fragment = re.sub(r"(?i)<li\b[^>]*>", "\n- ", fragment)
fragment = re.sub(r"(?i)<ul\b[^>]*>|</ul\s*>", "\n", fragment)
fragment = re.sub(r"(?i)<ol\b[^>]*>|</ol\s*>", "\n", fragment)
fragment = _strip_tags(fragment)
fragment = re.sub(r"\s*\n\s*", "\n", fragment)
return _normalize_space(fragment.replace("\xa0", " "))
def _id_from_path(relative_path: str) -> str:
return f"pt_{sha256(relative_path.encode('utf-8')).hexdigest()[:12]}"
def _site_root(root: Path) -> Path:
candidate = root / "public_html"
if (candidate / "archives").is_dir():
return candidate
return root
def _discover_html_files(site_root: Path) -> list[Path]:
archives = site_root / "archives"
if not archives.is_dir():
return []
rows = []
for path in sorted(archives.rglob("*")):
if not path.is_file() or path.suffix.lower() not in ARTIFACT_SUFFIXES:
continue
if path.name.lower() == "index.html":
continue
rows.append(path)
return rows
def _extract_article(html_text: str, relative_path: str) -> dict[str, Any] | None:
title_match = ARTICLE_TITLE_RE.search(html_text)
body_match = re.search(r'<div class="post-body">(.*)', html_text, re.I | re.S)
if title_match is None or body_match is None:
return None
meta_match = BYLINE_RE.search(html_text)
comments_match = COMMENTS_SECTION_RE.search(html_text)
body_html = body_match.group(1)
if comments_match is not None:
body_html = body_html[: comments_match.start() - body_match.end()]
body_text = _fragment_to_text(body_html)
title = _fragment_to_text(title_match.group(1))
author = _fragment_to_text(meta_match.group("author")) if meta_match else ""
published_at = _fragment_to_text(meta_match.group("date")) if meta_match else ""
canonical_url = "/" + relative_path.lstrip("/")
return {
"document_id": _id_from_path(relative_path),
"title": title,
"author": author,
"published_at": published_at,
"canonical_url": canonical_url,
"body_text": body_text,
}
def _extract_comments(html_text: str, relative_path: str, parent_document_id: str) -> list[dict[str, Any]]:
comments_match = COMMENTS_SECTION_RE.search(html_text)
if comments_match is None:
return []
comments_html = comments_match.group(1)
rows: list[dict[str, Any]] = []
starts = list(re.finditer(r'<div class="comment" id="comment-(?P<comment_id>\d+)">', comments_html, re.I))
for index, match in enumerate(starts):
start = match.end()
end = starts[index + 1].start() if index + 1 < len(starts) else len(comments_html)
chunk = comments_html[start:end]
meta_match = COMMENT_META_RE.search(chunk)
body_match = re.search(r'<div class="comment-body">(.*)</div>', chunk, re.I | re.S)
if body_match is None:
continue
rows.append(
{
"document_id": f"{parent_document_id}__comment_{match.group('comment_id')}",
"parent_document_id": parent_document_id,
"comment_id": match.group("comment_id"),
"document_kind": "comment",
"comment_author": _fragment_to_text(meta_match.group("author")) if meta_match else "",
"comment_date": _fragment_to_text(meta_match.group("date")) if meta_match else "",
"canonical_url": "/" + relative_path.lstrip("/"),
"body_text": _fragment_to_text(body_match.group(1)),
}
)
return rows
class PandasThumbMtSourceAdapter:
name = "pandasthumb_mt"
def detect(self, root: str | Path) -> bool:
site_root = _site_root(Path(root))
return (site_root / "archives").is_dir() and (site_root / "index.html").exists()
def discover(self, root: str | Path) -> list[DiscoveredImportSource]:
site_root = _site_root(Path(root))
rows: list[DiscoveredImportSource] = []
for path in _discover_html_files(site_root):
rows.append(
DiscoveredImportSource(
path=path,
relative_path=path.relative_to(site_root).as_posix(),
source_kind="pandasthumb_mt",
artifact_kind="pandasthumb_mt_page",
is_text=True,
metadata={"corpus": "pandasthumb_mt"},
)
)
return rows
def import_intent(self) -> str:
return "grounded_knowledge"
def build_rows(self, context, sources: list[DiscoveredImportSource]) -> StructuredImportRows | None:
artifact_rows: list[dict[str, Any]] = []
observation_rows: list[dict[str, Any]] = []
claim_rows: list[dict[str, Any]] = []
concept_rows: list[dict[str, Any]] = []
relation_rows: list[dict[str, Any]] = []
for source in sources:
html_text = source.path.read_text(encoding="utf-8", errors="replace")
article = _extract_article(html_text, source.relative_path)
if article is None:
continue
artifact_id = _id_from_path(source.relative_path)
artifact_rows.append(
{
"artifact_id": artifact_id,
"import_id": context.import_id,
"artifact_kind": source.artifact_kind,
"path": source.relative_path,
"title": article["title"],
"sha256": sha256(source.path.read_bytes()).hexdigest(),
"created_at": context.imported_at,
"metadata": {
"corpus": "pandasthumb_mt",
"document_kind": "article",
"author": article["author"],
"published_at": article["published_at"],
"canonical_url": article["canonical_url"],
},
"current_status": "draft",
}
)
observation_rows.append(
{
"observation_id": f"obs_{artifact_id}_body",
"import_id": context.import_id,
"artifact_id": artifact_id,
"role": "summary",
"text": article["body_text"],
"origin_path": source.relative_path,
"origin_section": article["title"],
"line_start": 0,
"line_end": 0,
"source_url": article["canonical_url"],
"metadata": {
"corpus": "pandasthumb_mt",
"document_kind": "article",
"author": article["author"],
"published_at": article["published_at"],
},
"grounding_status": "grounded",
"support_kind": "direct_source",
"confidence_hint": 0.75,
"current_status": "draft",
}
)
for comment in _extract_comments(html_text, source.relative_path, artifact_id):
comment_artifact_id = comment["document_id"]
artifact_rows.append(
{
"artifact_id": comment_artifact_id,
"import_id": context.import_id,
"artifact_kind": "pandasthumb_mt_comment",
"path": source.relative_path,
"title": f"{article['title']} comment {comment['comment_id']}",
"sha256": sha256(
f"{source.relative_path}#{comment['comment_id']}".encode("utf-8")
).hexdigest(),
"created_at": context.imported_at,
"metadata": {
"corpus": "pandasthumb_mt",
"document_kind": "comment",
"parent_document_id": artifact_id,
"comment_id": comment["comment_id"],
"comment_author": comment["comment_author"],
"comment_date": comment["comment_date"],
"canonical_url": comment["canonical_url"],
},
"current_status": "draft",
}
)
observation_rows.append(
{
"observation_id": f"obs_{comment_artifact_id}_body",
"import_id": context.import_id,
"artifact_id": comment_artifact_id,
"role": "summary",
"text": comment["body_text"],
"origin_path": source.relative_path,
"origin_section": f"comment {comment['comment_id']}",
"line_start": 0,
"line_end": 0,
"source_url": comment["canonical_url"],
"metadata": {
"corpus": "pandasthumb_mt",
"document_kind": "comment",
"parent_document_id": artifact_id,
"comment_id": comment["comment_id"],
"comment_author": comment["comment_author"],
"comment_date": comment["comment_date"],
},
"grounding_status": "grounded",
"support_kind": "direct_source",
"confidence_hint": 0.7,
"current_status": "draft",
}
)
return StructuredImportRows(
artifact_rows=artifact_rows,
fragment_rows=[],
observation_rows=observation_rows,
claim_rows=claim_rows,
concept_rows=concept_rows,
relation_rows=relation_rows,
)
register_source_adapter(PandasThumbMtSourceAdapter())