ThreeGate/tools/validate_common.py

#!/usr/bin/env python3
"""
Common helpers for ThreeGate validators.

Design goals:
- stdlib-only
- deterministic
- conservative: reject on ambiguity
"""

from __future__ import annotations

import hashlib
import os
import re
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Dict, List, Tuple


FRONT_MATTER_RE = re.compile(r"(?s)\A---\n(.*?)\n---\n", re.MULTILINE)

# Suspicious / forbidden patterns (case-insensitive) meant to catch:
# - instruction smuggling
# - runnable shell/code blocks
# - install/persistence advice
# - “ignore policy” prompt injection
FORBIDDEN_PATTERNS = [
    # shell / command execution
    r"```(?:bash|sh|zsh|powershell|pwsh|cmd|fish)\b",
    r"\b(?:curl|wget)\b.*\|\s*(?:sh|bash|zsh)\b",
    r"\b(?:sudo|su)\b",
    r"\bchmod\s+\+x\b",
    r"\b(?:/etc/(?:passwd|shadow|sudoers)|~/.ssh)\b",
    r"\b(?:ssh|scp|sftp)\b",

    # package installs / persistence
    r"\b(?:apt-get|apt|dnf|yum|pacman|apk|brew)\s+install\b",
    r"\bpip\s+install\b",
    r"\bnpm\s+(?:i|install)\b",
    r"\bgo\s+get\b",
    r"\bgem\s+install\b",
    r"\bconda\s+install\b",
    r"\bsystemctl\b",
    r"\bcron\b|\bcrontab\b",
    r"\binit\.d\b|\bsysv\b",

    # policy override / injection cues
    r"ignore (?:all|any|previous|prior) (?:instructions|rules|policies)",
    r"\bsystem prompt\b|\bdeveloper message\b|\bhidden instructions\b",
    r"\bdo not mention\b.*\bpolicy\b",
    r"\bexfiltrat(?:e|ion)\b|\bdata exfil\b",
    r"\bbase64\b.*\bdecode\b",  # often used to smuggle payloads
]

FORBIDDEN_RE = [re.compile(pat, re.IGNORECASE) for pat in FORBIDDEN_PATTERNS]


@dataclass(frozen=True)
class ValidationResult:
    ok: bool
    errors: List[str]
    warnings: List[str]


def sha256_bytes(data: bytes) -> str:
    h = hashlib.sha256()
    h.update(data)
    return h.hexdigest()


def utc_now_iso() -> str:
    return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")


def read_text(path: str, max_bytes: int = 5_000_000) -> str:
    st = os.stat(path)
    if st.st_size > max_bytes:
        raise ValueError(f"File too large for validator ({st.st_size} bytes > {max_bytes}).")
    with open(path, "rb") as f:
        data = f.read()
    # Strict UTF-8; reject if not UTF-8
    try:
        return data.decode("utf-8")
    except UnicodeDecodeError as e:
        raise ValueError(f"File is not valid UTF-8 text: {e}") from e


def extract_front_matter(md: str) -> Tuple[Dict[str, str], str]:
    """
    Extract YAML-ish front matter.

    We intentionally implement a *very small* parser:
    - key: value
    - key: "value"
    - key: [a, b, c] (kept as raw string)
    - nested objects are not supported except as raw strings
    """
    m = FRONT_MATTER_RE.search(md)
    if not m:
        return {}, md
    fm_text = m.group(1)
    body = md[m.end():]

    fm: Dict[str, str] = {}
    for line in fm_text.splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if ":" not in line:
            raise ValueError(f"Invalid front matter line (no ':'): {line}")
        k, v = line.split(":", 1)
        k = k.strip()
        v = v.strip()
        # Strip surrounding quotes if present
        if (v.startswith('"') and v.endswith('"')) or (v.startswith("'") and v.endswith("'")):
            v = v[1:-1]
        fm[k] = v
    return fm, body


def require_keys(fm: Dict[str, str], keys: List[str]) -> List[str]:
    missing = [k for k in keys if k not in fm or not fm[k].strip()]
    return missing


def find_forbidden(md: str) -> List[str]:
    hits: List[str] = []
    for rx in FORBIDDEN_RE:
        m = rx.search(md)
        if m:
            snippet = md[max(0, m.start() - 40): m.end() + 40].replace("\n", "\\n")
            hits.append(f"Forbidden pattern matched: /{rx.pattern}/ near '{snippet}'")
    return hits


def require_sections_in_order(body: str, required_h2: List[str]) -> List[str]:
    """
    Require exact H2 headings in order. Additional headings allowed, but required must exist.
    """
    errors: List[str] = []
    # Find all H2 headings
    h2 = [line.strip() for line in body.splitlines() if line.startswith("## ")]
    idx = 0
    for req in required_h2:
        while idx < len(h2) and h2[idx] != req:
            idx += 1
        if idx >= len(h2):
            errors.append(f"Missing required section heading: {req}")
            continue
        idx += 1
    return errors