#!/usr/bin/env python3 """ Common helpers for ThreeGate validators. Design goals: - stdlib-only - deterministic - conservative: reject on ambiguity """ from __future__ import annotations import hashlib import os import re from dataclasses import dataclass from datetime import datetime, timezone from typing import Dict, List, Tuple FRONT_MATTER_RE = re.compile(r"(?s)\A---\n(.*?)\n---\n", re.MULTILINE) # Suspicious / forbidden patterns (case-insensitive) meant to catch: # - instruction smuggling # - runnable shell/code blocks # - install/persistence advice # - “ignore policy” prompt injection FORBIDDEN_PATTERNS = [ # shell / command execution r"```(?:bash|sh|zsh|powershell|pwsh|cmd|fish)\b", r"\b(?:curl|wget)\b.*\|\s*(?:sh|bash|zsh)\b", r"\b(?:sudo|su)\b", r"\bchmod\s+\+x\b", r"\b(?:/etc/(?:passwd|shadow|sudoers)|~/.ssh)\b", r"\b(?:ssh|scp|sftp)\b", # package installs / persistence r"\b(?:apt-get|apt|dnf|yum|pacman|apk|brew)\s+install\b", r"\bpip\s+install\b", r"\bnpm\s+(?:i|install)\b", r"\bgo\s+get\b", r"\bgem\s+install\b", r"\bconda\s+install\b", r"\bsystemctl\b", r"\bcron\b|\bcrontab\b", r"\binit\.d\b|\bsysv\b", # policy override / injection cues r"ignore (?:all|any|previous|prior) (?:instructions|rules|policies)", r"\bsystem prompt\b|\bdeveloper message\b|\bhidden instructions\b", r"\bdo not mention\b.*\bpolicy\b", r"\bexfiltrat(?:e|ion)\b|\bdata exfil\b", r"\bbase64\b.*\bdecode\b", # often used to smuggle payloads ] FORBIDDEN_RE = [re.compile(pat, re.IGNORECASE) for pat in FORBIDDEN_PATTERNS] @dataclass(frozen=True) class ValidationResult: ok: bool errors: List[str] warnings: List[str] def sha256_bytes(data: bytes) -> str: h = hashlib.sha256() h.update(data) return h.hexdigest() def utc_now_iso() -> str: return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") def read_text(path: str, max_bytes: int = 5_000_000) -> str: st = os.stat(path) if st.st_size > max_bytes: raise ValueError(f"File too large for validator ({st.st_size} bytes > {max_bytes}).") with open(path, "rb") as f: data = f.read() # Strict UTF-8; reject if not UTF-8 try: return data.decode("utf-8") except UnicodeDecodeError as e: raise ValueError(f"File is not valid UTF-8 text: {e}") from e def extract_front_matter(md: str) -> Tuple[Dict[str, str], str]: """ Extract YAML-ish front matter. We intentionally implement a *very small* parser: - key: value - key: "value" - key: [a, b, c] (kept as raw string) - nested objects are not supported except as raw strings """ m = FRONT_MATTER_RE.search(md) if not m: return {}, md fm_text = m.group(1) body = md[m.end():] fm: Dict[str, str] = {} for line in fm_text.splitlines(): line = line.strip() if not line or line.startswith("#"): continue if ":" not in line: raise ValueError(f"Invalid front matter line (no ':'): {line}") k, v = line.split(":", 1) k = k.strip() v = v.strip() # Strip surrounding quotes if present if (v.startswith('"') and v.endswith('"')) or (v.startswith("'") and v.endswith("'")): v = v[1:-1] fm[k] = v return fm, body def require_keys(fm: Dict[str, str], keys: List[str]) -> List[str]: missing = [k for k in keys if k not in fm or not fm[k].strip()] return missing def find_forbidden(md: str) -> List[str]: hits: List[str] = [] for rx in FORBIDDEN_RE: m = rx.search(md) if m: snippet = md[max(0, m.start() - 40): m.end() + 40].replace("\n", "\\n") hits.append(f"Forbidden pattern matched: /{rx.pattern}/ near '{snippet}'") return hits def require_sections_in_order(body: str, required_h2: List[str]) -> List[str]: """ Require exact H2 headings in order. Additional headings allowed, but required must exist. """ errors: List[str] = [] # Find all H2 headings h2 = [line.strip() for line in body.splitlines() if line.startswith("## ")] idx = 0 for req in required_h2: while idx < len(h2) and h2[idx] != req: idx += 1 if idx >= len(h2): errors.append(f"Missing required section heading: {req}") continue idx += 1 return errors