ThreeGate/tools/validate_common.py

153 lines
4.4 KiB
Python

#!/usr/bin/env python3
"""
Common helpers for ThreeGate validators.
Design goals:
- stdlib-only
- deterministic
- conservative: reject on ambiguity
"""
from __future__ import annotations
import hashlib
import os
import re
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Dict, List, Tuple
FRONT_MATTER_RE = re.compile(r"(?s)\A---\n(.*?)\n---\n", re.MULTILINE)
# Suspicious / forbidden patterns (case-insensitive) meant to catch:
# - instruction smuggling
# - runnable shell/code blocks
# - install/persistence advice
# - “ignore policy” prompt injection
FORBIDDEN_PATTERNS = [
# shell / command execution
r"```(?:bash|sh|zsh|powershell|pwsh|cmd|fish)\b",
r"\b(?:curl|wget)\b.*\|\s*(?:sh|bash|zsh)\b",
r"\b(?:sudo|su)\b",
r"\bchmod\s+\+x\b",
r"\b(?:/etc/(?:passwd|shadow|sudoers)|~/.ssh)\b",
r"\b(?:ssh|scp|sftp)\b",
# package installs / persistence
r"\b(?:apt-get|apt|dnf|yum|pacman|apk|brew)\s+install\b",
r"\bpip\s+install\b",
r"\bnpm\s+(?:i|install)\b",
r"\bgo\s+get\b",
r"\bgem\s+install\b",
r"\bconda\s+install\b",
r"\bsystemctl\b",
r"\bcron\b|\bcrontab\b",
r"\binit\.d\b|\bsysv\b",
# policy override / injection cues
r"ignore (?:all|any|previous|prior) (?:instructions|rules|policies)",
r"\bsystem prompt\b|\bdeveloper message\b|\bhidden instructions\b",
r"\bdo not mention\b.*\bpolicy\b",
r"\bexfiltrat(?:e|ion)\b|\bdata exfil\b",
r"\bbase64\b.*\bdecode\b", # often used to smuggle payloads
]
FORBIDDEN_RE = [re.compile(pat, re.IGNORECASE) for pat in FORBIDDEN_PATTERNS]
@dataclass(frozen=True)
class ValidationResult:
ok: bool
errors: List[str]
warnings: List[str]
def sha256_bytes(data: bytes) -> str:
h = hashlib.sha256()
h.update(data)
return h.hexdigest()
def utc_now_iso() -> str:
return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
def read_text(path: str, max_bytes: int = 5_000_000) -> str:
st = os.stat(path)
if st.st_size > max_bytes:
raise ValueError(f"File too large for validator ({st.st_size} bytes > {max_bytes}).")
with open(path, "rb") as f:
data = f.read()
# Strict UTF-8; reject if not UTF-8
try:
return data.decode("utf-8")
except UnicodeDecodeError as e:
raise ValueError(f"File is not valid UTF-8 text: {e}") from e
def extract_front_matter(md: str) -> Tuple[Dict[str, str], str]:
"""
Extract YAML-ish front matter.
We intentionally implement a *very small* parser:
- key: value
- key: "value"
- key: [a, b, c] (kept as raw string)
- nested objects are not supported except as raw strings
"""
m = FRONT_MATTER_RE.search(md)
if not m:
return {}, md
fm_text = m.group(1)
body = md[m.end():]
fm: Dict[str, str] = {}
for line in fm_text.splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
if ":" not in line:
raise ValueError(f"Invalid front matter line (no ':'): {line}")
k, v = line.split(":", 1)
k = k.strip()
v = v.strip()
# Strip surrounding quotes if present
if (v.startswith('"') and v.endswith('"')) or (v.startswith("'") and v.endswith("'")):
v = v[1:-1]
fm[k] = v
return fm, body
def require_keys(fm: Dict[str, str], keys: List[str]) -> List[str]:
missing = [k for k in keys if k not in fm or not fm[k].strip()]
return missing
def find_forbidden(md: str) -> List[str]:
hits: List[str] = []
for rx in FORBIDDEN_RE:
m = rx.search(md)
if m:
snippet = md[max(0, m.start() - 40): m.end() + 40].replace("\n", "\\n")
hits.append(f"Forbidden pattern matched: /{rx.pattern}/ near '{snippet}'")
return hits
def require_sections_in_order(body: str, required_h2: List[str]) -> List[str]:
"""
Require exact H2 headings in order. Additional headings allowed, but required must exist.
"""
errors: List[str] = []
# Find all H2 headings
h2 = [line.strip() for line in body.splitlines() if line.startswith("## ")]
idx = 0
for req in required_h2:
while idx < len(h2) and h2[idx] != req:
idx += 1
if idx >= len(h2):
errors.append(f"Missing required section heading: {req}")
continue
idx += 1
return errors