153 lines
4.4 KiB
Python
153 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Common helpers for ThreeGate validators.
|
|
|
|
Design goals:
|
|
- stdlib-only
|
|
- deterministic
|
|
- conservative: reject on ambiguity
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import os
|
|
import re
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Tuple
|
|
|
|
|
|
FRONT_MATTER_RE = re.compile(r"(?s)\A---\n(.*?)\n---\n", re.MULTILINE)
|
|
|
|
# Suspicious / forbidden patterns (case-insensitive) meant to catch:
|
|
# - instruction smuggling
|
|
# - runnable shell/code blocks
|
|
# - install/persistence advice
|
|
# - “ignore policy” prompt injection
|
|
FORBIDDEN_PATTERNS = [
|
|
# shell / command execution
|
|
r"```(?:bash|sh|zsh|powershell|pwsh|cmd|fish)\b",
|
|
r"\b(?:curl|wget)\b.*\|\s*(?:sh|bash|zsh)\b",
|
|
r"\b(?:sudo|su)\b",
|
|
r"\bchmod\s+\+x\b",
|
|
r"\b(?:/etc/(?:passwd|shadow|sudoers)|~/.ssh)\b",
|
|
r"\b(?:ssh|scp|sftp)\b",
|
|
|
|
# package installs / persistence
|
|
r"\b(?:apt-get|apt|dnf|yum|pacman|apk|brew)\s+install\b",
|
|
r"\bpip\s+install\b",
|
|
r"\bnpm\s+(?:i|install)\b",
|
|
r"\bgo\s+get\b",
|
|
r"\bgem\s+install\b",
|
|
r"\bconda\s+install\b",
|
|
r"\bsystemctl\b",
|
|
r"\bcron\b|\bcrontab\b",
|
|
r"\binit\.d\b|\bsysv\b",
|
|
|
|
# policy override / injection cues
|
|
r"ignore (?:all|any|previous|prior) (?:instructions|rules|policies)",
|
|
r"\bsystem prompt\b|\bdeveloper message\b|\bhidden instructions\b",
|
|
r"\bdo not mention\b.*\bpolicy\b",
|
|
r"\bexfiltrat(?:e|ion)\b|\bdata exfil\b",
|
|
r"\bbase64\b.*\bdecode\b", # often used to smuggle payloads
|
|
]
|
|
|
|
FORBIDDEN_RE = [re.compile(pat, re.IGNORECASE) for pat in FORBIDDEN_PATTERNS]
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ValidationResult:
|
|
ok: bool
|
|
errors: List[str]
|
|
warnings: List[str]
|
|
|
|
|
|
def sha256_bytes(data: bytes) -> str:
|
|
h = hashlib.sha256()
|
|
h.update(data)
|
|
return h.hexdigest()
|
|
|
|
|
|
def utc_now_iso() -> str:
|
|
return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
|
|
|
|
|
def read_text(path: str, max_bytes: int = 5_000_000) -> str:
|
|
st = os.stat(path)
|
|
if st.st_size > max_bytes:
|
|
raise ValueError(f"File too large for validator ({st.st_size} bytes > {max_bytes}).")
|
|
with open(path, "rb") as f:
|
|
data = f.read()
|
|
# Strict UTF-8; reject if not UTF-8
|
|
try:
|
|
return data.decode("utf-8")
|
|
except UnicodeDecodeError as e:
|
|
raise ValueError(f"File is not valid UTF-8 text: {e}") from e
|
|
|
|
|
|
def extract_front_matter(md: str) -> Tuple[Dict[str, str], str]:
|
|
"""
|
|
Extract YAML-ish front matter.
|
|
|
|
We intentionally implement a *very small* parser:
|
|
- key: value
|
|
- key: "value"
|
|
- key: [a, b, c] (kept as raw string)
|
|
- nested objects are not supported except as raw strings
|
|
"""
|
|
m = FRONT_MATTER_RE.search(md)
|
|
if not m:
|
|
return {}, md
|
|
fm_text = m.group(1)
|
|
body = md[m.end():]
|
|
|
|
fm: Dict[str, str] = {}
|
|
for line in fm_text.splitlines():
|
|
line = line.strip()
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
if ":" not in line:
|
|
raise ValueError(f"Invalid front matter line (no ':'): {line}")
|
|
k, v = line.split(":", 1)
|
|
k = k.strip()
|
|
v = v.strip()
|
|
# Strip surrounding quotes if present
|
|
if (v.startswith('"') and v.endswith('"')) or (v.startswith("'") and v.endswith("'")):
|
|
v = v[1:-1]
|
|
fm[k] = v
|
|
return fm, body
|
|
|
|
|
|
def require_keys(fm: Dict[str, str], keys: List[str]) -> List[str]:
|
|
missing = [k for k in keys if k not in fm or not fm[k].strip()]
|
|
return missing
|
|
|
|
|
|
def find_forbidden(md: str) -> List[str]:
|
|
hits: List[str] = []
|
|
for rx in FORBIDDEN_RE:
|
|
m = rx.search(md)
|
|
if m:
|
|
snippet = md[max(0, m.start() - 40): m.end() + 40].replace("\n", "\\n")
|
|
hits.append(f"Forbidden pattern matched: /{rx.pattern}/ near '{snippet}'")
|
|
return hits
|
|
|
|
|
|
def require_sections_in_order(body: str, required_h2: List[str]) -> List[str]:
|
|
"""
|
|
Require exact H2 headings in order. Additional headings allowed, but required must exist.
|
|
"""
|
|
errors: List[str] = []
|
|
# Find all H2 headings
|
|
h2 = [line.strip() for line in body.splitlines() if line.startswith("## ")]
|
|
idx = 0
|
|
for req in required_h2:
|
|
while idx < len(h2) and h2[idx] != req:
|
|
idx += 1
|
|
if idx >= len(h2):
|
|
errors.append(f"Missing required section heading: {req}")
|
|
continue
|
|
idx += 1
|
|
return errors
|