ThreeGate/tools/validate_tool_request.py

#!/usr/bin/env python3
"""
Validate a Tool Request against schemas/tool-request.schema.md (schema_version=1).

Supports two execution backends:
- backend=ERA   (default): command execution in microVM lane
- backend=monty: Python-subset execution in TOOL-EXEC-Lite lane

Usage:
  validate_tool_request.py /path/to/request.md

Exit codes:
  0 = valid
  2 = invalid
  3 = error
"""

from __future__ import annotations

import json
import re
import sys
from typing import List, Tuple
import keyword


from validate_common import (
    ValidationResult,
    extract_front_matter,
    find_forbidden,
    read_text,
    require_keys,
    require_sections_in_order,
)

REQUIRED_KEYS_BASE = [
    "request_type",
    "schema_version",
    "request_id",
    "created_utc",
    "requested_by",
    "approved_by",
    "approved_utc",
    "purpose",
    "language",
    "network",
    "cpu_limit",
    "memory_limit_mb",
    "time_limit_sec",
]

# Additional optional key (recommended); default is ERA if absent.
OPTIONAL_KEYS = ["backend"]

# ERA required headings
REQUIRED_H2_ERA = [
    "## Command",
    "## Input Files",
    "## Output Expectations",
    "## Risk Assessment",
]

# Monty required headings (Inputs optional)
REQUIRED_H2_MONTY = [
    "## Code",
    "## Output Expectations",
    "## Risk Assessment",
]

# Strong rules for ERA: command must be a single line and must not contain shell chaining/pipes/redirection
DANGEROUS_CMD_TOKENS = re.compile(r"[;&|><`]|(\$\()|(\)\s*)", re.IGNORECASE)

# Monty code guardrails (best-effort, not a substitute for Monty itself)
FORBIDDEN_MONTY_CODE_TOKENS = re.compile(
    r"\b(import|open|exec|eval|compile|__import__|globals|locals|vars|dir|getattr|setattr|delattr)\b",
    re.IGNORECASE,
)

IDENT_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")

def extract_single_line_after_heading(body: str, heading: str) -> str:
    lines = body.splitlines()
    try:
        i = lines.index(heading)
    except ValueError:
        return ""
    for j in range(i + 1, len(lines)):
        line = lines[j].strip()
        if line.startswith("## "):
            break
        if line:
            return line
    return ""


def extract_section_text(body: str, heading: str) -> str:
    """
    Extract full text under a heading until the next H2.
    """
    lines = body.splitlines()
    try:
        i = lines.index(heading)
    except ValueError:
        return ""
    out: List[str] = []
    for j in range(i + 1, len(lines)):
        if lines[j].startswith("## "):
            break
        out.append(lines[j])
    return "\n".join(out).strip()


def validate_era(body: str, errors: List[str]) -> None:
    # Headings
    errors.extend(require_sections_in_order(body, REQUIRED_H2_ERA))

    # Command rules
    cmd = extract_single_line_after_heading(body, "## Command")
    if not cmd:
        errors.append("## Command must contain a single command line.")
        return

    if cmd.startswith("```") or cmd.endswith("```"):
        errors.append("Command must be plain text, not a fenced code block.")
    if DANGEROUS_CMD_TOKENS.search(cmd):
        errors.append("Command contains forbidden shell metacharacters (chaining/pipes/redirection/subshell).")
    lc = cmd.lower()
    if "pip install" in lc or "apt" in lc or "apt-get" in lc or "npm install" in lc:
        errors.append("Command appears to install packages; installs are forbidden in TOOL-EXEC.")


def validate_monty(fm: dict, body: str, errors: List[str], warnings: List[str]) -> None:
    # Headings
    errors.extend(require_sections_in_order(body, REQUIRED_H2_MONTY))

    # Monty requires code block text (not fenced)
    code = extract_section_text(body, "## Code")
    if not code:
        errors.append("## Code must contain Monty-executable Python-subset code.")
        return

    # For safety, discourage fenced blocks (users can still paste raw code)
    if "```" in code:
        errors.append("Monty code must be plain text, not fenced code blocks.")

    # Inputs JSON is optional but if present must parse as object
    inputs_json = extract_section_text(body, "## Inputs (JSON)")
    if inputs_json:
        try:
            obj = json.loads(inputs_json)
            if not isinstance(obj, dict):
                errors.append("## Inputs (JSON) must be a JSON object/dict.")
            else:
                bad_keys = []
                for k in obj.keys():
                    if not isinstance(k, str):
                        bad_keys.append(repr(k))
                        continue
                    if not IDENT_RE.match(k) or keyword.iskeyword(k):
                        bad_keys.append(k)
                if bad_keys:
                    errors.append(
                        "## Inputs (JSON) keys must be valid Python identifiers and not keywords. "
                        f"Invalid keys: {', '.join(bad_keys)}"
                    )
        except Exception as e:
            errors.append(f"Invalid JSON in ## Inputs (JSON): {e}")

    # Backend-policy constraints
    lang = fm.get("language", "").strip().lower()
    if lang != "python":
        errors.append("backend=monty requires language=python.")

    net = fm.get("network", "").strip().lower()
    if net != "none":
        errors.append("backend=monty requires network=none.")

    # Best-effort code guardrails: these names should not appear in pure-compute Monty lane
    # (Monty itself blocks many capabilities; this is defense-in-depth and discourages risky patterns.)
    if FORBIDDEN_MONTY_CODE_TOKENS.search(code):
        warnings.append(
            "Monty code contains potentially risky builtins/names (import/open/exec/eval/etc). "
            "Monty may block these, but review intent carefully."
        )


def validate(path: str) -> ValidationResult:
    errors: List[str] = []
    warnings: List[str] = []

    md = read_text(path)
    fm, body = extract_front_matter(md)

    # Base required keys
    missing = require_keys(fm, REQUIRED_KEYS_BASE)
    if missing:
        errors.append(f"Missing required front matter keys: {', '.join(missing)}")

    if fm.get("request_type") != "tool_request":
        errors.append(f"request_type must be 'tool_request' (got: {fm.get('request_type')!r})")

    if fm.get("schema_version") != "1":
        errors.append(f"schema_version must be '1' (got: {fm.get('schema_version')!r})")

    # Approval gate: require approved_by and approved_utc
    if not fm.get("approved_by") or not fm.get("approved_utc"):
        errors.append("Tool Request must include approved_by and approved_utc (human approval gate).")

    # language must not be shell
    if fm.get("language", "").strip().lower() in ("shell", "bash", "sh", "zsh", "powershell", "pwsh", "cmd"):
        errors.append("language must not be a shell. Use a supported language runtime only.")

    # network must be none or allowlist
    net = fm.get("network", "").strip().lower()
    if net not in ("none", "allowlist"):
        errors.append("network must be 'none' or 'allowlist'.")

    # Determine backend (default ERA)
    backend = (fm.get("backend") or "ERA").strip().lower()
    if backend not in ("era", "monty"):
        errors.append("backend must be 'ERA' or 'monty' (default ERA if omitted).")

    # Backend-specific validation
    if backend == "era":
        validate_era(body, errors)
    elif backend == "monty":
        validate_monty(fm, body, errors, warnings)

    # Forbidden content scan (whole doc)
    forbidden_hits = find_forbidden(md)
    if forbidden_hits:
        errors.extend(forbidden_hits)

    return ValidationResult(ok=(len(errors) == 0), errors=errors, warnings=warnings)


def main() -> int:
    if len(sys.argv) != 2:
        print(__doc__.strip(), file=sys.stderr)
        return 3
    path = sys.argv[1]
    try:
        res = validate(path)
    except Exception as e:
        print(f"ERROR: {e}", file=sys.stderr)
        return 3

    if res.ok:
        for w in res.warnings:
            print(f"WARNING: {w}", file=sys.stderr)
        print("ACCEPT")
        return 0
    else:
        for e in res.errors:
            print(f"ERROR: {e}", file=sys.stderr)
        for w in res.warnings:
            print(f"WARNING: {w}", file=sys.stderr)
        print("REJECT")
        return 2


if __name__ == "__main__":
    raise SystemExit(main())