node:
  host_id: "p40-box"
  display_name: "Dual P40 + CPU Fallback"
  listen_host: "127.0.0.1"
  listen_port: 8891
  address: "127.0.0.1"
  labels:
    topology: "singlebox"
    runtime: "llama.cpp"
    gpu0: "Tesla P40"
    gpu1: "Tesla P40"
    cpu: "Ryzen 5600G"

control_plane:
  base_url: "http://127.0.0.1:8800"
  node_api_key: "change-me-node-key"
  heartbeat_interval_s: 5

inventory:
  model_roots:
    - "/path/to/models"
  cpu_threads: 12
  ram_gb: 128
  capabilities:
    cpu: true
    cuda: true

managed_runtimes:
  enabled: false

services:
  - service_id: "p40-box/chat/gpu0-primary"
    kind: "chat"
    endpoint: "http://127.0.0.1:18091"
    runtime:
      engine: "llama.cpp"
      launcher: "external"
      device: "gpu0"
    assets:
      - asset_id: "Qwen2.5-14B-Instruct-1M-Q5_K_M"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
    observed:
      p50_latency_ms: 1200
      tokens_per_sec: 24

  - service_id: "p40-box/chat/gpu1-secondary"
    kind: "chat"
    endpoint: "http://127.0.0.1:18092"
    runtime:
      engine: "llama.cpp"
      launcher: "external"
      device: "gpu1"
    assets:
      - asset_id: "Qwen3.5-9B-Q5_K_M"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
    observed:
      p50_latency_ms: 1000
      tokens_per_sec: 30

  - service_id: "p40-box/chat/cpu-fallback"
    kind: "chat"
    endpoint: "http://127.0.0.1:18093"
    runtime:
      engine: "llama.cpp"
      launcher: "external"
      device: "cpu"
    assets:
      - asset_id: "rocket-3b.Q5_K_M"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
    observed:
      p50_latency_ms: 4500
      tokens_per_sec: 7