node: host_id: "p40-box" display_name: "Dual P40 + CPU Fallback" listen_host: "127.0.0.1" listen_port: 8891 address: "127.0.0.1" labels: topology: "singlebox" runtime: "llama.cpp" gpu0: "Tesla P40" gpu1: "Tesla P40" cpu: "Ryzen 5600G" control_plane: base_url: "http://127.0.0.1:8800" node_api_key: "change-me-node-key" heartbeat_interval_s: 5 inventory: model_roots: - "/path/to/models" cpu_threads: 12 ram_gb: 128 capabilities: cpu: true cuda: true managed_runtimes: enabled: false services: - service_id: "p40-box/chat/gpu0-primary" kind: "chat" endpoint: "http://127.0.0.1:18091" runtime: engine: "llama.cpp" launcher: "external" device: "gpu0" assets: - asset_id: "Qwen2.5-14B-Instruct-1M-Q5_K_M" loaded: true state: health: "healthy" load_state: "loaded" accept_requests: true observed: p50_latency_ms: 1200 tokens_per_sec: 24 - service_id: "p40-box/chat/gpu1-secondary" kind: "chat" endpoint: "http://127.0.0.1:18092" runtime: engine: "llama.cpp" launcher: "external" device: "gpu1" assets: - asset_id: "Qwen3.5-9B-Q5_K_M" loaded: true state: health: "healthy" load_state: "loaded" accept_requests: true observed: p50_latency_ms: 1000 tokens_per_sec: 30 - service_id: "p40-box/chat/cpu-fallback" kind: "chat" endpoint: "http://127.0.0.1:18093" runtime: engine: "llama.cpp" launcher: "external" device: "cpu" assets: - asset_id: "rocket-3b.Q5_K_M" loaded: true state: health: "healthy" load_state: "loaded" accept_requests: true observed: p50_latency_ms: 4500 tokens_per_sec: 7