85 lines
1.8 KiB
YAML
85 lines
1.8 KiB
YAML
node:
|
|
host_id: "p40-box"
|
|
display_name: "Dual P40 + CPU Fallback"
|
|
listen_host: "127.0.0.1"
|
|
listen_port: 8891
|
|
address: "127.0.0.1"
|
|
labels:
|
|
topology: "singlebox"
|
|
runtime: "llama.cpp"
|
|
gpu0: "Tesla P40"
|
|
gpu1: "Tesla P40"
|
|
cpu: "Ryzen 5600G"
|
|
|
|
control_plane:
|
|
base_url: "http://127.0.0.1:8800"
|
|
node_api_key: "change-me-node-key"
|
|
heartbeat_interval_s: 5
|
|
|
|
inventory:
|
|
model_roots:
|
|
- "/path/to/models"
|
|
cpu_threads: 12
|
|
ram_gb: 128
|
|
capabilities:
|
|
cpu: true
|
|
cuda: true
|
|
|
|
managed_runtimes:
|
|
enabled: false
|
|
|
|
services:
|
|
- service_id: "p40-box/chat/gpu0-primary"
|
|
kind: "chat"
|
|
endpoint: "http://127.0.0.1:18091"
|
|
runtime:
|
|
engine: "llama.cpp"
|
|
launcher: "external"
|
|
device: "gpu0"
|
|
assets:
|
|
- asset_id: "Qwen2.5-14B-Instruct-1M-Q5_K_M"
|
|
loaded: true
|
|
state:
|
|
health: "healthy"
|
|
load_state: "loaded"
|
|
accept_requests: true
|
|
observed:
|
|
p50_latency_ms: 1200
|
|
tokens_per_sec: 24
|
|
|
|
- service_id: "p40-box/chat/gpu1-secondary"
|
|
kind: "chat"
|
|
endpoint: "http://127.0.0.1:18092"
|
|
runtime:
|
|
engine: "llama.cpp"
|
|
launcher: "external"
|
|
device: "gpu1"
|
|
assets:
|
|
- asset_id: "Qwen3.5-9B-Q5_K_M"
|
|
loaded: true
|
|
state:
|
|
health: "healthy"
|
|
load_state: "loaded"
|
|
accept_requests: true
|
|
observed:
|
|
p50_latency_ms: 1000
|
|
tokens_per_sec: 30
|
|
|
|
- service_id: "p40-box/chat/cpu-fallback"
|
|
kind: "chat"
|
|
endpoint: "http://127.0.0.1:18093"
|
|
runtime:
|
|
engine: "llama.cpp"
|
|
launcher: "external"
|
|
device: "cpu"
|
|
assets:
|
|
- asset_id: "rocket-3b.Q5_K_M"
|
|
loaded: true
|
|
state:
|
|
health: "healthy"
|
|
load_state: "loaded"
|
|
accept_requests: true
|
|
observed:
|
|
p50_latency_ms: 4500
|
|
tokens_per_sec: 7
|