GenieHive/configs/node.singlebox.p40-triple.e...

85 lines
1.8 KiB
YAML

node:
host_id: "p40-box"
display_name: "Dual P40 + CPU Fallback"
listen_host: "127.0.0.1"
listen_port: 8891
address: "127.0.0.1"
labels:
topology: "singlebox"
runtime: "llama.cpp"
gpu0: "Tesla P40"
gpu1: "Tesla P40"
cpu: "Ryzen 5600G"
control_plane:
base_url: "http://127.0.0.1:8800"
node_api_key: "change-me-node-key"
heartbeat_interval_s: 5
inventory:
model_roots:
- "/path/to/models"
cpu_threads: 12
ram_gb: 128
capabilities:
cpu: true
cuda: true
managed_runtimes:
enabled: false
services:
- service_id: "p40-box/chat/gpu0-primary"
kind: "chat"
endpoint: "http://127.0.0.1:18091"
runtime:
engine: "llama.cpp"
launcher: "external"
device: "gpu0"
assets:
- asset_id: "Qwen2.5-14B-Instruct-1M-Q5_K_M"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
observed:
p50_latency_ms: 1200
tokens_per_sec: 24
- service_id: "p40-box/chat/gpu1-secondary"
kind: "chat"
endpoint: "http://127.0.0.1:18092"
runtime:
engine: "llama.cpp"
launcher: "external"
device: "gpu1"
assets:
- asset_id: "Qwen3.5-9B-Q5_K_M"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
observed:
p50_latency_ms: 1000
tokens_per_sec: 30
- service_id: "p40-box/chat/cpu-fallback"
kind: "chat"
endpoint: "http://127.0.0.1:18093"
runtime:
engine: "llama.cpp"
launcher: "external"
device: "cpu"
assets:
- asset_id: "rocket-3b.Q5_K_M"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
observed:
p50_latency_ms: 4500
tokens_per_sec: 7