Expanded node agent llama-server parameters coverage.

This commit is contained in:
welsberr 2026-03-16 22:53:55 -04:00
parent 0226f7526d
commit 11cab90efa
6 changed files with 117 additions and 8 deletions

View File

@ -22,11 +22,12 @@ models:
path: "/models/SomePlannerModel.Q5_K_M.gguf" path: "/models/SomePlannerModel.Q5_K_M.gguf"
roles: ["planner"] roles: ["planner"]
default_ctx: 8192 default_ctx: 8192
# Common llama-server options can be configured as structured fields:
ctx_size: 8192
gpu_layers: 60
threads: 8
batch_size: 1024
flash_attn: true
# Keep server_args for less common passthrough flags.
server_args: server_args:
# Examples (llama.cpp flags differ by build/version): parallel: 1
# c: 8192
# n_gpu_layers: 60
# threads: 8
# parallel: 1
# keep: true
c: 8192

View File

@ -22,6 +22,41 @@ models:
- `path`: exact GGUF file to load - `path`: exact GGUF file to load
- `roles`: role labels this model can satisfy when the node registers with a gateway - `roles`: role labels this model can satisfy when the node registers with a gateway
## Common llama-server options as structured config
For common runtime tuning, the node agent supports structured model fields instead of requiring everything to go
through raw `server_args`.
Supported structured fields:
- `ctx_size`
- `batch_size`
- `ubatch_size`
- `threads`
- `threads_batch`
- `gpu_layers`
- `main_gpu`
- `tensor_split`
- `flash_attn`
- `alias`
Example:
```yaml
models:
- model_id: "planner-gguf"
path: "/models/SomePlannerModel.Q5_K_M.gguf"
roles: ["planner"]
ctx_size: 8192
gpu_layers: 60
threads: 8
batch_size: 1024
flash_attn: true
server_args:
parallel: 1
```
`server_args` is still supported as an escape hatch for less common `llama-server` flags.
## Persistent server model ## Persistent server model
For each GPU device, the node agent starts a dedicated `llama-server` process, pinned via For each GPU device, the node agent starts a dedicated `llama-server` process, pinned via

View File

@ -12,6 +12,7 @@ from dataclasses import dataclass
from typing import Any, Dict, List from typing import Any, Dict, List
from .base import DeviceMetrics, DeviceRef, RuntimeAdapter from .base import DeviceMetrics, DeviceRef, RuntimeAdapter
from ..config import ModelEntry
def _find_free_port() -> int: def _find_free_port() -> int:
@ -117,6 +118,31 @@ class CudaAdapter(RuntimeAdapter):
metrics.append(m) metrics.append(m)
return metrics return metrics
def build_server_args(self, model: ModelEntry) -> Dict[str, Any]:
args: Dict[str, Any] = {"ctx-size": model.ctx_size or model.default_ctx}
if model.batch_size is not None:
args["batch-size"] = model.batch_size
if model.ubatch_size is not None:
args["ubatch-size"] = model.ubatch_size
if model.threads is not None:
args["threads"] = model.threads
if model.threads_batch is not None:
args["threads-batch"] = model.threads_batch
if model.gpu_layers is not None:
args["gpu-layers"] = model.gpu_layers
if model.main_gpu is not None:
args["main-gpu"] = model.main_gpu
if model.tensor_split:
args["tensor-split"] = ",".join(f"{v:g}" for v in model.tensor_split)
if model.flash_attn is not None:
args["flash-attn"] = "on" if model.flash_attn else "off"
if model.alias:
args["alias"] = model.alias
args.update(model.server_args or {})
return args
async def ensure_server(self, device: DeviceRef, *, model_path: str, model_id: str, server_args: Dict[str, Any]) -> str: async def ensure_server(self, device: DeviceRef, *, model_path: str, model_id: str, server_args: Dict[str, Any]) -> str:
if device.backend != "cuda" or device.kind != "gpu": if device.backend != "cuda" or device.kind != "gpu":
raise ValueError("CudaAdapter can only manage cuda gpu devices.") raise ValueError("CudaAdapter can only manage cuda gpu devices.")

View File

@ -11,6 +11,16 @@ class ModelEntry(BaseModel):
path: Path path: Path
roles: List[str] = Field(default_factory=list) roles: List[str] = Field(default_factory=list)
default_ctx: int = 8192 default_ctx: int = 8192
ctx_size: Optional[int] = None
batch_size: Optional[int] = None
ubatch_size: Optional[int] = None
threads: Optional[int] = None
threads_batch: Optional[int] = None
gpu_layers: Optional[int] = None
main_gpu: Optional[int] = None
tensor_split: Optional[List[float]] = None
flash_attn: Optional[bool] = None
alias: Optional[str] = None
server_args: Dict[str, Any] = Field(default_factory=dict) server_args: Dict[str, Any] = Field(default_factory=dict)

View File

@ -175,7 +175,7 @@ def create_app(cfg: NodeAgentConfig) -> FastAPI:
device, device,
model_path=str(model_entry.path), model_path=str(model_entry.path),
model_id=model_entry.model_id, model_id=model_entry.model_id,
server_args=model_entry.server_args, server_args=app.state.cuda.build_server_args(model_entry),
) )
except ServerStartupError as e: except ServerStartupError as e:
return _error(str(e), code="server_startup_error", status_code=503) return _error(str(e), code="server_startup_error", status_code=503)

View File

@ -6,6 +6,7 @@ from pathlib import Path
import httpx import httpx
from rolemesh_node_agent.adapters.base import DeviceMetrics, DeviceRef from rolemesh_node_agent.adapters.base import DeviceMetrics, DeviceRef
from rolemesh_node_agent.adapters.cuda import CudaAdapter
from rolemesh_node_agent.config import ModelEntry, NodeAgentConfig from rolemesh_node_agent.config import ModelEntry, NodeAgentConfig
from rolemesh_node_agent.main import _merge_scheduler_metrics, _select_device from rolemesh_node_agent.main import _merge_scheduler_metrics, _select_device
from rolemesh_node_agent.scheduler import AdmissionError, DeviceQueue from rolemesh_node_agent.scheduler import AdmissionError, DeviceQueue
@ -266,6 +267,42 @@ def test_chat_completions_uses_selected_device_not_first_device(tmp_path):
asyncio.run(app.state.http.aclose()) asyncio.run(app.state.http.aclose())
def test_cuda_adapter_build_server_args_from_structured_fields(tmp_path):
model_path = tmp_path / "model.gguf"
model_path.write_bytes(b"GGUF")
adapter = CudaAdapter()
model = ModelEntry(
model_id="planner-gguf",
path=model_path,
roles=["planner"],
ctx_size=4096,
batch_size=1024,
ubatch_size=256,
threads=8,
threads_batch=4,
gpu_layers=999,
main_gpu=1,
tensor_split=[3, 1],
flash_attn=True,
alias="planner",
server_args={"parallel": 2},
)
args = adapter.build_server_args(model)
assert args["ctx-size"] == 4096
assert args["batch-size"] == 1024
assert args["ubatch-size"] == 256
assert args["threads"] == 8
assert args["threads-batch"] == 4
assert args["gpu-layers"] == 999
assert args["main-gpu"] == 1
assert args["tensor-split"] == "3,1"
assert args["flash-attn"] == "on"
assert args["alias"] == "planner"
assert args["parallel"] == 2
def test_merge_scheduler_metrics_overlays_queue_state(): def test_merge_scheduler_metrics_overlays_queue_state():
device = DeviceRef(kind="gpu", backend="cuda", id="gpu:0") device = DeviceRef(kind="gpu", backend="cuda", id="gpu:0")
metric = DeviceMetrics(device=device) metric = DeviceMetrics(device=device)