From 11cab90efaf446537a6a28caf45cdd09826ba171 Mon Sep 17 00:00:00 2001 From: welsberr Date: Mon, 16 Mar 2026 22:53:55 -0400 Subject: [PATCH] Expanded node agent llama-server parameters coverage. --- configs/node_agent.example.yaml | 15 +++++----- docs/NODE_AGENT.md | 35 ++++++++++++++++++++++ src/rolemesh_node_agent/adapters/cuda.py | 26 +++++++++++++++++ src/rolemesh_node_agent/config.py | 10 +++++++ src/rolemesh_node_agent/main.py | 2 +- tests/test_node_agent.py | 37 ++++++++++++++++++++++++ 6 files changed, 117 insertions(+), 8 deletions(-) diff --git a/configs/node_agent.example.yaml b/configs/node_agent.example.yaml index 96e2211..11b7517 100644 --- a/configs/node_agent.example.yaml +++ b/configs/node_agent.example.yaml @@ -22,11 +22,12 @@ models: path: "/models/SomePlannerModel.Q5_K_M.gguf" roles: ["planner"] default_ctx: 8192 + # Common llama-server options can be configured as structured fields: + ctx_size: 8192 + gpu_layers: 60 + threads: 8 + batch_size: 1024 + flash_attn: true + # Keep server_args for less common passthrough flags. server_args: - # Examples (llama.cpp flags differ by build/version): - # c: 8192 - # n_gpu_layers: 60 - # threads: 8 - # parallel: 1 - # keep: true - c: 8192 + parallel: 1 diff --git a/docs/NODE_AGENT.md b/docs/NODE_AGENT.md index 0006da5..979adb4 100644 --- a/docs/NODE_AGENT.md +++ b/docs/NODE_AGENT.md @@ -22,6 +22,41 @@ models: - `path`: exact GGUF file to load - `roles`: role labels this model can satisfy when the node registers with a gateway +## Common llama-server options as structured config + +For common runtime tuning, the node agent supports structured model fields instead of requiring everything to go +through raw `server_args`. + +Supported structured fields: +- `ctx_size` +- `batch_size` +- `ubatch_size` +- `threads` +- `threads_batch` +- `gpu_layers` +- `main_gpu` +- `tensor_split` +- `flash_attn` +- `alias` + +Example: + +```yaml +models: + - model_id: "planner-gguf" + path: "/models/SomePlannerModel.Q5_K_M.gguf" + roles: ["planner"] + ctx_size: 8192 + gpu_layers: 60 + threads: 8 + batch_size: 1024 + flash_attn: true + server_args: + parallel: 1 +``` + +`server_args` is still supported as an escape hatch for less common `llama-server` flags. + ## Persistent server model For each GPU device, the node agent starts a dedicated `llama-server` process, pinned via diff --git a/src/rolemesh_node_agent/adapters/cuda.py b/src/rolemesh_node_agent/adapters/cuda.py index b8f0e7b..d8fb166 100644 --- a/src/rolemesh_node_agent/adapters/cuda.py +++ b/src/rolemesh_node_agent/adapters/cuda.py @@ -12,6 +12,7 @@ from dataclasses import dataclass from typing import Any, Dict, List from .base import DeviceMetrics, DeviceRef, RuntimeAdapter +from ..config import ModelEntry def _find_free_port() -> int: @@ -117,6 +118,31 @@ class CudaAdapter(RuntimeAdapter): metrics.append(m) return metrics + def build_server_args(self, model: ModelEntry) -> Dict[str, Any]: + args: Dict[str, Any] = {"ctx-size": model.ctx_size or model.default_ctx} + + if model.batch_size is not None: + args["batch-size"] = model.batch_size + if model.ubatch_size is not None: + args["ubatch-size"] = model.ubatch_size + if model.threads is not None: + args["threads"] = model.threads + if model.threads_batch is not None: + args["threads-batch"] = model.threads_batch + if model.gpu_layers is not None: + args["gpu-layers"] = model.gpu_layers + if model.main_gpu is not None: + args["main-gpu"] = model.main_gpu + if model.tensor_split: + args["tensor-split"] = ",".join(f"{v:g}" for v in model.tensor_split) + if model.flash_attn is not None: + args["flash-attn"] = "on" if model.flash_attn else "off" + if model.alias: + args["alias"] = model.alias + + args.update(model.server_args or {}) + return args + async def ensure_server(self, device: DeviceRef, *, model_path: str, model_id: str, server_args: Dict[str, Any]) -> str: if device.backend != "cuda" or device.kind != "gpu": raise ValueError("CudaAdapter can only manage cuda gpu devices.") diff --git a/src/rolemesh_node_agent/config.py b/src/rolemesh_node_agent/config.py index a36242f..ae5bba7 100644 --- a/src/rolemesh_node_agent/config.py +++ b/src/rolemesh_node_agent/config.py @@ -11,6 +11,16 @@ class ModelEntry(BaseModel): path: Path roles: List[str] = Field(default_factory=list) default_ctx: int = 8192 + ctx_size: Optional[int] = None + batch_size: Optional[int] = None + ubatch_size: Optional[int] = None + threads: Optional[int] = None + threads_batch: Optional[int] = None + gpu_layers: Optional[int] = None + main_gpu: Optional[int] = None + tensor_split: Optional[List[float]] = None + flash_attn: Optional[bool] = None + alias: Optional[str] = None server_args: Dict[str, Any] = Field(default_factory=dict) diff --git a/src/rolemesh_node_agent/main.py b/src/rolemesh_node_agent/main.py index 3735ee7..ae3fdb6 100644 --- a/src/rolemesh_node_agent/main.py +++ b/src/rolemesh_node_agent/main.py @@ -175,7 +175,7 @@ def create_app(cfg: NodeAgentConfig) -> FastAPI: device, model_path=str(model_entry.path), model_id=model_entry.model_id, - server_args=model_entry.server_args, + server_args=app.state.cuda.build_server_args(model_entry), ) except ServerStartupError as e: return _error(str(e), code="server_startup_error", status_code=503) diff --git a/tests/test_node_agent.py b/tests/test_node_agent.py index 9236cab..a624c9a 100644 --- a/tests/test_node_agent.py +++ b/tests/test_node_agent.py @@ -6,6 +6,7 @@ from pathlib import Path import httpx from rolemesh_node_agent.adapters.base import DeviceMetrics, DeviceRef +from rolemesh_node_agent.adapters.cuda import CudaAdapter from rolemesh_node_agent.config import ModelEntry, NodeAgentConfig from rolemesh_node_agent.main import _merge_scheduler_metrics, _select_device from rolemesh_node_agent.scheduler import AdmissionError, DeviceQueue @@ -266,6 +267,42 @@ def test_chat_completions_uses_selected_device_not_first_device(tmp_path): asyncio.run(app.state.http.aclose()) +def test_cuda_adapter_build_server_args_from_structured_fields(tmp_path): + model_path = tmp_path / "model.gguf" + model_path.write_bytes(b"GGUF") + adapter = CudaAdapter() + model = ModelEntry( + model_id="planner-gguf", + path=model_path, + roles=["planner"], + ctx_size=4096, + batch_size=1024, + ubatch_size=256, + threads=8, + threads_batch=4, + gpu_layers=999, + main_gpu=1, + tensor_split=[3, 1], + flash_attn=True, + alias="planner", + server_args={"parallel": 2}, + ) + + args = adapter.build_server_args(model) + + assert args["ctx-size"] == 4096 + assert args["batch-size"] == 1024 + assert args["ubatch-size"] == 256 + assert args["threads"] == 8 + assert args["threads-batch"] == 4 + assert args["gpu-layers"] == 999 + assert args["main-gpu"] == 1 + assert args["tensor-split"] == "3,1" + assert args["flash-attn"] == "on" + assert args["alias"] == "planner" + assert args["parallel"] == 2 + + def test_merge_scheduler_metrics_overlays_queue_state(): device = DeviceRef(kind="gpu", backend="cuda", id="gpu:0") metric = DeviceMetrics(device=device)