Expanded node agent llama-server parameters coverage.
This commit is contained in:
parent
0226f7526d
commit
11cab90efa
|
|
@ -22,11 +22,12 @@ models:
|
||||||
path: "/models/SomePlannerModel.Q5_K_M.gguf"
|
path: "/models/SomePlannerModel.Q5_K_M.gguf"
|
||||||
roles: ["planner"]
|
roles: ["planner"]
|
||||||
default_ctx: 8192
|
default_ctx: 8192
|
||||||
|
# Common llama-server options can be configured as structured fields:
|
||||||
|
ctx_size: 8192
|
||||||
|
gpu_layers: 60
|
||||||
|
threads: 8
|
||||||
|
batch_size: 1024
|
||||||
|
flash_attn: true
|
||||||
|
# Keep server_args for less common passthrough flags.
|
||||||
server_args:
|
server_args:
|
||||||
# Examples (llama.cpp flags differ by build/version):
|
parallel: 1
|
||||||
# c: 8192
|
|
||||||
# n_gpu_layers: 60
|
|
||||||
# threads: 8
|
|
||||||
# parallel: 1
|
|
||||||
# keep: true
|
|
||||||
c: 8192
|
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,41 @@ models:
|
||||||
- `path`: exact GGUF file to load
|
- `path`: exact GGUF file to load
|
||||||
- `roles`: role labels this model can satisfy when the node registers with a gateway
|
- `roles`: role labels this model can satisfy when the node registers with a gateway
|
||||||
|
|
||||||
|
## Common llama-server options as structured config
|
||||||
|
|
||||||
|
For common runtime tuning, the node agent supports structured model fields instead of requiring everything to go
|
||||||
|
through raw `server_args`.
|
||||||
|
|
||||||
|
Supported structured fields:
|
||||||
|
- `ctx_size`
|
||||||
|
- `batch_size`
|
||||||
|
- `ubatch_size`
|
||||||
|
- `threads`
|
||||||
|
- `threads_batch`
|
||||||
|
- `gpu_layers`
|
||||||
|
- `main_gpu`
|
||||||
|
- `tensor_split`
|
||||||
|
- `flash_attn`
|
||||||
|
- `alias`
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
models:
|
||||||
|
- model_id: "planner-gguf"
|
||||||
|
path: "/models/SomePlannerModel.Q5_K_M.gguf"
|
||||||
|
roles: ["planner"]
|
||||||
|
ctx_size: 8192
|
||||||
|
gpu_layers: 60
|
||||||
|
threads: 8
|
||||||
|
batch_size: 1024
|
||||||
|
flash_attn: true
|
||||||
|
server_args:
|
||||||
|
parallel: 1
|
||||||
|
```
|
||||||
|
|
||||||
|
`server_args` is still supported as an escape hatch for less common `llama-server` flags.
|
||||||
|
|
||||||
## Persistent server model
|
## Persistent server model
|
||||||
|
|
||||||
For each GPU device, the node agent starts a dedicated `llama-server` process, pinned via
|
For each GPU device, the node agent starts a dedicated `llama-server` process, pinned via
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@ from dataclasses import dataclass
|
||||||
from typing import Any, Dict, List
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
from .base import DeviceMetrics, DeviceRef, RuntimeAdapter
|
from .base import DeviceMetrics, DeviceRef, RuntimeAdapter
|
||||||
|
from ..config import ModelEntry
|
||||||
|
|
||||||
|
|
||||||
def _find_free_port() -> int:
|
def _find_free_port() -> int:
|
||||||
|
|
@ -117,6 +118,31 @@ class CudaAdapter(RuntimeAdapter):
|
||||||
metrics.append(m)
|
metrics.append(m)
|
||||||
return metrics
|
return metrics
|
||||||
|
|
||||||
|
def build_server_args(self, model: ModelEntry) -> Dict[str, Any]:
|
||||||
|
args: Dict[str, Any] = {"ctx-size": model.ctx_size or model.default_ctx}
|
||||||
|
|
||||||
|
if model.batch_size is not None:
|
||||||
|
args["batch-size"] = model.batch_size
|
||||||
|
if model.ubatch_size is not None:
|
||||||
|
args["ubatch-size"] = model.ubatch_size
|
||||||
|
if model.threads is not None:
|
||||||
|
args["threads"] = model.threads
|
||||||
|
if model.threads_batch is not None:
|
||||||
|
args["threads-batch"] = model.threads_batch
|
||||||
|
if model.gpu_layers is not None:
|
||||||
|
args["gpu-layers"] = model.gpu_layers
|
||||||
|
if model.main_gpu is not None:
|
||||||
|
args["main-gpu"] = model.main_gpu
|
||||||
|
if model.tensor_split:
|
||||||
|
args["tensor-split"] = ",".join(f"{v:g}" for v in model.tensor_split)
|
||||||
|
if model.flash_attn is not None:
|
||||||
|
args["flash-attn"] = "on" if model.flash_attn else "off"
|
||||||
|
if model.alias:
|
||||||
|
args["alias"] = model.alias
|
||||||
|
|
||||||
|
args.update(model.server_args or {})
|
||||||
|
return args
|
||||||
|
|
||||||
async def ensure_server(self, device: DeviceRef, *, model_path: str, model_id: str, server_args: Dict[str, Any]) -> str:
|
async def ensure_server(self, device: DeviceRef, *, model_path: str, model_id: str, server_args: Dict[str, Any]) -> str:
|
||||||
if device.backend != "cuda" or device.kind != "gpu":
|
if device.backend != "cuda" or device.kind != "gpu":
|
||||||
raise ValueError("CudaAdapter can only manage cuda gpu devices.")
|
raise ValueError("CudaAdapter can only manage cuda gpu devices.")
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,16 @@ class ModelEntry(BaseModel):
|
||||||
path: Path
|
path: Path
|
||||||
roles: List[str] = Field(default_factory=list)
|
roles: List[str] = Field(default_factory=list)
|
||||||
default_ctx: int = 8192
|
default_ctx: int = 8192
|
||||||
|
ctx_size: Optional[int] = None
|
||||||
|
batch_size: Optional[int] = None
|
||||||
|
ubatch_size: Optional[int] = None
|
||||||
|
threads: Optional[int] = None
|
||||||
|
threads_batch: Optional[int] = None
|
||||||
|
gpu_layers: Optional[int] = None
|
||||||
|
main_gpu: Optional[int] = None
|
||||||
|
tensor_split: Optional[List[float]] = None
|
||||||
|
flash_attn: Optional[bool] = None
|
||||||
|
alias: Optional[str] = None
|
||||||
server_args: Dict[str, Any] = Field(default_factory=dict)
|
server_args: Dict[str, Any] = Field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -175,7 +175,7 @@ def create_app(cfg: NodeAgentConfig) -> FastAPI:
|
||||||
device,
|
device,
|
||||||
model_path=str(model_entry.path),
|
model_path=str(model_entry.path),
|
||||||
model_id=model_entry.model_id,
|
model_id=model_entry.model_id,
|
||||||
server_args=model_entry.server_args,
|
server_args=app.state.cuda.build_server_args(model_entry),
|
||||||
)
|
)
|
||||||
except ServerStartupError as e:
|
except ServerStartupError as e:
|
||||||
return _error(str(e), code="server_startup_error", status_code=503)
|
return _error(str(e), code="server_startup_error", status_code=503)
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ from pathlib import Path
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
from rolemesh_node_agent.adapters.base import DeviceMetrics, DeviceRef
|
from rolemesh_node_agent.adapters.base import DeviceMetrics, DeviceRef
|
||||||
|
from rolemesh_node_agent.adapters.cuda import CudaAdapter
|
||||||
from rolemesh_node_agent.config import ModelEntry, NodeAgentConfig
|
from rolemesh_node_agent.config import ModelEntry, NodeAgentConfig
|
||||||
from rolemesh_node_agent.main import _merge_scheduler_metrics, _select_device
|
from rolemesh_node_agent.main import _merge_scheduler_metrics, _select_device
|
||||||
from rolemesh_node_agent.scheduler import AdmissionError, DeviceQueue
|
from rolemesh_node_agent.scheduler import AdmissionError, DeviceQueue
|
||||||
|
|
@ -266,6 +267,42 @@ def test_chat_completions_uses_selected_device_not_first_device(tmp_path):
|
||||||
asyncio.run(app.state.http.aclose())
|
asyncio.run(app.state.http.aclose())
|
||||||
|
|
||||||
|
|
||||||
|
def test_cuda_adapter_build_server_args_from_structured_fields(tmp_path):
|
||||||
|
model_path = tmp_path / "model.gguf"
|
||||||
|
model_path.write_bytes(b"GGUF")
|
||||||
|
adapter = CudaAdapter()
|
||||||
|
model = ModelEntry(
|
||||||
|
model_id="planner-gguf",
|
||||||
|
path=model_path,
|
||||||
|
roles=["planner"],
|
||||||
|
ctx_size=4096,
|
||||||
|
batch_size=1024,
|
||||||
|
ubatch_size=256,
|
||||||
|
threads=8,
|
||||||
|
threads_batch=4,
|
||||||
|
gpu_layers=999,
|
||||||
|
main_gpu=1,
|
||||||
|
tensor_split=[3, 1],
|
||||||
|
flash_attn=True,
|
||||||
|
alias="planner",
|
||||||
|
server_args={"parallel": 2},
|
||||||
|
)
|
||||||
|
|
||||||
|
args = adapter.build_server_args(model)
|
||||||
|
|
||||||
|
assert args["ctx-size"] == 4096
|
||||||
|
assert args["batch-size"] == 1024
|
||||||
|
assert args["ubatch-size"] == 256
|
||||||
|
assert args["threads"] == 8
|
||||||
|
assert args["threads-batch"] == 4
|
||||||
|
assert args["gpu-layers"] == 999
|
||||||
|
assert args["main-gpu"] == 1
|
||||||
|
assert args["tensor-split"] == "3,1"
|
||||||
|
assert args["flash-attn"] == "on"
|
||||||
|
assert args["alias"] == "planner"
|
||||||
|
assert args["parallel"] == 2
|
||||||
|
|
||||||
|
|
||||||
def test_merge_scheduler_metrics_overlays_queue_state():
|
def test_merge_scheduler_metrics_overlays_queue_state():
|
||||||
device = DeviceRef(kind="gpu", backend="cuda", id="gpu:0")
|
device = DeviceRef(kind="gpu", backend="cuda", id="gpu:0")
|
||||||
metric = DeviceMetrics(device=device)
|
metric = DeviceMetrics(device=device)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue