Node agent and docs revisions.

This commit is contained in:
welsberr 2026-03-16 10:59:21 -04:00
parent 0b7d16b718
commit 5befa6d7f6
9 changed files with 284 additions and 14 deletions

2
.gitignore vendored
View File

@ -14,3 +14,5 @@ build/
state/registry.json
configs/models.yaml
tmp-codex/
codex-resume.sh
codex*

View File

@ -93,6 +93,59 @@ curl -sS -X POST http://127.0.0.1:8000/v1/chat/completions \
If you prefer the provided example file, copy `configs/models.example.yaml` and adjust the `proxy_url` values.
## Known Good Inference Backends
The gateway is designed to work with any backend that exposes OpenAI-compatible `GET /v1/models` and
`POST /v1/chat/completions` endpoints. The following applications have been exercised successfully in this repository.
### Ollama
- Verified directly against `http://127.0.0.1:11434`
- Verified through RoleMesh Gateway proxy routing
- Tested with model `dolphin3:latest`
Example upstream:
```yaml
models:
planner:
type: proxy
openai_model_name: planner
proxy_url: http://127.0.0.1:11434
defaults:
model: dolphin3:latest
```
Note: when proxying to Ollama's OpenAI-compatible API, the upstream Ollama model name still needs to be supplied.
One simple pattern is to set it in `defaults.model` and let the gateway inject it.
### Llamafile
- Verified directly with the newer `llamafile` runner in `tmp-codex/llamafile`
- Verified through RoleMesh Gateway proxy routing
- Verified role switching between two live backends
- Tested successfully with:
- `phi-2.Q5_K_M.llamafile`
- `rocket-3b.Q5_K_M.llamafile`
Example launch:
```bash
./llamafile --server -m /path/to/model.gguf --host 127.0.0.1 --port 8011 --nobrowser
```
### llama.cpp / llama-server
- Verified live through the RoleMesh Node Agent on NVIDIA GPUs
- Tested with `/home/netuser/bin/llama.cpp/build/bin/llama-server`
- Tested model load and model switching on Tesla P40 GPUs
- Tested successfully with:
- `gemma-2b-it-q8_0.gguf`
- `Mistral-7B-Instruct-v0.3-Q5_K_M.gguf`
The node agent now waits for `llama-server` readiness during model load or model switch before proxying the first
request, which avoids transient "Loading model" failures on cold start.
## Multi-host (node registration)
If you want machines to host backends and “register” them dynamically, run a tiny node agent on each backend host
@ -110,6 +163,8 @@ This repository is a **preliminary scaffold**:
- Registration and load-selection are implemented (basic round-robin).
- API-key auth for clients and nodes is available.
- Persistence is basic JSON-backed state, not a full service registry.
- Gateway proxying has been exercised live with Ollama and `llamafile`.
- Node-agent managed inference has been exercised live with `llama-server` on CUDA hardware.
## License

View File

@ -10,6 +10,8 @@ dispatcher_roles: ["planner", "coder"]
heartbeat_interval_sec: 5
llama_server_bin: "llama-server"
llama_server_startup_timeout_s: 30
llama_server_probe_interval_s: 0.5
model_roots:
- "/models"

View File

@ -13,6 +13,9 @@ For each GPU device, the node agent starts a dedicated `llama-server` process, p
environment variables (e.g. `CUDA_VISIBLE_DEVICES=0` for `gpu:0`) and bound to `127.0.0.1:<port>`.
Model switching is handled by **restart** in the scaffold.
The agent now waits for the replacement `llama-server` to report readiness before proxying the first request.
If startup or switching takes too long, the request fails with a `503` instead of passing through a transient upstream
"Loading model" error.
## Backends
@ -31,6 +34,20 @@ pip install -e .
rolemesh-node-agent --config configs/node_agent.example.yaml
```
### Startup timing guards
Two config knobs control how long the node agent waits for a managed `llama-server` to become ready:
```yaml
llama_server_startup_timeout_s: 30.0
llama_server_probe_interval_s: 0.5
```
- `llama_server_startup_timeout_s`: maximum time to wait for a newly started or switched model
- `llama_server_probe_interval_s`: polling interval for readiness checks
The readiness probe checks the managed server's local `GET /health` and `GET /v1/models` endpoints.
## Registering
If `dispatcher_base_url` is set in the node-agent config, the node agent will periodically call:

View File

@ -1,13 +1,15 @@
from __future__ import annotations
import asyncio
import json
import os
import shlex
import socket
import subprocess
import time
import urllib.error
import urllib.request
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List
from .base import DeviceMetrics, DeviceRef, RuntimeAdapter
@ -56,6 +58,10 @@ class _ServerProc:
started_at: float
class ServerStartupError(RuntimeError):
pass
class CudaAdapter(RuntimeAdapter):
"""
CUDA adapter that manages one persistent ggml-org/llama.cpp 'llama-server' per GPU.
@ -74,8 +80,15 @@ class CudaAdapter(RuntimeAdapter):
backend_name = "cuda"
def __init__(self, llama_server_bin: str = "llama-server") -> None:
def __init__(
self,
llama_server_bin: str = "llama-server",
startup_timeout_s: float = 30.0,
probe_interval_s: float = 0.5,
) -> None:
self._bin = llama_server_bin
self._startup_timeout_s = startup_timeout_s
self._probe_interval_s = probe_interval_s
self._servers: Dict[str, _ServerProc] = {} # key: device.id
self._lock = asyncio.Lock()
@ -160,9 +173,67 @@ class CudaAdapter(RuntimeAdapter):
started_at=time.time(),
)
# TODO: replace with a real readiness probe (e.g., GET /health on llama-server)
await asyncio.sleep(0.25)
return f"http://127.0.0.1:{port}"
base_url = f"http://127.0.0.1:{port}"
try:
await self._wait_for_server_ready(base_url, proc=proc)
except Exception:
if proc.poll() is None:
proc.terminate()
try:
proc.wait(timeout=5)
except Exception:
proc.kill()
self._servers.pop(device.id, None)
raise
return base_url
async def _wait_for_server_ready(self, base_url: str, *, proc: subprocess.Popen) -> None:
deadline = time.monotonic() + self._startup_timeout_s
last_error = "server did not become ready"
while time.monotonic() < deadline:
if proc.poll() is not None:
raise ServerStartupError(
f"llama-server exited before becoming ready (exit code {proc.returncode})."
)
try:
await asyncio.to_thread(self._probe_server_ready, base_url)
return
except ServerStartupError as exc:
last_error = str(exc)
await asyncio.sleep(self._probe_interval_s)
raise ServerStartupError(
f"Timed out after {self._startup_timeout_s:.1f}s waiting for llama-server readiness: {last_error}"
)
def _probe_server_ready(self, base_url: str) -> None:
for path in ("/health", "/v1/models"):
url = base_url.rstrip("/") + path
try:
with urllib.request.urlopen(url, timeout=2.0) as response:
if response.status >= 400:
continue
body = response.read()
except (urllib.error.URLError, TimeoutError, OSError):
continue
if path == "/health":
try:
payload = json.loads(body.decode("utf-8"))
except Exception:
payload = None
if isinstance(payload, dict) and payload.get("status") == "ok":
return
continue
if path == "/v1/models":
try:
payload = json.loads(body.decode("utf-8"))
except Exception:
payload = None
if isinstance(payload, dict) and isinstance(payload.get("data"), list):
return
raise ServerStartupError("llama-server probe did not return a ready response.")
async def shutdown(self) -> None:
async with self._lock:

View File

@ -38,3 +38,5 @@ class NodeAgentConfig(BaseModel):
# llama-server binary name/path
llama_server_bin: str = "llama-server"
llama_server_startup_timeout_s: float = 30.0
llama_server_probe_interval_s: float = 0.5

View File

@ -10,7 +10,7 @@ from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, StreamingResponse
from rolemesh_gateway.upstream import UpstreamClient, UpstreamError # reuse gateway client
from .adapters.cuda import CudaAdapter
from .adapters.cuda import CudaAdapter, ServerStartupError
from .adapters.base import DeviceRef
from .config import NodeAgentConfig
from .inventory import discover_gguf_models
@ -28,7 +28,11 @@ def create_app(cfg: NodeAgentConfig) -> FastAPI:
timeout=httpx.Timeout(connect=5.0, read=3600.0, write=30.0, pool=30.0)
)
upstream = UpstreamClient(client=http)
cuda = CudaAdapter(llama_server_bin=cfg.llama_server_bin)
cuda = CudaAdapter(
llama_server_bin=cfg.llama_server_bin,
startup_timeout_s=cfg.llama_server_startup_timeout_s,
probe_interval_s=cfg.llama_server_probe_interval_s,
)
@asynccontextmanager
async def lifespan(app: FastAPI):
@ -102,12 +106,15 @@ def create_app(cfg: NodeAgentConfig) -> FastAPI:
return _error("No CUDA GPUs discovered on this node.", code="no_device", status_code=503)
device = devices[0]
try:
base_url = await app.state.cuda.ensure_server(
device,
model_path=str(model_entry.path),
model_id=model_entry.model_id,
server_args=model_entry.server_args,
)
except ServerStartupError as e:
return _error(str(e), code="server_startup_error", status_code=503)
upstream = app.state.upstream
try:

View File

@ -0,0 +1,81 @@
from __future__ import annotations
import asyncio
from rolemesh_node_agent.adapters.base import DeviceRef
from rolemesh_node_agent.adapters.cuda import CudaAdapter, ServerStartupError
def test_probe_server_ready_accepts_health_response(monkeypatch):
adapter = CudaAdapter()
class _Response:
def __init__(self, status: int, body: bytes):
self.status = status
self._body = body
def read(self):
return self._body
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
return None
def fake_urlopen(url, timeout):
return _Response(200, b'{"status":"ok"}')
monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
adapter._probe_server_ready("http://127.0.0.1:1234")
def test_wait_for_server_ready_times_out_when_probe_never_succeeds(monkeypatch):
adapter = CudaAdapter(startup_timeout_s=0.0, probe_interval_s=0.0)
class _Proc:
returncode = None
def poll(self):
return None
try:
asyncio.run(adapter._wait_for_server_ready("http://127.0.0.1:1234", proc=_Proc()))
except ServerStartupError as exc:
assert "Timed out" in str(exc)
else:
raise AssertionError("expected ServerStartupError")
def test_ensure_server_reuses_existing_matching_process():
adapter = CudaAdapter()
device = DeviceRef(kind="gpu", backend="cuda", id="gpu:0")
class _Proc:
def poll(self):
return None
adapter._servers[device.id] = type(
"_SP",
(),
{
"device": device,
"model_id": "planner",
"model_path": "/tmp/model.gguf",
"port": 1234,
"proc": _Proc(),
"started_at": 1.0,
},
)()
base_url = asyncio.run(
adapter.ensure_server(
device,
model_path="/tmp/model.gguf",
model_id="planner",
server_args={},
)
)
assert base_url == "http://127.0.0.1:1234"

View File

@ -119,3 +119,36 @@ def test_chat_completions_routes_to_local_server_and_streams(tmp_path):
assert "data: first" in stream_response.text
assert calls["stream_base_url"] == "http://127.0.0.1:9100"
asyncio.run(app.state.http.aclose())
def test_chat_completions_returns_503_when_server_startup_fails(tmp_path):
from rolemesh_node_agent.main import create_app
from rolemesh_node_agent.adapters.cuda import ServerStartupError
cfg = _node_config(tmp_path)
app = create_app(cfg)
async def fake_discover_devices():
return [DeviceRef(kind="gpu", backend="cuda", id="gpu:0")]
async def fake_ensure_server(device, *, model_path, model_id, server_args):
raise ServerStartupError("Timed out waiting for model load")
app.state.cuda.discover_devices = fake_discover_devices
app.state.cuda.ensure_server = fake_ensure_server
response = asyncio.run(
_request(
app,
"POST",
"/v1/chat/completions",
json={
"model": "planner-gguf",
"messages": [{"role": "user", "content": "hello"}],
},
)
)
assert response.status_code == 503
assert response.json()["error"]["code"] == "server_startup_error"
asyncio.run(app.state.http.aclose())