Initial commit

2026-04-07 13:17:28 -04:00 · 2026-04-07 13:17:28 -04:00 · b9270df3e8
parent dabbebd3ba
commit b9270df3e8
60 changed files with 4021 additions and 224 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,229 +1,13 @@
-# ---> Python
-# Byte-compiled / optimized / DLL files
+.pytest_cache/
 __pycache__/
 *.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
 *.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
 .coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
+.venv/
+.benchmarks/

-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-
-# ---> Emacs
-# -*- mode: gitignore; -*-
-*~
-\#*\#
-/.emacs.desktop
-/.emacs.desktop.lock
-*.elc
-auto-save-list
-tramp
-.\#*
-
-# Org-mode
-.org-id-locations
-*_archive
-
-# flymake-mode
-*_flymake.*
-
-# eshell files
-/eshell/history
-/eshell/lastdir
-
-# elpa packages
-/elpa/
-
-# reftex files
-*.rel
-
-# AUCTeX auto folder
-/auto/
-
-# cask packages
-.cask/
-dist/
-
-# Flycheck
-flycheck_*.el
-
-# server auth directory
-/server/
-
-# projectiles files
-.projectile
-
-# directory configuration
-.dir-locals.el
-
-# network security
-/network-security.data
-
-
-# ---> Rust
-# Generated by Cargo
-# will have compiled files and executables
-debug/
-target/
-
-# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
-# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
-Cargo.lock
-
-# These are backup files generated by rustfmt
-**/*.rs.bk
-
-# MSVC Windows builds of rustc generate these, which store debugging information
-*.pdb
+state/*.sqlite3
+state/*.db
+state/*.log

+.DS_Store
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,33 @@
+# Contributing
+
+GenieHive is still early-stage infrastructure code. Keep changes small, explicit, and easy to verify.
+
+## Setup
+
+```bash
+cd /home/netuser/bin/geniehive
+python -m venv .venv
+. .venv/bin/activate
+pip install -e '.[dev]'
+```
+
+## Common Checks
+
+```bash
+make test
+make smoke
+```
+
+## Guidelines
+
+- Prefer narrowly scoped patches over broad rewrites.
+- Keep the control-plane and node-agent contracts in sync.
+- Add or update tests with behavior changes.
+- Do not commit local runtime state from `state/`.
+- Do not commit benchmark artifacts or cache directories.
+
+## Runtime Notes
+
+- Example configs under `configs/` are meant to stay runnable.
+- Scripts under `scripts/` should remain usable as operator entrypoints, not just test helpers.
+- If a startup dependency can race in practice, prefer self-healing behavior over one-shot initialization.
--- a/13
+++ b/13
@ -0,0 +1,13 @@
+PYTHON ?= python
+PYTEST ?= pytest
+
+.PHONY: test smoke health
+
+test:
+	$(PYTEST) -q
+
+smoke:
+	$(PYTEST) -q tests/test_smoke.py
+
+health:
+	bash scripts/check_singlebox_health.sh
--- a/README.md
+++ b/README.md
@ -1,3 +1,60 @@
 # GenieHive

-GenieHive is a generative AI router, starting with presenting an OpenAI API-compatible endpoint for clients to interact with, while their requests are routed appropriately among one or more nodes that register running servers with the control host. From running multiple LLMs on a single host to doing that across a distributed cluster, GenieHive aims to make it easier to actually use local AI.
+GenieHive is a local-first control plane for heterogeneous generative AI services running across one or more hosts.
+
+V1 scope:
+
+- chat completions
+- embeddings
+- transcription
+
+Core goals:
+
+- register hosts and services
+- track health, inventory, and observed performance
+- expose a stable client-facing API
+- support direct model addressing and higher-level role addressing
+- route requests to healthy loaded services first
+
+Repository layout:
+
+- `docs/architecture.md`: system overview and v1 scope
+- `docs/roadmap.md`: current milestones and near-term priorities
+- `docs/schemas.md`: canonical data models
+- `docs/deployment.md`: intended deployment approach
+- `docs/demo.md`: first end-to-end control-plus-node demo flow
+- `docs/llm_demo.md`: detailed master/peer/client LLM demo runbook
+- `docs/reverse_proxy.md`: safer external exposure patterns
+- `configs/`: example control-plane, node, and role configs
+- `scripts/`: small launch and inspection helpers
+- `src/geniehive_control/`: control-plane package
+- `src/geniehive_node/`: node-agent package
+
+There is now a documented single-machine path as well as the cluster-oriented path, so GenieHive can be exercised as a useful local router even without multiple hosts.
+
+This repository is intended as the clean successor to narrower local gateway experiments. OpenAI-compatible routing remains important, but it is treated as one client facade within a broader cluster control-plane design.
+
+## Development
+
+Local development setup:
+
+```bash
+cd /home/netuser/bin/geniehive
+python -m venv .venv
+. .venv/bin/activate
+pip install -e '.[dev]'
+```
+
+Common commands:
+
+```bash
+make test
+make smoke
+make health
+```
+
+Repository conventions:
+
+- local runtime state lives under `state/` and should not be committed
+- example configs under `configs/` should remain runnable
+- operator scripts under `scripts/` are part of the supported workflow
--- a/configs/control.example.yaml
+++ b/configs/control.example.yaml
@ -0,0 +1,18 @@
+server:
+  host: "127.0.0.1"
+  port: 8800
+
+auth:
+  client_api_keys:
+    - "change-me-client-key"
+  node_api_keys:
+    - "change-me-node-key"
+
+storage:
+  sqlite_path: "state/geniehive.sqlite3"
+
+roles_path: "configs/roles.example.yaml"
+
+routing:
+  default_strategy: "loaded_first"
+  health_stale_after_s: 30
--- a/configs/control.singlebox.example.yaml
+++ b/configs/control.singlebox.example.yaml
@ -0,0 +1,18 @@
+server:
+  host: "127.0.0.1"
+  port: 8800
+
+auth:
+  client_api_keys:
+    - "change-me-client-key"
+  node_api_keys:
+    - "change-me-node-key"
+
+storage:
+  sqlite_path: "state/geniehive-singlebox.sqlite3"
+
+roles_path: "configs/roles.example.yaml"
+
+routing:
+  default_strategy: "loaded_first"
+  health_stale_after_s: 30
--- a/configs/control.singlebox.p40.example.yaml
+++ b/configs/control.singlebox.p40.example.yaml
@ -0,0 +1,18 @@
+server:
+  host: "127.0.0.1"
+  port: 8800
+
+auth:
+  client_api_keys:
+    - "change-me-client-key"
+  node_api_keys:
+    - "change-me-node-key"
+
+storage:
+  sqlite_path: "state/geniehive-p40.sqlite3"
+
+roles_path: "configs/roles.singlebox.p40.example.yaml"
+
+routing:
+  default_strategy: "loaded_first"
+  health_stale_after_s: 30
--- a/configs/node.example.yaml
+++ b/configs/node.example.yaml
@ -0,0 +1,56 @@
+node:
+  host_id: "atlas-01"
+  display_name: "Atlas GPU Box"
+  listen_host: "127.0.0.1"
+  listen_port: 8891
+
+control_plane:
+  base_url: "http://127.0.0.1:8800"
+  node_api_key: "change-me-node-key"
+  heartbeat_interval_s: 5
+
+inventory:
+  model_roots:
+    - "/path/to/models"
+  cpu_threads: 24
+  ram_gb: 128
+  capabilities:
+    cuda: true
+    rocm: false
+    metal: false
+
+managed_runtimes:
+  enabled: true
+  llama_server_bin: "/path/to/llama-server"
+
+services:
+  - service_id: "atlas-01/chat/qwen3-8b"
+    kind: "chat"
+    endpoint: "http://127.0.0.1:18091"
+    runtime:
+      engine: "llama.cpp"
+      launcher: "managed"
+    assets:
+      - asset_id: "qwen3-8b-q4km"
+        loaded: true
+    state:
+      health: "healthy"
+      load_state: "loaded"
+      accept_requests: true
+    observed:
+      p50_latency_ms: 900
+      tokens_per_sec: 40
+
+  - service_id: "atlas-01/embeddings/bge-small"
+    kind: "embeddings"
+    endpoint: "http://127.0.0.1:18092"
+    runtime:
+      engine: "llama.cpp"
+      launcher: "managed"
+    assets:
+      - asset_id: "bge-small-en"
+        loaded: true
+    state:
+      health: "healthy"
+      load_state: "loaded"
+      accept_requests: true
--- a/configs/node.singlebox.llamacpp.example.yaml
+++ b/configs/node.singlebox.llamacpp.example.yaml
@ -0,0 +1,43 @@
+node:
+  host_id: "singlebox-llamacpp"
+  display_name: "SingleBox llama.cpp"
+  listen_host: "127.0.0.1"
+  listen_port: 8891
+  address: "127.0.0.1"
+  labels:
+    topology: "singlebox"
+    runtime: "llama.cpp"
+
+control_plane:
+  base_url: "http://127.0.0.1:8800"
+  node_api_key: "change-me-node-key"
+  heartbeat_interval_s: 5
+
+inventory:
+  model_roots:
+    - "/path/to/models"
+  cpu_threads: 24
+  ram_gb: 64
+  capabilities:
+    cpu: true
+    cuda: true
+
+managed_runtimes:
+  enabled: false
+
+services:
+  - service_id: "singlebox/chat/qwen3-8b"
+    kind: "chat"
+    endpoint: "http://127.0.0.1:18091"
+    runtime:
+      engine: "llama.cpp"
+      launcher: "external"
+    assets:
+      - asset_id: "qwen3-8b-q4_k_m"
+        loaded: true
+    state:
+      health: "healthy"
+      load_state: "loaded"
+      accept_requests: true
+    observed:
+      p50_latency_ms: 900
--- a/configs/node.singlebox.llamafile.example.yaml
+++ b/configs/node.singlebox.llamafile.example.yaml
@ -0,0 +1,43 @@
+node:
+  host_id: "singlebox-llamafile"
+  display_name: "SingleBox llamafile"
+  listen_host: "127.0.0.1"
+  listen_port: 8891
+  address: "127.0.0.1"
+  labels:
+    topology: "singlebox"
+    runtime: "llamafile"
+
+control_plane:
+  base_url: "http://127.0.0.1:8800"
+  node_api_key: "change-me-node-key"
+  heartbeat_interval_s: 5
+
+inventory:
+  model_roots:
+    - "/path/to/models"
+  cpu_threads: 24
+  ram_gb: 64
+  capabilities:
+    cpu: true
+    cuda: true
+
+managed_runtimes:
+  enabled: false
+
+services:
+  - service_id: "singlebox/chat/qwen3-8b"
+    kind: "chat"
+    endpoint: "http://127.0.0.1:18091"
+    runtime:
+      engine: "llamafile"
+      launcher: "external"
+    assets:
+      - asset_id: "qwen3-8b-q4_k_m"
+        loaded: true
+    state:
+      health: "healthy"
+      load_state: "loaded"
+      accept_requests: true
+    observed:
+      p50_latency_ms: 900
--- a/configs/node.singlebox.ollama.example.yaml
+++ b/configs/node.singlebox.ollama.example.yaml
@ -0,0 +1,58 @@
+node:
+  host_id: "singlebox-ollama"
+  display_name: "SingleBox Ollama"
+  listen_host: "127.0.0.1"
+  listen_port: 8891
+  address: "127.0.0.1"
+  labels:
+    topology: "singlebox"
+    runtime: "ollama"
+
+control_plane:
+  base_url: "http://127.0.0.1:8800"
+  node_api_key: "change-me-node-key"
+  heartbeat_interval_s: 5
+
+inventory:
+  model_roots: []
+  cpu_threads: 24
+  ram_gb: 64
+  capabilities:
+    cpu: true
+    cuda: false
+
+managed_runtimes:
+  enabled: false
+
+services:
+  - service_id: "singlebox/chat/qwen3"
+    kind: "chat"
+    endpoint: "http://127.0.0.1:11434"
+    runtime:
+      engine: "ollama"
+      launcher: "external"
+    assets:
+      - asset_id: "qwen3"
+        loaded: true
+    state:
+      health: "healthy"
+      load_state: "loaded"
+      accept_requests: true
+    observed:
+      p50_latency_ms: 900
+
+  - service_id: "singlebox/embeddings/nomic-embed-text"
+    kind: "embeddings"
+    endpoint: "http://127.0.0.1:11434"
+    runtime:
+      engine: "ollama"
+      launcher: "external"
+    assets:
+      - asset_id: "nomic-embed-text"
+        loaded: true
+    state:
+      health: "healthy"
+      load_state: "loaded"
+      accept_requests: true
+    observed:
+      p50_latency_ms: 150
--- a/configs/node.singlebox.p40-triple.example.yaml
+++ b/configs/node.singlebox.p40-triple.example.yaml
@ -0,0 +1,84 @@
+node:
+  host_id: "p40-box"
+  display_name: "Dual P40 + CPU Fallback"
+  listen_host: "127.0.0.1"
+  listen_port: 8891
+  address: "127.0.0.1"
+  labels:
+    topology: "singlebox"
+    runtime: "llama.cpp"
+    gpu0: "Tesla P40"
+    gpu1: "Tesla P40"
+    cpu: "Ryzen 5600G"
+
+control_plane:
+  base_url: "http://127.0.0.1:8800"
+  node_api_key: "change-me-node-key"
+  heartbeat_interval_s: 5
+
+inventory:
+  model_roots:
+    - "/path/to/models"
+  cpu_threads: 12
+  ram_gb: 128
+  capabilities:
+    cpu: true
+    cuda: true
+
+managed_runtimes:
+  enabled: false
+
+services:
+  - service_id: "p40-box/chat/gpu0-primary"
+    kind: "chat"
+    endpoint: "http://127.0.0.1:18091"
+    runtime:
+      engine: "llama.cpp"
+      launcher: "external"
+      device: "gpu0"
+    assets:
+      - asset_id: "Qwen2.5-14B-Instruct-1M-Q5_K_M"
+        loaded: true
+    state:
+      health: "healthy"
+      load_state: "loaded"
+      accept_requests: true
+    observed:
+      p50_latency_ms: 1200
+      tokens_per_sec: 24
+
+  - service_id: "p40-box/chat/gpu1-secondary"
+    kind: "chat"
+    endpoint: "http://127.0.0.1:18092"
+    runtime:
+      engine: "llama.cpp"
+      launcher: "external"
+      device: "gpu1"
+    assets:
+      - asset_id: "Qwen3.5-9B-Q5_K_M"
+        loaded: true
+    state:
+      health: "healthy"
+      load_state: "loaded"
+      accept_requests: true
+    observed:
+      p50_latency_ms: 1000
+      tokens_per_sec: 30
+
+  - service_id: "p40-box/chat/cpu-fallback"
+    kind: "chat"
+    endpoint: "http://127.0.0.1:18093"
+    runtime:
+      engine: "llama.cpp"
+      launcher: "external"
+      device: "cpu"
+    assets:
+      - asset_id: "rocket-3b.Q5_K_M"
+        loaded: true
+    state:
+      health: "healthy"
+      load_state: "loaded"
+      accept_requests: true
+    observed:
+      p50_latency_ms: 4500
+      tokens_per_sec: 7
--- a/configs/roles.example.yaml
+++ b/configs/roles.example.yaml
@ -0,0 +1,22 @@
+roles:
+  - role_id: "mentor"
+    display_name: "Mentor"
+    operation: "chat"
+    modality: "text"
+    prompt_policy:
+      system_prompt: "Guide the user without taking over the task."
+    routing_policy:
+      preferred_families: ["Qwen3", "Mistral"]
+      min_context: 8192
+
+  - role_id: "embedder"
+    display_name: "Embedder"
+    operation: "embeddings"
+    modality: "text"
+    routing_policy:
+      require_loaded: true
+
+  - role_id: "transcriber"
+    display_name: "Transcriber"
+    operation: "transcription"
+    modality: "audio"
--- a/configs/roles.singlebox.p40.example.yaml
+++ b/configs/roles.singlebox.p40.example.yaml
@ -0,0 +1,33 @@
+roles:
+  - role_id: "mentor"
+    display_name: "Mentor"
+    description: "Primary high-quality reasoning/chat route"
+    operation: "chat"
+    modality: "text"
+    prompt_policy:
+      system_prompt: "Be concise, helpful, and technically accurate."
+    routing_policy:
+      preferred_families: ["qwen2.5-14b", "qwen2.5"]
+      require_loaded: true
+
+  - role_id: "general_assistant"
+    display_name: "General Assistant"
+    description: "Secondary fast chat route"
+    operation: "chat"
+    modality: "text"
+    prompt_policy:
+      system_prompt: "Answer clearly and directly."
+    routing_policy:
+      preferred_families: ["qwen3.5-9b", "qwen3.5"]
+      require_loaded: true
+
+  - role_id: "background_summarizer"
+    display_name: "Background Summarizer"
+    description: "Slow fallback route for low-priority work"
+    operation: "chat"
+    modality: "text"
+    prompt_policy:
+      system_prompt: "Summarize briefly and conservatively."
+    routing_policy:
+      preferred_families: ["rocket-3b", "rocket"]
+      require_loaded: true
--- a/docs/architecture.md
+++ b/docs/architecture.md
@ -0,0 +1,194 @@
+# GenieHive Architecture
+
+Status: proposed v1 architecture
+Drafted: 2026-04-05
+
+## Repo Name
+
+Chosen name: `GenieHive`
+
+Why this name:
+
+- suggestive: "genie" implies generative AI services, "hive" implies a cooperating cluster
+- accessible: easy to say, remember, and explain
+- whimsical enough to feel like a project name rather than a dry infrastructure label
+
+Tradeoff:
+
+- `GenieHive` is less search-distinct than `Geniewarren` because `hive` is a common product metaphor
+
+## Mission
+
+GenieHive is a local-first control plane for heterogeneous generative AI services running across one or more hosts.
+
+It should:
+
+- register hosts and their available services
+- expose a stable client-facing API
+- track health, capacity, and observed performance
+- support direct model addressing and higher-level role addressing
+- route requests to healthy loaded services first
+- optionally coordinate loading or swapping when policy allows
+- remain practical for a small self-hosted deployment with two hosts
+
+## Non-Goals For V1
+
+Out of scope initially:
+
+- peer-to-peer consensus
+- autonomous global model swapping across many nodes
+- full WAN zero-trust platform engineering
+- image and TTS generation orchestration
+- distributed vector database management
+- billing or multi-tenant quota accounting
+
+## Architectural Position
+
+GenieHive is not just an OpenAI-compatible gateway.
+
+It is a control plane with these layers:
+
+1. Control API
+   - authoritative registry
+   - routing and scheduling
+   - role catalog
+   - operator inspection
+
+2. Node Agent
+   - host discovery
+   - service discovery
+   - telemetry reporting
+   - optional local process management
+
+3. Provider Adapters
+   - OpenAI-compatible chat backends
+   - OpenAI-compatible embedding backends
+   - transcription backends
+   - future adapters for image and speech synthesis
+
+4. Client Facades
+   - OpenAI-compatible facade for completions and embeddings
+   - operator API for topology, health, and inventory
+
+## Core Concepts
+
+### Host
+
+A physical or virtual machine participating in the cluster.
+
+### Service
+
+A concrete callable capability on a host. Examples:
+
+- chat completion endpoint
+- embedding endpoint
+- transcription endpoint
+
+### Asset
+
+A model weight, model name, application, or runtime target that a service can serve.
+
+### Role
+
+A reusable task profile that describes how requests should be fulfilled. A role is policy, not a concrete model.
+
+### Route Resolution
+
+Request handling order:
+
+1. If the requested `model` matches a currently loaded and healthy concrete asset or service alias, route directly.
+2. Otherwise, if the requested `model` matches a known role, resolve the role to the best eligible service.
+3. Otherwise, fail clearly.
+
+## V1 Capability Scope
+
+V1 supports only:
+
+- chat completions
+- embeddings
+- transcription
+
+## Topology
+
+Recommended initial topology:
+
+- 1 control plane
+- 2 node agents
+- 1 or more clients
+- LAN-first deployment
+- API key auth in v1
+- VPN or mTLS in v1.5
+
+## API Families
+
+### Client API
+
+- `GET /v1/models`
+- `POST /v1/chat/completions`
+- `POST /v1/embeddings`
+- `POST /v1/audio/transcriptions`
+
+`GET /v1/models` should expose enough metadata for programmatic clients to make routing decisions about what GenieHive can handle cheaply, especially for lower-complexity offloaded work. That metadata should include direct assets, service-backed aliases, role aliases, operation kind, health, loaded status, and observed performance hints.
+
+### Operator API
+
+- `GET /v1/cluster/hosts`
+- `GET /v1/cluster/services`
+- `GET /v1/cluster/roles`
+- `GET /v1/cluster/health`
+- `GET /v1/cluster/routes/resolve?model=...`
+
+### Node API
+
+- `POST /v1/nodes/register`
+- `POST /v1/nodes/heartbeat`
+- `GET /v1/node/inventory`
+- `POST /v1/node/services/refresh`
+
+## Data Store
+
+V1 should use SQLite for durable state.
+
+## Routing Rules
+
+### Direct Model Resolution
+
+If a request names a concrete asset alias or service alias:
+
+- prefer loaded and healthy services
+- choose the lowest-cost healthy target if multiple matches exist
+- fail clearly if all matches are unhealthy
+
+### Role Resolution
+
+If direct resolution fails, treat the requested name as a role.
+
+Role resolution should filter by:
+
+- operation kind
+- modality
+- health
+- auth and exposure compatibility
+- minimum context or memory requirements
+- preferred model families
+
+Then rank by:
+
+- already loaded
+- recent health
+- expected latency
+- queue pressure
+- operator priority
+
+## First Implementation Sequence
+
+1. Create the repo skeleton and docs.
+2. Implement SQLite-backed registry models.
+3. Implement node registration and heartbeat.
+4. Implement operator inspection endpoints.
+5. Implement client-facing chat routing.
+6. Add embeddings routing.
+7. Add transcription routing.
+8. Add truthful readiness and health reporting.
+9. Add role catalog and role-based resolution.
+10. Add optional managed local runtime support.
--- a/docs/demo.md
+++ b/docs/demo.md
@ -0,0 +1,61 @@
+# GenieHive Demo
+
+This is the first end-to-end demo path for GenieHive using the example configs already in the repo.
+
+## Goal
+
+Bring up:
+
+- one control plane
+- one node agent
+- one route-resolution check
+
+The node should auto-register with the control plane on startup and then send periodic heartbeats.
+
+## 1. Start the control plane
+
+From the repo root:
+
+```bash
+bash scripts/run_control.sh
+```
+
+This uses:
+
+- `configs/control.example.yaml`
+- `configs/roles.example.yaml`
+
+## 2. Start the node agent
+
+In another shell:
+
+```bash
+bash scripts/run_node.sh
+```
+
+This uses:
+
+- `configs/node.example.yaml`
+
+## 3. Inspect the cluster
+
+In another shell:
+
+```bash
+bash scripts/demo_inspect.sh
+```
+
+That script checks:
+
+- client-facing model metadata
+- cluster health
+- registered hosts
+- registered services
+- loaded roles
+- route resolution for `mentor`
+
+## Notes
+
+- The example configs use API keys; the inspection script sends the example client key.
+- The example node config assumes the underlying model-serving endpoints already exist. The current demo proves control-plane registration and routing metadata, not full inference proxying yet.
+- The control plane stores state in `state/geniehive.sqlite3` by default.
--- a/docs/deployment.md
+++ b/docs/deployment.md
@ -0,0 +1,48 @@
+# GenieHive Deployment
+
+## Initial Deployment Target
+
+V1 should be easy to deploy on a small self-hosted setup:
+
+- 1 control plane
+- 2 node agents
+- private LAN or VPN
+- API-key auth first
+
+## Binding Guidance
+
+Defaults should be conservative:
+
+- control plane binds to localhost by default during development
+- node agents bind to localhost unless remote registration is needed
+- managed inference runtimes should stay node-local unless there is a specific reason to expose them
+
+## Security Baseline
+
+Required in v1:
+
+- client API keys
+- node registration keys
+- clear separation between client-facing and node-facing credentials
+
+Planned after v1:
+
+- mTLS between control plane and nodes
+- scoped client tokens
+
+## Persistence
+
+Use SQLite first for:
+
+- host registry
+- service registry
+- role catalog
+- recent health and benchmark samples
+
+## Startup Order
+
+1. Start the control plane.
+2. Start node agents.
+3. Confirm registration and heartbeat visibility.
+4. Confirm client API readiness.
+5. Exercise chat, embeddings, and transcription paths.
--- a/docs/llm_demo.md
+++ b/docs/llm_demo.md
@ -0,0 +1,676 @@
+# GenieHive LLM Demo
+
+This runbook covers the first practical GenieHive LLM demo with three roles:
+
+- master: the GenieHive control plane
+- peer: a GenieHive node agent attached to one or more local LLM servers
+- client: a demo client agent or Codex using GenieHive as the API front door
+
+## Current Readiness
+
+GenieHive is ready for a first live chat demo now.
+
+What works in GenieHive already:
+
+- node registration
+- heartbeat
+- role-aware route resolution
+- `GET /v1/models`
+- `POST /v1/chat/completions`
+- `POST /v1/embeddings`
+
+What GenieHive does not do yet:
+
+- launch upstream LLM servers for you automatically
+- provide `POST /v1/audio/transcriptions`
+- maintain advanced benchmark history or queue-aware scheduling
+
+For the first demo, treat GenieHive as a metadata-rich router over already-running local servers.
+
+## Topologies
+
+### Smallest Demo
+
+Run everything on one host:
+
+- control plane on `127.0.0.1:8800`
+- node agent on `127.0.0.1:8891`
+- one or more upstream model servers on local ports
+
+This is also the recommended setup for users who do not have a cluster. GenieHive still provides value as:
+
+- a local router
+- a metadata-rich local model catalog
+- a role-to-model indirection layer
+- a common front door for client tools
+
+### Two-Host Demo
+
+- master host runs GenieHive control plane
+- peer host runs GenieHive node agent and one or more local LLM servers
+- client runs anywhere that can reach the master
+
+## Master Instructions
+
+On the control-plane host:
+
+1. Create a repo-local Python environment if you want isolation.
+2. Start GenieHive control:
+
+```bash
+cd /home/netuser/bin/geniehive
+bash scripts/run_control.sh
+```
+
+3. Confirm health:
+
+```bash
+curl -sS http://127.0.0.1:8800/health
+```
+
+Expected result:
+
+- JSON containing `{"status":"ok"}`
+
+4. Keep note of the example client and node keys from `configs/control.example.yaml`.
+
+### Single-Box Shortcut
+
+If you are running control and node on the same machine, use:
+
+```bash
+cd /home/netuser/bin/geniehive
+bash scripts/run_control_singlebox.sh
+```
+
+For your P40 host, repo-provided external bind helpers now exist:
+
+LAN:
+
+```bash
+bash scripts/run_control_p40_lan.sh
+```
+
+ZeroTier:
+
+```bash
+bash scripts/run_control_p40_zerotier.sh
+```
+
+Both use the P40-specific control config and only change the bind interface.
+
+## Peer Instructions
+
+On each peer host you need:
+
+- one or more local LLM servers already running
+- one GenieHive node config that points at those servers
+- the control-plane base URL and node API key
+
+For a single-machine setup, the peer is simply another process on the same host.
+
+The node agent should advertise upstream server roots, not endpoint suffixes. For example:
+
+- good: `http://127.0.0.1:11434`
+- good: `http://127.0.0.1:18091`
+- not good: `http://127.0.0.1:11434/v1/chat/completions`
+
+### Option A: Ollama
+
+Use this when you want the lowest-friction chat and embeddings demo.
+
+1. Start Ollama if it is not already running:
+
+```bash
+ollama serve
+```
+
+2. Pull the model or models you want:
+
+```bash
+ollama pull qwen3
+ollama pull nomic-embed-text
+```
+
+3. Example peer service config:
+
+```yaml
+services:
+  - service_id: "peer1/chat/qwen3"
+    kind: "chat"
+    endpoint: "http://127.0.0.1:11434"
+    runtime:
+      engine: "ollama"
+      launcher: "external"
+    assets:
+      - asset_id: "qwen3"
+        loaded: true
+    state:
+      health: "healthy"
+      load_state: "loaded"
+      accept_requests: true
+
+  - service_id: "peer1/embeddings/nomic-embed-text"
+    kind: "embeddings"
+    endpoint: "http://127.0.0.1:11434"
+    runtime:
+      engine: "ollama"
+      launcher: "external"
+    assets:
+      - asset_id: "nomic-embed-text"
+        loaded: true
+    state:
+      health: "healthy"
+      load_state: "loaded"
+      accept_requests: true
+```
+
+4. Start the node:
+
+```bash
+cd /home/netuser/bin/geniehive
+bash scripts/run_node_singlebox.sh configs/node.singlebox.ollama.example.yaml
+```
+
+### Option B: llama.cpp
+
+Use this when you want direct GGUF serving with `llama-server`.
+
+1. Start a chat server:
+
+```bash
+llama-server -m /path/to/model.gguf --host 127.0.0.1 --port 18091
+```
+
+2. Example peer service config:
+
+```yaml
+services:
+  - service_id: "peer1/chat/qwen3-8b"
+    kind: "chat"
+    endpoint: "http://127.0.0.1:18091"
+    runtime:
+      engine: "llama.cpp"
+      launcher: "external"
+    assets:
+      - asset_id: "qwen3-8b-q4_k_m"
+        loaded: true
+    state:
+      health: "healthy"
+      load_state: "loaded"
+      accept_requests: true
+```
+
+Then start the node:
+
+```bash
+cd /home/netuser/bin/geniehive
+bash scripts/run_node_singlebox.sh configs/node.singlebox.llamacpp.example.yaml
+```
+
+Note:
+
+- The official `llama.cpp` docs clearly show OpenAI-compatible chat serving.
+- For embeddings, some `llama.cpp` builds document non-OpenAI embedding endpoints such as `/embedding`, so GenieHive’s current `POST /v1/embeddings` path is safest with Ollama or vLLM unless you have verified your specific build.
+
+### Option C: llamafile
+
+Use this when you want a single-file local server built around llama.cpp.
+
+1. Start a chat server:
+
+```bash
+./your-model.llamafile --server --host 127.0.0.1 --port 18091 --nobrowser
+```
+
+2. Example peer service config:
+
+```yaml
+services:
+  - service_id: "peer1/chat/llamafile-qwen3"
+    kind: "chat"
+    endpoint: "http://127.0.0.1:18091"
+    runtime:
+      engine: "llamafile"
+      launcher: "external"
+    assets:
+      - asset_id: "qwen3-8b-q4_k_m"
+        loaded: true
+    state:
+      health: "healthy"
+      load_state: "loaded"
+      accept_requests: true
+```
+
+Then start the node:
+
+```bash
+cd /home/netuser/bin/geniehive
+bash scripts/run_node_singlebox.sh configs/node.singlebox.llamafile.example.yaml
+```
+
+### Option D: vLLM
+
+Use this when you want a more server-oriented OpenAI-compatible stack and you have the hardware budget for it.
+
+1. Start the server:
+
+```bash
+vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
+```
+
+2. Example peer service config:
+
+```yaml
+services:
+  - service_id: "peer1/chat/llama3-8b"
+    kind: "chat"
+    endpoint: "http://127.0.0.1:8000"
+    runtime:
+      engine: "vllm"
+      launcher: "external"
+    assets:
+      - asset_id: "NousResearch/Meta-Llama-3-8B-Instruct"
+        loaded: true
+    state:
+      health: "healthy"
+      load_state: "loaded"
+      accept_requests: true
+
+  - service_id: "peer1/embeddings/bge-base"
+    kind: "embeddings"
+    endpoint: "http://127.0.0.1:8001"
+    runtime:
+      engine: "vllm"
+      launcher: "external"
+    assets:
+      - asset_id: "BAAI/bge-base-en-v1.5"
+        loaded: true
+    state:
+      health: "healthy"
+      load_state: "loaded"
+      accept_requests: true
+```
+
+## Minimal Node Config Pattern
+
+For a real peer host, the fields you most likely need to edit in `configs/node.example.yaml` are:
+
+- `node.host_id`
+- `node.display_name`
+- `node.address`
+- `control_plane.base_url`
+- `control_plane.node_api_key`
+- `inventory.capabilities`
+- `services`
+
+## Client Instructions
+
+You now have two simple ways to exercise GenieHive as a client.
+
+### Option 1: Inspect and call it manually
+
+List models:
+
+```bash
+curl -sS http://127.0.0.1:8800/v1/models \
+  -H 'X-Api-Key: change-me-client-key'
+```
+
+Chat using a role:
+
+```bash
+curl -sS http://127.0.0.1:8800/v1/chat/completions \
+  -H 'Content-Type: application/json' \
+  -H 'X-Api-Key: change-me-client-key' \
+  -d '{
+    "model": "mentor",
+    "messages": [{"role":"user","content":"Give me a 2-sentence summary of why SQLite is useful here."}]
+  }'
+```
+
+Embeddings using a direct embedding asset:
+
+```bash
+curl -sS http://127.0.0.1:8800/v1/embeddings \
+  -H 'Content-Type: application/json' \
+  -H 'X-Api-Key: change-me-client-key' \
+  -d '{
+    "model": "nomic-embed-text",
+    "input": "GenieHive is a local-first control plane."
+  }'
+```
+
+### Option 2: Use the demo client agent
+
+Run:
+
+```bash
+cd /home/netuser/bin/geniehive
+python scripts/demo_client_agent.py \
+  --base-url http://127.0.0.1:8800 \
+  --api-key change-me-client-key \
+  --task "Summarize the current GenieHive demo in three bullets."
+```
+
+That script will:
+
+- read `GET /v1/models`
+- choose a chat-capable model automatically if you do not specify one
+- prefer entries GenieHive marks as suitable for lower-complexity offload
+- submit a chat request and print the answer
+
+If you want to force a specific route:
+
+```bash
+python scripts/demo_client_agent.py \
+  --base-url http://127.0.0.1:8800 \
+  --api-key change-me-client-key \
+  --model mentor \
+  --task "State what host and route type you would expect for this demo."
+```
+
+## Codex-As-Client
+
+For Codex or another agentic client, the intended pattern is:
+
+1. Read `GET /v1/models`.
+2. Filter for `geniehive.operation == "chat"`.
+3. Prefer:
+   - `geniehive.offload_hint.suitability == "good_for_low_complexity"`
+   - `geniehive.loaded_target_count > 0` for role entries
+   - lower `best_p50_latency_ms`
+4. Send lower-complexity requests to GenieHive.
+5. Keep higher-complexity, high-context, or high-risk tasks local unless the catalog indicates a better remote fit.
+
+## Good First Live Demo
+
+If you want the safest first success path:
+
+- control plane on one host
+- node agent on the same host
+- Ollama upstream with one chat model
+- role alias `mentor`
+- demo client agent calling `mentor`
+
+That avoids GGUF-specific launch tuning while still exercising the full GenieHive master/peer/client path.
+
+## Single-Machine End-to-End Example
+
+### Ollama-backed single box
+
+1. Start Ollama:
+
+```bash
+ollama serve
+```
+
+2. Pull models:
+
+```bash
+ollama pull qwen3
+ollama pull nomic-embed-text
+```
+
+3. Start GenieHive control:
+
+```bash
+cd /home/netuser/bin/geniehive
+bash scripts/run_control_singlebox.sh
+```
+
+4. Start GenieHive node:
+
+```bash
+cd /home/netuser/bin/geniehive
+bash scripts/run_node_singlebox.sh configs/node.singlebox.ollama.example.yaml
+```
+
+5. Inspect:
+
+```bash
+bash scripts/demo_inspect.sh
+```
+
+6. Run the client agent:
+
+```bash
+python scripts/demo_client_agent.py \
+  --base-url http://127.0.0.1:8800 \
+  --api-key change-me-client-key \
+  --task "Explain in three bullets what GenieHive is doing in this single-machine demo."
+```
+
+### llama.cpp-backed single box
+
+1. Start the local server:
+
+```bash
+llama-server -m /path/to/model.gguf --host 127.0.0.1 --port 18091
+```
+
+2. Start GenieHive control:
+
+```bash
+cd /home/netuser/bin/geniehive
+bash scripts/run_control_singlebox.sh
+```
+
+3. Start GenieHive node:
+
+```bash
+cd /home/netuser/bin/geniehive
+bash scripts/run_node_singlebox.sh configs/node.singlebox.llamacpp.example.yaml
+```
+
+4. Run the client agent:
+
+```bash
+python scripts/demo_client_agent.py \
+  --base-url http://127.0.0.1:8800 \
+  --api-key change-me-client-key \
+  --task "Summarize why a single-machine GenieHive setup can still be useful."
+```
+
+## Host-Specific Note: Dual Tesla P40 + 128 GB RAM
+
+For a machine with:
+
+- `2 x Nvidia Tesla P40`
+- `AMD Ryzen 5600G`
+- `128 GB RAM`
+
+the most practical first GenieHive layout is:
+
+- one chat model on `GPU0`
+- one chat or utility model on `GPU1`
+- one slower fallback chat model on CPU
+
+This is now sketched in:
+
+- `configs/node.singlebox.p40-triple.example.yaml`
+- `configs/control.singlebox.p40.example.yaml`
+- `configs/roles.singlebox.p40.example.yaml`
+- `scripts/start_p40_triple_llamacpp.sh`
+- `scripts/launch_p40_triple.sh`
+- `scripts/p40_triple_gpu0.sh`
+- `scripts/p40_triple_gpu1.sh`
+- `scripts/p40_triple_cpu.sh`
+
+The current concrete defaults use models already present under `/home/netuser/bin/models/llm`:
+
+- `GPU0`: `Qwen2.5-14B-Instruct-1M-Q5_K_M.gguf`
+- `GPU1`: `Qwen3.5-9B-Q5_K_M.gguf`
+- `CPU`: `rocket-3b.Q5_K_M.gguf`
+
+### Why this layout works
+
+- each P40 has enough VRAM for a quantized 7B to 14B model comfortably
+- 128 GB RAM is enough to hold a separate CPU-served fallback model without much trouble
+- the CPU route will be much slower, but it is still useful for low-priority offload or fallback handling
+
+### Suggested role usage
+
+- `mentor` or primary chat role -> `GPU0`
+- `general_assistant` or alternate chat role -> `GPU1`
+- `fallback_writer` or `background_summarizer` -> CPU route
+
+The repo now includes a host-specific role catalog with exactly that intent.
+
+### Launch pattern
+
+1. Edit your model paths:
+
+```bash
+cd /home/netuser/bin/geniehive
+bash scripts/start_p40_triple_llamacpp.sh
+```
+
+If the defaults look good, you do not need to edit them before trying the first run.
+
+If `tmux` is available, you can also launch the three processes detached:
+
+```bash
+cd /home/netuser/bin/geniehive
+bash scripts/launch_p40_triple.sh
+```
+
+Then inspect pane state without binding your current terminal to the session:
+
+```bash
+bash scripts/tmux_session_status.sh
+```
+
+That status helper checks whether the session exists and whether each pane's launcher process is still running or has already exited. If `tmux` is not installed, the combined launcher prints the three helper commands instead.
+
+2. Start the three `llama-server` processes in separate shells.
+
+3. Start GenieHive control:
+
+```bash
+bash scripts/run_control_singlebox.sh configs/control.singlebox.p40.example.yaml
+```
+
+4. Start GenieHive node with the host-specific config:
+
+```bash
+bash scripts/run_node_singlebox.sh configs/node.singlebox.p40-triple.example.yaml
+```
+
+5. Inspect the catalog:
+
+```bash
+bash scripts/demo_inspect.sh
+```
+
+If something is not coming up cleanly, run:
+
+```bash
+bash scripts/check_singlebox_health.sh
+```
+
+That checks:
+
+- `GPU0` upstream health
+- `GPU1` upstream health
+- CPU fallback upstream health
+- GenieHive control health
+- GenieHive node health
+- authenticated cluster and model-catalog endpoints
+
+6. Exercise the chat path:
+
+```bash
+python scripts/demo_client_agent.py \
+  --base-url http://127.0.0.1:8800 \
+  --api-key change-me-client-key \
+  --model mentor \
+  --task "State which route should be preferred for low-latency chat and which should be the slow fallback."
+```
+
+### Practical expectations
+
+- `GPU0` and `GPU1` should be the preferred targets for normal chat work
+- the CPU route should mostly be treated as fallback or low-priority background work
+- GenieHive metadata should make that visible to clients through latency and offload hints
+
+### Containerized Qwen3.5 probe
+
+If the host-installed `llama-server` is too old for `Qwen3.5`, but the NVIDIA Container Toolkit is installed, you can test a newer CUDA-enabled `llama.cpp` without changing the host CUDA stack:
+
+```bash
+cd /home/netuser/bin/geniehive
+bash scripts/test_qwen35_server_cuda_container.sh
+```
+
+Useful overrides:
+
+```bash
+GPU_INDEX=1 PORT=19092 bash scripts/test_qwen35_server_cuda_container.sh
+MODEL_PATH=/home/netuser/bin/models/llm/Qwen3.5-9B-Q5_K_M.gguf bash scripts/test_qwen35_server_cuda_container.sh
+```
+
+That probe uses the official `ghcr.io/ggml-org/llama.cpp:server-cuda` image. If it loads the model and starts serving, then the remaining blocker is your host `llama.cpp` install, not GPU compatibility.
+
+## External Client Access
+
+For your current host addresses:
+
+- LAN: `192.168.40.207`
+- ZeroTier: `172.24.50.65`
+
+The cleanest rule is:
+
+- keep upstream model servers on `127.0.0.1`
+- keep the GenieHive node on `127.0.0.1` unless you specifically need remote node access
+- expose only the GenieHive control plane to LAN or ZeroTier clients
+
+That gives remote clients a single stable endpoint without exposing the underlying model servers directly.
+
+### LAN bind
+
+```bash
+cd /home/netuser/bin/geniehive
+bash scripts/run_control_p40_lan.sh
+```
+
+Remote client example:
+
+```bash
+python scripts/demo_client_agent.py \
+  --base-url http://192.168.40.207:8800 \
+  --api-key change-me-client-key \
+  --model mentor \
+  --task "Briefly describe the preferred and fallback routes on this host."
+```
+
+### ZeroTier bind
+
+```bash
+cd /home/netuser/bin/geniehive
+bash scripts/run_control_p40_zerotier.sh
+```
+
+Remote client example:
+
+```bash
+python scripts/demo_client_agent.py \
+  --base-url http://172.24.50.65:8800 \
+  --api-key change-me-client-key \
+  --model mentor \
+  --task "Briefly describe the preferred and fallback routes on this host."
+```
+
+### Security note
+
+Prefer ZeroTier over general LAN exposure when possible. In both cases:
+
+- do not expose the upstream `llama-server` ports
+- keep the client API key enabled
+- if you later open this beyond trusted networks, add a reverse proxy or VPN-only boundary rather than binding GenieHive broadly
+
+### Role meanings for this host
+
+- `mentor` should bias toward the `GPU0` Qwen2.5 14B route
+- `general_assistant` should bias toward the `GPU1` Qwen3.5 9B route
+- `background_summarizer` should bias toward the CPU Rocket 3B fallback route
--- a/docs/reverse_proxy.md
+++ b/docs/reverse_proxy.md
@ -0,0 +1,94 @@
+# GenieHive Reverse Proxy
+
+For external clients, a reverse proxy is cleaner than binding GenieHive directly to every interface.
+
+Recommended pattern:
+
+- keep upstream model servers on `127.0.0.1`
+- keep GenieHive node on `127.0.0.1`
+- keep GenieHive control on `127.0.0.1`
+- expose only the reverse proxy on LAN or ZeroTier
+
+## Caddy Example
+
+Config file:
+
+```caddy
+192.168.40.207:8080 {
+    reverse_proxy 127.0.0.1:8800
+}
+```
+
+ZeroTier variant:
+
+```caddy
+172.24.50.65:8080 {
+    reverse_proxy 127.0.0.1:8800
+}
+```
+
+Advantages:
+
+- simple config
+- easy to move to TLS later
+- good default operational behavior
+
+## Nginx Example
+
+Server block:
+
+```nginx
+server {
+    listen 192.168.40.207:8080;
+    server_name _;
+
+    location / {
+        proxy_pass http://127.0.0.1:8800;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+    }
+}
+```
+
+ZeroTier variant:
+
+```nginx
+server {
+    listen 172.24.50.65:8080;
+    server_name _;
+
+    location / {
+        proxy_pass http://127.0.0.1:8800;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+    }
+}
+```
+
+## Operational Recommendation
+
+For your current host, the cleanest shape is:
+
+1. GenieHive control on `127.0.0.1:8800`
+2. reverse proxy on either:
+   - `192.168.40.207:8080`
+   - `172.24.50.65:8080`
+3. clients talk only to the reverse proxy
+
+## Client Example
+
+```bash
+python scripts/demo_client_agent.py \
+  --base-url http://172.24.50.65:8080 \
+  --api-key change-me-client-key \
+  --model mentor \
+  --task "Describe the preferred and fallback routes on this host."
+```
+
+## Security Note
+
+The API key is still required. The reverse proxy improves exposure hygiene, but it is not a substitute for network trust boundaries.
--- a/docs/roadmap.md
+++ b/docs/roadmap.md
@ -0,0 +1,34 @@
+# GenieHive Roadmap
+
+## Completed Foundations
+
+- control-plane registry with SQLite persistence
+- node registration and heartbeat
+- role catalog and route resolution
+- client-facing `GET /v1/models`
+- client-facing `POST /v1/chat/completions`
+- client-facing `POST /v1/embeddings`
+- first control-plus-node demo flow
+
+## Immediate Next Milestones
+
+1. Run and document the first live LLM demo against real upstream servers.
+2. Validate the `GET /v1/models` metadata as a Codex-friendly offload catalog for lower-complexity tasks.
+3. Add `POST /v1/audio/transcriptions`.
+4. Add a richer node metrics model for queue depth, current load, and observed performance over time.
+5. Add a stronger operator/client distinction in the public metadata and auth surfaces.
+
+## LLM Demo Note
+
+The project is now ready for a first live LLM demo using GenieHive as:
+
+- master: control plane
+- peer: one or more node agents with pre-existing local LLM servers
+- client: a small demo agent or Codex configured against GenieHive
+
+The current live-demo priority is chat-first. Embeddings are also wired in GenieHive, but upstream compatibility differs across local servers, so the safest first demo matrix is:
+
+- Ollama for chat and embeddings
+- vLLM for chat and embeddings
+- llama.cpp for chat
+- llamafile for chat
--- a/docs/schemas.md
+++ b/docs/schemas.md
@ -0,0 +1,128 @@
+# GenieHive Schemas
+
+These are canonical logical schemas for v1. They are documentation first, not final implementation code.
+
+## Host
+
+```yaml
+host:
+  host_id: "atlas-01"
+  display_name: "Atlas GPU Box"
+  address: "192.168.1.101"
+  labels:
+    site: "home-lab"
+    class: "gpu"
+  capabilities:
+    cuda: true
+    rocm: false
+    metal: false
+  resources:
+    cpu_threads: 24
+    ram_gb: 128
+    gpus:
+      - gpu_id: "cuda:0"
+        name: "RTX 4090"
+        vram_gb: 24
+  auth:
+    node_key_id: "nk_atlas_01"
+  status:
+    state: "online"
+    last_seen: "2026-04-05T15:30:00Z"
+```
+
+## Service
+
+```yaml
+service:
+  service_id: "atlas-01/chat/qwen3-8b"
+  host_id: "atlas-01"
+  kind: "chat"
+  protocol: "openai"
+  endpoint: "http://192.168.1.101:18091"
+  runtime:
+    engine: "llama.cpp"
+    launcher: "managed"
+  assets:
+    - asset_id: "qwen3-8b-q4km"
+      loaded: true
+  state:
+    health: "healthy"
+    load_state: "loaded"
+    accept_requests: true
+  observed:
+    p50_latency_ms: 920
+    p95_latency_ms: 1900
+    tokens_per_sec: 42
+```
+
+## Asset
+
+```yaml
+asset:
+  asset_id: "qwen3-8b-q4km"
+  family: "Qwen3-8B"
+  modality: "text"
+  operation: "chat"
+  format: "gguf"
+  locator:
+    kind: "path"
+    value: "/models/qwen3-8b/qwen3-8b-q4_k_m.gguf"
+  metadata:
+    quant: "Q4_K_M"
+    ctx_train: 32768
+```
+
+## Role Profile
+
+```yaml
+role:
+  role_id: "mentor"
+  display_name: "Mentor"
+  description: "Guidance-oriented instructional reasoning"
+  modality: "text"
+  operation: "chat"
+  prompt_policy:
+    system_prompt: "You guide without doing the user's work for them."
+    user_template: "{{ user_input }}"
+  routing_policy:
+    preferred_families: ["Qwen3", "Mistral"]
+    preferred_labels: ["instruction", "stable"]
+    min_context: 8192
+    require_loaded: false
+    fallback_roles: ["general_assistant"]
+```
+
+## Health Sample
+
+```yaml
+health_sample:
+  sample_id: "hs_01"
+  target_type: "service"
+  target_id: "atlas-01/chat/qwen3-8b"
+  observed_at: "2026-04-05T15:30:00Z"
+  status: "healthy"
+  checks:
+    http_ok: true
+    models_ok: true
+    auth_ok: true
+  metrics:
+    queue_depth: 1
+    in_flight: 1
+    mem_used_gb: 18.4
+```
+
+## Benchmark Sample
+
+```yaml
+benchmark_sample:
+  benchmark_id: "bench_01"
+  service_id: "atlas-01/chat/qwen3-8b"
+  asset_id: "qwen3-8b-q4km"
+  observed_at: "2026-04-05T15:25:00Z"
+  workload: "chat.short_reasoning"
+  results:
+    prompt_tokens: 512
+    completion_tokens: 256
+    ttft_ms: 780
+    tokens_per_sec: 44
+```
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,28 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "geniehive"
+version = "0.1.0"
+description = "Local-first control plane for heterogeneous generative AI services"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+  "fastapi>=0.110",
+  "httpx>=0.27",
+  "pydantic>=2.6",
+  "pyyaml>=6.0.1",
+  "uvicorn>=0.29",
+]
+
+[project.optional-dependencies]
+dev = [
+  "pytest>=8.0",
+]
+
+[tool.setuptools]
+package-dir = {"" = "src"}
+
+[tool.setuptools.packages.find]
+where = ["src"]
--- a/scripts/check_singlebox_health.sh
+++ b/scripts/check_singlebox_health.sh
@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+check() {
+  local name="$1"
+  local url="$2"
+  if curl -fsS "$url" >/dev/null 2>&1; then
+    printf '[ok]   %s -> %s\n' "$name" "$url"
+  else
+    printf '[fail] %s -> %s\n' "$name" "$url"
+  fi
+}
+
+echo "GenieHive single-box health check"
+echo
+
+check "gpu0 upstream" "http://127.0.0.1:18091/health"
+check "gpu1 upstream" "http://127.0.0.1:18092/health"
+check "cpu upstream" "http://127.0.0.1:18093/health"
+check "control plane" "http://127.0.0.1:8800/health"
+check "node agent" "http://127.0.0.1:8891/health"
+
+echo
+echo "Authenticated GenieHive checks"
+echo
+
+if curl -fsS http://127.0.0.1:8800/v1/cluster/health -H 'X-Api-Key: change-me-client-key' >/dev/null 2>&1; then
+  echo "[ok]   cluster health endpoint"
+else
+  echo "[fail] cluster health endpoint"
+fi
+
+if curl -fsS http://127.0.0.1:8800/v1/models -H 'X-Api-Key: change-me-client-key' >/dev/null 2>&1; then
+  echo "[ok]   model catalog endpoint"
+else
+  echo "[fail] model catalog endpoint"
+fi
--- a/scripts/demo_client_agent.py
+++ b/scripts/demo_client_agent.py
@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+from typing import Any
+
+import httpx
+
+
+def fetch_models(client: httpx.Client, base_url: str, api_key: str) -> list[dict[str, Any]]:
+    response = client.get(
+        f"{base_url.rstrip('/')}/v1/models",
+        headers={"X-Api-Key": api_key},
+    )
+    response.raise_for_status()
+    return response.json().get("data", [])
+
+
+def choose_chat_model(models: list[dict[str, Any]]) -> str:
+    candidates = []
+    for item in models:
+        meta = item.get("geniehive", {})
+        if meta.get("operation") != "chat":
+            continue
+        offload = meta.get("offload_hint", {})
+        route_type = meta.get("route_type")
+        suitability = offload.get("suitability", "")
+        latency = meta.get("best_p50_latency_ms")
+        if latency is None:
+            latency = meta.get("observed", {}).get("p50_latency_ms")
+        latency_score = float(latency) if latency is not None else float("inf")
+        role_preference = 1 if route_type == "role" else 0
+        suitability_rank = {
+            "good_for_low_complexity": 3,
+            "usable_for_background_tasks": 2,
+            "available_but_slow": 1,
+            "cold_only": 0,
+        }.get(suitability, 0)
+        candidates.append((suitability_rank, role_preference, -latency_score, item["id"]))
+    if not candidates:
+        raise SystemExit("No chat-capable models were advertised by GenieHive.")
+    return max(candidates)[3]
+
+
+def run_task(base_url: str, api_key: str, model: str, task: str) -> dict[str, Any]:
+    with httpx.Client(timeout=120.0) as client:
+        response = client.post(
+            f"{base_url.rstrip('/')}/v1/chat/completions",
+            headers={
+                "X-Api-Key": api_key,
+                "Content-Type": "application/json",
+            },
+            json={
+                "model": model,
+                "messages": [
+                    {"role": "system", "content": "You are a concise demo client agent."},
+                    {"role": "user", "content": task},
+                ],
+            },
+        )
+        response.raise_for_status()
+        return response.json()
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Exercise GenieHive as a small client agent.")
+    parser.add_argument("--base-url", required=True, help="GenieHive control-plane base URL")
+    parser.add_argument("--api-key", required=True, help="GenieHive client API key")
+    parser.add_argument("--model", help="Explicit chat model or role alias to use")
+    parser.add_argument("--task", help="Task text to send")
+    parser.add_argument("--list-models", action="store_true", help="List advertised models and exit")
+    args = parser.parse_args()
+
+    with httpx.Client(timeout=30.0) as client:
+        models = fetch_models(client, args.base_url, args.api_key)
+
+    if args.list_models:
+        print(json.dumps(models, indent=2))
+        return
+
+    if not args.task:
+        raise SystemExit("--task is required unless --list-models is used.")
+
+    model = args.model or choose_chat_model(models)
+    print(f"Using model: {model}")
+    result = run_task(args.base_url, args.api_key, model, args.task)
+    print(json.dumps(result, indent=2))
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/demo_inspect.sh
+++ b/scripts/demo_inspect.sh
@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+BASE_URL="${GENIEHIVE_CONTROL_BASE_URL:-http://127.0.0.1:8800}"
+CLIENT_KEY="${GENIEHIVE_CLIENT_KEY:-change-me-client-key}"
+
+curl -sS "$BASE_URL/v1/models" -H "X-Api-Key: $CLIENT_KEY"
+printf '\n'
+curl -sS "$BASE_URL/v1/cluster/health" -H "X-Api-Key: $CLIENT_KEY"
+printf '\n'
+curl -sS "$BASE_URL/v1/cluster/hosts" -H "X-Api-Key: $CLIENT_KEY"
+printf '\n'
+curl -sS "$BASE_URL/v1/cluster/services" -H "X-Api-Key: $CLIENT_KEY"
+printf '\n'
+curl -sS "$BASE_URL/v1/cluster/roles" -H "X-Api-Key: $CLIENT_KEY"
+printf '\n'
+curl -sS "$BASE_URL/v1/cluster/routes/resolve?model=mentor" -H "X-Api-Key: $CLIENT_KEY"
+printf '\n'
--- a/scripts/launch_p40_triple.sh
+++ b/scripts/launch_p40_triple.sh
@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+SESSION="${GENIEHIVE_TMUX_SESSION:-geniehive-p40}"
+STATUS_CMD="$ROOT/scripts/tmux_session_status.sh"
+
+GPU0_CMD="$ROOT/scripts/p40_triple_gpu0.sh"
+GPU1_CMD="$ROOT/scripts/p40_triple_gpu1.sh"
+CPU_CMD="$ROOT/scripts/p40_triple_cpu.sh"
+
+if command -v tmux >/dev/null 2>&1; then
+  if tmux has-session -t "$SESSION" 2>/dev/null; then
+    echo "tmux session already exists: $SESSION"
+    echo "Inspect panes with: bash '$STATUS_CMD' '$SESSION'"
+    exit 1
+  fi
+
+  tmux new-session -d -s "$SESSION" "cd '$ROOT' && bash '$GPU0_CMD'"
+  tmux split-window -h -t "$SESSION:0" "cd '$ROOT' && bash '$GPU1_CMD'"
+  tmux split-window -v -t "$SESSION:0" "cd '$ROOT' && bash '$CPU_CMD'"
+  tmux set-option -t "$SESSION:0" remain-on-exit on >/dev/null
+  tmux select-pane -t "$SESSION:0.0" -T gpu0 >/dev/null
+  tmux select-pane -t "$SESSION:0.1" -T gpu1 >/dev/null
+  tmux select-pane -t "$SESSION:0.2" -T cpu >/dev/null
+  tmux select-layout -t "$SESSION" tiled >/dev/null
+  echo "Started tmux session: $SESSION"
+  echo "Inspect panes with: bash '$STATUS_CMD' '$SESSION'"
+  echo "Attach manually only if needed: tmux attach -t $SESSION"
+  exit 0
+fi
+
+echo "tmux not found. Run these in three shells:"
+echo
+echo "bash '$GPU0_CMD'"
+echo "bash '$GPU1_CMD'"
+echo "bash '$CPU_CMD'"
--- a/scripts/p40_triple_cpu.sh
+++ b/scripts/p40_triple_cpu.sh
@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+MODEL_CPU="${MODEL_CPU:-/home/netuser/bin/models/llm/rocket-3b.Q5_K_M.gguf}"
+LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/home/netuser/bin/llama.cpp/build/bin/llama-server}"
+
+exec "$LLAMA_SERVER_BIN" -m "$MODEL_CPU" --host 127.0.0.1 --port 18093 -ngl 0 -t 12
--- a/scripts/p40_triple_gpu0.sh
+++ b/scripts/p40_triple_gpu0.sh
@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+MODEL_GPU0="${MODEL_GPU0:-/home/netuser/bin/models/llm/Qwen2.5-14B-Instruct-1M-Q5_K_M.gguf}"
+LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/home/netuser/bin/llama.cpp/build/bin/llama-server}"
+
+exec env CUDA_VISIBLE_DEVICES=0 "$LLAMA_SERVER_BIN" -m "$MODEL_GPU0" --host 127.0.0.1 --port 18091
--- a/scripts/p40_triple_gpu1.sh
+++ b/scripts/p40_triple_gpu1.sh
@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+MODEL_GPU1="${MODEL_GPU1:-/home/netuser/bin/models/llm/Qwen3.5-9B-Q5_K_M.gguf}"
+LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/home/netuser/bin/llama.cpp/build/bin/llama-server}"
+HOST="${GPU1_HOST:-127.0.0.1}"
+PORT="${GPU1_PORT:-18092}"
+CTX_SIZE="${GPU1_CTX_SIZE:-4096}"
+NGL="${GPU1_NGL:-999}"
+GPU_INDEX="${GPU1_INDEX:-1}"
+USE_CONTAINER="${GPU1_USE_CONTAINER:-0}"
+CONTAINER_IMAGE="${GPU1_CONTAINER_IMAGE:-ghcr.io/ggml-org/llama.cpp:server-cuda}"
+
+if [[ "${USE_CONTAINER}" == "1" ]]; then
+  exec docker run --rm --gpus all \
+    --network host \
+    -e CUDA_VISIBLE_DEVICES="${GPU_INDEX}" \
+    -v "$(dirname "${MODEL_GPU1}"):/models:ro" \
+    "${CONTAINER_IMAGE}" \
+    -m "/models/$(basename "${MODEL_GPU1}")" \
+    -ngl "${NGL}" \
+    --ctx-size "${CTX_SIZE}" \
+    --host "${HOST}" \
+    --port "${PORT}"
+fi
+
+exec env CUDA_VISIBLE_DEVICES="${GPU_INDEX}" "$LLAMA_SERVER_BIN" \
+  -m "$MODEL_GPU1" \
+  -ngl "${NGL}" \
+  --ctx-size "${CTX_SIZE}" \
+  --host "${HOST}" \
+  --port "${PORT}"
--- a/scripts/run_control.sh
+++ b/scripts/run_control.sh
@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT"
+
+export GENIEHIVE_CONTROL_CONFIG="$ROOT/configs/control.example.yaml"
+export GENIEHIVE_ROLES_CONFIG="$ROOT/configs/roles.example.yaml"
+export PYTHONPATH="$ROOT/src"
+
+exec python -m uvicorn geniehive_control.main:app --host 127.0.0.1 --port 8800
--- a/scripts/run_control_p40_lan.sh
+++ b/scripts/run_control_p40_lan.sh
@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT"
+
+export GENIEHIVE_BIND_HOST="${GENIEHIVE_BIND_HOST:-192.168.40.207}"
+export GENIEHIVE_BIND_PORT="${GENIEHIVE_BIND_PORT:-8800}"
+
+exec bash "$ROOT/scripts/run_control_singlebox.sh" "$ROOT/configs/control.singlebox.p40.example.yaml"
--- a/scripts/run_control_p40_zerotier.sh
+++ b/scripts/run_control_p40_zerotier.sh
@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT"
+
+export GENIEHIVE_BIND_HOST="${GENIEHIVE_BIND_HOST:-172.24.50.65}"
+export GENIEHIVE_BIND_PORT="${GENIEHIVE_BIND_PORT:-8800}"
+
+exec bash "$ROOT/scripts/run_control_singlebox.sh" "$ROOT/configs/control.singlebox.p40.example.yaml"
--- a/scripts/run_control_singlebox.sh
+++ b/scripts/run_control_singlebox.sh
@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT"
+
+CONTROL_CONFIG="${1:-$ROOT/configs/control.singlebox.example.yaml}"
+
+export GENIEHIVE_CONTROL_CONFIG="$CONTROL_CONFIG"
+if [[ -z "${GENIEHIVE_ROLES_CONFIG:-}" ]]; then
+  export GENIEHIVE_ROLES_CONFIG="$ROOT/configs/roles.example.yaml"
+fi
+export PYTHONPATH="$ROOT/src"
+
+HOST="${GENIEHIVE_BIND_HOST:-127.0.0.1}"
+PORT="${GENIEHIVE_BIND_PORT:-8800}"
+
+exec python -m uvicorn geniehive_control.main:app --host "$HOST" --port "$PORT"
--- a/scripts/run_node.sh
+++ b/scripts/run_node.sh
@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT"
+
+export GENIEHIVE_NODE_CONFIG="$ROOT/configs/node.example.yaml"
+export PYTHONPATH="$ROOT/src"
+
+exec python -m uvicorn geniehive_node.main:app --host 127.0.0.1 --port 8891
--- a/scripts/run_node_singlebox.sh
+++ b/scripts/run_node_singlebox.sh
@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT"
+
+NODE_CONFIG="${1:-$ROOT/configs/node.singlebox.ollama.example.yaml}"
+
+export GENIEHIVE_NODE_CONFIG="$NODE_CONFIG"
+export PYTHONPATH="$ROOT/src"
+
+HOST="${GENIEHIVE_NODE_BIND_HOST:-127.0.0.1}"
+PORT="${GENIEHIVE_NODE_BIND_PORT:-8891}"
+
+exec python -m uvicorn geniehive_node.main:app --host "$HOST" --port "$PORT"
--- a/scripts/start_p40_triple_llamacpp.sh
+++ b/scripts/start_p40_triple_llamacpp.sh
@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Example launcher pattern for:
+# - GPU0 chat model on :18091
+# - GPU1 chat model on :18092
+# - CPU fallback chat model on :18093
+#
+# Defaults are based on models already present under /home/netuser/bin/models/llm.
+# Override them via env vars if you want different weights.
+
+MODEL_GPU0="${MODEL_GPU0:-/home/netuser/bin/models/llm/Qwen2.5-14B-Instruct-1M-Q5_K_M.gguf}"
+MODEL_GPU1="${MODEL_GPU1:-/home/netuser/bin/models/llm/Qwen3.5-9B-Q5_K_M.gguf}"
+MODEL_CPU="${MODEL_CPU:-/home/netuser/bin/models/llm/rocket-3b.Q5_K_M.gguf}"
+LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/home/netuser/bin/llama.cpp/build/bin/llama-server}"
+
+echo "Start these in separate shells or tmux panes."
+echo "Helper scripts are available too:"
+echo
+echo "bash /home/netuser/bin/geniehive/scripts/p40_triple_gpu0.sh"
+echo
+echo "bash /home/netuser/bin/geniehive/scripts/p40_triple_gpu1.sh"
+echo
+echo "bash /home/netuser/bin/geniehive/scripts/p40_triple_cpu.sh"
+echo
+echo "Or try the combined launcher:"
+echo "bash /home/netuser/bin/geniehive/scripts/launch_p40_triple.sh"
+echo
+echo "Equivalent raw commands:"
+echo
+echo "CUDA_VISIBLE_DEVICES=0 \"$LLAMA_SERVER_BIN\" -m \"$MODEL_GPU0\" --host 127.0.0.1 --port 18091"
+echo
+echo "CUDA_VISIBLE_DEVICES=1 \"$LLAMA_SERVER_BIN\" -m \"$MODEL_GPU1\" --host 127.0.0.1 --port 18092"
+echo
+echo "\"$LLAMA_SERVER_BIN\" -m \"$MODEL_CPU\" --host 127.0.0.1 --port 18093 -ngl 0 -t 12"
--- a/scripts/test_qwen35_server_cuda_container.sh
+++ b/scripts/test_qwen35_server_cuda_container.sh
@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+IMAGE="${IMAGE:-ghcr.io/ggml-org/llama.cpp:server-cuda}"
+MODEL_PATH="${MODEL_PATH:-/home/netuser/bin/models/llm/Qwen3.5-9B-Q5_K_M.gguf}"
+GPU_INDEX="${GPU_INDEX:-0}"
+CTX_SIZE="${CTX_SIZE:-512}"
+PORT="${PORT:-19091}"
+TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-90}"
+
+if [[ ! -f "${MODEL_PATH}" ]]; then
+  echo "Model not found: ${MODEL_PATH}" >&2
+  exit 1
+fi
+
+echo "Image: ${IMAGE}"
+echo "Model: ${MODEL_PATH}"
+echo "GPU: ${GPU_INDEX}"
+echo "Port: ${PORT}"
+echo "Timeout: ${TIMEOUT_SECONDS}s"
+echo
+echo "This probe is successful if llama-server loads the model and begins serving."
+echo "A timeout exit after successful startup is acceptable for this test."
+echo
+
+timeout "${TIMEOUT_SECONDS}"s docker run --rm --gpus all \
+  -e CUDA_VISIBLE_DEVICES="${GPU_INDEX}" \
+  -v "$(dirname "${MODEL_PATH}"):/models:ro" \
+  "${IMAGE}" \
+  -m "/models/$(basename "${MODEL_PATH}")" \
+  -ngl 999 \
+  --ctx-size "${CTX_SIZE}" \
+  --host 127.0.0.1 \
+  --port "${PORT}"
--- a/scripts/tmux_session_status.sh
+++ b/scripts/tmux_session_status.sh
@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SESSION="${1:-${GENIEHIVE_TMUX_SESSION:-geniehive-p40}}"
+
+if ! command -v tmux >/dev/null 2>&1; then
+  echo "tmux not found"
+  exit 127
+fi
+
+if ! tmux has-session -t "$SESSION" 2>/dev/null; then
+  echo "tmux session not found: $SESSION"
+  exit 1
+fi
+
+printf 'tmux session: %s\n' "$SESSION"
+printf '%-6s %-8s %-10s %-8s %s\n' "pane" "title" "state" "status" "command"
+
+live_count=0
+while IFS=$'\t' read -r pane_id pane_title pane_pid pane_dead pane_dead_status pane_start_command; do
+  state="exited"
+  status="$pane_dead_status"
+  if [[ "$pane_dead" == "0" ]] && kill -0 "$pane_pid" 2>/dev/null; then
+    state="running"
+    status="-"
+    live_count=$((live_count + 1))
+  fi
+
+  printf '%-6s %-8s %-10s %-8s %s\n' "$pane_id" "${pane_title:--}" "$state" "${status:--}" "$pane_start_command"
+done < <(
+  tmux list-panes -t "$SESSION" -F "#{pane_index}\t#{pane_title}\t#{pane_pid}\t#{pane_dead}\t#{pane_dead_status}\t#{pane_start_command}"
+)
+
+if [[ "$live_count" -eq 0 ]]; then
+  echo
+  echo "No pane processes are still running."
+  exit 2
+fi
--- a/src/geniehive_control/init.py
+++ b/src/geniehive_control/init.py
@ -0,0 +1,2 @@
+"""GenieHive control-plane package."""
+
--- a/src/geniehive_control/auth.py
+++ b/src/geniehive_control/auth.py
@ -0,0 +1,25 @@
+from __future__ import annotations
+
+from fastapi import HTTPException, Request, status
+
+
+def _check_key(request: Request, allowed_keys: list[str], header_name: str) -> None:
+    if not allowed_keys:
+        return
+    provided = request.headers.get(header_name)
+    if provided in allowed_keys:
+        return
+    raise HTTPException(
+        status_code=status.HTTP_401_UNAUTHORIZED,
+        detail="unauthorized",
+    )
+
+
+def require_client_auth(request: Request) -> None:
+    cfg = request.app.state.cfg
+    _check_key(request, cfg.auth.client_api_keys, "X-Api-Key")
+
+
+def require_node_auth(request: Request) -> None:
+    cfg = request.app.state.cfg
+    _check_key(request, cfg.auth.node_api_keys, "X-GenieHive-Node-Key")
--- a/src/geniehive_control/chat.py
+++ b/src/geniehive_control/chat.py
@ -0,0 +1,74 @@
+from __future__ import annotations
+
+from typing import Any
+
+from .registry import Registry
+from .routing import choose_upstream_model_id
+from .upstream import UpstreamClient
+
+
+class ProxyError(RuntimeError):
+    def __init__(self, message: str, *, status_code: int) -> None:
+        super().__init__(message)
+        self.status_code = status_code
+
+
+def _strip_reasoning_fields(payload: Any) -> Any:
+    if isinstance(payload, list):
+        return [_strip_reasoning_fields(item) for item in payload]
+    if not isinstance(payload, dict):
+        return payload
+
+    cleaned: dict[str, Any] = {}
+    for key, value in payload.items():
+        if key in {"reasoning_content", "reasoning"}:
+            continue
+        cleaned[key] = _strip_reasoning_fields(value)
+    return cleaned
+
+
+async def proxy_chat_completion(
+    body: dict[str, Any],
+    *,
+    registry: Registry,
+    upstream: UpstreamClient,
+) -> Any:
+    requested_model = body.get("model")
+    if not requested_model:
+        raise ProxyError("Missing 'model' in request body.", status_code=400)
+
+    resolved = registry.resolve_route(requested_model, kind="chat")
+    if resolved is None:
+        raise ProxyError(f"Unknown model or role '{requested_model}'.", status_code=404)
+
+    service = resolved.get("service")
+    if service is None:
+        raise ProxyError(f"No healthy chat target available for '{requested_model}'.", status_code=503)
+
+    upstream_body = dict(body)
+    upstream_body["model"] = choose_upstream_model_id(requested_model, service)
+    response = await upstream.chat_completions(service["endpoint"], upstream_body)
+    return _strip_reasoning_fields(response)
+
+
+async def proxy_embeddings(
+    body: dict[str, Any],
+    *,
+    registry: Registry,
+    upstream: UpstreamClient,
+) -> Any:
+    requested_model = body.get("model")
+    if not requested_model:
+        raise ProxyError("Missing 'model' in request body.", status_code=400)
+
+    resolved = registry.resolve_route(requested_model, kind="embeddings")
+    if resolved is None:
+        raise ProxyError(f"Unknown model or role '{requested_model}'.", status_code=404)
+
+    service = resolved.get("service")
+    if service is None:
+        raise ProxyError(f"No healthy embeddings target available for '{requested_model}'.", status_code=503)
+
+    upstream_body = dict(body)
+    upstream_body["model"] = choose_upstream_model_id(requested_model, service)
+    return await upstream.embeddings(service["endpoint"], upstream_body)
--- a/src/geniehive_control/config.py
+++ b/src/geniehive_control/config.py
@ -0,0 +1,40 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import yaml
+from pydantic import BaseModel, Field
+
+
+class ServerConfig(BaseModel):
+    host: str = "127.0.0.1"
+    port: int = 8800
+
+
+class AuthConfig(BaseModel):
+    client_api_keys: list[str] = Field(default_factory=list)
+    node_api_keys: list[str] = Field(default_factory=list)
+
+
+class StorageConfig(BaseModel):
+    sqlite_path: str = "state/geniehive.sqlite3"
+
+
+class RoutingConfig(BaseModel):
+    default_strategy: str = "loaded_first"
+    health_stale_after_s: float = 30.0
+
+
+class ControlConfig(BaseModel):
+    server: ServerConfig = Field(default_factory=ServerConfig)
+    auth: AuthConfig = Field(default_factory=AuthConfig)
+    storage: StorageConfig = Field(default_factory=StorageConfig)
+    routing: RoutingConfig = Field(default_factory=RoutingConfig)
+    roles_path: str | None = None
+
+
+def load_config(path: str | Path) -> ControlConfig:
+    raw = yaml.safe_load(Path(path).read_text()) or {}
+    if not isinstance(raw, dict):
+        raise ValueError("Control config must be a YAML mapping.")
+    return ControlConfig.model_validate(raw)
--- a/src/geniehive_control/main.py
+++ b/src/geniehive_control/main.py
@ -0,0 +1,127 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+from fastapi import Depends, FastAPI, Request
+from fastapi.responses import JSONResponse
+
+from .auth import require_client_auth, require_node_auth
+from .chat import ProxyError, proxy_chat_completion, proxy_embeddings
+from .config import ControlConfig, load_config
+from .models import HostHeartbeat, HostRegistration
+from .roles import load_role_catalog
+from .registry import Registry
+from .upstream import UpstreamClient, UpstreamError
+
+
+def create_app(
+    config_path: str | Path | None = None,
+    *,
+    upstream_client: UpstreamClient | None = None,
+) -> FastAPI:
+    cfg_path = config_path or os.environ.get("GENIEHIVE_CONTROL_CONFIG")
+    cfg = load_config(cfg_path) if cfg_path else ControlConfig()
+    registry = Registry(cfg.storage.sqlite_path)
+    roles_path = cfg.roles_path or os.environ.get("GENIEHIVE_ROLES_CONFIG")
+    if roles_path:
+        registry.upsert_roles(load_role_catalog(roles_path).roles)
+    upstream = upstream_client or UpstreamClient()
+
+    app = FastAPI(title="GenieHive Control", version="0.1.0")
+    app.state.cfg = cfg
+    app.state.registry = registry
+    app.state.upstream = upstream
+
+    @app.get("/health")
+    async def health() -> dict[str, str]:
+        return {"status": "ok"}
+
+    @app.post("/v1/nodes/register")
+    async def register_node(request: Request, _=Depends(require_node_auth)) -> dict:
+        payload = await request.json()
+        reg = HostRegistration.model_validate(payload)
+        host = request.app.state.registry.register_host(reg)
+        return {"status": "ok", "host": host}
+
+    @app.post("/v1/nodes/heartbeat")
+    async def heartbeat_node(request: Request, _=Depends(require_node_auth)):
+        payload = await request.json()
+        hb = HostHeartbeat.model_validate(payload)
+        host = request.app.state.registry.heartbeat_host(hb)
+        if host is None:
+            return JSONResponse(status_code=404, content={"error": "unknown_host", "host_id": hb.host_id})
+        return {"status": "ok", "host": host}
+
+    @app.get("/v1/cluster/hosts")
+    async def list_hosts(request: Request, _=Depends(require_client_auth)) -> dict:
+        return {"object": "list", "data": request.app.state.registry.list_hosts()}
+
+    @app.get("/v1/models")
+    async def list_models(request: Request, _=Depends(require_client_auth)) -> dict:
+        return {"object": "list", "data": request.app.state.registry.list_client_models()}
+
+    @app.post("/v1/chat/completions")
+    async def chat_completions(request: Request, _=Depends(require_client_auth)):
+        body = await request.json()
+        try:
+            return await proxy_chat_completion(
+                body,
+                registry=request.app.state.registry,
+                upstream=request.app.state.upstream,
+            )
+        except ProxyError as exc:
+            return JSONResponse(
+                status_code=exc.status_code,
+                content={"error": {"message": str(exc), "type": "geniehive_error", "code": "chat_proxy_error"}},
+            )
+        except UpstreamError as exc:
+            return JSONResponse(
+                status_code=exc.status_code or 502,
+                content={"error": {"message": str(exc), "type": "geniehive_error", "code": "upstream_error"}},
+            )
+
+    @app.post("/v1/embeddings")
+    async def embeddings(request: Request, _=Depends(require_client_auth)):
+        body = await request.json()
+        try:
+            return await proxy_embeddings(
+                body,
+                registry=request.app.state.registry,
+                upstream=request.app.state.upstream,
+            )
+        except ProxyError as exc:
+            return JSONResponse(
+                status_code=exc.status_code,
+                content={"error": {"message": str(exc), "type": "geniehive_error", "code": "embeddings_proxy_error"}},
+            )
+        except UpstreamError as exc:
+            return JSONResponse(
+                status_code=exc.status_code or 502,
+                content={"error": {"message": str(exc), "type": "geniehive_error", "code": "upstream_error"}},
+            )
+
+    @app.get("/v1/cluster/services")
+    async def list_services(request: Request, _=Depends(require_client_auth)) -> dict:
+        return {"object": "list", "data": request.app.state.registry.list_services()}
+
+    @app.get("/v1/cluster/roles")
+    async def list_roles(request: Request, _=Depends(require_client_auth)) -> dict:
+        return {"object": "list", "data": request.app.state.registry.list_roles()}
+
+    @app.get("/v1/cluster/health")
+    async def cluster_health(request: Request, _=Depends(require_client_auth)) -> dict:
+        cfg: ControlConfig = request.app.state.cfg
+        return request.app.state.registry.cluster_health(cfg.routing.health_stale_after_s)
+
+    @app.get("/v1/cluster/routes/resolve")
+    async def resolve_route(model: str, request: Request, kind: str | None = None, _=Depends(require_client_auth)) -> dict:
+        resolved = request.app.state.registry.resolve_route(model, kind=kind)
+        if resolved is None:
+            return JSONResponse(status_code=404, content={"error": "no_route", "model": model, "kind": kind})
+        return {"status": "ok", "resolution": resolved}
+
+    return app
+
+
+app = create_app()
--- a/src/geniehive_control/models.py
+++ b/src/geniehive_control/models.py
@ -0,0 +1,90 @@
+from __future__ import annotations
+
+from typing import Any, Literal
+
+from pydantic import BaseModel, Field
+
+
+class ServiceAsset(BaseModel):
+    asset_id: str
+    loaded: bool = False
+
+
+class ServiceRuntime(BaseModel):
+    engine: str | None = None
+    launcher: str | None = None
+
+
+class ServiceState(BaseModel):
+    health: str | None = None
+    load_state: str | None = None
+    accept_requests: bool = True
+
+
+class ServiceObserved(BaseModel):
+    p50_latency_ms: float | None = None
+    p95_latency_ms: float | None = None
+    tokens_per_sec: float | None = None
+    queue_depth: int | None = None
+    in_flight: int | None = None
+
+
+class RegisteredService(BaseModel):
+    service_id: str
+    host_id: str
+    kind: Literal["chat", "embeddings", "transcription"]
+    protocol: str = "openai"
+    endpoint: str
+    runtime: ServiceRuntime = Field(default_factory=ServiceRuntime)
+    assets: list[ServiceAsset] = Field(default_factory=list)
+    state: ServiceState = Field(default_factory=ServiceState)
+    observed: ServiceObserved = Field(default_factory=ServiceObserved)
+
+
+class HostStatus(BaseModel):
+    state: str = "online"
+    last_seen: float | None = None
+
+
+class HostRegistration(BaseModel):
+    host_id: str
+    display_name: str | None = None
+    address: str
+    labels: dict[str, str] = Field(default_factory=dict)
+    capabilities: dict[str, Any] = Field(default_factory=dict)
+    resources: dict[str, Any] = Field(default_factory=dict)
+    services: list[RegisteredService] = Field(default_factory=list)
+
+
+class HostHeartbeat(BaseModel):
+    host_id: str
+    status: HostStatus = Field(default_factory=HostStatus)
+    metrics: dict[str, Any] = Field(default_factory=dict)
+    services: list[RegisteredService] = Field(default_factory=list)
+
+
+class PromptPolicy(BaseModel):
+    system_prompt: str | None = None
+    user_template: str | None = None
+
+
+class RoutingPolicy(BaseModel):
+    preferred_families: list[str] = Field(default_factory=list)
+    preferred_labels: list[str] = Field(default_factory=list)
+    min_context: int | None = None
+    require_loaded: bool = False
+    fallback_roles: list[str] = Field(default_factory=list)
+
+
+class RoleProfile(BaseModel):
+    role_id: str
+    display_name: str | None = None
+    description: str | None = None
+    operation: Literal["chat", "embeddings", "transcription"]
+    modality: str
+    prompt_policy: PromptPolicy = Field(default_factory=PromptPolicy)
+    routing_policy: RoutingPolicy = Field(default_factory=RoutingPolicy)
+
+
+class RoleCatalog(BaseModel):
+    roles: list[RoleProfile] = Field(default_factory=list)
--- a/src/geniehive_control/registry.py
+++ b/src/geniehive_control/registry.py
@ -0,0 +1,464 @@
+from __future__ import annotations
+
+import json
+import sqlite3
+import time
+from pathlib import Path
+
+from .models import HostHeartbeat, HostRegistration, RegisteredService, RoleProfile
+
+
+def _json_dumps(value: object) -> str:
+    return json.dumps(value, sort_keys=True)
+
+
+class Registry:
+    def __init__(self, db_path: str | Path) -> None:
+        self.db_path = Path(db_path)
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        self._init_db()
+
+    def _connect(self) -> sqlite3.Connection:
+        conn = sqlite3.connect(self.db_path)
+        conn.row_factory = sqlite3.Row
+        return conn
+
+    def _init_db(self) -> None:
+        with self._connect() as conn:
+            conn.executescript(
+                """
+                CREATE TABLE IF NOT EXISTS hosts (
+                    host_id TEXT PRIMARY KEY,
+                    display_name TEXT,
+                    address TEXT NOT NULL,
+                    labels_json TEXT NOT NULL,
+                    capabilities_json TEXT NOT NULL,
+                    resources_json TEXT NOT NULL,
+                    status_state TEXT NOT NULL DEFAULT 'online',
+                    last_seen REAL NOT NULL,
+                    metrics_json TEXT NOT NULL DEFAULT '{}'
+                );
+
+                CREATE TABLE IF NOT EXISTS services (
+                    service_id TEXT PRIMARY KEY,
+                    host_id TEXT NOT NULL,
+                    kind TEXT NOT NULL,
+                    protocol TEXT NOT NULL,
+                    endpoint TEXT NOT NULL,
+                    runtime_json TEXT NOT NULL,
+                    assets_json TEXT NOT NULL,
+                    state_json TEXT NOT NULL,
+                    observed_json TEXT NOT NULL,
+                    updated_at REAL NOT NULL,
+                    FOREIGN KEY(host_id) REFERENCES hosts(host_id)
+                );
+
+                CREATE TABLE IF NOT EXISTS roles (
+                    role_id TEXT PRIMARY KEY,
+                    display_name TEXT,
+                    description TEXT,
+                    operation TEXT NOT NULL,
+                    modality TEXT NOT NULL,
+                    prompt_policy_json TEXT NOT NULL,
+                    routing_policy_json TEXT NOT NULL,
+                    updated_at REAL NOT NULL
+                );
+                """
+            )
+
+    def register_host(self, reg: HostRegistration) -> dict:
+        now = time.time()
+        with self._connect() as conn:
+            conn.execute(
+                """
+                INSERT INTO hosts (
+                    host_id, display_name, address, labels_json, capabilities_json,
+                    resources_json, status_state, last_seen, metrics_json
+                )
+                VALUES (?, ?, ?, ?, ?, ?, 'online', ?, '{}')
+                ON CONFLICT(host_id) DO UPDATE SET
+                    display_name=excluded.display_name,
+                    address=excluded.address,
+                    labels_json=excluded.labels_json,
+                    capabilities_json=excluded.capabilities_json,
+                    resources_json=excluded.resources_json,
+                    status_state='online',
+                    last_seen=excluded.last_seen
+                """,
+                (
+                    reg.host_id,
+                    reg.display_name,
+                    reg.address,
+                    _json_dumps(reg.labels),
+                    _json_dumps(reg.capabilities),
+                    _json_dumps(reg.resources),
+                    now,
+                ),
+            )
+            self._replace_services(conn, reg.host_id, reg.services, now)
+        return self.get_host(reg.host_id)
+
+    def heartbeat_host(self, hb: HostHeartbeat) -> dict | None:
+        now = time.time()
+        with self._connect() as conn:
+            cur = conn.execute(
+                "SELECT host_id FROM hosts WHERE host_id = ?",
+                (hb.host_id,),
+            )
+            if cur.fetchone() is None:
+                return None
+            conn.execute(
+                """
+                UPDATE hosts
+                SET status_state = ?, last_seen = ?, metrics_json = ?
+                WHERE host_id = ?
+                """,
+                (
+                    hb.status.state,
+                    now,
+                    _json_dumps(hb.metrics),
+                    hb.host_id,
+                ),
+            )
+            if hb.services:
+                self._replace_services(conn, hb.host_id, hb.services, now)
+        return self.get_host(hb.host_id)
+
+    def _replace_services(
+        self,
+        conn: sqlite3.Connection,
+        host_id: str,
+        services: list[RegisteredService],
+        now: float,
+    ) -> None:
+        conn.execute("DELETE FROM services WHERE host_id = ?", (host_id,))
+        for service in services:
+            conn.execute(
+                """
+                INSERT INTO services (
+                    service_id, host_id, kind, protocol, endpoint,
+                    runtime_json, assets_json, state_json, observed_json, updated_at
+                )
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                """,
+                (
+                    service.service_id,
+                    host_id,
+                    service.kind,
+                    service.protocol,
+                    service.endpoint,
+                    _json_dumps(service.runtime.model_dump()),
+                    _json_dumps([asset.model_dump() for asset in service.assets]),
+                    _json_dumps(service.state.model_dump()),
+                    _json_dumps(service.observed.model_dump()),
+                    now,
+                ),
+            )
+
+    def get_host(self, host_id: str) -> dict | None:
+        with self._connect() as conn:
+            row = conn.execute("SELECT * FROM hosts WHERE host_id = ?", (host_id,)).fetchone()
+            if row is None:
+                return None
+        return self._host_row_to_dict(row)
+
+    def upsert_roles(self, roles: list[RoleProfile]) -> list[dict]:
+        now = time.time()
+        with self._connect() as conn:
+            for role in roles:
+                conn.execute(
+                    """
+                    INSERT INTO roles (
+                        role_id, display_name, description, operation, modality,
+                        prompt_policy_json, routing_policy_json, updated_at
+                    )
+                    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+                    ON CONFLICT(role_id) DO UPDATE SET
+                        display_name=excluded.display_name,
+                        description=excluded.description,
+                        operation=excluded.operation,
+                        modality=excluded.modality,
+                        prompt_policy_json=excluded.prompt_policy_json,
+                        routing_policy_json=excluded.routing_policy_json,
+                        updated_at=excluded.updated_at
+                    """,
+                    (
+                        role.role_id,
+                        role.display_name,
+                        role.description,
+                        role.operation,
+                        role.modality,
+                        _json_dumps(role.prompt_policy.model_dump()),
+                        _json_dumps(role.routing_policy.model_dump()),
+                        now,
+                    ),
+                )
+        return self.list_roles()
+
+    def get_role(self, role_id: str) -> dict | None:
+        with self._connect() as conn:
+            row = conn.execute("SELECT * FROM roles WHERE role_id = ?", (role_id,)).fetchone()
+            if row is None:
+                return None
+        return self._role_row_to_dict(row)
+
+    def list_roles(self) -> list[dict]:
+        with self._connect() as conn:
+            rows = conn.execute("SELECT * FROM roles ORDER BY role_id").fetchall()
+        return [self._role_row_to_dict(row) for row in rows]
+
+    def list_hosts(self) -> list[dict]:
+        with self._connect() as conn:
+            rows = conn.execute("SELECT * FROM hosts ORDER BY host_id").fetchall()
+        return [self._host_row_to_dict(row) for row in rows]
+
+    def list_services(self) -> list[dict]:
+        with self._connect() as conn:
+            rows = conn.execute("SELECT * FROM services ORDER BY host_id, service_id").fetchall()
+        return [self._service_row_to_dict(row) for row in rows]
+
+    def list_client_models(self) -> list[dict]:
+        services = self.list_services()
+        roles = self.list_roles()
+        items: list[dict] = []
+
+        for service in services:
+            if not service["state"].get("accept_requests", True):
+                continue
+            if service["state"].get("health") != "healthy":
+                continue
+            item = {
+                "id": service["service_id"],
+                "object": "model",
+                "owned_by": service["host_id"],
+                "geniehive": self._service_metadata(service),
+            }
+            items.append(item)
+            for asset in service["assets"]:
+                asset_id = asset.get("asset_id")
+                if not asset_id:
+                    continue
+                items.append(
+                    {
+                        "id": asset_id,
+                        "object": "model",
+                        "owned_by": service["host_id"],
+                        "geniehive": self._service_metadata(service) | {"route_type": "asset", "asset_id": asset_id},
+                    }
+                )
+
+        for role in roles:
+            matching_services = [
+                service
+                for service in services
+                if service["kind"] == role["operation"]
+                and service["state"].get("accept_requests", True)
+                and service["state"].get("health") == "healthy"
+            ]
+            loaded_count = sum(1 for service in matching_services if any(asset.get("loaded") for asset in service["assets"]))
+            latencies = [
+                service["observed"].get("p50_latency_ms")
+                for service in matching_services
+                if service["observed"].get("p50_latency_ms") is not None
+            ]
+            best_latency_ms = min(latencies) if latencies else None
+            items.append(
+                {
+                    "id": role["role_id"],
+                    "object": "model",
+                    "owned_by": "geniehive-role",
+                    "geniehive": {
+                        "route_type": "role",
+                        "role_id": role["role_id"],
+                        "display_name": role["display_name"],
+                        "operation": role["operation"],
+                        "modality": role["modality"],
+                        "healthy_target_count": len(matching_services),
+                        "loaded_target_count": loaded_count,
+                        "best_p50_latency_ms": best_latency_ms,
+                        "offload_hint": self._offload_hint(
+                            operation=role["operation"],
+                            loaded_count=loaded_count,
+                            best_latency_ms=best_latency_ms,
+                        ),
+                        "routing_policy": role["routing_policy"],
+                    },
+                }
+            )
+
+        deduped: dict[str, dict] = {}
+        for item in items:
+            deduped[item["id"]] = item
+        return [deduped[key] for key in sorted(deduped)]
+
+    def resolve_route(self, requested_model: str, *, kind: str | None = None) -> dict | None:
+        direct = self._resolve_direct(requested_model, kind=kind)
+        if direct is not None:
+            return {"match_type": "direct", **direct}
+
+        role = self.get_role(requested_model)
+        if role is None:
+            return None
+
+        matched_kind = kind or role["operation"]
+        candidates = [
+            service
+            for service in self.list_services()
+            if service["kind"] == matched_kind
+            and service["state"].get("accept_requests", True)
+            and service["state"].get("health") == "healthy"
+        ]
+        if not candidates:
+            return {"match_type": "role", "role": role, "service": None}
+
+        preferred_families = [family.lower() for family in role["routing_policy"].get("preferred_families", [])]
+
+        def score(service: dict) -> tuple[int, int, float, str]:
+            loaded = 1 if any(asset.get("loaded") for asset in service["assets"]) else 0
+            family_match = 0
+            if preferred_families:
+                asset_names = " ".join(asset.get("asset_id", "") for asset in service["assets"]).lower()
+                family_match = 1 if any(family in asset_names for family in preferred_families) else 0
+            latency = service["observed"].get("p50_latency_ms")
+            latency_score = float(latency) if latency is not None else float("inf")
+            return (family_match, loaded, -latency_score, service["service_id"])
+
+        if role["routing_policy"].get("require_loaded"):
+            loaded_candidates = [service for service in candidates if any(asset.get("loaded") for asset in service["assets"])]
+            if loaded_candidates:
+                candidates = loaded_candidates
+
+        service = max(candidates, key=score)
+        return {"match_type": "role", "role": role, "service": service}
+
+    def _resolve_direct(self, requested_model: str, *, kind: str | None = None) -> dict | None:
+        candidates = []
+        for service in self.list_services():
+            if kind is not None and service["kind"] != kind:
+                continue
+            if not service["state"].get("accept_requests", True):
+                continue
+            if service["state"].get("health") != "healthy":
+                continue
+            asset_ids = {asset.get("asset_id") for asset in service["assets"]}
+            if service["service_id"] == requested_model or requested_model in asset_ids:
+                candidates.append(service)
+        if not candidates:
+            return None
+
+        def score(service: dict) -> tuple[int, float, str]:
+            loaded = 1 if any(asset.get("loaded") for asset in service["assets"]) else 0
+            latency = service["observed"].get("p50_latency_ms")
+            latency_score = float(latency) if latency is not None else float("inf")
+            return (loaded, -latency_score, service["service_id"])
+
+        service = max(candidates, key=score)
+        return {"service": service}
+
+    def cluster_health(self, stale_after_s: float) -> dict:
+        hosts = self.list_hosts()
+        services = self.list_services()
+        now = time.time()
+        online = 0
+        stale = 0
+        for host in hosts:
+            is_stale = (now - host["status"]["last_seen"]) > stale_after_s
+            if is_stale:
+                stale += 1
+            elif host["status"]["state"] == "online":
+                online += 1
+        healthy_services = sum(1 for service in services if service["state"].get("health") == "healthy")
+        return {
+            "status": "ok",
+            "host_count": len(hosts),
+            "online_host_count": online,
+            "stale_host_count": stale,
+            "service_count": len(services),
+            "healthy_service_count": healthy_services,
+        }
+
+    @staticmethod
+    def _offload_hint(*, operation: str, loaded_count: int, best_latency_ms: float | None) -> dict:
+        if loaded_count <= 0:
+            suitability = "cold_only"
+        elif best_latency_ms is not None and best_latency_ms <= 1500:
+            suitability = "good_for_low_complexity"
+        elif best_latency_ms is not None and best_latency_ms <= 4000:
+            suitability = "usable_for_background_tasks"
+        else:
+            suitability = "available_but_slow"
+        return {
+            "operation": operation,
+            "suitability": suitability,
+            "recommended_for": "lower-complexity offload" if operation == "chat" else f"{operation} offload",
+            "inference_basis": {
+                "loaded_target_count": loaded_count,
+                "best_p50_latency_ms": best_latency_ms,
+            },
+        }
+
+    def _service_metadata(self, service: dict) -> dict:
+        lat = service["observed"].get("p50_latency_ms")
+        loaded_count = 1 if any(asset.get("loaded") for asset in service["assets"]) else 0
+        return {
+            "route_type": "service",
+            "service_id": service["service_id"],
+            "host_id": service["host_id"],
+            "operation": service["kind"],
+            "protocol": service["protocol"],
+            "endpoint": service["endpoint"],
+            "health": service["state"].get("health"),
+            "loaded_asset_count": loaded_count,
+            "assets": service["assets"],
+            "runtime": service["runtime"],
+            "observed": service["observed"],
+            "offload_hint": self._offload_hint(
+                operation=service["kind"],
+                loaded_count=loaded_count,
+                best_latency_ms=lat,
+            ),
+        }
+
+    @staticmethod
+    def _host_row_to_dict(row: sqlite3.Row) -> dict:
+        return {
+            "host_id": row["host_id"],
+            "display_name": row["display_name"],
+            "address": row["address"],
+            "labels": json.loads(row["labels_json"]),
+            "capabilities": json.loads(row["capabilities_json"]),
+            "resources": json.loads(row["resources_json"]),
+            "status": {
+                "state": row["status_state"],
+                "last_seen": row["last_seen"],
+            },
+            "metrics": json.loads(row["metrics_json"]),
+        }
+
+    @staticmethod
+    def _service_row_to_dict(row: sqlite3.Row) -> dict:
+        return {
+            "service_id": row["service_id"],
+            "host_id": row["host_id"],
+            "kind": row["kind"],
+            "protocol": row["protocol"],
+            "endpoint": row["endpoint"],
+            "runtime": json.loads(row["runtime_json"]),
+            "assets": json.loads(row["assets_json"]),
+            "state": json.loads(row["state_json"]),
+            "observed": json.loads(row["observed_json"]),
+            "updated_at": row["updated_at"],
+        }
+
+    @staticmethod
+    def _role_row_to_dict(row: sqlite3.Row) -> dict:
+        return {
+            "role_id": row["role_id"],
+            "display_name": row["display_name"],
+            "description": row["description"],
+            "operation": row["operation"],
+            "modality": row["modality"],
+            "prompt_policy": json.loads(row["prompt_policy_json"]),
+            "routing_policy": json.loads(row["routing_policy_json"]),
+            "updated_at": row["updated_at"],
+        }
--- a/src/geniehive_control/roles.py
+++ b/src/geniehive_control/roles.py
@ -0,0 +1,14 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import yaml
+
+from .models import RoleCatalog
+
+
+def load_role_catalog(path: str | Path) -> RoleCatalog:
+    raw = yaml.safe_load(Path(path).read_text()) or {}
+    if not isinstance(raw, dict):
+        raise ValueError("Role catalog must be a YAML mapping.")
+    return RoleCatalog.model_validate(raw)
--- a/src/geniehive_control/routing.py
+++ b/src/geniehive_control/routing.py
@ -0,0 +1,17 @@
+from __future__ import annotations
+
+from typing import Any
+
+
+def choose_upstream_model_id(requested_model: str, service: dict[str, Any]) -> str:
+    assets = service.get("assets", [])
+    asset_ids = [asset.get("asset_id") for asset in assets if asset.get("asset_id")]
+    if requested_model in asset_ids:
+        return requested_model
+
+    loaded_assets = [asset.get("asset_id") for asset in assets if asset.get("loaded") and asset.get("asset_id")]
+    if loaded_assets:
+        return loaded_assets[0]
+    if asset_ids:
+        return asset_ids[0]
+    return requested_model
--- a/src/geniehive_control/upstream.py
+++ b/src/geniehive_control/upstream.py
@ -0,0 +1,68 @@
+from __future__ import annotations
+
+from typing import Any, Protocol
+
+import httpx
+
+
+class UpstreamError(RuntimeError):
+    def __init__(self, message: str, *, status_code: int | None = None) -> None:
+        super().__init__(message)
+        self.status_code = status_code
+
+
+class AsyncPoster(Protocol):
+    async def post(self, url: str, *, json: dict[str, Any], headers: dict[str, str] | None = None) -> object:
+        ...
+
+
+class UpstreamClient:
+    def __init__(self, client: AsyncPoster | None = None) -> None:
+        self._owns_client = client is None
+        self._client = client or httpx.AsyncClient(
+            timeout=httpx.Timeout(connect=10.0, read=600.0, write=60.0, pool=60.0)
+        )
+
+    async def chat_completions(
+        self,
+        base_url: str,
+        body: dict[str, Any],
+        *,
+        headers: dict[str, str] | None = None,
+    ) -> Any:
+        url = base_url.rstrip("/") + "/v1/chat/completions"
+        response = await self._client.post(url, json=body, headers=headers)
+        status_code = getattr(response, "status_code", 200)
+        if status_code >= 400:
+            text = getattr(response, "text", "")
+            raise UpstreamError(
+                text or f"upstream error from {url}",
+                status_code=status_code,
+            )
+        if hasattr(response, "json"):
+            return response.json()
+        return response
+
+    async def embeddings(
+        self,
+        base_url: str,
+        body: dict[str, Any],
+        *,
+        headers: dict[str, str] | None = None,
+    ) -> Any:
+        url = base_url.rstrip("/") + "/v1/embeddings"
+        response = await self._client.post(url, json=body, headers=headers)
+        status_code = getattr(response, "status_code", 200)
+        if status_code >= 400:
+            text = getattr(response, "text", "")
+            raise UpstreamError(
+                text or f"upstream error from {url}",
+                status_code=status_code,
+            )
+        if hasattr(response, "json"):
+            return response.json()
+        return response
+
+    async def aclose(self) -> None:
+        if self._owns_client and isinstance(self._client, httpx.AsyncClient):
+            await self._client.aclose()
--- a/src/geniehive_node/init.py
+++ b/src/geniehive_node/init.py
@ -0,0 +1,2 @@
+"""GenieHive node-agent package."""
+
--- a/src/geniehive_node/config.py
+++ b/src/geniehive_node/config.py
@ -0,0 +1,68 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Literal
+
+import yaml
+from pydantic import BaseModel, Field
+
+
+ServiceKind = Literal["chat", "embeddings", "transcription"]
+
+
+class NodeConfigBlock(BaseModel):
+    host_id: str = "node-1"
+    display_name: str | None = None
+    listen_host: str = "127.0.0.1"
+    listen_port: int = 8891
+    address: str | None = None
+    labels: dict[str, str] = Field(default_factory=dict)
+
+
+class ControlPlaneConfig(BaseModel):
+    base_url: str | None = None
+    node_api_key: str | None = None
+    heartbeat_interval_s: float = 5.0
+
+
+class InventoryConfig(BaseModel):
+    model_roots: list[str] = Field(default_factory=list)
+    cpu_threads: int | None = None
+    ram_gb: float | None = None
+    capabilities: dict[str, bool] = Field(default_factory=dict)
+
+
+class ManagedRuntimesConfig(BaseModel):
+    enabled: bool = False
+    llama_server_bin: str | None = None
+
+
+class NodeServiceAssetConfig(BaseModel):
+    asset_id: str
+    loaded: bool = False
+
+
+class NodeServiceConfig(BaseModel):
+    service_id: str
+    kind: ServiceKind
+    protocol: str = "openai"
+    endpoint: str | None = None
+    runtime: dict[str, str] = Field(default_factory=dict)
+    assets: list[NodeServiceAssetConfig] = Field(default_factory=list)
+    state: dict[str, object] = Field(default_factory=dict)
+    observed: dict[str, object] = Field(default_factory=dict)
+
+
+class NodeConfig(BaseModel):
+    node: NodeConfigBlock = Field(default_factory=NodeConfigBlock)
+    control_plane: ControlPlaneConfig = Field(default_factory=ControlPlaneConfig)
+    inventory: InventoryConfig = Field(default_factory=InventoryConfig)
+    managed_runtimes: ManagedRuntimesConfig = Field(default_factory=ManagedRuntimesConfig)
+    services: list[NodeServiceConfig] = Field(default_factory=list)
+
+
+def load_config(path: str | Path) -> NodeConfig:
+    raw = yaml.safe_load(Path(path).read_text()) or {}
+    if not isinstance(raw, dict):
+        raise ValueError("Node config must be a YAML mapping.")
+    return NodeConfig.model_validate(raw)
--- a/src/geniehive_node/inventory.py
+++ b/src/geniehive_node/inventory.py
@ -0,0 +1,85 @@
+from __future__ import annotations
+
+from pathlib import Path
+import time
+
+from .config import NodeConfig
+from .models import NodeInventory
+
+
+def discover_model_files(roots: list[str]) -> list[dict[str, object]]:
+    discovered: list[dict[str, object]] = []
+    for root in roots:
+        path = Path(root)
+        if not path.exists():
+            continue
+        for model_path in sorted(path.rglob("*.gguf")):
+            discovered.append(
+                {
+                    "path": str(model_path),
+                    "name": model_path.name,
+                    "size_bytes": model_path.stat().st_size,
+                }
+            )
+    return discovered
+
+
+def build_inventory(cfg: NodeConfig) -> NodeInventory:
+    address = cfg.node.address or cfg.node.listen_host
+    resources: dict[str, object] = {}
+    if cfg.inventory.cpu_threads is not None:
+        resources["cpu_threads"] = cfg.inventory.cpu_threads
+    if cfg.inventory.ram_gb is not None:
+        resources["ram_gb"] = cfg.inventory.ram_gb
+    resources["discovered_models"] = discover_model_files(cfg.inventory.model_roots)
+
+    services: list[dict] = []
+    for service in cfg.services:
+        endpoint = service.endpoint or f"http://{cfg.node.listen_host}:{cfg.node.listen_port}"
+        services.append(
+            {
+                "service_id": service.service_id,
+                "host_id": cfg.node.host_id,
+                "kind": service.kind,
+                "protocol": service.protocol,
+                "endpoint": endpoint,
+                "runtime": service.runtime,
+                "assets": [asset.model_dump() for asset in service.assets],
+                "state": service.state,
+                "observed": service.observed,
+            }
+        )
+
+    return NodeInventory(
+        host_id=cfg.node.host_id,
+        display_name=cfg.node.display_name,
+        address=address,
+        labels=cfg.node.labels,
+        capabilities=cfg.inventory.capabilities,
+        resources=resources,
+        services=services,
+    )
+
+
+def build_registration_payload(cfg: NodeConfig) -> dict:
+    inventory = build_inventory(cfg)
+    return inventory.model_dump()
+
+
+def build_heartbeat_payload(cfg: NodeConfig) -> dict:
+    inventory = build_inventory(cfg)
+    healthy_service_count = sum(
+        1 for service in inventory.services if service.get("state", {}).get("health") == "healthy"
+    )
+    return {
+        "host_id": inventory.host_id,
+        "status": {
+            "state": "online",
+            "last_seen": time.time(),
+        },
+        "metrics": {
+            "service_count": len(inventory.services),
+            "healthy_service_count": healthy_service_count,
+            "discovered_model_count": len(inventory.resources.get("discovered_models", [])),
+        },
+    }
--- a/src/geniehive_node/main.py
+++ b/src/geniehive_node/main.py
@ -0,0 +1,62 @@
+from __future__ import annotations
+
+import asyncio
+from contextlib import asynccontextmanager, suppress
+import os
+from pathlib import Path
+
+from fastapi import FastAPI
+
+from .config import NodeConfig, load_config
+from .inventory import build_inventory, build_registration_payload
+from .sync import ControlPlaneClient
+
+
+def create_app(
+    config_path: str | Path | None = None,
+    *,
+    sync_enabled: bool = True,
+    control_client: ControlPlaneClient | None = None,
+) -> FastAPI:
+    cfg_path = config_path or os.environ.get("GENIEHIVE_NODE_CONFIG")
+    cfg = load_config(cfg_path) if cfg_path else NodeConfig()
+    sync_client = control_client or ControlPlaneClient(cfg)
+
+    @asynccontextmanager
+    async def lifespan(app: FastAPI):
+        heartbeat_task: asyncio.Task[None] | None = None
+        stop_event = asyncio.Event()
+        if sync_enabled and sync_client.enabled:
+            with suppress(Exception):
+                await sync_client.register_once()
+            heartbeat_task = asyncio.create_task(sync_client.heartbeat_loop(stop_event))
+        try:
+            yield
+        finally:
+            if heartbeat_task is not None:
+                stop_event.set()
+                heartbeat_task.cancel()
+                with suppress(asyncio.CancelledError):
+                    await heartbeat_task
+            await sync_client.aclose()
+
+    app = FastAPI(title="GenieHive Node", version="0.1.0", lifespan=lifespan)
+    app.state.cfg = cfg
+    app.state.control_client = sync_client
+
+    @app.get("/health")
+    async def health() -> dict[str, str]:
+        return {"status": "ok"}
+
+    @app.get("/v1/node/inventory")
+    async def inventory() -> dict:
+        return build_inventory(cfg).model_dump()
+
+    @app.get("/v1/node/registration")
+    async def registration() -> dict:
+        return build_registration_payload(cfg)
+
+    return app
+
+
+app = create_app()
--- a/src/geniehive_node/models.py
+++ b/src/geniehive_node/models.py
@ -0,0 +1,14 @@
+from __future__ import annotations
+
+from pydantic import BaseModel, Field
+
+
+class NodeInventory(BaseModel):
+    host_id: str
+    display_name: str | None = None
+    address: str
+    labels: dict[str, str] = Field(default_factory=dict)
+    capabilities: dict[str, bool] = Field(default_factory=dict)
+    resources: dict[str, object] = Field(default_factory=dict)
+    services: list[dict] = Field(default_factory=list)
+
--- a/src/geniehive_node/sync.py
+++ b/src/geniehive_node/sync.py
@ -0,0 +1,84 @@
+from __future__ import annotations
+
+import asyncio
+from contextlib import suppress
+from typing import Protocol
+
+import httpx
+
+from .config import NodeConfig
+from .inventory import build_heartbeat_payload, build_registration_payload
+
+
+class AsyncPoster(Protocol):
+    async def post(self, url: str, *, json: dict, headers: dict[str, str] | None = None) -> object:
+        ...
+
+
+class ControlPlaneClient:
+    def __init__(self, cfg: NodeConfig, http_client: AsyncPoster | None = None) -> None:
+        self.cfg = cfg
+        self._owns_client = http_client is None
+        self._registered = False
+        self._http = http_client or httpx.AsyncClient(
+            timeout=httpx.Timeout(connect=5.0, read=30.0, write=30.0, pool=30.0)
+        )
+
+    @property
+    def enabled(self) -> bool:
+        return bool(self.cfg.control_plane.base_url)
+
+    def _headers(self) -> dict[str, str]:
+        headers: dict[str, str] = {}
+        if self.cfg.control_plane.node_api_key:
+            headers["X-GenieHive-Node-Key"] = self.cfg.control_plane.node_api_key
+        return headers
+
+    async def register_once(self) -> None:
+        if not self.enabled:
+            return
+        url = str(self.cfg.control_plane.base_url).rstrip("/") + "/v1/nodes/register"
+        response = await self._http.post(
+            url,
+            json=build_registration_payload(self.cfg),
+            headers=self._headers(),
+        )
+        if isinstance(response, httpx.Response):
+            response.raise_for_status()
+        self._registered = True
+
+    async def heartbeat_once(self) -> None:
+        if not self.enabled:
+            return
+        if not self._registered:
+            await self.register_once()
+        url = str(self.cfg.control_plane.base_url).rstrip("/") + "/v1/nodes/heartbeat"
+        response = await self._http.post(
+            url,
+            json=build_heartbeat_payload(self.cfg),
+            headers=self._headers(),
+        )
+        if isinstance(response, httpx.Response):
+            if response.status_code == 404:
+                self._registered = False
+                await self.register_once()
+                response = await self._http.post(
+                    url,
+                    json=build_heartbeat_payload(self.cfg),
+                    headers=self._headers(),
+                )
+            response.raise_for_status()
+
+    async def heartbeat_loop(self, stop_event: asyncio.Event) -> None:
+        interval = max(self.cfg.control_plane.heartbeat_interval_s, 0.1)
+        while not stop_event.is_set():
+            with suppress(Exception):
+                await self.heartbeat_once()
+            try:
+                await asyncio.wait_for(stop_event.wait(), timeout=interval)
+            except asyncio.TimeoutError:
+                continue
+
+    async def aclose(self) -> None:
+        if self._owns_client and isinstance(self._http, httpx.AsyncClient):
+            await self._http.aclose()
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -0,0 +1,9 @@
+from pathlib import Path
+import sys
+
+
+ROOT = Path(__file__).resolve().parents[1]
+SRC = ROOT / "src"
+
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
--- a/tests/test_control_chat.py
+++ b/tests/test_control_chat.py
@ -0,0 +1,224 @@
+import asyncio
+from pathlib import Path
+
+from geniehive_control.chat import ProxyError, proxy_chat_completion, proxy_embeddings
+from geniehive_control.models import HostRegistration, RegisteredService, RoleProfile
+from geniehive_control.registry import Registry
+from geniehive_control.upstream import UpstreamClient
+
+
+class _FakeResponse:
+    def __init__(self, payload: dict, status_code: int = 200) -> None:
+        self._payload = payload
+        self.status_code = status_code
+        self.text = str(payload)
+
+    def json(self) -> dict:
+        return self._payload
+
+
+class _FakePoster:
+    def __init__(self) -> None:
+        self.calls: list[dict] = []
+
+    async def post(self, url: str, *, json: dict, headers: dict[str, str] | None = None) -> _FakeResponse:
+        self.calls.append({"url": url, "json": json, "headers": headers or {}})
+        return _FakeResponse({"ok": True, "echo_model": json["model"]})
+
+
+def _build_registry(tmp_path: Path) -> Registry:
+    registry = Registry(tmp_path / "geniehive.sqlite3")
+    registry.register_host(
+        HostRegistration(
+            host_id="atlas-01",
+            address="192.168.1.101",
+            services=[
+                RegisteredService(
+                    service_id="atlas-01/chat/qwen3-8b",
+                    host_id="atlas-01",
+                    kind="chat",
+                    endpoint="http://192.168.1.101:18091",
+                    assets=[{"asset_id": "qwen3-8b-q4km", "loaded": True}],
+                    state={"health": "healthy", "load_state": "loaded", "accept_requests": True},
+                    observed={"p50_latency_ms": 900},
+                ),
+                RegisteredService(
+                    service_id="atlas-01/embeddings/bge-small",
+                    host_id="atlas-01",
+                    kind="embeddings",
+                    endpoint="http://192.168.1.101:18092",
+                    assets=[{"asset_id": "bge-small-en", "loaded": True}],
+                    state={"health": "healthy", "load_state": "loaded", "accept_requests": True},
+                    observed={"p50_latency_ms": 120},
+                )
+            ],
+        )
+    )
+    registry.upsert_roles(
+        [
+            RoleProfile(
+                role_id="mentor",
+                display_name="Mentor",
+                operation="chat",
+                modality="text",
+                routing_policy={"preferred_families": ["qwen3"]},
+            ),
+            RoleProfile(
+                role_id="embedder",
+                display_name="Embedder",
+                operation="embeddings",
+                modality="text",
+                routing_policy={"require_loaded": True},
+            )
+        ]
+    )
+    return registry
+
+
+def test_proxy_chat_completion_rewrites_role_to_loaded_asset(tmp_path: Path) -> None:
+    registry = _build_registry(tmp_path)
+    fake = _FakePoster()
+    upstream = UpstreamClient(client=fake)
+
+    async def run() -> dict:
+        return await proxy_chat_completion(
+            {
+                "model": "mentor",
+                "messages": [{"role": "user", "content": "hello"}],
+            },
+            registry=registry,
+            upstream=upstream,
+        )
+
+    result = asyncio.run(run())
+    assert result["ok"] is True
+    assert result["echo_model"] == "qwen3-8b-q4km"
+    assert fake.calls[0]["url"] == "http://192.168.1.101:18091/v1/chat/completions"
+    assert fake.calls[0]["json"]["model"] == "qwen3-8b-q4km"
+
+
+def test_proxy_chat_completion_preserves_direct_asset_match(tmp_path: Path) -> None:
+    registry = _build_registry(tmp_path)
+    fake = _FakePoster()
+    upstream = UpstreamClient(client=fake)
+
+    async def run() -> dict:
+        return await proxy_chat_completion(
+            {
+                "model": "qwen3-8b-q4km",
+                "messages": [{"role": "user", "content": "hello"}],
+            },
+            registry=registry,
+            upstream=upstream,
+        )
+
+    result = asyncio.run(run())
+    assert result["echo_model"] == "qwen3-8b-q4km"
+
+
+def test_proxy_chat_completion_strips_reasoning_fields(tmp_path: Path) -> None:
+    registry = _build_registry(tmp_path)
+
+    class _ReasoningPoster:
+        async def post(self, url: str, *, json: dict, headers: dict[str, str] | None = None) -> _FakeResponse:
+            return _FakeResponse(
+                {
+                    "object": "chat.completion",
+                    "model": json["model"],
+                    "choices": [
+                        {
+                            "index": 0,
+                            "message": {
+                                "role": "assistant",
+                                "content": "GPU1 route is live.",
+                                "reasoning_content": "hidden chain of thought",
+                            },
+                            "reasoning": {"tokens": 42},
+                        }
+                    ],
+                }
+            )
+
+    upstream = UpstreamClient(client=_ReasoningPoster())
+
+    async def run() -> dict:
+        return await proxy_chat_completion(
+            {
+                "model": "mentor",
+                "messages": [{"role": "user", "content": "hello"}],
+            },
+            registry=registry,
+            upstream=upstream,
+        )
+
+    result = asyncio.run(run())
+    choice = result["choices"][0]
+    assert choice["message"]["content"] == "GPU1 route is live."
+    assert "reasoning_content" not in choice["message"]
+    assert "reasoning" not in choice
+
+
+def test_proxy_chat_completion_fails_for_unknown_model(tmp_path: Path) -> None:
+    registry = _build_registry(tmp_path)
+    upstream = UpstreamClient(client=_FakePoster())
+
+    async def run() -> None:
+        await proxy_chat_completion(
+            {
+                "model": "unknown-model",
+                "messages": [{"role": "user", "content": "hello"}],
+            },
+            registry=registry,
+            upstream=upstream,
+        )
+
+    try:
+        asyncio.run(run())
+    except ProxyError as exc:
+        assert exc.status_code == 404
+    else:
+        raise AssertionError("expected ChatProxyError")
+
+
+def test_proxy_embeddings_rewrites_role_to_loaded_asset(tmp_path: Path) -> None:
+    registry = _build_registry(tmp_path)
+    fake = _FakePoster()
+    upstream = UpstreamClient(client=fake)
+
+    async def run() -> dict:
+        return await proxy_embeddings(
+            {
+                "model": "embedder",
+                "input": "hello",
+            },
+            registry=registry,
+            upstream=upstream,
+        )
+
+    result = asyncio.run(run())
+    assert result["ok"] is True
+    assert result["echo_model"] == "bge-small-en"
+    assert fake.calls[0]["url"] == "http://192.168.1.101:18092/v1/embeddings"
+    assert fake.calls[0]["json"]["model"] == "bge-small-en"
+
+
+def test_proxy_embeddings_fails_for_unknown_model(tmp_path: Path) -> None:
+    registry = _build_registry(tmp_path)
+    upstream = UpstreamClient(client=_FakePoster())
+
+    async def run() -> None:
+        await proxy_embeddings(
+            {
+                "model": "unknown-embedder",
+                "input": "hello",
+            },
+            registry=registry,
+            upstream=upstream,
+        )
+
+    try:
+        asyncio.run(run())
+    except ProxyError as exc:
+        assert exc.status_code == 404
+    else:
+        raise AssertionError("expected ProxyError")
--- a/tests/test_control_registry.py
+++ b/tests/test_control_registry.py
@ -0,0 +1,152 @@
+from pathlib import Path
+
+from geniehive_control.main import create_app
+from geniehive_control.models import HostHeartbeat, HostRegistration, RegisteredService, RoleProfile
+from geniehive_control.registry import Registry
+
+
+def test_registry_persists_registration_and_heartbeat(tmp_path: Path) -> None:
+    db_path = tmp_path / "geniehive.sqlite3"
+    registry = Registry(db_path)
+
+    host = registry.register_host(
+        HostRegistration(
+            host_id="atlas-01",
+            display_name="Atlas GPU Box",
+            address="192.168.1.101",
+            labels={"site": "home-lab"},
+            capabilities={"cuda": True},
+            resources={"cpu_threads": 24},
+            services=[
+                RegisteredService(
+                    service_id="atlas-01/chat/qwen3-8b",
+                    host_id="atlas-01",
+                    kind="chat",
+                    protocol="openai",
+                    endpoint="http://192.168.1.101:18091",
+                    runtime={"engine": "llama.cpp", "launcher": "managed"},
+                    assets=[{"asset_id": "qwen3-8b-q4km", "loaded": True}],
+                    state={"health": "healthy", "load_state": "loaded", "accept_requests": True},
+                    observed={"p50_latency_ms": 900, "tokens_per_sec": 40},
+                )
+            ],
+        )
+    )
+    assert host is not None
+    assert host["host_id"] == "atlas-01"
+
+    updated = registry.heartbeat_host(
+        HostHeartbeat(
+            host_id="atlas-01",
+            status={"state": "online"},
+            metrics={"gpu_utilization_pct": 77},
+        )
+    )
+    assert updated is not None
+    assert updated["metrics"]["gpu_utilization_pct"] == 77
+
+    hosts = registry.list_hosts()
+    services = registry.list_services()
+    health = registry.cluster_health(stale_after_s=30)
+
+    assert len(hosts) == 1
+    assert len(services) == 1
+    assert services[0]["service_id"] == "atlas-01/chat/qwen3-8b"
+    assert services[0]["state"]["health"] == "healthy"
+    assert health["host_count"] == 1
+    assert health["healthy_service_count"] == 1
+
+
+def test_registry_persists_roles_and_resolves_direct_and_role_routes(tmp_path: Path) -> None:
+    db_path = tmp_path / "geniehive.sqlite3"
+    registry = Registry(db_path)
+
+    registry.register_host(
+        HostRegistration(
+            host_id="atlas-01",
+            address="192.168.1.101",
+            services=[
+                RegisteredService(
+                    service_id="atlas-01/chat/qwen3-8b",
+                    host_id="atlas-01",
+                    kind="chat",
+                    endpoint="http://192.168.1.101:18091",
+                    assets=[{"asset_id": "qwen3-8b-q4km", "loaded": True}],
+                    state={"health": "healthy", "load_state": "loaded", "accept_requests": True},
+                    observed={"p50_latency_ms": 900},
+                ),
+                RegisteredService(
+                    service_id="atlas-01/embeddings/bge-small",
+                    host_id="atlas-01",
+                    kind="embeddings",
+                    endpoint="http://192.168.1.101:18092",
+                    assets=[{"asset_id": "bge-small-en", "loaded": True}],
+                    state={"health": "healthy", "load_state": "loaded", "accept_requests": True},
+                    observed={"p50_latency_ms": 120},
+                ),
+            ],
+        )
+    )
+    registry.upsert_roles(
+        [
+            RoleProfile(
+                role_id="mentor",
+                display_name="Mentor",
+                operation="chat",
+                modality="text",
+                routing_policy={"preferred_families": ["qwen3"]},
+            ),
+            RoleProfile(
+                role_id="embedder",
+                display_name="Embedder",
+                operation="embeddings",
+                modality="text",
+                routing_policy={"require_loaded": True},
+            ),
+        ]
+    )
+
+    roles = registry.list_roles()
+    assert len(roles) == 2
+    assert roles[0]["role_id"] == "embedder"
+
+    direct = registry.resolve_route("qwen3-8b-q4km")
+    assert direct is not None
+    assert direct["match_type"] == "direct"
+    assert direct["service"]["service_id"] == "atlas-01/chat/qwen3-8b"
+
+    by_role = registry.resolve_route("mentor")
+    assert by_role is not None
+    assert by_role["match_type"] == "role"
+    assert by_role["role"]["role_id"] == "mentor"
+    assert by_role["service"]["service_id"] == "atlas-01/chat/qwen3-8b"
+
+    embed_role = registry.resolve_route("embedder")
+    assert embed_role is not None
+    assert embed_role["service"]["service_id"] == "atlas-01/embeddings/bge-small"
+
+    models = registry.list_client_models()
+    ids = {item["id"] for item in models}
+    assert "atlas-01/chat/qwen3-8b" in ids
+    assert "qwen3-8b-q4km" in ids
+    assert "mentor" in ids
+    mentor = next(item for item in models if item["id"] == "mentor")
+    assert mentor["geniehive"]["route_type"] == "role"
+    assert mentor["geniehive"]["offload_hint"]["suitability"] == "good_for_low_complexity"
+    asset = next(item for item in models if item["id"] == "qwen3-8b-q4km")
+    assert asset["geniehive"]["route_type"] == "asset"
+    assert asset["geniehive"]["offload_hint"]["recommended_for"] == "lower-complexity offload"
+
+
+def test_control_app_exposes_expected_routes() -> None:
+    app = create_app()
+    paths = {route.path for route in app.routes}
+    assert "/health" in paths
+    assert "/v1/models" in paths
+    assert "/v1/nodes/register" in paths
+    assert "/v1/nodes/heartbeat" in paths
+    assert "/v1/cluster/hosts" in paths
+    assert "/v1/cluster/services" in paths
+    assert "/v1/cluster/roles" in paths
+    assert "/v1/cluster/health" in paths
+    assert "/v1/cluster/routes/resolve" in paths
--- a/tests/test_demo_flow.py
+++ b/tests/test_demo_flow.py
@ -0,0 +1,104 @@
+from pathlib import Path
+
+from geniehive_control.main import create_app as create_control_app
+from geniehive_control.models import HostHeartbeat, HostRegistration
+from geniehive_node.config import load_config as load_node_config
+from geniehive_node.inventory import build_heartbeat_payload, build_registration_payload
+
+
+def _write_demo_files(tmp_path: Path) -> tuple[Path, Path, Path]:
+    models_dir = tmp_path / "models"
+    models_dir.mkdir()
+    (models_dir / "qwen3-demo.gguf").write_bytes(b"demo")
+
+    roles_path = tmp_path / "roles.yaml"
+    roles_path.write_text(
+        "\n".join(
+            [
+                "roles:",
+                '  - role_id: "mentor"',
+                '    display_name: "Mentor"',
+                '    operation: "chat"',
+                '    modality: "text"',
+                "    routing_policy:",
+                '      preferred_families: ["qwen3"]',
+            ]
+        )
+    )
+
+    control_path = tmp_path / "control.yaml"
+    control_path.write_text(
+        "\n".join(
+            [
+                "auth:",
+                "  client_api_keys:",
+                '    - "client-key"',
+                "  node_api_keys:",
+                '    - "node-key"',
+                "storage:",
+                f'  sqlite_path: "{tmp_path / "state.sqlite3"}"',
+                f'roles_path: "{roles_path}"',
+            ]
+        )
+    )
+
+    node_path = tmp_path / "node.yaml"
+    node_path.write_text(
+        "\n".join(
+            [
+                "node:",
+                '  host_id: "atlas-01"',
+                '  display_name: "Atlas GPU Box"',
+                '  listen_host: "127.0.0.1"',
+                "  listen_port: 8891",
+                '  address: "192.168.1.101"',
+                "control_plane:",
+                '  base_url: "http://127.0.0.1:8800"',
+                '  node_api_key: "node-key"',
+                "inventory:",
+                f'  model_roots:\n    - "{models_dir}"',
+                "  capabilities:",
+                "    cuda: true",
+                "services:",
+                '  - service_id: "atlas-01/chat/qwen3-8b"',
+                '    kind: "chat"',
+                '    endpoint: "http://127.0.0.1:18091"',
+                "    assets:",
+                '      - asset_id: "qwen3-8b-q4km"',
+                "        loaded: true",
+                "    state:",
+                '      health: "healthy"',
+                '      load_state: "loaded"',
+                "      accept_requests: true",
+                "    observed:",
+                "      p50_latency_ms: 900",
+            ]
+        )
+    )
+    return control_path, node_path, roles_path
+
+
+def test_demo_flow_registers_node_and_resolves_role(tmp_path: Path) -> None:
+    control_path, node_path, _ = _write_demo_files(tmp_path)
+    control_app = create_control_app(control_path)
+    registry = control_app.state.registry
+    node_cfg = load_node_config(node_path)
+
+    registration = build_registration_payload(node_cfg)
+    heartbeat = build_heartbeat_payload(node_cfg)
+
+    host = registry.register_host(HostRegistration.model_validate(registration))
+    assert host["host_id"] == "atlas-01"
+
+    updated = registry.heartbeat_host(HostHeartbeat.model_validate(heartbeat))
+    assert updated is not None
+    assert updated["metrics"]["service_count"] == 1
+
+    roles = registry.list_roles()
+    assert len(roles) == 1
+    assert roles[0]["role_id"] == "mentor"
+
+    resolved = registry.resolve_route("mentor")
+    assert resolved is not None
+    assert resolved["match_type"] == "role"
+    assert resolved["service"]["service_id"] == "atlas-01/chat/qwen3-8b"
--- a/tests/test_node_inventory.py
+++ b/tests/test_node_inventory.py
@ -0,0 +1,108 @@
+import asyncio
+from pathlib import Path
+
+from geniehive_node.config import load_config
+from geniehive_node.inventory import build_heartbeat_payload, build_inventory, build_registration_payload
+from geniehive_node.main import create_app
+from geniehive_node.sync import ControlPlaneClient
+
+
+def _write_node_config(tmp_path: Path) -> Path:
+    models_dir = tmp_path / "models"
+    models_dir.mkdir()
+    (models_dir / "demo.gguf").write_bytes(b"gguf-demo")
+
+    cfg_path = tmp_path / "node.yaml"
+    cfg_path.write_text(
+        "\n".join(
+            [
+                "node:",
+                '  host_id: "atlas-01"',
+                '  display_name: "Atlas GPU Box"',
+                '  listen_host: "127.0.0.1"',
+                "  listen_port: 8891",
+                '  address: "192.168.1.101"',
+                "  labels:",
+                '    site: "home-lab"',
+                "inventory:",
+                f'  model_roots:\n    - "{models_dir}"',
+                "  cpu_threads: 24",
+                "  ram_gb: 128",
+                "  capabilities:",
+                "    cuda: true",
+                "services:",
+                '  - service_id: "atlas-01/chat/qwen3-8b"',
+                '    kind: "chat"',
+                '    endpoint: "http://127.0.0.1:18091"',
+                "    runtime:",
+                '      engine: "llama.cpp"',
+                '      launcher: "managed"',
+                "    assets:",
+                '      - asset_id: "qwen3-8b-q4km"',
+                "        loaded: true",
+                "    state:",
+                '      health: "healthy"',
+                '      load_state: "loaded"',
+                "      accept_requests: true",
+            ]
+        )
+    )
+    return cfg_path
+
+
+def test_build_inventory_and_registration_payload(tmp_path: Path) -> None:
+    cfg = load_config(_write_node_config(tmp_path))
+    inventory = build_inventory(cfg)
+    payload = build_registration_payload(cfg)
+    heartbeat = build_heartbeat_payload(cfg)
+
+    assert inventory.host_id == "atlas-01"
+    assert inventory.address == "192.168.1.101"
+    assert inventory.capabilities["cuda"] is True
+    assert inventory.resources["cpu_threads"] == 24
+    assert len(inventory.resources["discovered_models"]) == 1
+    assert inventory.services[0]["host_id"] == "atlas-01"
+    assert inventory.services[0]["service_id"] == "atlas-01/chat/qwen3-8b"
+    assert payload["services"][0]["kind"] == "chat"
+    assert heartbeat["host_id"] == "atlas-01"
+    assert heartbeat["metrics"]["service_count"] == 1
+    assert heartbeat["metrics"]["healthy_service_count"] == 1
+
+
+def test_node_app_exposes_inventory_routes(tmp_path: Path) -> None:
+    app = create_app(_write_node_config(tmp_path), sync_enabled=False)
+    paths = {route.path for route in app.routes}
+    assert "/health" in paths
+    assert "/v1/node/inventory" in paths
+    assert "/v1/node/registration" in paths
+
+
+class _FakePoster:
+    def __init__(self) -> None:
+        self.calls: list[dict] = []
+
+    async def post(self, url: str, *, json: dict, headers: dict[str, str] | None = None) -> object:
+        self.calls.append({"url": url, "json": json, "headers": headers or {}})
+        return object()
+
+
+def test_control_plane_client_posts_register_and_heartbeat(tmp_path: Path) -> None:
+    cfg_path = _write_node_config(tmp_path)
+    cfg = load_config(cfg_path)
+    cfg.control_plane.base_url = "http://127.0.0.1:8800"
+    cfg.control_plane.node_api_key = "node-key"
+    fake = _FakePoster()
+    client = ControlPlaneClient(cfg, http_client=fake)
+
+    async def run() -> None:
+        await client.register_once()
+        await client.heartbeat_once()
+
+    asyncio.run(run())
+
+    assert len(fake.calls) == 2
+    assert fake.calls[0]["url"] == "http://127.0.0.1:8800/v1/nodes/register"
+    assert fake.calls[0]["headers"]["X-GenieHive-Node-Key"] == "node-key"
+    assert fake.calls[0]["json"]["host_id"] == "atlas-01"
+    assert fake.calls[1]["url"] == "http://127.0.0.1:8800/v1/nodes/heartbeat"
+    assert fake.calls[1]["json"]["metrics"]["service_count"] == 1
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@ -0,0 +1,10 @@
+from geniehive_control.main import create_app as create_control_app
+from geniehive_node.main import create_app as create_node_app
+
+
+def test_control_app_title() -> None:
+    assert create_control_app().title == "GenieHive Control"
+
+
+def test_node_app_title() -> None:
+    assert create_node_app().title == "GenieHive Node"