Initial commit

2026-04-07 13:17:28 -04:00 · 2026-04-07 13:17:28 -04:00 · b9270df3e8
parent dabbebd3ba
commit b9270df3e8
60 changed files with 4021 additions and 224 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,229 +1,13 @@
-# ---> Python
+.pytest_cache/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
-.coverage.*
+.venv/
-.cache
+.benchmarks/
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
-# Translations
+state/*.sqlite3
-*.mo
+state/*.db
-*.pot
+state/*.log
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 # ---> Emacs
 # -*- mode: gitignore; -*-
 *~
 \#*\#
 /.emacs.desktop
 /.emacs.desktop.lock
 *.elc
 auto-save-list
 tramp
 .\#*
 # Org-mode
 .org-id-locations
 *_archive
 # flymake-mode
 *_flymake.*
 # eshell files
 /eshell/history
 /eshell/lastdir
 # elpa packages
 /elpa/
 # reftex files
 *.rel
 # AUCTeX auto folder
 /auto/
 # cask packages
 .cask/
 dist/
 # Flycheck
 flycheck_*.el
 # server auth directory
 /server/
 # projectiles files
 .projectile
 # directory configuration
 .dir-locals.el
 # network security
 /network-security.data
 # ---> Rust
 # Generated by Cargo
 # will have compiled files and executables
 debug/
 target/
 # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 Cargo.lock
 # These are backup files generated by rustfmt
 **/*.rs.bk
 # MSVC Windows builds of rustc generate these, which store debugging information
 *.pdb
 .DS_Store
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,33 @@
 # Contributing
 GenieHive is still early-stage infrastructure code. Keep changes small, explicit, and easy to verify.
 ## Setup
 ```bash
 cd /home/netuser/bin/geniehive
 python -m venv .venv
 . .venv/bin/activate
 pip install -e '.[dev]'
 ```
 ## Common Checks
 ```bash
 make test
 make smoke
 ```
 ## Guidelines
 - Prefer narrowly scoped patches over broad rewrites.
 - Keep the control-plane and node-agent contracts in sync.
 - Add or update tests with behavior changes.
 - Do not commit local runtime state from `state/`.
 - Do not commit benchmark artifacts or cache directories.
 ## Runtime Notes
 - Example configs under `configs/` are meant to stay runnable.
 - Scripts under `scripts/` should remain usable as operator entrypoints, not just test helpers.
 - If a startup dependency can race in practice, prefer self-healing behavior over one-shot initialization.
--- a/13
+++ b/13
@ -0,0 +1,13 @@
 PYTHON ?= python
 PYTEST ?= pytest
 .PHONY: test smoke health
 test:
 	$(PYTEST) -q
 smoke:
 	$(PYTEST) -q tests/test_smoke.py
 health:
 	bash scripts/check_singlebox_health.sh
--- a/README.md
+++ b/README.md
@ -1,3 +1,60 @@
 # GenieHive
-GenieHive is a generative AI router, starting with presenting an OpenAI API-compatible endpoint for clients to interact with, while their requests are routed appropriately among one or more nodes that register running servers with the control host. From running multiple LLMs on a single host to doing that across a distributed cluster, GenieHive aims to make it easier to actually use local AI.
+GenieHive is a local-first control plane for heterogeneous generative AI services running across one or more hosts.
 V1 scope:
 - chat completions
 - embeddings
 - transcription
 Core goals:
 - register hosts and services
 - track health, inventory, and observed performance
 - expose a stable client-facing API
 - support direct model addressing and higher-level role addressing
 - route requests to healthy loaded services first
 Repository layout:
 - `docs/architecture.md`: system overview and v1 scope
 - `docs/roadmap.md`: current milestones and near-term priorities
 - `docs/schemas.md`: canonical data models
 - `docs/deployment.md`: intended deployment approach
 - `docs/demo.md`: first end-to-end control-plus-node demo flow
 - `docs/llm_demo.md`: detailed master/peer/client LLM demo runbook
 - `docs/reverse_proxy.md`: safer external exposure patterns
 - `configs/`: example control-plane, node, and role configs
 - `scripts/`: small launch and inspection helpers
 - `src/geniehive_control/`: control-plane package
 - `src/geniehive_node/`: node-agent package
 There is now a documented single-machine path as well as the cluster-oriented path, so GenieHive can be exercised as a useful local router even without multiple hosts.
 This repository is intended as the clean successor to narrower local gateway experiments. OpenAI-compatible routing remains important, but it is treated as one client facade within a broader cluster control-plane design.
 ## Development
 Local development setup:
 ```bash
 cd /home/netuser/bin/geniehive
 python -m venv .venv
 . .venv/bin/activate
 pip install -e '.[dev]'
 ```
 Common commands:
 ```bash
 make test
 make smoke
 make health
 ```
 Repository conventions:
 - local runtime state lives under `state/` and should not be committed
 - example configs under `configs/` should remain runnable
 - operator scripts under `scripts/` are part of the supported workflow
--- a/configs/control.example.yaml
+++ b/configs/control.example.yaml
@ -0,0 +1,18 @@
 server:
  host: "127.0.0.1"
  port: 8800
 auth:
  client_api_keys:
    - "change-me-client-key"
  node_api_keys:
    - "change-me-node-key"
 storage:
  sqlite_path: "state/geniehive.sqlite3"
 roles_path: "configs/roles.example.yaml"
 routing:
  default_strategy: "loaded_first"
  health_stale_after_s: 30
--- a/configs/control.singlebox.example.yaml
+++ b/configs/control.singlebox.example.yaml
@ -0,0 +1,18 @@
 server:
  host: "127.0.0.1"
  port: 8800
 auth:
  client_api_keys:
    - "change-me-client-key"
  node_api_keys:
    - "change-me-node-key"
 storage:
  sqlite_path: "state/geniehive-singlebox.sqlite3"
 roles_path: "configs/roles.example.yaml"
 routing:
  default_strategy: "loaded_first"
  health_stale_after_s: 30
--- a/configs/control.singlebox.p40.example.yaml
+++ b/configs/control.singlebox.p40.example.yaml
@ -0,0 +1,18 @@
 server:
  host: "127.0.0.1"
  port: 8800
 auth:
  client_api_keys:
    - "change-me-client-key"
  node_api_keys:
    - "change-me-node-key"
 storage:
  sqlite_path: "state/geniehive-p40.sqlite3"
 roles_path: "configs/roles.singlebox.p40.example.yaml"
 routing:
  default_strategy: "loaded_first"
  health_stale_after_s: 30
--- a/configs/node.example.yaml
+++ b/configs/node.example.yaml
@ -0,0 +1,56 @@
 node:
  host_id: "atlas-01"
  display_name: "Atlas GPU Box"
  listen_host: "127.0.0.1"
  listen_port: 8891
 control_plane:
  base_url: "http://127.0.0.1:8800"
  node_api_key: "change-me-node-key"
  heartbeat_interval_s: 5
 inventory:
  model_roots:
    - "/path/to/models"
  cpu_threads: 24
  ram_gb: 128
  capabilities:
    cuda: true
    rocm: false
    metal: false
 managed_runtimes:
  enabled: true
  llama_server_bin: "/path/to/llama-server"
 services:
  - service_id: "atlas-01/chat/qwen3-8b"
    kind: "chat"
    endpoint: "http://127.0.0.1:18091"
    runtime:
      engine: "llama.cpp"
      launcher: "managed"
    assets:
      - asset_id: "qwen3-8b-q4km"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
    observed:
      p50_latency_ms: 900
      tokens_per_sec: 40
  - service_id: "atlas-01/embeddings/bge-small"
    kind: "embeddings"
    endpoint: "http://127.0.0.1:18092"
    runtime:
      engine: "llama.cpp"
      launcher: "managed"
    assets:
      - asset_id: "bge-small-en"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
--- a/configs/node.singlebox.llamacpp.example.yaml
+++ b/configs/node.singlebox.llamacpp.example.yaml
@ -0,0 +1,43 @@
 node:
  host_id: "singlebox-llamacpp"
  display_name: "SingleBox llama.cpp"
  listen_host: "127.0.0.1"
  listen_port: 8891
  address: "127.0.0.1"
  labels:
    topology: "singlebox"
    runtime: "llama.cpp"
 control_plane:
  base_url: "http://127.0.0.1:8800"
  node_api_key: "change-me-node-key"
  heartbeat_interval_s: 5
 inventory:
  model_roots:
    - "/path/to/models"
  cpu_threads: 24
  ram_gb: 64
  capabilities:
    cpu: true
    cuda: true
 managed_runtimes:
  enabled: false
 services:
  - service_id: "singlebox/chat/qwen3-8b"
    kind: "chat"
    endpoint: "http://127.0.0.1:18091"
    runtime:
      engine: "llama.cpp"
      launcher: "external"
    assets:
      - asset_id: "qwen3-8b-q4_k_m"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
    observed:
      p50_latency_ms: 900
--- a/configs/node.singlebox.llamafile.example.yaml
+++ b/configs/node.singlebox.llamafile.example.yaml
@ -0,0 +1,43 @@
 node:
  host_id: "singlebox-llamafile"
  display_name: "SingleBox llamafile"
  listen_host: "127.0.0.1"
  listen_port: 8891
  address: "127.0.0.1"
  labels:
    topology: "singlebox"
    runtime: "llamafile"
 control_plane:
  base_url: "http://127.0.0.1:8800"
  node_api_key: "change-me-node-key"
  heartbeat_interval_s: 5
 inventory:
  model_roots:
    - "/path/to/models"
  cpu_threads: 24
  ram_gb: 64
  capabilities:
    cpu: true
    cuda: true
 managed_runtimes:
  enabled: false
 services:
  - service_id: "singlebox/chat/qwen3-8b"
    kind: "chat"
    endpoint: "http://127.0.0.1:18091"
    runtime:
      engine: "llamafile"
      launcher: "external"
    assets:
      - asset_id: "qwen3-8b-q4_k_m"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
    observed:
      p50_latency_ms: 900
--- a/configs/node.singlebox.ollama.example.yaml
+++ b/configs/node.singlebox.ollama.example.yaml
@ -0,0 +1,58 @@
 node:
  host_id: "singlebox-ollama"
  display_name: "SingleBox Ollama"
  listen_host: "127.0.0.1"
  listen_port: 8891
  address: "127.0.0.1"
  labels:
    topology: "singlebox"
    runtime: "ollama"
 control_plane:
  base_url: "http://127.0.0.1:8800"
  node_api_key: "change-me-node-key"
  heartbeat_interval_s: 5
 inventory:
  model_roots: []
  cpu_threads: 24
  ram_gb: 64
  capabilities:
    cpu: true
    cuda: false
 managed_runtimes:
  enabled: false
 services:
  - service_id: "singlebox/chat/qwen3"
    kind: "chat"
    endpoint: "http://127.0.0.1:11434"
    runtime:
      engine: "ollama"
      launcher: "external"
    assets:
      - asset_id: "qwen3"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
    observed:
      p50_latency_ms: 900
  - service_id: "singlebox/embeddings/nomic-embed-text"
    kind: "embeddings"
    endpoint: "http://127.0.0.1:11434"
    runtime:
      engine: "ollama"
      launcher: "external"
    assets:
      - asset_id: "nomic-embed-text"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
    observed:
      p50_latency_ms: 150
--- a/configs/node.singlebox.p40-triple.example.yaml
+++ b/configs/node.singlebox.p40-triple.example.yaml
@ -0,0 +1,84 @@
 node:
  host_id: "p40-box"
  display_name: "Dual P40 + CPU Fallback"
  listen_host: "127.0.0.1"
  listen_port: 8891
  address: "127.0.0.1"
  labels:
    topology: "singlebox"
    runtime: "llama.cpp"
    gpu0: "Tesla P40"
    gpu1: "Tesla P40"
    cpu: "Ryzen 5600G"
 control_plane:
  base_url: "http://127.0.0.1:8800"
  node_api_key: "change-me-node-key"
  heartbeat_interval_s: 5
 inventory:
  model_roots:
    - "/path/to/models"
  cpu_threads: 12
  ram_gb: 128
  capabilities:
    cpu: true
    cuda: true
 managed_runtimes:
  enabled: false
 services:
  - service_id: "p40-box/chat/gpu0-primary"
    kind: "chat"
    endpoint: "http://127.0.0.1:18091"
    runtime:
      engine: "llama.cpp"
      launcher: "external"
      device: "gpu0"
    assets:
      - asset_id: "Qwen2.5-14B-Instruct-1M-Q5_K_M"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
    observed:
      p50_latency_ms: 1200
      tokens_per_sec: 24
  - service_id: "p40-box/chat/gpu1-secondary"
    kind: "chat"
    endpoint: "http://127.0.0.1:18092"
    runtime:
      engine: "llama.cpp"
      launcher: "external"
      device: "gpu1"
    assets:
      - asset_id: "Qwen3.5-9B-Q5_K_M"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
    observed:
      p50_latency_ms: 1000
      tokens_per_sec: 30
  - service_id: "p40-box/chat/cpu-fallback"
    kind: "chat"
    endpoint: "http://127.0.0.1:18093"
    runtime:
      engine: "llama.cpp"
      launcher: "external"
      device: "cpu"
    assets:
      - asset_id: "rocket-3b.Q5_K_M"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
    observed:
      p50_latency_ms: 4500
      tokens_per_sec: 7
--- a/configs/roles.example.yaml
+++ b/configs/roles.example.yaml
@ -0,0 +1,22 @@
 roles:
  - role_id: "mentor"
    display_name: "Mentor"
    operation: "chat"
    modality: "text"
    prompt_policy:
      system_prompt: "Guide the user without taking over the task."
    routing_policy:
      preferred_families: ["Qwen3", "Mistral"]
      min_context: 8192
  - role_id: "embedder"
    display_name: "Embedder"
    operation: "embeddings"
    modality: "text"
    routing_policy:
      require_loaded: true
  - role_id: "transcriber"
    display_name: "Transcriber"
    operation: "transcription"
    modality: "audio"
--- a/configs/roles.singlebox.p40.example.yaml
+++ b/configs/roles.singlebox.p40.example.yaml
@ -0,0 +1,33 @@
 roles:
  - role_id: "mentor"
    display_name: "Mentor"
    description: "Primary high-quality reasoning/chat route"
    operation: "chat"
    modality: "text"
    prompt_policy:
      system_prompt: "Be concise, helpful, and technically accurate."
    routing_policy:
      preferred_families: ["qwen2.5-14b", "qwen2.5"]
      require_loaded: true
  - role_id: "general_assistant"
    display_name: "General Assistant"
    description: "Secondary fast chat route"
    operation: "chat"
    modality: "text"
    prompt_policy:
      system_prompt: "Answer clearly and directly."
    routing_policy:
      preferred_families: ["qwen3.5-9b", "qwen3.5"]
      require_loaded: true
  - role_id: "background_summarizer"
    display_name: "Background Summarizer"
    description: "Slow fallback route for low-priority work"
    operation: "chat"
    modality: "text"
    prompt_policy:
      system_prompt: "Summarize briefly and conservatively."
    routing_policy:
      preferred_families: ["rocket-3b", "rocket"]
      require_loaded: true
--- a/docs/architecture.md
+++ b/docs/architecture.md
@ -0,0 +1,194 @@
 # GenieHive Architecture
 Status: proposed v1 architecture
 Drafted: 2026-04-05
 ## Repo Name
 Chosen name: `GenieHive`
 Why this name:
 - suggestive: "genie" implies generative AI services, "hive" implies a cooperating cluster
 - accessible: easy to say, remember, and explain
 - whimsical enough to feel like a project name rather than a dry infrastructure label
 Tradeoff:
 - `GenieHive` is less search-distinct than `Geniewarren` because `hive` is a common product metaphor
 ## Mission
 GenieHive is a local-first control plane for heterogeneous generative AI services running across one or more hosts.
 It should:
 - register hosts and their available services
 - expose a stable client-facing API
 - track health, capacity, and observed performance
 - support direct model addressing and higher-level role addressing
 - route requests to healthy loaded services first
 - optionally coordinate loading or swapping when policy allows
 - remain practical for a small self-hosted deployment with two hosts
 ## Non-Goals For V1
 Out of scope initially:
 - peer-to-peer consensus
 - autonomous global model swapping across many nodes
 - full WAN zero-trust platform engineering
 - image and TTS generation orchestration
 - distributed vector database management
 - billing or multi-tenant quota accounting
 ## Architectural Position
 GenieHive is not just an OpenAI-compatible gateway.
 It is a control plane with these layers:
 1. Control API
   - authoritative registry
   - routing and scheduling
   - role catalog
   - operator inspection
 2. Node Agent
   - host discovery
   - service discovery
   - telemetry reporting
   - optional local process management
 3. Provider Adapters
   - OpenAI-compatible chat backends
   - OpenAI-compatible embedding backends
   - transcription backends
   - future adapters for image and speech synthesis
 4. Client Facades
   - OpenAI-compatible facade for completions and embeddings
   - operator API for topology, health, and inventory
 ## Core Concepts
 ### Host
 A physical or virtual machine participating in the cluster.
 ### Service
 A concrete callable capability on a host. Examples:
 - chat completion endpoint
 - embedding endpoint
 - transcription endpoint
 ### Asset
 A model weight, model name, application, or runtime target that a service can serve.
 ### Role
 A reusable task profile that describes how requests should be fulfilled. A role is policy, not a concrete model.
 ### Route Resolution
 Request handling order:
 1. If the requested `model` matches a currently loaded and healthy concrete asset or service alias, route directly.
 2. Otherwise, if the requested `model` matches a known role, resolve the role to the best eligible service.
 3. Otherwise, fail clearly.
 ## V1 Capability Scope
 V1 supports only:
 - chat completions
 - embeddings
 - transcription
 ## Topology
 Recommended initial topology:
 - 1 control plane
 - 2 node agents
 - 1 or more clients
 - LAN-first deployment
 - API key auth in v1
 - VPN or mTLS in v1.5
 ## API Families
 ### Client API
 - `GET /v1/models`
 - `POST /v1/chat/completions`
 - `POST /v1/embeddings`
 - `POST /v1/audio/transcriptions`
 `GET /v1/models` should expose enough metadata for programmatic clients to make routing decisions about what GenieHive can handle cheaply, especially for lower-complexity offloaded work. That metadata should include direct assets, service-backed aliases, role aliases, operation kind, health, loaded status, and observed performance hints.
 ### Operator API
 - `GET /v1/cluster/hosts`
 - `GET /v1/cluster/services`
 - `GET /v1/cluster/roles`
 - `GET /v1/cluster/health`
 - `GET /v1/cluster/routes/resolve?model=...`
 ### Node API
 - `POST /v1/nodes/register`
 - `POST /v1/nodes/heartbeat`
 - `GET /v1/node/inventory`
 - `POST /v1/node/services/refresh`
 ## Data Store
 V1 should use SQLite for durable state.
 ## Routing Rules
 ### Direct Model Resolution
 If a request names a concrete asset alias or service alias:
 - prefer loaded and healthy services
 - choose the lowest-cost healthy target if multiple matches exist
 - fail clearly if all matches are unhealthy
 ### Role Resolution
 If direct resolution fails, treat the requested name as a role.
 Role resolution should filter by:
 - operation kind
 - modality
 - health
 - auth and exposure compatibility
 - minimum context or memory requirements
 - preferred model families
 Then rank by:
 - already loaded
 - recent health
 - expected latency
 - queue pressure
 - operator priority
 ## First Implementation Sequence
 1. Create the repo skeleton and docs.
 2. Implement SQLite-backed registry models.
 3. Implement node registration and heartbeat.
 4. Implement operator inspection endpoints.
 5. Implement client-facing chat routing.
 6. Add embeddings routing.
 7. Add transcription routing.
 8. Add truthful readiness and health reporting.
 9. Add role catalog and role-based resolution.
 10. Add optional managed local runtime support.
--- a/docs/demo.md
+++ b/docs/demo.md
@ -0,0 +1,61 @@
 # GenieHive Demo
 This is the first end-to-end demo path for GenieHive using the example configs already in the repo.
 ## Goal
 Bring up:
 - one control plane
 - one node agent
 - one route-resolution check
 The node should auto-register with the control plane on startup and then send periodic heartbeats.
 ## 1. Start the control plane
 From the repo root:
 ```bash
 bash scripts/run_control.sh
 ```
 This uses:
 - `configs/control.example.yaml`
 - `configs/roles.example.yaml`
 ## 2. Start the node agent
 In another shell:
 ```bash
 bash scripts/run_node.sh
 ```
 This uses:
 - `configs/node.example.yaml`
 ## 3. Inspect the cluster
 In another shell:
 ```bash
 bash scripts/demo_inspect.sh
 ```
 That script checks:
 - client-facing model metadata
 - cluster health
 - registered hosts
 - registered services
 - loaded roles
 - route resolution for `mentor`
 ## Notes
 - The example configs use API keys; the inspection script sends the example client key.
 - The example node config assumes the underlying model-serving endpoints already exist. The current demo proves control-plane registration and routing metadata, not full inference proxying yet.
 - The control plane stores state in `state/geniehive.sqlite3` by default.
--- a/docs/deployment.md
+++ b/docs/deployment.md
@ -0,0 +1,48 @@
 # GenieHive Deployment
 ## Initial Deployment Target
 V1 should be easy to deploy on a small self-hosted setup:
 - 1 control plane
 - 2 node agents
 - private LAN or VPN
 - API-key auth first
 ## Binding Guidance
 Defaults should be conservative:
 - control plane binds to localhost by default during development
 - node agents bind to localhost unless remote registration is needed
 - managed inference runtimes should stay node-local unless there is a specific reason to expose them
 ## Security Baseline
 Required in v1:
 - client API keys
 - node registration keys
 - clear separation between client-facing and node-facing credentials
 Planned after v1:
 - mTLS between control plane and nodes
 - scoped client tokens
 ## Persistence
 Use SQLite first for:
 - host registry
 - service registry
 - role catalog
 - recent health and benchmark samples
 ## Startup Order
 1. Start the control plane.
 2. Start node agents.
 3. Confirm registration and heartbeat visibility.
 4. Confirm client API readiness.
 5. Exercise chat, embeddings, and transcription paths.
--- a/docs/llm_demo.md
+++ b/docs/llm_demo.md
@ -0,0 +1,676 @@
 # GenieHive LLM Demo
 This runbook covers the first practical GenieHive LLM demo with three roles:
 - master: the GenieHive control plane
 - peer: a GenieHive node agent attached to one or more local LLM servers
 - client: a demo client agent or Codex using GenieHive as the API front door
 ## Current Readiness
 GenieHive is ready for a first live chat demo now.
 What works in GenieHive already:
 - node registration
 - heartbeat
 - role-aware route resolution
 - `GET /v1/models`
 - `POST /v1/chat/completions`
 - `POST /v1/embeddings`
 What GenieHive does not do yet:
 - launch upstream LLM servers for you automatically
 - provide `POST /v1/audio/transcriptions`
 - maintain advanced benchmark history or queue-aware scheduling
 For the first demo, treat GenieHive as a metadata-rich router over already-running local servers.
 ## Topologies
 ### Smallest Demo
 Run everything on one host:
 - control plane on `127.0.0.1:8800`
 - node agent on `127.0.0.1:8891`
 - one or more upstream model servers on local ports
 This is also the recommended setup for users who do not have a cluster. GenieHive still provides value as:
 - a local router
 - a metadata-rich local model catalog
 - a role-to-model indirection layer
 - a common front door for client tools
 ### Two-Host Demo
 - master host runs GenieHive control plane
 - peer host runs GenieHive node agent and one or more local LLM servers
 - client runs anywhere that can reach the master
 ## Master Instructions
 On the control-plane host:
 1. Create a repo-local Python environment if you want isolation.
 2. Start GenieHive control:
 ```bash
 cd /home/netuser/bin/geniehive
 bash scripts/run_control.sh
 ```
 3. Confirm health:
 ```bash
 curl -sS http://127.0.0.1:8800/health
 ```
 Expected result:
 - JSON containing `{"status":"ok"}`
 4. Keep note of the example client and node keys from `configs/control.example.yaml`.
 ### Single-Box Shortcut
 If you are running control and node on the same machine, use:
 ```bash
 cd /home/netuser/bin/geniehive
 bash scripts/run_control_singlebox.sh
 ```
 For your P40 host, repo-provided external bind helpers now exist:
 LAN:
 ```bash
 bash scripts/run_control_p40_lan.sh
 ```
 ZeroTier:
 ```bash
 bash scripts/run_control_p40_zerotier.sh
 ```
 Both use the P40-specific control config and only change the bind interface.
 ## Peer Instructions
 On each peer host you need:
 - one or more local LLM servers already running
 - one GenieHive node config that points at those servers
 - the control-plane base URL and node API key
 For a single-machine setup, the peer is simply another process on the same host.
 The node agent should advertise upstream server roots, not endpoint suffixes. For example:
 - good: `http://127.0.0.1:11434`
 - good: `http://127.0.0.1:18091`
 - not good: `http://127.0.0.1:11434/v1/chat/completions`
 ### Option A: Ollama
 Use this when you want the lowest-friction chat and embeddings demo.
 1. Start Ollama if it is not already running:
 ```bash
 ollama serve
 ```
 2. Pull the model or models you want:
 ```bash
 ollama pull qwen3
 ollama pull nomic-embed-text
 ```
 3. Example peer service config:
 ```yaml
 services:
  - service_id: "peer1/chat/qwen3"
    kind: "chat"
    endpoint: "http://127.0.0.1:11434"
    runtime:
      engine: "ollama"
      launcher: "external"
    assets:
      - asset_id: "qwen3"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
  - service_id: "peer1/embeddings/nomic-embed-text"
    kind: "embeddings"
    endpoint: "http://127.0.0.1:11434"
    runtime:
      engine: "ollama"
      launcher: "external"
    assets:
      - asset_id: "nomic-embed-text"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
 ```
 4. Start the node:
 ```bash
 cd /home/netuser/bin/geniehive
 bash scripts/run_node_singlebox.sh configs/node.singlebox.ollama.example.yaml
 ```
 ### Option B: llama.cpp
 Use this when you want direct GGUF serving with `llama-server`.
 1. Start a chat server:
 ```bash
 llama-server -m /path/to/model.gguf --host 127.0.0.1 --port 18091
 ```
 2. Example peer service config:
 ```yaml
 services:
  - service_id: "peer1/chat/qwen3-8b"
    kind: "chat"
    endpoint: "http://127.0.0.1:18091"
    runtime:
      engine: "llama.cpp"
      launcher: "external"
    assets:
      - asset_id: "qwen3-8b-q4_k_m"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
 ```
 Then start the node:
 ```bash
 cd /home/netuser/bin/geniehive
 bash scripts/run_node_singlebox.sh configs/node.singlebox.llamacpp.example.yaml
 ```
 Note:
 - The official `llama.cpp` docs clearly show OpenAI-compatible chat serving.
 - For embeddings, some `llama.cpp` builds document non-OpenAI embedding endpoints such as `/embedding`, so GenieHive’s current `POST /v1/embeddings` path is safest with Ollama or vLLM unless you have verified your specific build.
 ### Option C: llamafile
 Use this when you want a single-file local server built around llama.cpp.
 1. Start a chat server:
 ```bash
 ./your-model.llamafile --server --host 127.0.0.1 --port 18091 --nobrowser
 ```
 2. Example peer service config:
 ```yaml
 services:
  - service_id: "peer1/chat/llamafile-qwen3"
    kind: "chat"
    endpoint: "http://127.0.0.1:18091"
    runtime:
      engine: "llamafile"
      launcher: "external"
    assets:
      - asset_id: "qwen3-8b-q4_k_m"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
 ```
 Then start the node:
 ```bash
 cd /home/netuser/bin/geniehive
 bash scripts/run_node_singlebox.sh configs/node.singlebox.llamafile.example.yaml
 ```
 ### Option D: vLLM
 Use this when you want a more server-oriented OpenAI-compatible stack and you have the hardware budget for it.
 1. Start the server:
 ```bash
 vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
 2. Example peer service config:
 ```yaml
 services:
  - service_id: "peer1/chat/llama3-8b"
    kind: "chat"
    endpoint: "http://127.0.0.1:8000"
    runtime:
      engine: "vllm"
      launcher: "external"
    assets:
      - asset_id: "NousResearch/Meta-Llama-3-8B-Instruct"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
  - service_id: "peer1/embeddings/bge-base"
    kind: "embeddings"
    endpoint: "http://127.0.0.1:8001"
    runtime:
      engine: "vllm"
      launcher: "external"
    assets:
      - asset_id: "BAAI/bge-base-en-v1.5"
        loaded: true
    state:
      health: "healthy"
      load_state: "loaded"
      accept_requests: true
 ```
 ## Minimal Node Config Pattern
 For a real peer host, the fields you most likely need to edit in `configs/node.example.yaml` are:
 - `node.host_id`
 - `node.display_name`
 - `node.address`
 - `control_plane.base_url`
 - `control_plane.node_api_key`
 - `inventory.capabilities`
 - `services`
 ## Client Instructions
 You now have two simple ways to exercise GenieHive as a client.
 ### Option 1: Inspect and call it manually
 List models:
 ```bash
 curl -sS http://127.0.0.1:8800/v1/models \
  -H 'X-Api-Key: change-me-client-key'
 ```
 Chat using a role:
 ```bash
 curl -sS http://127.0.0.1:8800/v1/chat/completions \
  -H 'Content-Type: application/json' \
  -H 'X-Api-Key: change-me-client-key' \
  -d '{
    "model": "mentor",
    "messages": [{"role":"user","content":"Give me a 2-sentence summary of why SQLite is useful here."}]
  }'
 ```
 Embeddings using a direct embedding asset:
 ```bash
 curl -sS http://127.0.0.1:8800/v1/embeddings \
  -H 'Content-Type: application/json' \
  -H 'X-Api-Key: change-me-client-key' \
  -d '{
    "model": "nomic-embed-text",
    "input": "GenieHive is a local-first control plane."
  }'
 ```
 ### Option 2: Use the demo client agent
 Run:
 ```bash
 cd /home/netuser/bin/geniehive
 python scripts/demo_client_agent.py \
  --base-url http://127.0.0.1:8800 \
  --api-key change-me-client-key \
  --task "Summarize the current GenieHive demo in three bullets."
 ```
 That script will:
 - read `GET /v1/models`
 - choose a chat-capable model automatically if you do not specify one
 - prefer entries GenieHive marks as suitable for lower-complexity offload
 - submit a chat request and print the answer
 If you want to force a specific route:
 ```bash
 python scripts/demo_client_agent.py \
  --base-url http://127.0.0.1:8800 \
  --api-key change-me-client-key \
  --model mentor \
  --task "State what host and route type you would expect for this demo."
 ```
 ## Codex-As-Client
 For Codex or another agentic client, the intended pattern is:
 1. Read `GET /v1/models`.
 2. Filter for `geniehive.operation == "chat"`.
 3. Prefer:
   - `geniehive.offload_hint.suitability == "good_for_low_complexity"`
   - `geniehive.loaded_target_count > 0` for role entries
   - lower `best_p50_latency_ms`
 4. Send lower-complexity requests to GenieHive.
 5. Keep higher-complexity, high-context, or high-risk tasks local unless the catalog indicates a better remote fit.
 ## Good First Live Demo
 If you want the safest first success path:
 - control plane on one host
 - node agent on the same host
 - Ollama upstream with one chat model
 - role alias `mentor`
 - demo client agent calling `mentor`
 That avoids GGUF-specific launch tuning while still exercising the full GenieHive master/peer/client path.
 ## Single-Machine End-to-End Example
 ### Ollama-backed single box
 1. Start Ollama:
 ```bash
 ollama serve
 ```
 2. Pull models:
 ```bash
 ollama pull qwen3
 ollama pull nomic-embed-text
 ```
 3. Start GenieHive control:
 ```bash
 cd /home/netuser/bin/geniehive
 bash scripts/run_control_singlebox.sh
 ```
 4. Start GenieHive node:
 ```bash
 cd /home/netuser/bin/geniehive
 bash scripts/run_node_singlebox.sh configs/node.singlebox.ollama.example.yaml
 ```
 5. Inspect:
 ```bash
 bash scripts/demo_inspect.sh
 ```
 6. Run the client agent:
 ```bash
 python scripts/demo_client_agent.py \
  --base-url http://127.0.0.1:8800 \
  --api-key change-me-client-key \
  --task "Explain in three bullets what GenieHive is doing in this single-machine demo."
 ```
 ### llama.cpp-backed single box
 1. Start the local server:
 ```bash
 llama-server -m /path/to/model.gguf --host 127.0.0.1 --port 18091
 ```
 2. Start GenieHive control:
 ```bash
 cd /home/netuser/bin/geniehive
 bash scripts/run_control_singlebox.sh
 ```
 3. Start GenieHive node:
 ```bash
 cd /home/netuser/bin/geniehive
 bash scripts/run_node_singlebox.sh configs/node.singlebox.llamacpp.example.yaml
 ```
 4. Run the client agent:
 ```bash
 python scripts/demo_client_agent.py \
  --base-url http://127.0.0.1:8800 \
  --api-key change-me-client-key \
  --task "Summarize why a single-machine GenieHive setup can still be useful."
 ```
 ## Host-Specific Note: Dual Tesla P40 + 128 GB RAM
 For a machine with:
 - `2 x Nvidia Tesla P40`
 - `AMD Ryzen 5600G`
 - `128 GB RAM`
 the most practical first GenieHive layout is:
 - one chat model on `GPU0`
 - one chat or utility model on `GPU1`
 - one slower fallback chat model on CPU
 This is now sketched in:
 - `configs/node.singlebox.p40-triple.example.yaml`
 - `configs/control.singlebox.p40.example.yaml`
 - `configs/roles.singlebox.p40.example.yaml`
 - `scripts/start_p40_triple_llamacpp.sh`
 - `scripts/launch_p40_triple.sh`
 - `scripts/p40_triple_gpu0.sh`
 - `scripts/p40_triple_gpu1.sh`
 - `scripts/p40_triple_cpu.sh`
 The current concrete defaults use models already present under `/home/netuser/bin/models/llm`:
 - `GPU0`: `Qwen2.5-14B-Instruct-1M-Q5_K_M.gguf`
 - `GPU1`: `Qwen3.5-9B-Q5_K_M.gguf`
 - `CPU`: `rocket-3b.Q5_K_M.gguf`
 ### Why this layout works
 - each P40 has enough VRAM for a quantized 7B to 14B model comfortably
 - 128 GB RAM is enough to hold a separate CPU-served fallback model without much trouble
 - the CPU route will be much slower, but it is still useful for low-priority offload or fallback handling
 ### Suggested role usage
 - `mentor` or primary chat role -> `GPU0`
 - `general_assistant` or alternate chat role -> `GPU1`
 - `fallback_writer` or `background_summarizer` -> CPU route
 The repo now includes a host-specific role catalog with exactly that intent.
 ### Launch pattern
 1. Edit your model paths:
 ```bash
 cd /home/netuser/bin/geniehive
 bash scripts/start_p40_triple_llamacpp.sh
 ```
 If the defaults look good, you do not need to edit them before trying the first run.
 If `tmux` is available, you can also launch the three processes detached:
 ```bash
 cd /home/netuser/bin/geniehive
 bash scripts/launch_p40_triple.sh
 ```
 Then inspect pane state without binding your current terminal to the session:
 ```bash
 bash scripts/tmux_session_status.sh
 ```
 That status helper checks whether the session exists and whether each pane's launcher process is still running or has already exited. If `tmux` is not installed, the combined launcher prints the three helper commands instead.
 2. Start the three `llama-server` processes in separate shells.
 3. Start GenieHive control:
 ```bash
 bash scripts/run_control_singlebox.sh configs/control.singlebox.p40.example.yaml
 ```
 4. Start GenieHive node with the host-specific config:
 ```bash
 bash scripts/run_node_singlebox.sh configs/node.singlebox.p40-triple.example.yaml
 ```
 5. Inspect the catalog:
 ```bash
 bash scripts/demo_inspect.sh
 ```
 If something is not coming up cleanly, run:
 ```bash
 bash scripts/check_singlebox_health.sh
 ```
 That checks:
 - `GPU0` upstream health
 - `GPU1` upstream health
 - CPU fallback upstream health
 - GenieHive control health
 - GenieHive node health
 - authenticated cluster and model-catalog endpoints
 6. Exercise the chat path:
 ```bash
 python scripts/demo_client_agent.py \
  --base-url http://127.0.0.1:8800 \
  --api-key change-me-client-key \
  --model mentor \
  --task "State which route should be preferred for low-latency chat and which should be the slow fallback."
 ```
 ### Practical expectations
 - `GPU0` and `GPU1` should be the preferred targets for normal chat work
 - the CPU route should mostly be treated as fallback or low-priority background work
 - GenieHive metadata should make that visible to clients through latency and offload hints
 ### Containerized Qwen3.5 probe
 If the host-installed `llama-server` is too old for `Qwen3.5`, but the NVIDIA Container Toolkit is installed, you can test a newer CUDA-enabled `llama.cpp` without changing the host CUDA stack:
 ```bash
 cd /home/netuser/bin/geniehive
 bash scripts/test_qwen35_server_cuda_container.sh
 ```
 Useful overrides:
 ```bash
 GPU_INDEX=1 PORT=19092 bash scripts/test_qwen35_server_cuda_container.sh
 MODEL_PATH=/home/netuser/bin/models/llm/Qwen3.5-9B-Q5_K_M.gguf bash scripts/test_qwen35_server_cuda_container.sh
 ```
 That probe uses the official `ghcr.io/ggml-org/llama.cpp:server-cuda` image. If it loads the model and starts serving, then the remaining blocker is your host `llama.cpp` install, not GPU compatibility.
 ## External Client Access
 For your current host addresses:
 - LAN: `192.168.40.207`
 - ZeroTier: `172.24.50.65`
 The cleanest rule is:
 - keep upstream model servers on `127.0.0.1`
 - keep the GenieHive node on `127.0.0.1` unless you specifically need remote node access
 - expose only the GenieHive control plane to LAN or ZeroTier clients
 That gives remote clients a single stable endpoint without exposing the underlying model servers directly.
 ### LAN bind
 ```bash
 cd /home/netuser/bin/geniehive
 bash scripts/run_control_p40_lan.sh
 ```
 Remote client example:
 ```bash
 python scripts/demo_client_agent.py \
  --base-url http://192.168.40.207:8800 \
  --api-key change-me-client-key \
  --model mentor \
  --task "Briefly describe the preferred and fallback routes on this host."
 ```
 ### ZeroTier bind
 ```bash
 cd /home/netuser/bin/geniehive
 bash scripts/run_control_p40_zerotier.sh
 ```
 Remote client example:
 ```bash
 python scripts/demo_client_agent.py \
  --base-url http://172.24.50.65:8800 \
  --api-key change-me-client-key \
  --model mentor \
  --task "Briefly describe the preferred and fallback routes on this host."
 ```
 ### Security note
 Prefer ZeroTier over general LAN exposure when possible. In both cases:
 - do not expose the upstream `llama-server` ports
 - keep the client API key enabled
 - if you later open this beyond trusted networks, add a reverse proxy or VPN-only boundary rather than binding GenieHive broadly
 ### Role meanings for this host
 - `mentor` should bias toward the `GPU0` Qwen2.5 14B route
 - `general_assistant` should bias toward the `GPU1` Qwen3.5 9B route
 - `background_summarizer` should bias toward the CPU Rocket 3B fallback route
--- a/docs/reverse_proxy.md
+++ b/docs/reverse_proxy.md
@ -0,0 +1,94 @@
 # GenieHive Reverse Proxy
 For external clients, a reverse proxy is cleaner than binding GenieHive directly to every interface.
 Recommended pattern:
 - keep upstream model servers on `127.0.0.1`
 - keep GenieHive node on `127.0.0.1`
 - keep GenieHive control on `127.0.0.1`
 - expose only the reverse proxy on LAN or ZeroTier
 ## Caddy Example
 Config file:
 ```caddy
 192.168.40.207:8080 {
    reverse_proxy 127.0.0.1:8800
 }
 ```
 ZeroTier variant:
 ```caddy
 172.24.50.65:8080 {
    reverse_proxy 127.0.0.1:8800
 }
 ```
 Advantages:
 - simple config
 - easy to move to TLS later
 - good default operational behavior
 ## Nginx Example
 Server block:
 ```nginx
 server {
    listen 192.168.40.207:8080;
    server_name _;
    location / {
        proxy_pass http://127.0.0.1:8800;
        proxy_http_version 1.1;
        proxy_set_header Host $host;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
    }
 }
 ```
 ZeroTier variant:
 ```nginx
 server {
    listen 172.24.50.65:8080;
    server_name _;
    location / {
        proxy_pass http://127.0.0.1:8800;
        proxy_http_version 1.1;
        proxy_set_header Host $host;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
    }
 }
 ```
 ## Operational Recommendation
 For your current host, the cleanest shape is:
 1. GenieHive control on `127.0.0.1:8800`
 2. reverse proxy on either:
   - `192.168.40.207:8080`
   - `172.24.50.65:8080`
 3. clients talk only to the reverse proxy
 ## Client Example
 ```bash
 python scripts/demo_client_agent.py \
  --base-url http://172.24.50.65:8080 \
  --api-key change-me-client-key \
  --model mentor \
  --task "Describe the preferred and fallback routes on this host."
 ```
 ## Security Note
 The API key is still required. The reverse proxy improves exposure hygiene, but it is not a substitute for network trust boundaries.
--- a/docs/roadmap.md
+++ b/docs/roadmap.md
@ -0,0 +1,34 @@
 # GenieHive Roadmap
 ## Completed Foundations
 - control-plane registry with SQLite persistence
 - node registration and heartbeat
 - role catalog and route resolution
 - client-facing `GET /v1/models`
 - client-facing `POST /v1/chat/completions`
 - client-facing `POST /v1/embeddings`
 - first control-plus-node demo flow
 ## Immediate Next Milestones
 1. Run and document the first live LLM demo against real upstream servers.
 2. Validate the `GET /v1/models` metadata as a Codex-friendly offload catalog for lower-complexity tasks.
 3. Add `POST /v1/audio/transcriptions`.
 4. Add a richer node metrics model for queue depth, current load, and observed performance over time.
 5. Add a stronger operator/client distinction in the public metadata and auth surfaces.
 ## LLM Demo Note
 The project is now ready for a first live LLM demo using GenieHive as:
 - master: control plane
 - peer: one or more node agents with pre-existing local LLM servers
 - client: a small demo agent or Codex configured against GenieHive
 The current live-demo priority is chat-first. Embeddings are also wired in GenieHive, but upstream compatibility differs across local servers, so the safest first demo matrix is:
 - Ollama for chat and embeddings
 - vLLM for chat and embeddings
 - llama.cpp for chat
 - llamafile for chat
--- a/docs/schemas.md
+++ b/docs/schemas.md
@ -0,0 +1,128 @@
 # GenieHive Schemas
 These are canonical logical schemas for v1. They are documentation first, not final implementation code.
 ## Host
 ```yaml
 host:
  host_id: "atlas-01"
  display_name: "Atlas GPU Box"
  address: "192.168.1.101"
  labels:
    site: "home-lab"
    class: "gpu"
  capabilities:
    cuda: true
    rocm: false
    metal: false
  resources:
    cpu_threads: 24
    ram_gb: 128
    gpus:
      - gpu_id: "cuda:0"
        name: "RTX 4090"
        vram_gb: 24
  auth:
    node_key_id: "nk_atlas_01"
  status:
    state: "online"
    last_seen: "2026-04-05T15:30:00Z"
 ```
 ## Service
 ```yaml
 service:
  service_id: "atlas-01/chat/qwen3-8b"
  host_id: "atlas-01"
  kind: "chat"
  protocol: "openai"
  endpoint: "http://192.168.1.101:18091"
  runtime:
    engine: "llama.cpp"
    launcher: "managed"
  assets:
    - asset_id: "qwen3-8b-q4km"
      loaded: true
  state:
    health: "healthy"
    load_state: "loaded"
    accept_requests: true
  observed:
    p50_latency_ms: 920
    p95_latency_ms: 1900
    tokens_per_sec: 42
 ```
 ## Asset
 ```yaml
 asset:
  asset_id: "qwen3-8b-q4km"
  family: "Qwen3-8B"
  modality: "text"
  operation: "chat"
  format: "gguf"
  locator:
    kind: "path"
    value: "/models/qwen3-8b/qwen3-8b-q4_k_m.gguf"
  metadata:
    quant: "Q4_K_M"
    ctx_train: 32768
 ```
 ## Role Profile
 ```yaml
 role:
  role_id: "mentor"
  display_name: "Mentor"
  description: "Guidance-oriented instructional reasoning"
  modality: "text"
  operation: "chat"
  prompt_policy:
    system_prompt: "You guide without doing the user's work for them."
    user_template: "{{ user_input }}"
  routing_policy:
    preferred_families: ["Qwen3", "Mistral"]
    preferred_labels: ["instruction", "stable"]
    min_context: 8192
    require_loaded: false
    fallback_roles: ["general_assistant"]
 ```
 ## Health Sample
 ```yaml
 health_sample:
  sample_id: "hs_01"
  target_type: "service"
  target_id: "atlas-01/chat/qwen3-8b"
  observed_at: "2026-04-05T15:30:00Z"
  status: "healthy"
  checks:
    http_ok: true
    models_ok: true
    auth_ok: true
  metrics:
    queue_depth: 1
    in_flight: 1
    mem_used_gb: 18.4
 ```
 ## Benchmark Sample
 ```yaml
 benchmark_sample:
  benchmark_id: "bench_01"
  service_id: "atlas-01/chat/qwen3-8b"
  asset_id: "qwen3-8b-q4km"
  observed_at: "2026-04-05T15:25:00Z"
  workload: "chat.short_reasoning"
  results:
    prompt_tokens: 512
    completion_tokens: 256
    ttft_ms: 780
    tokens_per_sec: 44
 ```
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,28 @@
 [build-system]
 requires = ["setuptools>=68", "wheel"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "geniehive"
 version = "0.1.0"
 description = "Local-first control plane for heterogeneous generative AI services"
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
  "fastapi>=0.110",
  "httpx>=0.27",
  "pydantic>=2.6",
  "pyyaml>=6.0.1",
  "uvicorn>=0.29",
 ]
 [project.optional-dependencies]
 dev = [
  "pytest>=8.0",
 ]
 [tool.setuptools]
 package-dir = {"" = "src"}
 [tool.setuptools.packages.find]
 where = ["src"]
--- a/scripts/check_singlebox_health.sh
+++ b/scripts/check_singlebox_health.sh
@ -0,0 +1,37 @@
 #!/usr/bin/env bash
 set -euo pipefail
 check() {
  local name="$1"
  local url="$2"
  if curl -fsS "$url" >/dev/null 2>&1; then
    printf '[ok]   %s -> %s\n' "$name" "$url"
  else
    printf '[fail] %s -> %s\n' "$name" "$url"
  fi
 }
 echo "GenieHive single-box health check"
 echo
 check "gpu0 upstream" "http://127.0.0.1:18091/health"
 check "gpu1 upstream" "http://127.0.0.1:18092/health"
 check "cpu upstream" "http://127.0.0.1:18093/health"
 check "control plane" "http://127.0.0.1:8800/health"
 check "node agent" "http://127.0.0.1:8891/health"
 echo
 echo "Authenticated GenieHive checks"
 echo
 if curl -fsS http://127.0.0.1:8800/v1/cluster/health -H 'X-Api-Key: change-me-client-key' >/dev/null 2>&1; then
  echo "[ok]   cluster health endpoint"
 else
  echo "[fail] cluster health endpoint"
 fi
 if curl -fsS http://127.0.0.1:8800/v1/models -H 'X-Api-Key: change-me-client-key' >/dev/null 2>&1; then
  echo "[ok]   model catalog endpoint"
 else
  echo "[fail] model catalog endpoint"
 fi
--- a/scripts/demo_client_agent.py
+++ b/scripts/demo_client_agent.py
@ -0,0 +1,92 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import argparse
 import json
 from typing import Any
 import httpx
 def fetch_models(client: httpx.Client, base_url: str, api_key: str) -> list[dict[str, Any]]:
    response = client.get(
        f"{base_url.rstrip('/')}/v1/models",
        headers={"X-Api-Key": api_key},
    )
    response.raise_for_status()
    return response.json().get("data", [])
 def choose_chat_model(models: list[dict[str, Any]]) -> str:
    candidates = []
    for item in models:
        meta = item.get("geniehive", {})
        if meta.get("operation") != "chat":
            continue
        offload = meta.get("offload_hint", {})
        route_type = meta.get("route_type")
        suitability = offload.get("suitability", "")
        latency = meta.get("best_p50_latency_ms")
        if latency is None:
            latency = meta.get("observed", {}).get("p50_latency_ms")
        latency_score = float(latency) if latency is not None else float("inf")
        role_preference = 1 if route_type == "role" else 0
        suitability_rank = {
            "good_for_low_complexity": 3,
            "usable_for_background_tasks": 2,
            "available_but_slow": 1,
            "cold_only": 0,
        }.get(suitability, 0)
        candidates.append((suitability_rank, role_preference, -latency_score, item["id"]))
    if not candidates:
        raise SystemExit("No chat-capable models were advertised by GenieHive.")
    return max(candidates)[3]
 def run_task(base_url: str, api_key: str, model: str, task: str) -> dict[str, Any]:
    with httpx.Client(timeout=120.0) as client:
        response = client.post(
            f"{base_url.rstrip('/')}/v1/chat/completions",
            headers={
                "X-Api-Key": api_key,
                "Content-Type": "application/json",
            },
            json={
                "model": model,
                "messages": [
                    {"role": "system", "content": "You are a concise demo client agent."},
                    {"role": "user", "content": task},
                ],
            },
        )
        response.raise_for_status()
        return response.json()
 def main() -> None:
    parser = argparse.ArgumentParser(description="Exercise GenieHive as a small client agent.")
    parser.add_argument("--base-url", required=True, help="GenieHive control-plane base URL")
    parser.add_argument("--api-key", required=True, help="GenieHive client API key")
    parser.add_argument("--model", help="Explicit chat model or role alias to use")
    parser.add_argument("--task", help="Task text to send")
    parser.add_argument("--list-models", action="store_true", help="List advertised models and exit")
    args = parser.parse_args()
    with httpx.Client(timeout=30.0) as client:
        models = fetch_models(client, args.base_url, args.api_key)
    if args.list_models:
        print(json.dumps(models, indent=2))
        return
    if not args.task:
        raise SystemExit("--task is required unless --list-models is used.")
    model = args.model or choose_chat_model(models)
    print(f"Using model: {model}")
    result = run_task(args.base_url, args.api_key, model, args.task)
    print(json.dumps(result, indent=2))
 if __name__ == "__main__":
    main()
--- a/scripts/demo_inspect.sh
+++ b/scripts/demo_inspect.sh
@ -0,0 +1,18 @@
 #!/usr/bin/env bash
 set -euo pipefail
 BASE_URL="${GENIEHIVE_CONTROL_BASE_URL:-http://127.0.0.1:8800}"
 CLIENT_KEY="${GENIEHIVE_CLIENT_KEY:-change-me-client-key}"
 curl -sS "$BASE_URL/v1/models" -H "X-Api-Key: $CLIENT_KEY"
 printf '\n'
 curl -sS "$BASE_URL/v1/cluster/health" -H "X-Api-Key: $CLIENT_KEY"
 printf '\n'
 curl -sS "$BASE_URL/v1/cluster/hosts" -H "X-Api-Key: $CLIENT_KEY"
 printf '\n'
 curl -sS "$BASE_URL/v1/cluster/services" -H "X-Api-Key: $CLIENT_KEY"
 printf '\n'
 curl -sS "$BASE_URL/v1/cluster/roles" -H "X-Api-Key: $CLIENT_KEY"
 printf '\n'
 curl -sS "$BASE_URL/v1/cluster/routes/resolve?model=mentor" -H "X-Api-Key: $CLIENT_KEY"
 printf '\n'
--- a/scripts/launch_p40_triple.sh
+++ b/scripts/launch_p40_triple.sh
@ -0,0 +1,37 @@
 #!/usr/bin/env bash
 set -euo pipefail
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 SESSION="${GENIEHIVE_TMUX_SESSION:-geniehive-p40}"
 STATUS_CMD="$ROOT/scripts/tmux_session_status.sh"
 GPU0_CMD="$ROOT/scripts/p40_triple_gpu0.sh"
 GPU1_CMD="$ROOT/scripts/p40_triple_gpu1.sh"
 CPU_CMD="$ROOT/scripts/p40_triple_cpu.sh"
 if command -v tmux >/dev/null 2>&1; then
  if tmux has-session -t "$SESSION" 2>/dev/null; then
    echo "tmux session already exists: $SESSION"
    echo "Inspect panes with: bash '$STATUS_CMD' '$SESSION'"
    exit 1
  fi
  tmux new-session -d -s "$SESSION" "cd '$ROOT' && bash '$GPU0_CMD'"
  tmux split-window -h -t "$SESSION:0" "cd '$ROOT' && bash '$GPU1_CMD'"
  tmux split-window -v -t "$SESSION:0" "cd '$ROOT' && bash '$CPU_CMD'"
  tmux set-option -t "$SESSION:0" remain-on-exit on >/dev/null
  tmux select-pane -t "$SESSION:0.0" -T gpu0 >/dev/null
  tmux select-pane -t "$SESSION:0.1" -T gpu1 >/dev/null
  tmux select-pane -t "$SESSION:0.2" -T cpu >/dev/null
  tmux select-layout -t "$SESSION" tiled >/dev/null
  echo "Started tmux session: $SESSION"
  echo "Inspect panes with: bash '$STATUS_CMD' '$SESSION'"
  echo "Attach manually only if needed: tmux attach -t $SESSION"
  exit 0
 fi
 echo "tmux not found. Run these in three shells:"
 echo
 echo "bash '$GPU0_CMD'"
 echo "bash '$GPU1_CMD'"
 echo "bash '$CPU_CMD'"
--- a/scripts/p40_triple_cpu.sh
+++ b/scripts/p40_triple_cpu.sh
@ -0,0 +1,7 @@
 #!/usr/bin/env bash
 set -euo pipefail
 MODEL_CPU="${MODEL_CPU:-/home/netuser/bin/models/llm/rocket-3b.Q5_K_M.gguf}"
 LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/home/netuser/bin/llama.cpp/build/bin/llama-server}"
 exec "$LLAMA_SERVER_BIN" -m "$MODEL_CPU" --host 127.0.0.1 --port 18093 -ngl 0 -t 12
--- a/scripts/p40_triple_gpu0.sh
+++ b/scripts/p40_triple_gpu0.sh
@ -0,0 +1,7 @@
 #!/usr/bin/env bash
 set -euo pipefail
 MODEL_GPU0="${MODEL_GPU0:-/home/netuser/bin/models/llm/Qwen2.5-14B-Instruct-1M-Q5_K_M.gguf}"
 LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/home/netuser/bin/llama.cpp/build/bin/llama-server}"
 exec env CUDA_VISIBLE_DEVICES=0 "$LLAMA_SERVER_BIN" -m "$MODEL_GPU0" --host 127.0.0.1 --port 18091
--- a/scripts/p40_triple_gpu1.sh
+++ b/scripts/p40_triple_gpu1.sh
@ -0,0 +1,32 @@
 #!/usr/bin/env bash
 set -euo pipefail
 MODEL_GPU1="${MODEL_GPU1:-/home/netuser/bin/models/llm/Qwen3.5-9B-Q5_K_M.gguf}"
 LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/home/netuser/bin/llama.cpp/build/bin/llama-server}"
 HOST="${GPU1_HOST:-127.0.0.1}"
 PORT="${GPU1_PORT:-18092}"
 CTX_SIZE="${GPU1_CTX_SIZE:-4096}"
 NGL="${GPU1_NGL:-999}"
 GPU_INDEX="${GPU1_INDEX:-1}"
 USE_CONTAINER="${GPU1_USE_CONTAINER:-0}"
 CONTAINER_IMAGE="${GPU1_CONTAINER_IMAGE:-ghcr.io/ggml-org/llama.cpp:server-cuda}"
 if [[ "${USE_CONTAINER}" == "1" ]]; then
  exec docker run --rm --gpus all \
    --network host \
    -e CUDA_VISIBLE_DEVICES="${GPU_INDEX}" \
    -v "$(dirname "${MODEL_GPU1}"):/models:ro" \
    "${CONTAINER_IMAGE}" \
    -m "/models/$(basename "${MODEL_GPU1}")" \
    -ngl "${NGL}" \
    --ctx-size "${CTX_SIZE}" \
    --host "${HOST}" \
    --port "${PORT}"
 fi
 exec env CUDA_VISIBLE_DEVICES="${GPU_INDEX}" "$LLAMA_SERVER_BIN" \
  -m "$MODEL_GPU1" \
  -ngl "${NGL}" \
  --ctx-size "${CTX_SIZE}" \
  --host "${HOST}" \
  --port "${PORT}"
--- a/scripts/run_control.sh
+++ b/scripts/run_control.sh
@ -0,0 +1,11 @@
 #!/usr/bin/env bash
 set -euo pipefail
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "$ROOT"
 export GENIEHIVE_CONTROL_CONFIG="$ROOT/configs/control.example.yaml"
 export GENIEHIVE_ROLES_CONFIG="$ROOT/configs/roles.example.yaml"
 export PYTHONPATH="$ROOT/src"
 exec python -m uvicorn geniehive_control.main:app --host 127.0.0.1 --port 8800
--- a/scripts/run_control_p40_lan.sh
+++ b/scripts/run_control_p40_lan.sh
@ -0,0 +1,10 @@
 #!/usr/bin/env bash
 set -euo pipefail
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "$ROOT"
 export GENIEHIVE_BIND_HOST="${GENIEHIVE_BIND_HOST:-192.168.40.207}"
 export GENIEHIVE_BIND_PORT="${GENIEHIVE_BIND_PORT:-8800}"
 exec bash "$ROOT/scripts/run_control_singlebox.sh" "$ROOT/configs/control.singlebox.p40.example.yaml"
--- a/scripts/run_control_p40_zerotier.sh
+++ b/scripts/run_control_p40_zerotier.sh
@ -0,0 +1,10 @@
 #!/usr/bin/env bash
 set -euo pipefail
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "$ROOT"
 export GENIEHIVE_BIND_HOST="${GENIEHIVE_BIND_HOST:-172.24.50.65}"
 export GENIEHIVE_BIND_PORT="${GENIEHIVE_BIND_PORT:-8800}"
 exec bash "$ROOT/scripts/run_control_singlebox.sh" "$ROOT/configs/control.singlebox.p40.example.yaml"
--- a/scripts/run_control_singlebox.sh
+++ b/scripts/run_control_singlebox.sh
@ -0,0 +1,18 @@
 #!/usr/bin/env bash
 set -euo pipefail
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "$ROOT"
 CONTROL_CONFIG="${1:-$ROOT/configs/control.singlebox.example.yaml}"
 export GENIEHIVE_CONTROL_CONFIG="$CONTROL_CONFIG"
 if [[ -z "${GENIEHIVE_ROLES_CONFIG:-}" ]]; then
  export GENIEHIVE_ROLES_CONFIG="$ROOT/configs/roles.example.yaml"
 fi
 export PYTHONPATH="$ROOT/src"
 HOST="${GENIEHIVE_BIND_HOST:-127.0.0.1}"
 PORT="${GENIEHIVE_BIND_PORT:-8800}"
 exec python -m uvicorn geniehive_control.main:app --host "$HOST" --port "$PORT"
--- a/scripts/run_node.sh
+++ b/scripts/run_node.sh
@ -0,0 +1,10 @@
 #!/usr/bin/env bash
 set -euo pipefail
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "$ROOT"
 export GENIEHIVE_NODE_CONFIG="$ROOT/configs/node.example.yaml"
 export PYTHONPATH="$ROOT/src"
 exec python -m uvicorn geniehive_node.main:app --host 127.0.0.1 --port 8891
--- a/scripts/run_node_singlebox.sh
+++ b/scripts/run_node_singlebox.sh
@ -0,0 +1,15 @@
 #!/usr/bin/env bash
 set -euo pipefail
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "$ROOT"
 NODE_CONFIG="${1:-$ROOT/configs/node.singlebox.ollama.example.yaml}"
 export GENIEHIVE_NODE_CONFIG="$NODE_CONFIG"
 export PYTHONPATH="$ROOT/src"
 HOST="${GENIEHIVE_NODE_BIND_HOST:-127.0.0.1}"
 PORT="${GENIEHIVE_NODE_BIND_PORT:-8891}"
 exec python -m uvicorn geniehive_node.main:app --host "$HOST" --port "$PORT"
--- a/scripts/start_p40_triple_llamacpp.sh
+++ b/scripts/start_p40_triple_llamacpp.sh
@ -0,0 +1,35 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # Example launcher pattern for:
 # - GPU0 chat model on :18091
 # - GPU1 chat model on :18092
 # - CPU fallback chat model on :18093
 #
 # Defaults are based on models already present under /home/netuser/bin/models/llm.
 # Override them via env vars if you want different weights.
 MODEL_GPU0="${MODEL_GPU0:-/home/netuser/bin/models/llm/Qwen2.5-14B-Instruct-1M-Q5_K_M.gguf}"
 MODEL_GPU1="${MODEL_GPU1:-/home/netuser/bin/models/llm/Qwen3.5-9B-Q5_K_M.gguf}"
 MODEL_CPU="${MODEL_CPU:-/home/netuser/bin/models/llm/rocket-3b.Q5_K_M.gguf}"
 LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/home/netuser/bin/llama.cpp/build/bin/llama-server}"
 echo "Start these in separate shells or tmux panes."
 echo "Helper scripts are available too:"
 echo
 echo "bash /home/netuser/bin/geniehive/scripts/p40_triple_gpu0.sh"
 echo
 echo "bash /home/netuser/bin/geniehive/scripts/p40_triple_gpu1.sh"
 echo
 echo "bash /home/netuser/bin/geniehive/scripts/p40_triple_cpu.sh"
 echo
 echo "Or try the combined launcher:"
 echo "bash /home/netuser/bin/geniehive/scripts/launch_p40_triple.sh"
 echo
 echo "Equivalent raw commands:"
 echo
 echo "CUDA_VISIBLE_DEVICES=0 \"$LLAMA_SERVER_BIN\" -m \"$MODEL_GPU0\" --host 127.0.0.1 --port 18091"
 echo
 echo "CUDA_VISIBLE_DEVICES=1 \"$LLAMA_SERVER_BIN\" -m \"$MODEL_GPU1\" --host 127.0.0.1 --port 18092"
 echo
 echo "\"$LLAMA_SERVER_BIN\" -m \"$MODEL_CPU\" --host 127.0.0.1 --port 18093 -ngl 0 -t 12"
--- a/scripts/test_qwen35_server_cuda_container.sh
+++ b/scripts/test_qwen35_server_cuda_container.sh
@ -0,0 +1,34 @@
 #!/usr/bin/env bash
 set -euo pipefail
 IMAGE="${IMAGE:-ghcr.io/ggml-org/llama.cpp:server-cuda}"
 MODEL_PATH="${MODEL_PATH:-/home/netuser/bin/models/llm/Qwen3.5-9B-Q5_K_M.gguf}"
 GPU_INDEX="${GPU_INDEX:-0}"
 CTX_SIZE="${CTX_SIZE:-512}"
 PORT="${PORT:-19091}"
 TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-90}"
 if [[ ! -f "${MODEL_PATH}" ]]; then
  echo "Model not found: ${MODEL_PATH}" >&2
  exit 1
 fi
 echo "Image: ${IMAGE}"
 echo "Model: ${MODEL_PATH}"
 echo "GPU: ${GPU_INDEX}"
 echo "Port: ${PORT}"
 echo "Timeout: ${TIMEOUT_SECONDS}s"
 echo
 echo "This probe is successful if llama-server loads the model and begins serving."
 echo "A timeout exit after successful startup is acceptable for this test."
 echo
 timeout "${TIMEOUT_SECONDS}"s docker run --rm --gpus all \
  -e CUDA_VISIBLE_DEVICES="${GPU_INDEX}" \
  -v "$(dirname "${MODEL_PATH}"):/models:ro" \
  "${IMAGE}" \
  -m "/models/$(basename "${MODEL_PATH}")" \
  -ngl 999 \
  --ctx-size "${CTX_SIZE}" \
  --host 127.0.0.1 \
  --port "${PORT}"
--- a/scripts/tmux_session_status.sh
+++ b/scripts/tmux_session_status.sh
@ -0,0 +1,38 @@
 #!/usr/bin/env bash
 set -euo pipefail
 SESSION="${1:-${GENIEHIVE_TMUX_SESSION:-geniehive-p40}}"
 if ! command -v tmux >/dev/null 2>&1; then
  echo "tmux not found"
  exit 127
 fi
 if ! tmux has-session -t "$SESSION" 2>/dev/null; then
  echo "tmux session not found: $SESSION"
  exit 1
 fi
 printf 'tmux session: %s\n' "$SESSION"
 printf '%-6s %-8s %-10s %-8s %s\n' "pane" "title" "state" "status" "command"
 live_count=0
 while IFS=$'\t' read -r pane_id pane_title pane_pid pane_dead pane_dead_status pane_start_command; do
  state="exited"
  status="$pane_dead_status"
  if [[ "$pane_dead" == "0" ]] && kill -0 "$pane_pid" 2>/dev/null; then
    state="running"
    status="-"
    live_count=$((live_count + 1))
  fi
  printf '%-6s %-8s %-10s %-8s %s\n' "$pane_id" "${pane_title:--}" "$state" "${status:--}" "$pane_start_command"
 done < <(
  tmux list-panes -t "$SESSION" -F "#{pane_index}\t#{pane_title}\t#{pane_pid}\t#{pane_dead}\t#{pane_dead_status}\t#{pane_start_command}"
 )
 if [[ "$live_count" -eq 0 ]]; then
  echo
  echo "No pane processes are still running."
  exit 2
 fi
--- a/src/geniehive_control/init.py
+++ b/src/geniehive_control/init.py
@ -0,0 +1,2 @@
 """GenieHive control-plane package."""
--- a/src/geniehive_control/auth.py
+++ b/src/geniehive_control/auth.py
@ -0,0 +1,25 @@
 from __future__ import annotations
 from fastapi import HTTPException, Request, status
 def _check_key(request: Request, allowed_keys: list[str], header_name: str) -> None:
    if not allowed_keys:
        return
    provided = request.headers.get(header_name)
    if provided in allowed_keys:
        return
    raise HTTPException(
        status_code=status.HTTP_401_UNAUTHORIZED,
        detail="unauthorized",
    )
 def require_client_auth(request: Request) -> None:
    cfg = request.app.state.cfg
    _check_key(request, cfg.auth.client_api_keys, "X-Api-Key")
 def require_node_auth(request: Request) -> None:
    cfg = request.app.state.cfg
    _check_key(request, cfg.auth.node_api_keys, "X-GenieHive-Node-Key")
--- a/src/geniehive_control/chat.py
+++ b/src/geniehive_control/chat.py
@ -0,0 +1,74 @@
 from __future__ import annotations
 from typing import Any
 from .registry import Registry
 from .routing import choose_upstream_model_id
 from .upstream import UpstreamClient
 class ProxyError(RuntimeError):
    def __init__(self, message: str, *, status_code: int) -> None:
        super().__init__(message)
        self.status_code = status_code
 def _strip_reasoning_fields(payload: Any) -> Any:
    if isinstance(payload, list):
        return [_strip_reasoning_fields(item) for item in payload]
    if not isinstance(payload, dict):
        return payload
    cleaned: dict[str, Any] = {}
    for key, value in payload.items():
        if key in {"reasoning_content", "reasoning"}:
            continue
        cleaned[key] = _strip_reasoning_fields(value)
    return cleaned
 async def proxy_chat_completion(
    body: dict[str, Any],
    *,
    registry: Registry,
    upstream: UpstreamClient,
 ) -> Any:
    requested_model = body.get("model")
    if not requested_model:
        raise ProxyError("Missing 'model' in request body.", status_code=400)
    resolved = registry.resolve_route(requested_model, kind="chat")
    if resolved is None:
        raise ProxyError(f"Unknown model or role '{requested_model}'.", status_code=404)
    service = resolved.get("service")
    if service is None:
        raise ProxyError(f"No healthy chat target available for '{requested_model}'.", status_code=503)
    upstream_body = dict(body)
    upstream_body["model"] = choose_upstream_model_id(requested_model, service)
    response = await upstream.chat_completions(service["endpoint"], upstream_body)
    return _strip_reasoning_fields(response)
 async def proxy_embeddings(
    body: dict[str, Any],
    *,
    registry: Registry,
    upstream: UpstreamClient,
 ) -> Any:
    requested_model = body.get("model")
    if not requested_model:
        raise ProxyError("Missing 'model' in request body.", status_code=400)
    resolved = registry.resolve_route(requested_model, kind="embeddings")
    if resolved is None:
        raise ProxyError(f"Unknown model or role '{requested_model}'.", status_code=404)
    service = resolved.get("service")
    if service is None:
        raise ProxyError(f"No healthy embeddings target available for '{requested_model}'.", status_code=503)
    upstream_body = dict(body)
    upstream_body["model"] = choose_upstream_model_id(requested_model, service)
    return await upstream.embeddings(service["endpoint"], upstream_body)
--- a/src/geniehive_control/config.py
+++ b/src/geniehive_control/config.py
@ -0,0 +1,40 @@
 from __future__ import annotations
 from pathlib import Path
 import yaml
 from pydantic import BaseModel, Field
 class ServerConfig(BaseModel):
    host: str = "127.0.0.1"
    port: int = 8800
 class AuthConfig(BaseModel):
    client_api_keys: list[str] = Field(default_factory=list)
    node_api_keys: list[str] = Field(default_factory=list)
 class StorageConfig(BaseModel):
    sqlite_path: str = "state/geniehive.sqlite3"
 class RoutingConfig(BaseModel):
    default_strategy: str = "loaded_first"
    health_stale_after_s: float = 30.0
 class ControlConfig(BaseModel):
    server: ServerConfig = Field(default_factory=ServerConfig)
    auth: AuthConfig = Field(default_factory=AuthConfig)
    storage: StorageConfig = Field(default_factory=StorageConfig)
    routing: RoutingConfig = Field(default_factory=RoutingConfig)
    roles_path: str | None = None
 def load_config(path: str | Path) -> ControlConfig:
    raw = yaml.safe_load(Path(path).read_text()) or {}
    if not isinstance(raw, dict):
        raise ValueError("Control config must be a YAML mapping.")
    return ControlConfig.model_validate(raw)
--- a/src/geniehive_control/main.py
+++ b/src/geniehive_control/main.py
@ -0,0 +1,127 @@
 from __future__ import annotations
 import os
 from pathlib import Path
 from fastapi import Depends, FastAPI, Request
 from fastapi.responses import JSONResponse
 from .auth import require_client_auth, require_node_auth
 from .chat import ProxyError, proxy_chat_completion, proxy_embeddings
 from .config import ControlConfig, load_config
 from .models import HostHeartbeat, HostRegistration
 from .roles import load_role_catalog
 from .registry import Registry
 from .upstream import UpstreamClient, UpstreamError
 def create_app(
    config_path: str | Path | None = None,
    *,
    upstream_client: UpstreamClient | None = None,
 ) -> FastAPI:
    cfg_path = config_path or os.environ.get("GENIEHIVE_CONTROL_CONFIG")
    cfg = load_config(cfg_path) if cfg_path else ControlConfig()
    registry = Registry(cfg.storage.sqlite_path)
    roles_path = cfg.roles_path or os.environ.get("GENIEHIVE_ROLES_CONFIG")
    if roles_path:
        registry.upsert_roles(load_role_catalog(roles_path).roles)
    upstream = upstream_client or UpstreamClient()
    app = FastAPI(title="GenieHive Control", version="0.1.0")
    app.state.cfg = cfg
    app.state.registry = registry
    app.state.upstream = upstream
    @app.get("/health")
    async def health() -> dict[str, str]:
        return {"status": "ok"}
    @app.post("/v1/nodes/register")
    async def register_node(request: Request, _=Depends(require_node_auth)) -> dict:
        payload = await request.json()
        reg = HostRegistration.model_validate(payload)
        host = request.app.state.registry.register_host(reg)
        return {"status": "ok", "host": host}
    @app.post("/v1/nodes/heartbeat")
    async def heartbeat_node(request: Request, _=Depends(require_node_auth)):
        payload = await request.json()
        hb = HostHeartbeat.model_validate(payload)
        host = request.app.state.registry.heartbeat_host(hb)
        if host is None:
            return JSONResponse(status_code=404, content={"error": "unknown_host", "host_id": hb.host_id})
        return {"status": "ok", "host": host}
    @app.get("/v1/cluster/hosts")
    async def list_hosts(request: Request, _=Depends(require_client_auth)) -> dict:
        return {"object": "list", "data": request.app.state.registry.list_hosts()}
    @app.get("/v1/models")
    async def list_models(request: Request, _=Depends(require_client_auth)) -> dict:
        return {"object": "list", "data": request.app.state.registry.list_client_models()}
    @app.post("/v1/chat/completions")
    async def chat_completions(request: Request, _=Depends(require_client_auth)):
        body = await request.json()
        try:
            return await proxy_chat_completion(
                body,
                registry=request.app.state.registry,
                upstream=request.app.state.upstream,
            )
        except ProxyError as exc:
            return JSONResponse(
                status_code=exc.status_code,
                content={"error": {"message": str(exc), "type": "geniehive_error", "code": "chat_proxy_error"}},
            )
        except UpstreamError as exc:
            return JSONResponse(
                status_code=exc.status_code or 502,
                content={"error": {"message": str(exc), "type": "geniehive_error", "code": "upstream_error"}},
            )
    @app.post("/v1/embeddings")
    async def embeddings(request: Request, _=Depends(require_client_auth)):
        body = await request.json()
        try:
            return await proxy_embeddings(
                body,
                registry=request.app.state.registry,
                upstream=request.app.state.upstream,
            )
        except ProxyError as exc:
            return JSONResponse(
                status_code=exc.status_code,
                content={"error": {"message": str(exc), "type": "geniehive_error", "code": "embeddings_proxy_error"}},
            )
        except UpstreamError as exc:
            return JSONResponse(
                status_code=exc.status_code or 502,
                content={"error": {"message": str(exc), "type": "geniehive_error", "code": "upstream_error"}},
            )
    @app.get("/v1/cluster/services")
    async def list_services(request: Request, _=Depends(require_client_auth)) -> dict:
        return {"object": "list", "data": request.app.state.registry.list_services()}
    @app.get("/v1/cluster/roles")
    async def list_roles(request: Request, _=Depends(require_client_auth)) -> dict:
        return {"object": "list", "data": request.app.state.registry.list_roles()}
    @app.get("/v1/cluster/health")
    async def cluster_health(request: Request, _=Depends(require_client_auth)) -> dict:
        cfg: ControlConfig = request.app.state.cfg
        return request.app.state.registry.cluster_health(cfg.routing.health_stale_after_s)
    @app.get("/v1/cluster/routes/resolve")
    async def resolve_route(model: str, request: Request, kind: str | None = None, _=Depends(require_client_auth)) -> dict:
        resolved = request.app.state.registry.resolve_route(model, kind=kind)
        if resolved is None:
            return JSONResponse(status_code=404, content={"error": "no_route", "model": model, "kind": kind})
        return {"status": "ok", "resolution": resolved}
    return app
 app = create_app()
--- a/src/geniehive_control/models.py
+++ b/src/geniehive_control/models.py
@ -0,0 +1,90 @@
 from __future__ import annotations
 from typing import Any, Literal
 from pydantic import BaseModel, Field
 class ServiceAsset(BaseModel):
    asset_id: str
    loaded: bool = False
 class ServiceRuntime(BaseModel):
    engine: str | None = None
    launcher: str | None = None
 class ServiceState(BaseModel):
    health: str | None = None
    load_state: str | None = None
    accept_requests: bool = True
 class ServiceObserved(BaseModel):
    p50_latency_ms: float | None = None
    p95_latency_ms: float | None = None
    tokens_per_sec: float | None = None
    queue_depth: int | None = None
    in_flight: int | None = None
 class RegisteredService(BaseModel):
    service_id: str
    host_id: str
    kind: Literal["chat", "embeddings", "transcription"]
    protocol: str = "openai"
    endpoint: str
    runtime: ServiceRuntime = Field(default_factory=ServiceRuntime)
    assets: list[ServiceAsset] = Field(default_factory=list)
    state: ServiceState = Field(default_factory=ServiceState)
    observed: ServiceObserved = Field(default_factory=ServiceObserved)
 class HostStatus(BaseModel):
    state: str = "online"
    last_seen: float | None = None
 class HostRegistration(BaseModel):
    host_id: str
    display_name: str | None = None
    address: str
    labels: dict[str, str] = Field(default_factory=dict)
    capabilities: dict[str, Any] = Field(default_factory=dict)
    resources: dict[str, Any] = Field(default_factory=dict)
    services: list[RegisteredService] = Field(default_factory=list)
 class HostHeartbeat(BaseModel):
    host_id: str
    status: HostStatus = Field(default_factory=HostStatus)
    metrics: dict[str, Any] = Field(default_factory=dict)
    services: list[RegisteredService] = Field(default_factory=list)
 class PromptPolicy(BaseModel):
    system_prompt: str | None = None
    user_template: str | None = None
 class RoutingPolicy(BaseModel):
    preferred_families: list[str] = Field(default_factory=list)
    preferred_labels: list[str] = Field(default_factory=list)
    min_context: int | None = None
    require_loaded: bool = False
    fallback_roles: list[str] = Field(default_factory=list)
 class RoleProfile(BaseModel):
    role_id: str
    display_name: str | None = None
    description: str | None = None
    operation: Literal["chat", "embeddings", "transcription"]
    modality: str
    prompt_policy: PromptPolicy = Field(default_factory=PromptPolicy)
    routing_policy: RoutingPolicy = Field(default_factory=RoutingPolicy)
 class RoleCatalog(BaseModel):
    roles: list[RoleProfile] = Field(default_factory=list)
--- a/src/geniehive_control/registry.py
+++ b/src/geniehive_control/registry.py
@ -0,0 +1,464 @@
 from __future__ import annotations
 import json
 import sqlite3
 import time
 from pathlib import Path
 from .models import HostHeartbeat, HostRegistration, RegisteredService, RoleProfile
 def _json_dumps(value: object) -> str:
    return json.dumps(value, sort_keys=True)
 class Registry:
    def __init__(self, db_path: str | Path) -> None:
        self.db_path = Path(db_path)
        self.db_path.parent.mkdir(parents=True, exist_ok=True)
        self._init_db()
    def _connect(self) -> sqlite3.Connection:
        conn = sqlite3.connect(self.db_path)
        conn.row_factory = sqlite3.Row
        return conn
    def _init_db(self) -> None:
        with self._connect() as conn:
            conn.executescript(
                """
                CREATE TABLE IF NOT EXISTS hosts (
                    host_id TEXT PRIMARY KEY,
                    display_name TEXT,
                    address TEXT NOT NULL,
                    labels_json TEXT NOT NULL,
                    capabilities_json TEXT NOT NULL,
                    resources_json TEXT NOT NULL,
                    status_state TEXT NOT NULL DEFAULT 'online',
                    last_seen REAL NOT NULL,
                    metrics_json TEXT NOT NULL DEFAULT '{}'
                );
                CREATE TABLE IF NOT EXISTS services (
                    service_id TEXT PRIMARY KEY,
                    host_id TEXT NOT NULL,
                    kind TEXT NOT NULL,
                    protocol TEXT NOT NULL,
                    endpoint TEXT NOT NULL,
                    runtime_json TEXT NOT NULL,
                    assets_json TEXT NOT NULL,
                    state_json TEXT NOT NULL,
                    observed_json TEXT NOT NULL,
                    updated_at REAL NOT NULL,
                    FOREIGN KEY(host_id) REFERENCES hosts(host_id)
                );
                CREATE TABLE IF NOT EXISTS roles (
                    role_id TEXT PRIMARY KEY,
                    display_name TEXT,
                    description TEXT,
                    operation TEXT NOT NULL,
                    modality TEXT NOT NULL,
                    prompt_policy_json TEXT NOT NULL,
                    routing_policy_json TEXT NOT NULL,
                    updated_at REAL NOT NULL
                );
                """
            )
    def register_host(self, reg: HostRegistration) -> dict:
        now = time.time()
        with self._connect() as conn:
            conn.execute(
                """
                INSERT INTO hosts (
                    host_id, display_name, address, labels_json, capabilities_json,
                    resources_json, status_state, last_seen, metrics_json
                )
                VALUES (?, ?, ?, ?, ?, ?, 'online', ?, '{}')
                ON CONFLICT(host_id) DO UPDATE SET
                    display_name=excluded.display_name,
                    address=excluded.address,
                    labels_json=excluded.labels_json,
                    capabilities_json=excluded.capabilities_json,
                    resources_json=excluded.resources_json,
                    status_state='online',
                    last_seen=excluded.last_seen
                """,
                (
                    reg.host_id,
                    reg.display_name,
                    reg.address,
                    _json_dumps(reg.labels),
                    _json_dumps(reg.capabilities),
                    _json_dumps(reg.resources),
                    now,
                ),
            )
            self._replace_services(conn, reg.host_id, reg.services, now)
        return self.get_host(reg.host_id)
    def heartbeat_host(self, hb: HostHeartbeat) -> dict | None:
        now = time.time()
        with self._connect() as conn:
            cur = conn.execute(
                "SELECT host_id FROM hosts WHERE host_id = ?",
                (hb.host_id,),
            )
            if cur.fetchone() is None:
                return None
            conn.execute(
                """
                UPDATE hosts
                SET status_state = ?, last_seen = ?, metrics_json = ?
                WHERE host_id = ?
                """,
                (
                    hb.status.state,
                    now,
                    _json_dumps(hb.metrics),
                    hb.host_id,
                ),
            )
            if hb.services:
                self._replace_services(conn, hb.host_id, hb.services, now)
        return self.get_host(hb.host_id)
    def _replace_services(
        self,
        conn: sqlite3.Connection,
        host_id: str,
        services: list[RegisteredService],
        now: float,
    ) -> None:
        conn.execute("DELETE FROM services WHERE host_id = ?", (host_id,))
        for service in services:
            conn.execute(
                """
                INSERT INTO services (
                    service_id, host_id, kind, protocol, endpoint,
                    runtime_json, assets_json, state_json, observed_json, updated_at
                )
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                """,
                (
                    service.service_id,
                    host_id,
                    service.kind,
                    service.protocol,
                    service.endpoint,
                    _json_dumps(service.runtime.model_dump()),
                    _json_dumps([asset.model_dump() for asset in service.assets]),
                    _json_dumps(service.state.model_dump()),
                    _json_dumps(service.observed.model_dump()),
                    now,
                ),
            )
    def get_host(self, host_id: str) -> dict | None:
        with self._connect() as conn:
            row = conn.execute("SELECT * FROM hosts WHERE host_id = ?", (host_id,)).fetchone()
            if row is None:
                return None
        return self._host_row_to_dict(row)
    def upsert_roles(self, roles: list[RoleProfile]) -> list[dict]:
        now = time.time()
        with self._connect() as conn:
            for role in roles:
                conn.execute(
                    """
                    INSERT INTO roles (
                        role_id, display_name, description, operation, modality,
                        prompt_policy_json, routing_policy_json, updated_at
                    )
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
                    ON CONFLICT(role_id) DO UPDATE SET
                        display_name=excluded.display_name,
                        description=excluded.description,
                        operation=excluded.operation,
                        modality=excluded.modality,
                        prompt_policy_json=excluded.prompt_policy_json,
                        routing_policy_json=excluded.routing_policy_json,
                        updated_at=excluded.updated_at
                    """,
                    (
                        role.role_id,
                        role.display_name,
                        role.description,
                        role.operation,
                        role.modality,
                        _json_dumps(role.prompt_policy.model_dump()),
                        _json_dumps(role.routing_policy.model_dump()),
                        now,
                    ),
                )
        return self.list_roles()
    def get_role(self, role_id: str) -> dict | None:
        with self._connect() as conn:
            row = conn.execute("SELECT * FROM roles WHERE role_id = ?", (role_id,)).fetchone()
            if row is None:
                return None
        return self._role_row_to_dict(row)
    def list_roles(self) -> list[dict]:
        with self._connect() as conn:
            rows = conn.execute("SELECT * FROM roles ORDER BY role_id").fetchall()
        return [self._role_row_to_dict(row) for row in rows]
    def list_hosts(self) -> list[dict]:
        with self._connect() as conn:
            rows = conn.execute("SELECT * FROM hosts ORDER BY host_id").fetchall()
        return [self._host_row_to_dict(row) for row in rows]
    def list_services(self) -> list[dict]:
        with self._connect() as conn:
            rows = conn.execute("SELECT * FROM services ORDER BY host_id, service_id").fetchall()
        return [self._service_row_to_dict(row) for row in rows]
    def list_client_models(self) -> list[dict]:
        services = self.list_services()
        roles = self.list_roles()
        items: list[dict] = []
        for service in services:
            if not service["state"].get("accept_requests", True):
                continue
            if service["state"].get("health") != "healthy":
                continue
            item = {
                "id": service["service_id"],
                "object": "model",
                "owned_by": service["host_id"],
                "geniehive": self._service_metadata(service),
            }
            items.append(item)
            for asset in service["assets"]:
                asset_id = asset.get("asset_id")
                if not asset_id:
                    continue
                items.append(
                    {
                        "id": asset_id,
                        "object": "model",
                        "owned_by": service["host_id"],
                        "geniehive": self._service_metadata(service) | {"route_type": "asset", "asset_id": asset_id},
                    }
                )
        for role in roles:
            matching_services = [
                service
                for service in services
                if service["kind"] == role["operation"]
                and service["state"].get("accept_requests", True)
                and service["state"].get("health") == "healthy"
            ]
            loaded_count = sum(1 for service in matching_services if any(asset.get("loaded") for asset in service["assets"]))
            latencies = [
                service["observed"].get("p50_latency_ms")
                for service in matching_services
                if service["observed"].get("p50_latency_ms") is not None
            ]
            best_latency_ms = min(latencies) if latencies else None
            items.append(
                {
                    "id": role["role_id"],
                    "object": "model",
                    "owned_by": "geniehive-role",
                    "geniehive": {
                        "route_type": "role",
                        "role_id": role["role_id"],
                        "display_name": role["display_name"],
                        "operation": role["operation"],
                        "modality": role["modality"],
                        "healthy_target_count": len(matching_services),
                        "loaded_target_count": loaded_count,
                        "best_p50_latency_ms": best_latency_ms,
                        "offload_hint": self._offload_hint(
                            operation=role["operation"],
                            loaded_count=loaded_count,
                            best_latency_ms=best_latency_ms,
                        ),
                        "routing_policy": role["routing_policy"],
                    },
                }
            )
        deduped: dict[str, dict] = {}
        for item in items:
            deduped[item["id"]] = item
        return [deduped[key] for key in sorted(deduped)]
    def resolve_route(self, requested_model: str, *, kind: str | None = None) -> dict | None:
        direct = self._resolve_direct(requested_model, kind=kind)
        if direct is not None:
            return {"match_type": "direct", **direct}
        role = self.get_role(requested_model)
        if role is None:
            return None
        matched_kind = kind or role["operation"]
        candidates = [
            service
            for service in self.list_services()
            if service["kind"] == matched_kind
            and service["state"].get("accept_requests", True)
            and service["state"].get("health") == "healthy"
        ]
        if not candidates:
            return {"match_type": "role", "role": role, "service": None}
        preferred_families = [family.lower() for family in role["routing_policy"].get("preferred_families", [])]
        def score(service: dict) -> tuple[int, int, float, str]:
            loaded = 1 if any(asset.get("loaded") for asset in service["assets"]) else 0
            family_match = 0
            if preferred_families:
                asset_names = " ".join(asset.get("asset_id", "") for asset in service["assets"]).lower()
                family_match = 1 if any(family in asset_names for family in preferred_families) else 0
            latency = service["observed"].get("p50_latency_ms")
            latency_score = float(latency) if latency is not None else float("inf")
            return (family_match, loaded, -latency_score, service["service_id"])
        if role["routing_policy"].get("require_loaded"):
            loaded_candidates = [service for service in candidates if any(asset.get("loaded") for asset in service["assets"])]
            if loaded_candidates:
                candidates = loaded_candidates
        service = max(candidates, key=score)
        return {"match_type": "role", "role": role, "service": service}
    def _resolve_direct(self, requested_model: str, *, kind: str | None = None) -> dict | None:
        candidates = []
        for service in self.list_services():
            if kind is not None and service["kind"] != kind:
                continue
            if not service["state"].get("accept_requests", True):
                continue
            if service["state"].get("health") != "healthy":
                continue
            asset_ids = {asset.get("asset_id") for asset in service["assets"]}
            if service["service_id"] == requested_model or requested_model in asset_ids:
                candidates.append(service)
        if not candidates:
            return None
        def score(service: dict) -> tuple[int, float, str]:
            loaded = 1 if any(asset.get("loaded") for asset in service["assets"]) else 0
            latency = service["observed"].get("p50_latency_ms")
            latency_score = float(latency) if latency is not None else float("inf")
            return (loaded, -latency_score, service["service_id"])
        service = max(candidates, key=score)
        return {"service": service}
    def cluster_health(self, stale_after_s: float) -> dict:
        hosts = self.list_hosts()
        services = self.list_services()
        now = time.time()
        online = 0
        stale = 0
        for host in hosts:
            is_stale = (now - host["status"]["last_seen"]) > stale_after_s
            if is_stale:
                stale += 1
            elif host["status"]["state"] == "online":
                online += 1
        healthy_services = sum(1 for service in services if service["state"].get("health") == "healthy")
        return {
            "status": "ok",
            "host_count": len(hosts),
            "online_host_count": online,
            "stale_host_count": stale,
            "service_count": len(services),
            "healthy_service_count": healthy_services,
        }
    @staticmethod
    def _offload_hint(*, operation: str, loaded_count: int, best_latency_ms: float | None) -> dict:
        if loaded_count <= 0:
            suitability = "cold_only"
        elif best_latency_ms is not None and best_latency_ms <= 1500:
            suitability = "good_for_low_complexity"
        elif best_latency_ms is not None and best_latency_ms <= 4000:
            suitability = "usable_for_background_tasks"
        else:
            suitability = "available_but_slow"
        return {
            "operation": operation,
            "suitability": suitability,
            "recommended_for": "lower-complexity offload" if operation == "chat" else f"{operation} offload",
            "inference_basis": {
                "loaded_target_count": loaded_count,
                "best_p50_latency_ms": best_latency_ms,
            },
        }
    def _service_metadata(self, service: dict) -> dict:
        lat = service["observed"].get("p50_latency_ms")
        loaded_count = 1 if any(asset.get("loaded") for asset in service["assets"]) else 0
        return {
            "route_type": "service",
            "service_id": service["service_id"],
            "host_id": service["host_id"],
            "operation": service["kind"],
            "protocol": service["protocol"],
            "endpoint": service["endpoint"],
            "health": service["state"].get("health"),
            "loaded_asset_count": loaded_count,
            "assets": service["assets"],
            "runtime": service["runtime"],
            "observed": service["observed"],
            "offload_hint": self._offload_hint(
                operation=service["kind"],
                loaded_count=loaded_count,
                best_latency_ms=lat,
            ),
        }
    @staticmethod
    def _host_row_to_dict(row: sqlite3.Row) -> dict:
        return {
            "host_id": row["host_id"],
            "display_name": row["display_name"],
            "address": row["address"],
            "labels": json.loads(row["labels_json"]),
            "capabilities": json.loads(row["capabilities_json"]),
            "resources": json.loads(row["resources_json"]),
            "status": {
                "state": row["status_state"],
                "last_seen": row["last_seen"],
            },
            "metrics": json.loads(row["metrics_json"]),
        }
    @staticmethod
    def _service_row_to_dict(row: sqlite3.Row) -> dict:
        return {
            "service_id": row["service_id"],
            "host_id": row["host_id"],
            "kind": row["kind"],
            "protocol": row["protocol"],
            "endpoint": row["endpoint"],
            "runtime": json.loads(row["runtime_json"]),
            "assets": json.loads(row["assets_json"]),
            "state": json.loads(row["state_json"]),
            "observed": json.loads(row["observed_json"]),
            "updated_at": row["updated_at"],
        }
    @staticmethod
    def _role_row_to_dict(row: sqlite3.Row) -> dict:
        return {
            "role_id": row["role_id"],
            "display_name": row["display_name"],
            "description": row["description"],
            "operation": row["operation"],
            "modality": row["modality"],
            "prompt_policy": json.loads(row["prompt_policy_json"]),
            "routing_policy": json.loads(row["routing_policy_json"]),
            "updated_at": row["updated_at"],
        }
--- a/src/geniehive_control/roles.py
+++ b/src/geniehive_control/roles.py
@ -0,0 +1,14 @@
 from __future__ import annotations
 from pathlib import Path
 import yaml
 from .models import RoleCatalog
 def load_role_catalog(path: str | Path) -> RoleCatalog:
    raw = yaml.safe_load(Path(path).read_text()) or {}
    if not isinstance(raw, dict):
        raise ValueError("Role catalog must be a YAML mapping.")
    return RoleCatalog.model_validate(raw)
--- a/src/geniehive_control/routing.py
+++ b/src/geniehive_control/routing.py
@ -0,0 +1,17 @@
 from __future__ import annotations
 from typing import Any
 def choose_upstream_model_id(requested_model: str, service: dict[str, Any]) -> str:
    assets = service.get("assets", [])
    asset_ids = [asset.get("asset_id") for asset in assets if asset.get("asset_id")]
    if requested_model in asset_ids:
        return requested_model
    loaded_assets = [asset.get("asset_id") for asset in assets if asset.get("loaded") and asset.get("asset_id")]
    if loaded_assets:
        return loaded_assets[0]
    if asset_ids:
        return asset_ids[0]
    return requested_model
--- a/src/geniehive_control/upstream.py
+++ b/src/geniehive_control/upstream.py
@ -0,0 +1,68 @@
 from __future__ import annotations
 from typing import Any, Protocol
 import httpx
 class UpstreamError(RuntimeError):
    def __init__(self, message: str, *, status_code: int | None = None) -> None:
        super().__init__(message)
        self.status_code = status_code
 class AsyncPoster(Protocol):
    async def post(self, url: str, *, json: dict[str, Any], headers: dict[str, str] | None = None) -> object:
        ...
 class UpstreamClient:
    def __init__(self, client: AsyncPoster | None = None) -> None:
        self._owns_client = client is None
        self._client = client or httpx.AsyncClient(
            timeout=httpx.Timeout(connect=10.0, read=600.0, write=60.0, pool=60.0)
        )
    async def chat_completions(
        self,
        base_url: str,
        body: dict[str, Any],
        *,
        headers: dict[str, str] | None = None,
    ) -> Any:
        url = base_url.rstrip("/") + "/v1/chat/completions"
        response = await self._client.post(url, json=body, headers=headers)
        status_code = getattr(response, "status_code", 200)
        if status_code >= 400:
            text = getattr(response, "text", "")
            raise UpstreamError(
                text or f"upstream error from {url}",
                status_code=status_code,
            )
        if hasattr(response, "json"):
            return response.json()
        return response
    async def embeddings(
        self,
        base_url: str,
        body: dict[str, Any],
        *,
        headers: dict[str, str] | None = None,
    ) -> Any:
        url = base_url.rstrip("/") + "/v1/embeddings"
        response = await self._client.post(url, json=body, headers=headers)
        status_code = getattr(response, "status_code", 200)
        if status_code >= 400:
            text = getattr(response, "text", "")
            raise UpstreamError(
                text or f"upstream error from {url}",
                status_code=status_code,
            )
        if hasattr(response, "json"):
            return response.json()
        return response
    async def aclose(self) -> None:
        if self._owns_client and isinstance(self._client, httpx.AsyncClient):
            await self._client.aclose()
--- a/src/geniehive_node/init.py
+++ b/src/geniehive_node/init.py
@ -0,0 +1,2 @@
 """GenieHive node-agent package."""
--- a/src/geniehive_node/config.py
+++ b/src/geniehive_node/config.py
@ -0,0 +1,68 @@
 from __future__ import annotations
 from pathlib import Path
 from typing import Literal
 import yaml
 from pydantic import BaseModel, Field
 ServiceKind = Literal["chat", "embeddings", "transcription"]
 class NodeConfigBlock(BaseModel):
    host_id: str = "node-1"
    display_name: str | None = None
    listen_host: str = "127.0.0.1"
    listen_port: int = 8891
    address: str | None = None
    labels: dict[str, str] = Field(default_factory=dict)
 class ControlPlaneConfig(BaseModel):
    base_url: str | None = None
    node_api_key: str | None = None
    heartbeat_interval_s: float = 5.0
 class InventoryConfig(BaseModel):
    model_roots: list[str] = Field(default_factory=list)
    cpu_threads: int | None = None
    ram_gb: float | None = None
    capabilities: dict[str, bool] = Field(default_factory=dict)
 class ManagedRuntimesConfig(BaseModel):
    enabled: bool = False
    llama_server_bin: str | None = None
 class NodeServiceAssetConfig(BaseModel):
    asset_id: str
    loaded: bool = False
 class NodeServiceConfig(BaseModel):
    service_id: str
    kind: ServiceKind
    protocol: str = "openai"
    endpoint: str | None = None
    runtime: dict[str, str] = Field(default_factory=dict)
    assets: list[NodeServiceAssetConfig] = Field(default_factory=list)
    state: dict[str, object] = Field(default_factory=dict)
    observed: dict[str, object] = Field(default_factory=dict)
 class NodeConfig(BaseModel):
    node: NodeConfigBlock = Field(default_factory=NodeConfigBlock)
    control_plane: ControlPlaneConfig = Field(default_factory=ControlPlaneConfig)
    inventory: InventoryConfig = Field(default_factory=InventoryConfig)
    managed_runtimes: ManagedRuntimesConfig = Field(default_factory=ManagedRuntimesConfig)
    services: list[NodeServiceConfig] = Field(default_factory=list)
 def load_config(path: str | Path) -> NodeConfig:
    raw = yaml.safe_load(Path(path).read_text()) or {}
    if not isinstance(raw, dict):
        raise ValueError("Node config must be a YAML mapping.")
    return NodeConfig.model_validate(raw)
--- a/src/geniehive_node/inventory.py
+++ b/src/geniehive_node/inventory.py
@ -0,0 +1,85 @@
 from __future__ import annotations
 from pathlib import Path
 import time
 from .config import NodeConfig
 from .models import NodeInventory
 def discover_model_files(roots: list[str]) -> list[dict[str, object]]:
    discovered: list[dict[str, object]] = []
    for root in roots:
        path = Path(root)
        if not path.exists():
            continue
        for model_path in sorted(path.rglob("*.gguf")):
            discovered.append(
                {
                    "path": str(model_path),
                    "name": model_path.name,
                    "size_bytes": model_path.stat().st_size,
                }
            )
    return discovered
 def build_inventory(cfg: NodeConfig) -> NodeInventory:
    address = cfg.node.address or cfg.node.listen_host
    resources: dict[str, object] = {}
    if cfg.inventory.cpu_threads is not None:
        resources["cpu_threads"] = cfg.inventory.cpu_threads
    if cfg.inventory.ram_gb is not None:
        resources["ram_gb"] = cfg.inventory.ram_gb
    resources["discovered_models"] = discover_model_files(cfg.inventory.model_roots)
    services: list[dict] = []
    for service in cfg.services:
        endpoint = service.endpoint or f"http://{cfg.node.listen_host}:{cfg.node.listen_port}"
        services.append(
            {
                "service_id": service.service_id,
                "host_id": cfg.node.host_id,
                "kind": service.kind,
                "protocol": service.protocol,
                "endpoint": endpoint,
                "runtime": service.runtime,
                "assets": [asset.model_dump() for asset in service.assets],
                "state": service.state,
                "observed": service.observed,
            }
        )
    return NodeInventory(
        host_id=cfg.node.host_id,
        display_name=cfg.node.display_name,
        address=address,
        labels=cfg.node.labels,
        capabilities=cfg.inventory.capabilities,
        resources=resources,
        services=services,
    )
 def build_registration_payload(cfg: NodeConfig) -> dict:
    inventory = build_inventory(cfg)
    return inventory.model_dump()
 def build_heartbeat_payload(cfg: NodeConfig) -> dict:
    inventory = build_inventory(cfg)
    healthy_service_count = sum(
        1 for service in inventory.services if service.get("state", {}).get("health") == "healthy"
    )
    return {
        "host_id": inventory.host_id,
        "status": {
            "state": "online",
            "last_seen": time.time(),
        },
        "metrics": {
            "service_count": len(inventory.services),
            "healthy_service_count": healthy_service_count,
            "discovered_model_count": len(inventory.resources.get("discovered_models", [])),
        },
    }
--- a/src/geniehive_node/main.py
+++ b/src/geniehive_node/main.py
@ -0,0 +1,62 @@
 from __future__ import annotations
 import asyncio
 from contextlib import asynccontextmanager, suppress
 import os
 from pathlib import Path
 from fastapi import FastAPI
 from .config import NodeConfig, load_config
 from .inventory import build_inventory, build_registration_payload
 from .sync import ControlPlaneClient
 def create_app(
    config_path: str | Path | None = None,
    *,
    sync_enabled: bool = True,
    control_client: ControlPlaneClient | None = None,
 ) -> FastAPI:
    cfg_path = config_path or os.environ.get("GENIEHIVE_NODE_CONFIG")
    cfg = load_config(cfg_path) if cfg_path else NodeConfig()
    sync_client = control_client or ControlPlaneClient(cfg)
    @asynccontextmanager
    async def lifespan(app: FastAPI):
        heartbeat_task: asyncio.Task[None] | None = None
        stop_event = asyncio.Event()
        if sync_enabled and sync_client.enabled:
            with suppress(Exception):
                await sync_client.register_once()
            heartbeat_task = asyncio.create_task(sync_client.heartbeat_loop(stop_event))
        try:
            yield
        finally:
            if heartbeat_task is not None:
                stop_event.set()
                heartbeat_task.cancel()
                with suppress(asyncio.CancelledError):
                    await heartbeat_task
            await sync_client.aclose()
    app = FastAPI(title="GenieHive Node", version="0.1.0", lifespan=lifespan)
    app.state.cfg = cfg
    app.state.control_client = sync_client
    @app.get("/health")
    async def health() -> dict[str, str]:
        return {"status": "ok"}
    @app.get("/v1/node/inventory")
    async def inventory() -> dict:
        return build_inventory(cfg).model_dump()
    @app.get("/v1/node/registration")
    async def registration() -> dict:
        return build_registration_payload(cfg)
    return app
 app = create_app()
--- a/src/geniehive_node/models.py
+++ b/src/geniehive_node/models.py
@ -0,0 +1,14 @@
 from __future__ import annotations
 from pydantic import BaseModel, Field
 class NodeInventory(BaseModel):
    host_id: str
    display_name: str | None = None
    address: str
    labels: dict[str, str] = Field(default_factory=dict)
    capabilities: dict[str, bool] = Field(default_factory=dict)
    resources: dict[str, object] = Field(default_factory=dict)
    services: list[dict] = Field(default_factory=list)
--- a/src/geniehive_node/sync.py
+++ b/src/geniehive_node/sync.py
@ -0,0 +1,84 @@
 from __future__ import annotations
 import asyncio
 from contextlib import suppress
 from typing import Protocol
 import httpx
 from .config import NodeConfig
 from .inventory import build_heartbeat_payload, build_registration_payload
 class AsyncPoster(Protocol):
    async def post(self, url: str, *, json: dict, headers: dict[str, str] | None = None) -> object:
        ...
 class ControlPlaneClient:
    def __init__(self, cfg: NodeConfig, http_client: AsyncPoster | None = None) -> None:
        self.cfg = cfg
        self._owns_client = http_client is None
        self._registered = False
        self._http = http_client or httpx.AsyncClient(
            timeout=httpx.Timeout(connect=5.0, read=30.0, write=30.0, pool=30.0)
        )
    @property
    def enabled(self) -> bool:
        return bool(self.cfg.control_plane.base_url)
    def _headers(self) -> dict[str, str]:
        headers: dict[str, str] = {}
        if self.cfg.control_plane.node_api_key:
            headers["X-GenieHive-Node-Key"] = self.cfg.control_plane.node_api_key
        return headers
    async def register_once(self) -> None:
        if not self.enabled:
            return
        url = str(self.cfg.control_plane.base_url).rstrip("/") + "/v1/nodes/register"
        response = await self._http.post(
            url,
            json=build_registration_payload(self.cfg),
            headers=self._headers(),
        )
        if isinstance(response, httpx.Response):
            response.raise_for_status()
        self._registered = True
    async def heartbeat_once(self) -> None:
        if not self.enabled:
            return
        if not self._registered:
            await self.register_once()
        url = str(self.cfg.control_plane.base_url).rstrip("/") + "/v1/nodes/heartbeat"
        response = await self._http.post(
            url,
            json=build_heartbeat_payload(self.cfg),
            headers=self._headers(),
        )
        if isinstance(response, httpx.Response):
            if response.status_code == 404:
                self._registered = False
                await self.register_once()
                response = await self._http.post(
                    url,
                    json=build_heartbeat_payload(self.cfg),
                    headers=self._headers(),
                )
            response.raise_for_status()
    async def heartbeat_loop(self, stop_event: asyncio.Event) -> None:
        interval = max(self.cfg.control_plane.heartbeat_interval_s, 0.1)
        while not stop_event.is_set():
            with suppress(Exception):
                await self.heartbeat_once()
            try:
                await asyncio.wait_for(stop_event.wait(), timeout=interval)
            except asyncio.TimeoutError:
                continue
    async def aclose(self) -> None:
        if self._owns_client and isinstance(self._http, httpx.AsyncClient):
            await self._http.aclose()
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -0,0 +1,9 @@
 from pathlib import Path
 import sys
 ROOT = Path(__file__).resolve().parents[1]
 SRC = ROOT / "src"
 if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
--- a/tests/test_control_chat.py
+++ b/tests/test_control_chat.py
@ -0,0 +1,224 @@
 import asyncio
 from pathlib import Path
 from geniehive_control.chat import ProxyError, proxy_chat_completion, proxy_embeddings
 from geniehive_control.models import HostRegistration, RegisteredService, RoleProfile
 from geniehive_control.registry import Registry
 from geniehive_control.upstream import UpstreamClient
 class _FakeResponse:
    def __init__(self, payload: dict, status_code: int = 200) -> None:
        self._payload = payload
        self.status_code = status_code
        self.text = str(payload)
    def json(self) -> dict:
        return self._payload
 class _FakePoster:
    def __init__(self) -> None:
        self.calls: list[dict] = []
    async def post(self, url: str, *, json: dict, headers: dict[str, str] | None = None) -> _FakeResponse:
        self.calls.append({"url": url, "json": json, "headers": headers or {}})
        return _FakeResponse({"ok": True, "echo_model": json["model"]})
 def _build_registry(tmp_path: Path) -> Registry:
    registry = Registry(tmp_path / "geniehive.sqlite3")
    registry.register_host(
        HostRegistration(
            host_id="atlas-01",
            address="192.168.1.101",
            services=[
                RegisteredService(
                    service_id="atlas-01/chat/qwen3-8b",
                    host_id="atlas-01",
                    kind="chat",
                    endpoint="http://192.168.1.101:18091",
                    assets=[{"asset_id": "qwen3-8b-q4km", "loaded": True}],
                    state={"health": "healthy", "load_state": "loaded", "accept_requests": True},
                    observed={"p50_latency_ms": 900},
                ),
                RegisteredService(
                    service_id="atlas-01/embeddings/bge-small",
                    host_id="atlas-01",
                    kind="embeddings",
                    endpoint="http://192.168.1.101:18092",
                    assets=[{"asset_id": "bge-small-en", "loaded": True}],
                    state={"health": "healthy", "load_state": "loaded", "accept_requests": True},
                    observed={"p50_latency_ms": 120},
                )
            ],
        )
    )
    registry.upsert_roles(
        [
            RoleProfile(
                role_id="mentor",
                display_name="Mentor",
                operation="chat",
                modality="text",
                routing_policy={"preferred_families": ["qwen3"]},
            ),
            RoleProfile(
                role_id="embedder",
                display_name="Embedder",
                operation="embeddings",
                modality="text",
                routing_policy={"require_loaded": True},
            )
        ]
    )
    return registry
 def test_proxy_chat_completion_rewrites_role_to_loaded_asset(tmp_path: Path) -> None:
    registry = _build_registry(tmp_path)
    fake = _FakePoster()
    upstream = UpstreamClient(client=fake)
    async def run() -> dict:
        return await proxy_chat_completion(
            {
                "model": "mentor",
                "messages": [{"role": "user", "content": "hello"}],
            },
            registry=registry,
            upstream=upstream,
        )
    result = asyncio.run(run())
    assert result["ok"] is True
    assert result["echo_model"] == "qwen3-8b-q4km"
    assert fake.calls[0]["url"] == "http://192.168.1.101:18091/v1/chat/completions"
    assert fake.calls[0]["json"]["model"] == "qwen3-8b-q4km"
 def test_proxy_chat_completion_preserves_direct_asset_match(tmp_path: Path) -> None:
    registry = _build_registry(tmp_path)
    fake = _FakePoster()
    upstream = UpstreamClient(client=fake)
    async def run() -> dict:
        return await proxy_chat_completion(
            {
                "model": "qwen3-8b-q4km",
                "messages": [{"role": "user", "content": "hello"}],
            },
            registry=registry,
            upstream=upstream,
        )
    result = asyncio.run(run())
    assert result["echo_model"] == "qwen3-8b-q4km"
 def test_proxy_chat_completion_strips_reasoning_fields(tmp_path: Path) -> None:
    registry = _build_registry(tmp_path)
    class _ReasoningPoster:
        async def post(self, url: str, *, json: dict, headers: dict[str, str] | None = None) -> _FakeResponse:
            return _FakeResponse(
                {
                    "object": "chat.completion",
                    "model": json["model"],
                    "choices": [
                        {
                            "index": 0,
                            "message": {
                                "role": "assistant",
                                "content": "GPU1 route is live.",
                                "reasoning_content": "hidden chain of thought",
                            },
                            "reasoning": {"tokens": 42},
                        }
                    ],
                }
            )
    upstream = UpstreamClient(client=_ReasoningPoster())
    async def run() -> dict:
        return await proxy_chat_completion(
            {
                "model": "mentor",
                "messages": [{"role": "user", "content": "hello"}],
            },
            registry=registry,
            upstream=upstream,
        )
    result = asyncio.run(run())
    choice = result["choices"][0]
    assert choice["message"]["content"] == "GPU1 route is live."
    assert "reasoning_content" not in choice["message"]
    assert "reasoning" not in choice
 def test_proxy_chat_completion_fails_for_unknown_model(tmp_path: Path) -> None:
    registry = _build_registry(tmp_path)
    upstream = UpstreamClient(client=_FakePoster())
    async def run() -> None:
        await proxy_chat_completion(
            {
                "model": "unknown-model",
                "messages": [{"role": "user", "content": "hello"}],
            },
            registry=registry,
            upstream=upstream,
        )
    try:
        asyncio.run(run())
    except ProxyError as exc:
        assert exc.status_code == 404
    else:
        raise AssertionError("expected ChatProxyError")
 def test_proxy_embeddings_rewrites_role_to_loaded_asset(tmp_path: Path) -> None:
    registry = _build_registry(tmp_path)
    fake = _FakePoster()
    upstream = UpstreamClient(client=fake)
    async def run() -> dict:
        return await proxy_embeddings(
            {
                "model": "embedder",
                "input": "hello",
            },
            registry=registry,
            upstream=upstream,
        )
    result = asyncio.run(run())
    assert result["ok"] is True
    assert result["echo_model"] == "bge-small-en"
    assert fake.calls[0]["url"] == "http://192.168.1.101:18092/v1/embeddings"
    assert fake.calls[0]["json"]["model"] == "bge-small-en"
 def test_proxy_embeddings_fails_for_unknown_model(tmp_path: Path) -> None:
    registry = _build_registry(tmp_path)
    upstream = UpstreamClient(client=_FakePoster())
    async def run() -> None:
        await proxy_embeddings(
            {
                "model": "unknown-embedder",
                "input": "hello",
            },
            registry=registry,
            upstream=upstream,
        )
    try:
        asyncio.run(run())
    except ProxyError as exc:
        assert exc.status_code == 404
    else:
        raise AssertionError("expected ProxyError")
--- a/tests/test_control_registry.py
+++ b/tests/test_control_registry.py
@ -0,0 +1,152 @@
 from pathlib import Path
 from geniehive_control.main import create_app
 from geniehive_control.models import HostHeartbeat, HostRegistration, RegisteredService, RoleProfile
 from geniehive_control.registry import Registry
 def test_registry_persists_registration_and_heartbeat(tmp_path: Path) -> None:
    db_path = tmp_path / "geniehive.sqlite3"
    registry = Registry(db_path)
    host = registry.register_host(
        HostRegistration(
            host_id="atlas-01",
            display_name="Atlas GPU Box",
            address="192.168.1.101",
            labels={"site": "home-lab"},
            capabilities={"cuda": True},
            resources={"cpu_threads": 24},
            services=[
                RegisteredService(
                    service_id="atlas-01/chat/qwen3-8b",
                    host_id="atlas-01",
                    kind="chat",
                    protocol="openai",
                    endpoint="http://192.168.1.101:18091",
                    runtime={"engine": "llama.cpp", "launcher": "managed"},
                    assets=[{"asset_id": "qwen3-8b-q4km", "loaded": True}],
                    state={"health": "healthy", "load_state": "loaded", "accept_requests": True},
                    observed={"p50_latency_ms": 900, "tokens_per_sec": 40},
                )
            ],
        )
    )
    assert host is not None
    assert host["host_id"] == "atlas-01"
    updated = registry.heartbeat_host(
        HostHeartbeat(
            host_id="atlas-01",
            status={"state": "online"},
            metrics={"gpu_utilization_pct": 77},
        )
    )
    assert updated is not None
    assert updated["metrics"]["gpu_utilization_pct"] == 77
    hosts = registry.list_hosts()
    services = registry.list_services()
    health = registry.cluster_health(stale_after_s=30)
    assert len(hosts) == 1
    assert len(services) == 1
    assert services[0]["service_id"] == "atlas-01/chat/qwen3-8b"
    assert services[0]["state"]["health"] == "healthy"
    assert health["host_count"] == 1
    assert health["healthy_service_count"] == 1
 def test_registry_persists_roles_and_resolves_direct_and_role_routes(tmp_path: Path) -> None:
    db_path = tmp_path / "geniehive.sqlite3"
    registry = Registry(db_path)
    registry.register_host(
        HostRegistration(
            host_id="atlas-01",
            address="192.168.1.101",
            services=[
                RegisteredService(
                    service_id="atlas-01/chat/qwen3-8b",
                    host_id="atlas-01",
                    kind="chat",
                    endpoint="http://192.168.1.101:18091",
                    assets=[{"asset_id": "qwen3-8b-q4km", "loaded": True}],
                    state={"health": "healthy", "load_state": "loaded", "accept_requests": True},
                    observed={"p50_latency_ms": 900},
                ),
                RegisteredService(
                    service_id="atlas-01/embeddings/bge-small",
                    host_id="atlas-01",
                    kind="embeddings",
                    endpoint="http://192.168.1.101:18092",
                    assets=[{"asset_id": "bge-small-en", "loaded": True}],
                    state={"health": "healthy", "load_state": "loaded", "accept_requests": True},
                    observed={"p50_latency_ms": 120},
                ),
            ],
        )
    )
    registry.upsert_roles(
        [
            RoleProfile(
                role_id="mentor",
                display_name="Mentor",
                operation="chat",
                modality="text",
                routing_policy={"preferred_families": ["qwen3"]},
            ),
            RoleProfile(
                role_id="embedder",
                display_name="Embedder",
                operation="embeddings",
                modality="text",
                routing_policy={"require_loaded": True},
            ),
        ]
    )
    roles = registry.list_roles()
    assert len(roles) == 2
    assert roles[0]["role_id"] == "embedder"
    direct = registry.resolve_route("qwen3-8b-q4km")
    assert direct is not None
    assert direct["match_type"] == "direct"
    assert direct["service"]["service_id"] == "atlas-01/chat/qwen3-8b"
    by_role = registry.resolve_route("mentor")
    assert by_role is not None
    assert by_role["match_type"] == "role"
    assert by_role["role"]["role_id"] == "mentor"
    assert by_role["service"]["service_id"] == "atlas-01/chat/qwen3-8b"
    embed_role = registry.resolve_route("embedder")
    assert embed_role is not None
    assert embed_role["service"]["service_id"] == "atlas-01/embeddings/bge-small"
    models = registry.list_client_models()
    ids = {item["id"] for item in models}
    assert "atlas-01/chat/qwen3-8b" in ids
    assert "qwen3-8b-q4km" in ids
    assert "mentor" in ids
    mentor = next(item for item in models if item["id"] == "mentor")
    assert mentor["geniehive"]["route_type"] == "role"
    assert mentor["geniehive"]["offload_hint"]["suitability"] == "good_for_low_complexity"
    asset = next(item for item in models if item["id"] == "qwen3-8b-q4km")
    assert asset["geniehive"]["route_type"] == "asset"
    assert asset["geniehive"]["offload_hint"]["recommended_for"] == "lower-complexity offload"
 def test_control_app_exposes_expected_routes() -> None:
    app = create_app()
    paths = {route.path for route in app.routes}
    assert "/health" in paths
    assert "/v1/models" in paths
    assert "/v1/nodes/register" in paths
    assert "/v1/nodes/heartbeat" in paths
    assert "/v1/cluster/hosts" in paths
    assert "/v1/cluster/services" in paths
    assert "/v1/cluster/roles" in paths
    assert "/v1/cluster/health" in paths
    assert "/v1/cluster/routes/resolve" in paths
--- a/tests/test_demo_flow.py
+++ b/tests/test_demo_flow.py
@ -0,0 +1,104 @@
 from pathlib import Path
 from geniehive_control.main import create_app as create_control_app
 from geniehive_control.models import HostHeartbeat, HostRegistration
 from geniehive_node.config import load_config as load_node_config
 from geniehive_node.inventory import build_heartbeat_payload, build_registration_payload
 def _write_demo_files(tmp_path: Path) -> tuple[Path, Path, Path]:
    models_dir = tmp_path / "models"
    models_dir.mkdir()
    (models_dir / "qwen3-demo.gguf").write_bytes(b"demo")
    roles_path = tmp_path / "roles.yaml"
    roles_path.write_text(
        "\n".join(
            [
                "roles:",
                '  - role_id: "mentor"',
                '    display_name: "Mentor"',
                '    operation: "chat"',
                '    modality: "text"',
                "    routing_policy:",
                '      preferred_families: ["qwen3"]',
            ]
        )
    )
    control_path = tmp_path / "control.yaml"
    control_path.write_text(
        "\n".join(
            [
                "auth:",
                "  client_api_keys:",
                '    - "client-key"',
                "  node_api_keys:",
                '    - "node-key"',
                "storage:",
                f'  sqlite_path: "{tmp_path / "state.sqlite3"}"',
                f'roles_path: "{roles_path}"',
            ]
        )
    )
    node_path = tmp_path / "node.yaml"
    node_path.write_text(
        "\n".join(
            [
                "node:",
                '  host_id: "atlas-01"',
                '  display_name: "Atlas GPU Box"',
                '  listen_host: "127.0.0.1"',
                "  listen_port: 8891",
                '  address: "192.168.1.101"',
                "control_plane:",
                '  base_url: "http://127.0.0.1:8800"',
                '  node_api_key: "node-key"',
                "inventory:",
                f'  model_roots:\n    - "{models_dir}"',
                "  capabilities:",
                "    cuda: true",
                "services:",
                '  - service_id: "atlas-01/chat/qwen3-8b"',
                '    kind: "chat"',
                '    endpoint: "http://127.0.0.1:18091"',
                "    assets:",
                '      - asset_id: "qwen3-8b-q4km"',
                "        loaded: true",
                "    state:",
                '      health: "healthy"',
                '      load_state: "loaded"',
                "      accept_requests: true",
                "    observed:",
                "      p50_latency_ms: 900",
            ]
        )
    )
    return control_path, node_path, roles_path
 def test_demo_flow_registers_node_and_resolves_role(tmp_path: Path) -> None:
    control_path, node_path, _ = _write_demo_files(tmp_path)
    control_app = create_control_app(control_path)
    registry = control_app.state.registry
    node_cfg = load_node_config(node_path)
    registration = build_registration_payload(node_cfg)
    heartbeat = build_heartbeat_payload(node_cfg)
    host = registry.register_host(HostRegistration.model_validate(registration))
    assert host["host_id"] == "atlas-01"
    updated = registry.heartbeat_host(HostHeartbeat.model_validate(heartbeat))
    assert updated is not None
    assert updated["metrics"]["service_count"] == 1
    roles = registry.list_roles()
    assert len(roles) == 1
    assert roles[0]["role_id"] == "mentor"
    resolved = registry.resolve_route("mentor")
    assert resolved is not None
    assert resolved["match_type"] == "role"
    assert resolved["service"]["service_id"] == "atlas-01/chat/qwen3-8b"
--- a/tests/test_node_inventory.py
+++ b/tests/test_node_inventory.py
@ -0,0 +1,108 @@
 import asyncio
 from pathlib import Path
 from geniehive_node.config import load_config
 from geniehive_node.inventory import build_heartbeat_payload, build_inventory, build_registration_payload
 from geniehive_node.main import create_app
 from geniehive_node.sync import ControlPlaneClient
 def _write_node_config(tmp_path: Path) -> Path:
    models_dir = tmp_path / "models"
    models_dir.mkdir()
    (models_dir / "demo.gguf").write_bytes(b"gguf-demo")
    cfg_path = tmp_path / "node.yaml"
    cfg_path.write_text(
        "\n".join(
            [
                "node:",
                '  host_id: "atlas-01"',
                '  display_name: "Atlas GPU Box"',
                '  listen_host: "127.0.0.1"',
                "  listen_port: 8891",
                '  address: "192.168.1.101"',
                "  labels:",
                '    site: "home-lab"',
                "inventory:",
                f'  model_roots:\n    - "{models_dir}"',
                "  cpu_threads: 24",
                "  ram_gb: 128",
                "  capabilities:",
                "    cuda: true",
                "services:",
                '  - service_id: "atlas-01/chat/qwen3-8b"',
                '    kind: "chat"',
                '    endpoint: "http://127.0.0.1:18091"',
                "    runtime:",
                '      engine: "llama.cpp"',
                '      launcher: "managed"',
                "    assets:",
                '      - asset_id: "qwen3-8b-q4km"',
                "        loaded: true",
                "    state:",
                '      health: "healthy"',
                '      load_state: "loaded"',
                "      accept_requests: true",
            ]
        )
    )
    return cfg_path
 def test_build_inventory_and_registration_payload(tmp_path: Path) -> None:
    cfg = load_config(_write_node_config(tmp_path))
    inventory = build_inventory(cfg)
    payload = build_registration_payload(cfg)
    heartbeat = build_heartbeat_payload(cfg)
    assert inventory.host_id == "atlas-01"
    assert inventory.address == "192.168.1.101"
    assert inventory.capabilities["cuda"] is True
    assert inventory.resources["cpu_threads"] == 24
    assert len(inventory.resources["discovered_models"]) == 1
    assert inventory.services[0]["host_id"] == "atlas-01"
    assert inventory.services[0]["service_id"] == "atlas-01/chat/qwen3-8b"
    assert payload["services"][0]["kind"] == "chat"
    assert heartbeat["host_id"] == "atlas-01"
    assert heartbeat["metrics"]["service_count"] == 1
    assert heartbeat["metrics"]["healthy_service_count"] == 1
 def test_node_app_exposes_inventory_routes(tmp_path: Path) -> None:
    app = create_app(_write_node_config(tmp_path), sync_enabled=False)
    paths = {route.path for route in app.routes}
    assert "/health" in paths
    assert "/v1/node/inventory" in paths
    assert "/v1/node/registration" in paths
 class _FakePoster:
    def __init__(self) -> None:
        self.calls: list[dict] = []
    async def post(self, url: str, *, json: dict, headers: dict[str, str] | None = None) -> object:
        self.calls.append({"url": url, "json": json, "headers": headers or {}})
        return object()
 def test_control_plane_client_posts_register_and_heartbeat(tmp_path: Path) -> None:
    cfg_path = _write_node_config(tmp_path)
    cfg = load_config(cfg_path)
    cfg.control_plane.base_url = "http://127.0.0.1:8800"
    cfg.control_plane.node_api_key = "node-key"
    fake = _FakePoster()
    client = ControlPlaneClient(cfg, http_client=fake)
    async def run() -> None:
        await client.register_once()
        await client.heartbeat_once()
    asyncio.run(run())
    assert len(fake.calls) == 2
    assert fake.calls[0]["url"] == "http://127.0.0.1:8800/v1/nodes/register"
    assert fake.calls[0]["headers"]["X-GenieHive-Node-Key"] == "node-key"
    assert fake.calls[0]["json"]["host_id"] == "atlas-01"
    assert fake.calls[1]["url"] == "http://127.0.0.1:8800/v1/nodes/heartbeat"
    assert fake.calls[1]["json"]["metrics"]["service_count"] == 1
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@ -0,0 +1,10 @@
 from geniehive_control.main import create_app as create_control_app
 from geniehive_node.main import create_app as create_node_app
 def test_control_app_title() -> None:
    assert create_control_app().title == "GenieHive Control"
 def test_node_app_title() -> None:
    assert create_node_app().title == "GenieHive Node"