Initial commit

This commit is contained in:
welberr 2026-04-07 13:17:28 -04:00
parent dabbebd3ba
commit b9270df3e8
60 changed files with 4021 additions and 224 deletions

230
.gitignore vendored
View File

@ -1,229 +1,13 @@
# ---> Python
# Byte-compiled / optimized / DLL files
.pytest_cache/
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
.venv/
.benchmarks/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# ---> Emacs
# -*- mode: gitignore; -*-
*~
\#*\#
/.emacs.desktop
/.emacs.desktop.lock
*.elc
auto-save-list
tramp
.\#*
# Org-mode
.org-id-locations
*_archive
# flymake-mode
*_flymake.*
# eshell files
/eshell/history
/eshell/lastdir
# elpa packages
/elpa/
# reftex files
*.rel
# AUCTeX auto folder
/auto/
# cask packages
.cask/
dist/
# Flycheck
flycheck_*.el
# server auth directory
/server/
# projectiles files
.projectile
# directory configuration
.dir-locals.el
# network security
/network-security.data
# ---> Rust
# Generated by Cargo
# will have compiled files and executables
debug/
target/
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
Cargo.lock
# These are backup files generated by rustfmt
**/*.rs.bk
# MSVC Windows builds of rustc generate these, which store debugging information
*.pdb
state/*.sqlite3
state/*.db
state/*.log
.DS_Store

33
CONTRIBUTING.md Normal file
View File

@ -0,0 +1,33 @@
# Contributing
GenieHive is still early-stage infrastructure code. Keep changes small, explicit, and easy to verify.
## Setup
```bash
cd /home/netuser/bin/geniehive
python -m venv .venv
. .venv/bin/activate
pip install -e '.[dev]'
```
## Common Checks
```bash
make test
make smoke
```
## Guidelines
- Prefer narrowly scoped patches over broad rewrites.
- Keep the control-plane and node-agent contracts in sync.
- Add or update tests with behavior changes.
- Do not commit local runtime state from `state/`.
- Do not commit benchmark artifacts or cache directories.
## Runtime Notes
- Example configs under `configs/` are meant to stay runnable.
- Scripts under `scripts/` should remain usable as operator entrypoints, not just test helpers.
- If a startup dependency can race in practice, prefer self-healing behavior over one-shot initialization.

13
Makefile Normal file
View File

@ -0,0 +1,13 @@
PYTHON ?= python
PYTEST ?= pytest
.PHONY: test smoke health
test:
$(PYTEST) -q
smoke:
$(PYTEST) -q tests/test_smoke.py
health:
bash scripts/check_singlebox_health.sh

View File

@ -1,3 +1,60 @@
# GenieHive
GenieHive is a generative AI router, starting with presenting an OpenAI API-compatible endpoint for clients to interact with, while their requests are routed appropriately among one or more nodes that register running servers with the control host. From running multiple LLMs on a single host to doing that across a distributed cluster, GenieHive aims to make it easier to actually use local AI.
GenieHive is a local-first control plane for heterogeneous generative AI services running across one or more hosts.
V1 scope:
- chat completions
- embeddings
- transcription
Core goals:
- register hosts and services
- track health, inventory, and observed performance
- expose a stable client-facing API
- support direct model addressing and higher-level role addressing
- route requests to healthy loaded services first
Repository layout:
- `docs/architecture.md`: system overview and v1 scope
- `docs/roadmap.md`: current milestones and near-term priorities
- `docs/schemas.md`: canonical data models
- `docs/deployment.md`: intended deployment approach
- `docs/demo.md`: first end-to-end control-plus-node demo flow
- `docs/llm_demo.md`: detailed master/peer/client LLM demo runbook
- `docs/reverse_proxy.md`: safer external exposure patterns
- `configs/`: example control-plane, node, and role configs
- `scripts/`: small launch and inspection helpers
- `src/geniehive_control/`: control-plane package
- `src/geniehive_node/`: node-agent package
There is now a documented single-machine path as well as the cluster-oriented path, so GenieHive can be exercised as a useful local router even without multiple hosts.
This repository is intended as the clean successor to narrower local gateway experiments. OpenAI-compatible routing remains important, but it is treated as one client facade within a broader cluster control-plane design.
## Development
Local development setup:
```bash
cd /home/netuser/bin/geniehive
python -m venv .venv
. .venv/bin/activate
pip install -e '.[dev]'
```
Common commands:
```bash
make test
make smoke
make health
```
Repository conventions:
- local runtime state lives under `state/` and should not be committed
- example configs under `configs/` should remain runnable
- operator scripts under `scripts/` are part of the supported workflow

View File

@ -0,0 +1,18 @@
server:
host: "127.0.0.1"
port: 8800
auth:
client_api_keys:
- "change-me-client-key"
node_api_keys:
- "change-me-node-key"
storage:
sqlite_path: "state/geniehive.sqlite3"
roles_path: "configs/roles.example.yaml"
routing:
default_strategy: "loaded_first"
health_stale_after_s: 30

View File

@ -0,0 +1,18 @@
server:
host: "127.0.0.1"
port: 8800
auth:
client_api_keys:
- "change-me-client-key"
node_api_keys:
- "change-me-node-key"
storage:
sqlite_path: "state/geniehive-singlebox.sqlite3"
roles_path: "configs/roles.example.yaml"
routing:
default_strategy: "loaded_first"
health_stale_after_s: 30

View File

@ -0,0 +1,18 @@
server:
host: "127.0.0.1"
port: 8800
auth:
client_api_keys:
- "change-me-client-key"
node_api_keys:
- "change-me-node-key"
storage:
sqlite_path: "state/geniehive-p40.sqlite3"
roles_path: "configs/roles.singlebox.p40.example.yaml"
routing:
default_strategy: "loaded_first"
health_stale_after_s: 30

56
configs/node.example.yaml Normal file
View File

@ -0,0 +1,56 @@
node:
host_id: "atlas-01"
display_name: "Atlas GPU Box"
listen_host: "127.0.0.1"
listen_port: 8891
control_plane:
base_url: "http://127.0.0.1:8800"
node_api_key: "change-me-node-key"
heartbeat_interval_s: 5
inventory:
model_roots:
- "/path/to/models"
cpu_threads: 24
ram_gb: 128
capabilities:
cuda: true
rocm: false
metal: false
managed_runtimes:
enabled: true
llama_server_bin: "/path/to/llama-server"
services:
- service_id: "atlas-01/chat/qwen3-8b"
kind: "chat"
endpoint: "http://127.0.0.1:18091"
runtime:
engine: "llama.cpp"
launcher: "managed"
assets:
- asset_id: "qwen3-8b-q4km"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
observed:
p50_latency_ms: 900
tokens_per_sec: 40
- service_id: "atlas-01/embeddings/bge-small"
kind: "embeddings"
endpoint: "http://127.0.0.1:18092"
runtime:
engine: "llama.cpp"
launcher: "managed"
assets:
- asset_id: "bge-small-en"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true

View File

@ -0,0 +1,43 @@
node:
host_id: "singlebox-llamacpp"
display_name: "SingleBox llama.cpp"
listen_host: "127.0.0.1"
listen_port: 8891
address: "127.0.0.1"
labels:
topology: "singlebox"
runtime: "llama.cpp"
control_plane:
base_url: "http://127.0.0.1:8800"
node_api_key: "change-me-node-key"
heartbeat_interval_s: 5
inventory:
model_roots:
- "/path/to/models"
cpu_threads: 24
ram_gb: 64
capabilities:
cpu: true
cuda: true
managed_runtimes:
enabled: false
services:
- service_id: "singlebox/chat/qwen3-8b"
kind: "chat"
endpoint: "http://127.0.0.1:18091"
runtime:
engine: "llama.cpp"
launcher: "external"
assets:
- asset_id: "qwen3-8b-q4_k_m"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
observed:
p50_latency_ms: 900

View File

@ -0,0 +1,43 @@
node:
host_id: "singlebox-llamafile"
display_name: "SingleBox llamafile"
listen_host: "127.0.0.1"
listen_port: 8891
address: "127.0.0.1"
labels:
topology: "singlebox"
runtime: "llamafile"
control_plane:
base_url: "http://127.0.0.1:8800"
node_api_key: "change-me-node-key"
heartbeat_interval_s: 5
inventory:
model_roots:
- "/path/to/models"
cpu_threads: 24
ram_gb: 64
capabilities:
cpu: true
cuda: true
managed_runtimes:
enabled: false
services:
- service_id: "singlebox/chat/qwen3-8b"
kind: "chat"
endpoint: "http://127.0.0.1:18091"
runtime:
engine: "llamafile"
launcher: "external"
assets:
- asset_id: "qwen3-8b-q4_k_m"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
observed:
p50_latency_ms: 900

View File

@ -0,0 +1,58 @@
node:
host_id: "singlebox-ollama"
display_name: "SingleBox Ollama"
listen_host: "127.0.0.1"
listen_port: 8891
address: "127.0.0.1"
labels:
topology: "singlebox"
runtime: "ollama"
control_plane:
base_url: "http://127.0.0.1:8800"
node_api_key: "change-me-node-key"
heartbeat_interval_s: 5
inventory:
model_roots: []
cpu_threads: 24
ram_gb: 64
capabilities:
cpu: true
cuda: false
managed_runtimes:
enabled: false
services:
- service_id: "singlebox/chat/qwen3"
kind: "chat"
endpoint: "http://127.0.0.1:11434"
runtime:
engine: "ollama"
launcher: "external"
assets:
- asset_id: "qwen3"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
observed:
p50_latency_ms: 900
- service_id: "singlebox/embeddings/nomic-embed-text"
kind: "embeddings"
endpoint: "http://127.0.0.1:11434"
runtime:
engine: "ollama"
launcher: "external"
assets:
- asset_id: "nomic-embed-text"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
observed:
p50_latency_ms: 150

View File

@ -0,0 +1,84 @@
node:
host_id: "p40-box"
display_name: "Dual P40 + CPU Fallback"
listen_host: "127.0.0.1"
listen_port: 8891
address: "127.0.0.1"
labels:
topology: "singlebox"
runtime: "llama.cpp"
gpu0: "Tesla P40"
gpu1: "Tesla P40"
cpu: "Ryzen 5600G"
control_plane:
base_url: "http://127.0.0.1:8800"
node_api_key: "change-me-node-key"
heartbeat_interval_s: 5
inventory:
model_roots:
- "/path/to/models"
cpu_threads: 12
ram_gb: 128
capabilities:
cpu: true
cuda: true
managed_runtimes:
enabled: false
services:
- service_id: "p40-box/chat/gpu0-primary"
kind: "chat"
endpoint: "http://127.0.0.1:18091"
runtime:
engine: "llama.cpp"
launcher: "external"
device: "gpu0"
assets:
- asset_id: "Qwen2.5-14B-Instruct-1M-Q5_K_M"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
observed:
p50_latency_ms: 1200
tokens_per_sec: 24
- service_id: "p40-box/chat/gpu1-secondary"
kind: "chat"
endpoint: "http://127.0.0.1:18092"
runtime:
engine: "llama.cpp"
launcher: "external"
device: "gpu1"
assets:
- asset_id: "Qwen3.5-9B-Q5_K_M"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
observed:
p50_latency_ms: 1000
tokens_per_sec: 30
- service_id: "p40-box/chat/cpu-fallback"
kind: "chat"
endpoint: "http://127.0.0.1:18093"
runtime:
engine: "llama.cpp"
launcher: "external"
device: "cpu"
assets:
- asset_id: "rocket-3b.Q5_K_M"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
observed:
p50_latency_ms: 4500
tokens_per_sec: 7

View File

@ -0,0 +1,22 @@
roles:
- role_id: "mentor"
display_name: "Mentor"
operation: "chat"
modality: "text"
prompt_policy:
system_prompt: "Guide the user without taking over the task."
routing_policy:
preferred_families: ["Qwen3", "Mistral"]
min_context: 8192
- role_id: "embedder"
display_name: "Embedder"
operation: "embeddings"
modality: "text"
routing_policy:
require_loaded: true
- role_id: "transcriber"
display_name: "Transcriber"
operation: "transcription"
modality: "audio"

View File

@ -0,0 +1,33 @@
roles:
- role_id: "mentor"
display_name: "Mentor"
description: "Primary high-quality reasoning/chat route"
operation: "chat"
modality: "text"
prompt_policy:
system_prompt: "Be concise, helpful, and technically accurate."
routing_policy:
preferred_families: ["qwen2.5-14b", "qwen2.5"]
require_loaded: true
- role_id: "general_assistant"
display_name: "General Assistant"
description: "Secondary fast chat route"
operation: "chat"
modality: "text"
prompt_policy:
system_prompt: "Answer clearly and directly."
routing_policy:
preferred_families: ["qwen3.5-9b", "qwen3.5"]
require_loaded: true
- role_id: "background_summarizer"
display_name: "Background Summarizer"
description: "Slow fallback route for low-priority work"
operation: "chat"
modality: "text"
prompt_policy:
system_prompt: "Summarize briefly and conservatively."
routing_policy:
preferred_families: ["rocket-3b", "rocket"]
require_loaded: true

194
docs/architecture.md Normal file
View File

@ -0,0 +1,194 @@
# GenieHive Architecture
Status: proposed v1 architecture
Drafted: 2026-04-05
## Repo Name
Chosen name: `GenieHive`
Why this name:
- suggestive: "genie" implies generative AI services, "hive" implies a cooperating cluster
- accessible: easy to say, remember, and explain
- whimsical enough to feel like a project name rather than a dry infrastructure label
Tradeoff:
- `GenieHive` is less search-distinct than `Geniewarren` because `hive` is a common product metaphor
## Mission
GenieHive is a local-first control plane for heterogeneous generative AI services running across one or more hosts.
It should:
- register hosts and their available services
- expose a stable client-facing API
- track health, capacity, and observed performance
- support direct model addressing and higher-level role addressing
- route requests to healthy loaded services first
- optionally coordinate loading or swapping when policy allows
- remain practical for a small self-hosted deployment with two hosts
## Non-Goals For V1
Out of scope initially:
- peer-to-peer consensus
- autonomous global model swapping across many nodes
- full WAN zero-trust platform engineering
- image and TTS generation orchestration
- distributed vector database management
- billing or multi-tenant quota accounting
## Architectural Position
GenieHive is not just an OpenAI-compatible gateway.
It is a control plane with these layers:
1. Control API
- authoritative registry
- routing and scheduling
- role catalog
- operator inspection
2. Node Agent
- host discovery
- service discovery
- telemetry reporting
- optional local process management
3. Provider Adapters
- OpenAI-compatible chat backends
- OpenAI-compatible embedding backends
- transcription backends
- future adapters for image and speech synthesis
4. Client Facades
- OpenAI-compatible facade for completions and embeddings
- operator API for topology, health, and inventory
## Core Concepts
### Host
A physical or virtual machine participating in the cluster.
### Service
A concrete callable capability on a host. Examples:
- chat completion endpoint
- embedding endpoint
- transcription endpoint
### Asset
A model weight, model name, application, or runtime target that a service can serve.
### Role
A reusable task profile that describes how requests should be fulfilled. A role is policy, not a concrete model.
### Route Resolution
Request handling order:
1. If the requested `model` matches a currently loaded and healthy concrete asset or service alias, route directly.
2. Otherwise, if the requested `model` matches a known role, resolve the role to the best eligible service.
3. Otherwise, fail clearly.
## V1 Capability Scope
V1 supports only:
- chat completions
- embeddings
- transcription
## Topology
Recommended initial topology:
- 1 control plane
- 2 node agents
- 1 or more clients
- LAN-first deployment
- API key auth in v1
- VPN or mTLS in v1.5
## API Families
### Client API
- `GET /v1/models`
- `POST /v1/chat/completions`
- `POST /v1/embeddings`
- `POST /v1/audio/transcriptions`
`GET /v1/models` should expose enough metadata for programmatic clients to make routing decisions about what GenieHive can handle cheaply, especially for lower-complexity offloaded work. That metadata should include direct assets, service-backed aliases, role aliases, operation kind, health, loaded status, and observed performance hints.
### Operator API
- `GET /v1/cluster/hosts`
- `GET /v1/cluster/services`
- `GET /v1/cluster/roles`
- `GET /v1/cluster/health`
- `GET /v1/cluster/routes/resolve?model=...`
### Node API
- `POST /v1/nodes/register`
- `POST /v1/nodes/heartbeat`
- `GET /v1/node/inventory`
- `POST /v1/node/services/refresh`
## Data Store
V1 should use SQLite for durable state.
## Routing Rules
### Direct Model Resolution
If a request names a concrete asset alias or service alias:
- prefer loaded and healthy services
- choose the lowest-cost healthy target if multiple matches exist
- fail clearly if all matches are unhealthy
### Role Resolution
If direct resolution fails, treat the requested name as a role.
Role resolution should filter by:
- operation kind
- modality
- health
- auth and exposure compatibility
- minimum context or memory requirements
- preferred model families
Then rank by:
- already loaded
- recent health
- expected latency
- queue pressure
- operator priority
## First Implementation Sequence
1. Create the repo skeleton and docs.
2. Implement SQLite-backed registry models.
3. Implement node registration and heartbeat.
4. Implement operator inspection endpoints.
5. Implement client-facing chat routing.
6. Add embeddings routing.
7. Add transcription routing.
8. Add truthful readiness and health reporting.
9. Add role catalog and role-based resolution.
10. Add optional managed local runtime support.

61
docs/demo.md Normal file
View File

@ -0,0 +1,61 @@
# GenieHive Demo
This is the first end-to-end demo path for GenieHive using the example configs already in the repo.
## Goal
Bring up:
- one control plane
- one node agent
- one route-resolution check
The node should auto-register with the control plane on startup and then send periodic heartbeats.
## 1. Start the control plane
From the repo root:
```bash
bash scripts/run_control.sh
```
This uses:
- `configs/control.example.yaml`
- `configs/roles.example.yaml`
## 2. Start the node agent
In another shell:
```bash
bash scripts/run_node.sh
```
This uses:
- `configs/node.example.yaml`
## 3. Inspect the cluster
In another shell:
```bash
bash scripts/demo_inspect.sh
```
That script checks:
- client-facing model metadata
- cluster health
- registered hosts
- registered services
- loaded roles
- route resolution for `mentor`
## Notes
- The example configs use API keys; the inspection script sends the example client key.
- The example node config assumes the underlying model-serving endpoints already exist. The current demo proves control-plane registration and routing metadata, not full inference proxying yet.
- The control plane stores state in `state/geniehive.sqlite3` by default.

48
docs/deployment.md Normal file
View File

@ -0,0 +1,48 @@
# GenieHive Deployment
## Initial Deployment Target
V1 should be easy to deploy on a small self-hosted setup:
- 1 control plane
- 2 node agents
- private LAN or VPN
- API-key auth first
## Binding Guidance
Defaults should be conservative:
- control plane binds to localhost by default during development
- node agents bind to localhost unless remote registration is needed
- managed inference runtimes should stay node-local unless there is a specific reason to expose them
## Security Baseline
Required in v1:
- client API keys
- node registration keys
- clear separation between client-facing and node-facing credentials
Planned after v1:
- mTLS between control plane and nodes
- scoped client tokens
## Persistence
Use SQLite first for:
- host registry
- service registry
- role catalog
- recent health and benchmark samples
## Startup Order
1. Start the control plane.
2. Start node agents.
3. Confirm registration and heartbeat visibility.
4. Confirm client API readiness.
5. Exercise chat, embeddings, and transcription paths.

676
docs/llm_demo.md Normal file
View File

@ -0,0 +1,676 @@
# GenieHive LLM Demo
This runbook covers the first practical GenieHive LLM demo with three roles:
- master: the GenieHive control plane
- peer: a GenieHive node agent attached to one or more local LLM servers
- client: a demo client agent or Codex using GenieHive as the API front door
## Current Readiness
GenieHive is ready for a first live chat demo now.
What works in GenieHive already:
- node registration
- heartbeat
- role-aware route resolution
- `GET /v1/models`
- `POST /v1/chat/completions`
- `POST /v1/embeddings`
What GenieHive does not do yet:
- launch upstream LLM servers for you automatically
- provide `POST /v1/audio/transcriptions`
- maintain advanced benchmark history or queue-aware scheduling
For the first demo, treat GenieHive as a metadata-rich router over already-running local servers.
## Topologies
### Smallest Demo
Run everything on one host:
- control plane on `127.0.0.1:8800`
- node agent on `127.0.0.1:8891`
- one or more upstream model servers on local ports
This is also the recommended setup for users who do not have a cluster. GenieHive still provides value as:
- a local router
- a metadata-rich local model catalog
- a role-to-model indirection layer
- a common front door for client tools
### Two-Host Demo
- master host runs GenieHive control plane
- peer host runs GenieHive node agent and one or more local LLM servers
- client runs anywhere that can reach the master
## Master Instructions
On the control-plane host:
1. Create a repo-local Python environment if you want isolation.
2. Start GenieHive control:
```bash
cd /home/netuser/bin/geniehive
bash scripts/run_control.sh
```
3. Confirm health:
```bash
curl -sS http://127.0.0.1:8800/health
```
Expected result:
- JSON containing `{"status":"ok"}`
4. Keep note of the example client and node keys from `configs/control.example.yaml`.
### Single-Box Shortcut
If you are running control and node on the same machine, use:
```bash
cd /home/netuser/bin/geniehive
bash scripts/run_control_singlebox.sh
```
For your P40 host, repo-provided external bind helpers now exist:
LAN:
```bash
bash scripts/run_control_p40_lan.sh
```
ZeroTier:
```bash
bash scripts/run_control_p40_zerotier.sh
```
Both use the P40-specific control config and only change the bind interface.
## Peer Instructions
On each peer host you need:
- one or more local LLM servers already running
- one GenieHive node config that points at those servers
- the control-plane base URL and node API key
For a single-machine setup, the peer is simply another process on the same host.
The node agent should advertise upstream server roots, not endpoint suffixes. For example:
- good: `http://127.0.0.1:11434`
- good: `http://127.0.0.1:18091`
- not good: `http://127.0.0.1:11434/v1/chat/completions`
### Option A: Ollama
Use this when you want the lowest-friction chat and embeddings demo.
1. Start Ollama if it is not already running:
```bash
ollama serve
```
2. Pull the model or models you want:
```bash
ollama pull qwen3
ollama pull nomic-embed-text
```
3. Example peer service config:
```yaml
services:
- service_id: "peer1/chat/qwen3"
kind: "chat"
endpoint: "http://127.0.0.1:11434"
runtime:
engine: "ollama"
launcher: "external"
assets:
- asset_id: "qwen3"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
- service_id: "peer1/embeddings/nomic-embed-text"
kind: "embeddings"
endpoint: "http://127.0.0.1:11434"
runtime:
engine: "ollama"
launcher: "external"
assets:
- asset_id: "nomic-embed-text"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
```
4. Start the node:
```bash
cd /home/netuser/bin/geniehive
bash scripts/run_node_singlebox.sh configs/node.singlebox.ollama.example.yaml
```
### Option B: llama.cpp
Use this when you want direct GGUF serving with `llama-server`.
1. Start a chat server:
```bash
llama-server -m /path/to/model.gguf --host 127.0.0.1 --port 18091
```
2. Example peer service config:
```yaml
services:
- service_id: "peer1/chat/qwen3-8b"
kind: "chat"
endpoint: "http://127.0.0.1:18091"
runtime:
engine: "llama.cpp"
launcher: "external"
assets:
- asset_id: "qwen3-8b-q4_k_m"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
```
Then start the node:
```bash
cd /home/netuser/bin/geniehive
bash scripts/run_node_singlebox.sh configs/node.singlebox.llamacpp.example.yaml
```
Note:
- The official `llama.cpp` docs clearly show OpenAI-compatible chat serving.
- For embeddings, some `llama.cpp` builds document non-OpenAI embedding endpoints such as `/embedding`, so GenieHives current `POST /v1/embeddings` path is safest with Ollama or vLLM unless you have verified your specific build.
### Option C: llamafile
Use this when you want a single-file local server built around llama.cpp.
1. Start a chat server:
```bash
./your-model.llamafile --server --host 127.0.0.1 --port 18091 --nobrowser
```
2. Example peer service config:
```yaml
services:
- service_id: "peer1/chat/llamafile-qwen3"
kind: "chat"
endpoint: "http://127.0.0.1:18091"
runtime:
engine: "llamafile"
launcher: "external"
assets:
- asset_id: "qwen3-8b-q4_k_m"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
```
Then start the node:
```bash
cd /home/netuser/bin/geniehive
bash scripts/run_node_singlebox.sh configs/node.singlebox.llamafile.example.yaml
```
### Option D: vLLM
Use this when you want a more server-oriented OpenAI-compatible stack and you have the hardware budget for it.
1. Start the server:
```bash
vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
```
2. Example peer service config:
```yaml
services:
- service_id: "peer1/chat/llama3-8b"
kind: "chat"
endpoint: "http://127.0.0.1:8000"
runtime:
engine: "vllm"
launcher: "external"
assets:
- asset_id: "NousResearch/Meta-Llama-3-8B-Instruct"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
- service_id: "peer1/embeddings/bge-base"
kind: "embeddings"
endpoint: "http://127.0.0.1:8001"
runtime:
engine: "vllm"
launcher: "external"
assets:
- asset_id: "BAAI/bge-base-en-v1.5"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
```
## Minimal Node Config Pattern
For a real peer host, the fields you most likely need to edit in `configs/node.example.yaml` are:
- `node.host_id`
- `node.display_name`
- `node.address`
- `control_plane.base_url`
- `control_plane.node_api_key`
- `inventory.capabilities`
- `services`
## Client Instructions
You now have two simple ways to exercise GenieHive as a client.
### Option 1: Inspect and call it manually
List models:
```bash
curl -sS http://127.0.0.1:8800/v1/models \
-H 'X-Api-Key: change-me-client-key'
```
Chat using a role:
```bash
curl -sS http://127.0.0.1:8800/v1/chat/completions \
-H 'Content-Type: application/json' \
-H 'X-Api-Key: change-me-client-key' \
-d '{
"model": "mentor",
"messages": [{"role":"user","content":"Give me a 2-sentence summary of why SQLite is useful here."}]
}'
```
Embeddings using a direct embedding asset:
```bash
curl -sS http://127.0.0.1:8800/v1/embeddings \
-H 'Content-Type: application/json' \
-H 'X-Api-Key: change-me-client-key' \
-d '{
"model": "nomic-embed-text",
"input": "GenieHive is a local-first control plane."
}'
```
### Option 2: Use the demo client agent
Run:
```bash
cd /home/netuser/bin/geniehive
python scripts/demo_client_agent.py \
--base-url http://127.0.0.1:8800 \
--api-key change-me-client-key \
--task "Summarize the current GenieHive demo in three bullets."
```
That script will:
- read `GET /v1/models`
- choose a chat-capable model automatically if you do not specify one
- prefer entries GenieHive marks as suitable for lower-complexity offload
- submit a chat request and print the answer
If you want to force a specific route:
```bash
python scripts/demo_client_agent.py \
--base-url http://127.0.0.1:8800 \
--api-key change-me-client-key \
--model mentor \
--task "State what host and route type you would expect for this demo."
```
## Codex-As-Client
For Codex or another agentic client, the intended pattern is:
1. Read `GET /v1/models`.
2. Filter for `geniehive.operation == "chat"`.
3. Prefer:
- `geniehive.offload_hint.suitability == "good_for_low_complexity"`
- `geniehive.loaded_target_count > 0` for role entries
- lower `best_p50_latency_ms`
4. Send lower-complexity requests to GenieHive.
5. Keep higher-complexity, high-context, or high-risk tasks local unless the catalog indicates a better remote fit.
## Good First Live Demo
If you want the safest first success path:
- control plane on one host
- node agent on the same host
- Ollama upstream with one chat model
- role alias `mentor`
- demo client agent calling `mentor`
That avoids GGUF-specific launch tuning while still exercising the full GenieHive master/peer/client path.
## Single-Machine End-to-End Example
### Ollama-backed single box
1. Start Ollama:
```bash
ollama serve
```
2. Pull models:
```bash
ollama pull qwen3
ollama pull nomic-embed-text
```
3. Start GenieHive control:
```bash
cd /home/netuser/bin/geniehive
bash scripts/run_control_singlebox.sh
```
4. Start GenieHive node:
```bash
cd /home/netuser/bin/geniehive
bash scripts/run_node_singlebox.sh configs/node.singlebox.ollama.example.yaml
```
5. Inspect:
```bash
bash scripts/demo_inspect.sh
```
6. Run the client agent:
```bash
python scripts/demo_client_agent.py \
--base-url http://127.0.0.1:8800 \
--api-key change-me-client-key \
--task "Explain in three bullets what GenieHive is doing in this single-machine demo."
```
### llama.cpp-backed single box
1. Start the local server:
```bash
llama-server -m /path/to/model.gguf --host 127.0.0.1 --port 18091
```
2. Start GenieHive control:
```bash
cd /home/netuser/bin/geniehive
bash scripts/run_control_singlebox.sh
```
3. Start GenieHive node:
```bash
cd /home/netuser/bin/geniehive
bash scripts/run_node_singlebox.sh configs/node.singlebox.llamacpp.example.yaml
```
4. Run the client agent:
```bash
python scripts/demo_client_agent.py \
--base-url http://127.0.0.1:8800 \
--api-key change-me-client-key \
--task "Summarize why a single-machine GenieHive setup can still be useful."
```
## Host-Specific Note: Dual Tesla P40 + 128 GB RAM
For a machine with:
- `2 x Nvidia Tesla P40`
- `AMD Ryzen 5600G`
- `128 GB RAM`
the most practical first GenieHive layout is:
- one chat model on `GPU0`
- one chat or utility model on `GPU1`
- one slower fallback chat model on CPU
This is now sketched in:
- `configs/node.singlebox.p40-triple.example.yaml`
- `configs/control.singlebox.p40.example.yaml`
- `configs/roles.singlebox.p40.example.yaml`
- `scripts/start_p40_triple_llamacpp.sh`
- `scripts/launch_p40_triple.sh`
- `scripts/p40_triple_gpu0.sh`
- `scripts/p40_triple_gpu1.sh`
- `scripts/p40_triple_cpu.sh`
The current concrete defaults use models already present under `/home/netuser/bin/models/llm`:
- `GPU0`: `Qwen2.5-14B-Instruct-1M-Q5_K_M.gguf`
- `GPU1`: `Qwen3.5-9B-Q5_K_M.gguf`
- `CPU`: `rocket-3b.Q5_K_M.gguf`
### Why this layout works
- each P40 has enough VRAM for a quantized 7B to 14B model comfortably
- 128 GB RAM is enough to hold a separate CPU-served fallback model without much trouble
- the CPU route will be much slower, but it is still useful for low-priority offload or fallback handling
### Suggested role usage
- `mentor` or primary chat role -> `GPU0`
- `general_assistant` or alternate chat role -> `GPU1`
- `fallback_writer` or `background_summarizer` -> CPU route
The repo now includes a host-specific role catalog with exactly that intent.
### Launch pattern
1. Edit your model paths:
```bash
cd /home/netuser/bin/geniehive
bash scripts/start_p40_triple_llamacpp.sh
```
If the defaults look good, you do not need to edit them before trying the first run.
If `tmux` is available, you can also launch the three processes detached:
```bash
cd /home/netuser/bin/geniehive
bash scripts/launch_p40_triple.sh
```
Then inspect pane state without binding your current terminal to the session:
```bash
bash scripts/tmux_session_status.sh
```
That status helper checks whether the session exists and whether each pane's launcher process is still running or has already exited. If `tmux` is not installed, the combined launcher prints the three helper commands instead.
2. Start the three `llama-server` processes in separate shells.
3. Start GenieHive control:
```bash
bash scripts/run_control_singlebox.sh configs/control.singlebox.p40.example.yaml
```
4. Start GenieHive node with the host-specific config:
```bash
bash scripts/run_node_singlebox.sh configs/node.singlebox.p40-triple.example.yaml
```
5. Inspect the catalog:
```bash
bash scripts/demo_inspect.sh
```
If something is not coming up cleanly, run:
```bash
bash scripts/check_singlebox_health.sh
```
That checks:
- `GPU0` upstream health
- `GPU1` upstream health
- CPU fallback upstream health
- GenieHive control health
- GenieHive node health
- authenticated cluster and model-catalog endpoints
6. Exercise the chat path:
```bash
python scripts/demo_client_agent.py \
--base-url http://127.0.0.1:8800 \
--api-key change-me-client-key \
--model mentor \
--task "State which route should be preferred for low-latency chat and which should be the slow fallback."
```
### Practical expectations
- `GPU0` and `GPU1` should be the preferred targets for normal chat work
- the CPU route should mostly be treated as fallback or low-priority background work
- GenieHive metadata should make that visible to clients through latency and offload hints
### Containerized Qwen3.5 probe
If the host-installed `llama-server` is too old for `Qwen3.5`, but the NVIDIA Container Toolkit is installed, you can test a newer CUDA-enabled `llama.cpp` without changing the host CUDA stack:
```bash
cd /home/netuser/bin/geniehive
bash scripts/test_qwen35_server_cuda_container.sh
```
Useful overrides:
```bash
GPU_INDEX=1 PORT=19092 bash scripts/test_qwen35_server_cuda_container.sh
MODEL_PATH=/home/netuser/bin/models/llm/Qwen3.5-9B-Q5_K_M.gguf bash scripts/test_qwen35_server_cuda_container.sh
```
That probe uses the official `ghcr.io/ggml-org/llama.cpp:server-cuda` image. If it loads the model and starts serving, then the remaining blocker is your host `llama.cpp` install, not GPU compatibility.
## External Client Access
For your current host addresses:
- LAN: `192.168.40.207`
- ZeroTier: `172.24.50.65`
The cleanest rule is:
- keep upstream model servers on `127.0.0.1`
- keep the GenieHive node on `127.0.0.1` unless you specifically need remote node access
- expose only the GenieHive control plane to LAN or ZeroTier clients
That gives remote clients a single stable endpoint without exposing the underlying model servers directly.
### LAN bind
```bash
cd /home/netuser/bin/geniehive
bash scripts/run_control_p40_lan.sh
```
Remote client example:
```bash
python scripts/demo_client_agent.py \
--base-url http://192.168.40.207:8800 \
--api-key change-me-client-key \
--model mentor \
--task "Briefly describe the preferred and fallback routes on this host."
```
### ZeroTier bind
```bash
cd /home/netuser/bin/geniehive
bash scripts/run_control_p40_zerotier.sh
```
Remote client example:
```bash
python scripts/demo_client_agent.py \
--base-url http://172.24.50.65:8800 \
--api-key change-me-client-key \
--model mentor \
--task "Briefly describe the preferred and fallback routes on this host."
```
### Security note
Prefer ZeroTier over general LAN exposure when possible. In both cases:
- do not expose the upstream `llama-server` ports
- keep the client API key enabled
- if you later open this beyond trusted networks, add a reverse proxy or VPN-only boundary rather than binding GenieHive broadly
### Role meanings for this host
- `mentor` should bias toward the `GPU0` Qwen2.5 14B route
- `general_assistant` should bias toward the `GPU1` Qwen3.5 9B route
- `background_summarizer` should bias toward the CPU Rocket 3B fallback route

94
docs/reverse_proxy.md Normal file
View File

@ -0,0 +1,94 @@
# GenieHive Reverse Proxy
For external clients, a reverse proxy is cleaner than binding GenieHive directly to every interface.
Recommended pattern:
- keep upstream model servers on `127.0.0.1`
- keep GenieHive node on `127.0.0.1`
- keep GenieHive control on `127.0.0.1`
- expose only the reverse proxy on LAN or ZeroTier
## Caddy Example
Config file:
```caddy
192.168.40.207:8080 {
reverse_proxy 127.0.0.1:8800
}
```
ZeroTier variant:
```caddy
172.24.50.65:8080 {
reverse_proxy 127.0.0.1:8800
}
```
Advantages:
- simple config
- easy to move to TLS later
- good default operational behavior
## Nginx Example
Server block:
```nginx
server {
listen 192.168.40.207:8080;
server_name _;
location / {
proxy_pass http://127.0.0.1:8800;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
}
```
ZeroTier variant:
```nginx
server {
listen 172.24.50.65:8080;
server_name _;
location / {
proxy_pass http://127.0.0.1:8800;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
}
```
## Operational Recommendation
For your current host, the cleanest shape is:
1. GenieHive control on `127.0.0.1:8800`
2. reverse proxy on either:
- `192.168.40.207:8080`
- `172.24.50.65:8080`
3. clients talk only to the reverse proxy
## Client Example
```bash
python scripts/demo_client_agent.py \
--base-url http://172.24.50.65:8080 \
--api-key change-me-client-key \
--model mentor \
--task "Describe the preferred and fallback routes on this host."
```
## Security Note
The API key is still required. The reverse proxy improves exposure hygiene, but it is not a substitute for network trust boundaries.

34
docs/roadmap.md Normal file
View File

@ -0,0 +1,34 @@
# GenieHive Roadmap
## Completed Foundations
- control-plane registry with SQLite persistence
- node registration and heartbeat
- role catalog and route resolution
- client-facing `GET /v1/models`
- client-facing `POST /v1/chat/completions`
- client-facing `POST /v1/embeddings`
- first control-plus-node demo flow
## Immediate Next Milestones
1. Run and document the first live LLM demo against real upstream servers.
2. Validate the `GET /v1/models` metadata as a Codex-friendly offload catalog for lower-complexity tasks.
3. Add `POST /v1/audio/transcriptions`.
4. Add a richer node metrics model for queue depth, current load, and observed performance over time.
5. Add a stronger operator/client distinction in the public metadata and auth surfaces.
## LLM Demo Note
The project is now ready for a first live LLM demo using GenieHive as:
- master: control plane
- peer: one or more node agents with pre-existing local LLM servers
- client: a small demo agent or Codex configured against GenieHive
The current live-demo priority is chat-first. Embeddings are also wired in GenieHive, but upstream compatibility differs across local servers, so the safest first demo matrix is:
- Ollama for chat and embeddings
- vLLM for chat and embeddings
- llama.cpp for chat
- llamafile for chat

128
docs/schemas.md Normal file
View File

@ -0,0 +1,128 @@
# GenieHive Schemas
These are canonical logical schemas for v1. They are documentation first, not final implementation code.
## Host
```yaml
host:
host_id: "atlas-01"
display_name: "Atlas GPU Box"
address: "192.168.1.101"
labels:
site: "home-lab"
class: "gpu"
capabilities:
cuda: true
rocm: false
metal: false
resources:
cpu_threads: 24
ram_gb: 128
gpus:
- gpu_id: "cuda:0"
name: "RTX 4090"
vram_gb: 24
auth:
node_key_id: "nk_atlas_01"
status:
state: "online"
last_seen: "2026-04-05T15:30:00Z"
```
## Service
```yaml
service:
service_id: "atlas-01/chat/qwen3-8b"
host_id: "atlas-01"
kind: "chat"
protocol: "openai"
endpoint: "http://192.168.1.101:18091"
runtime:
engine: "llama.cpp"
launcher: "managed"
assets:
- asset_id: "qwen3-8b-q4km"
loaded: true
state:
health: "healthy"
load_state: "loaded"
accept_requests: true
observed:
p50_latency_ms: 920
p95_latency_ms: 1900
tokens_per_sec: 42
```
## Asset
```yaml
asset:
asset_id: "qwen3-8b-q4km"
family: "Qwen3-8B"
modality: "text"
operation: "chat"
format: "gguf"
locator:
kind: "path"
value: "/models/qwen3-8b/qwen3-8b-q4_k_m.gguf"
metadata:
quant: "Q4_K_M"
ctx_train: 32768
```
## Role Profile
```yaml
role:
role_id: "mentor"
display_name: "Mentor"
description: "Guidance-oriented instructional reasoning"
modality: "text"
operation: "chat"
prompt_policy:
system_prompt: "You guide without doing the user's work for them."
user_template: "{{ user_input }}"
routing_policy:
preferred_families: ["Qwen3", "Mistral"]
preferred_labels: ["instruction", "stable"]
min_context: 8192
require_loaded: false
fallback_roles: ["general_assistant"]
```
## Health Sample
```yaml
health_sample:
sample_id: "hs_01"
target_type: "service"
target_id: "atlas-01/chat/qwen3-8b"
observed_at: "2026-04-05T15:30:00Z"
status: "healthy"
checks:
http_ok: true
models_ok: true
auth_ok: true
metrics:
queue_depth: 1
in_flight: 1
mem_used_gb: 18.4
```
## Benchmark Sample
```yaml
benchmark_sample:
benchmark_id: "bench_01"
service_id: "atlas-01/chat/qwen3-8b"
asset_id: "qwen3-8b-q4km"
observed_at: "2026-04-05T15:25:00Z"
workload: "chat.short_reasoning"
results:
prompt_tokens: 512
completion_tokens: 256
ttft_ms: 780
tokens_per_sec: 44
```

28
pyproject.toml Normal file
View File

@ -0,0 +1,28 @@
[build-system]
requires = ["setuptools>=68", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "geniehive"
version = "0.1.0"
description = "Local-first control plane for heterogeneous generative AI services"
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"fastapi>=0.110",
"httpx>=0.27",
"pydantic>=2.6",
"pyyaml>=6.0.1",
"uvicorn>=0.29",
]
[project.optional-dependencies]
dev = [
"pytest>=8.0",
]
[tool.setuptools]
package-dir = {"" = "src"}
[tool.setuptools.packages.find]
where = ["src"]

View File

@ -0,0 +1,37 @@
#!/usr/bin/env bash
set -euo pipefail
check() {
local name="$1"
local url="$2"
if curl -fsS "$url" >/dev/null 2>&1; then
printf '[ok] %s -> %s\n' "$name" "$url"
else
printf '[fail] %s -> %s\n' "$name" "$url"
fi
}
echo "GenieHive single-box health check"
echo
check "gpu0 upstream" "http://127.0.0.1:18091/health"
check "gpu1 upstream" "http://127.0.0.1:18092/health"
check "cpu upstream" "http://127.0.0.1:18093/health"
check "control plane" "http://127.0.0.1:8800/health"
check "node agent" "http://127.0.0.1:8891/health"
echo
echo "Authenticated GenieHive checks"
echo
if curl -fsS http://127.0.0.1:8800/v1/cluster/health -H 'X-Api-Key: change-me-client-key' >/dev/null 2>&1; then
echo "[ok] cluster health endpoint"
else
echo "[fail] cluster health endpoint"
fi
if curl -fsS http://127.0.0.1:8800/v1/models -H 'X-Api-Key: change-me-client-key' >/dev/null 2>&1; then
echo "[ok] model catalog endpoint"
else
echo "[fail] model catalog endpoint"
fi

View File

@ -0,0 +1,92 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
from typing import Any
import httpx
def fetch_models(client: httpx.Client, base_url: str, api_key: str) -> list[dict[str, Any]]:
response = client.get(
f"{base_url.rstrip('/')}/v1/models",
headers={"X-Api-Key": api_key},
)
response.raise_for_status()
return response.json().get("data", [])
def choose_chat_model(models: list[dict[str, Any]]) -> str:
candidates = []
for item in models:
meta = item.get("geniehive", {})
if meta.get("operation") != "chat":
continue
offload = meta.get("offload_hint", {})
route_type = meta.get("route_type")
suitability = offload.get("suitability", "")
latency = meta.get("best_p50_latency_ms")
if latency is None:
latency = meta.get("observed", {}).get("p50_latency_ms")
latency_score = float(latency) if latency is not None else float("inf")
role_preference = 1 if route_type == "role" else 0
suitability_rank = {
"good_for_low_complexity": 3,
"usable_for_background_tasks": 2,
"available_but_slow": 1,
"cold_only": 0,
}.get(suitability, 0)
candidates.append((suitability_rank, role_preference, -latency_score, item["id"]))
if not candidates:
raise SystemExit("No chat-capable models were advertised by GenieHive.")
return max(candidates)[3]
def run_task(base_url: str, api_key: str, model: str, task: str) -> dict[str, Any]:
with httpx.Client(timeout=120.0) as client:
response = client.post(
f"{base_url.rstrip('/')}/v1/chat/completions",
headers={
"X-Api-Key": api_key,
"Content-Type": "application/json",
},
json={
"model": model,
"messages": [
{"role": "system", "content": "You are a concise demo client agent."},
{"role": "user", "content": task},
],
},
)
response.raise_for_status()
return response.json()
def main() -> None:
parser = argparse.ArgumentParser(description="Exercise GenieHive as a small client agent.")
parser.add_argument("--base-url", required=True, help="GenieHive control-plane base URL")
parser.add_argument("--api-key", required=True, help="GenieHive client API key")
parser.add_argument("--model", help="Explicit chat model or role alias to use")
parser.add_argument("--task", help="Task text to send")
parser.add_argument("--list-models", action="store_true", help="List advertised models and exit")
args = parser.parse_args()
with httpx.Client(timeout=30.0) as client:
models = fetch_models(client, args.base_url, args.api_key)
if args.list_models:
print(json.dumps(models, indent=2))
return
if not args.task:
raise SystemExit("--task is required unless --list-models is used.")
model = args.model or choose_chat_model(models)
print(f"Using model: {model}")
result = run_task(args.base_url, args.api_key, model, args.task)
print(json.dumps(result, indent=2))
if __name__ == "__main__":
main()

18
scripts/demo_inspect.sh Normal file
View File

@ -0,0 +1,18 @@
#!/usr/bin/env bash
set -euo pipefail
BASE_URL="${GENIEHIVE_CONTROL_BASE_URL:-http://127.0.0.1:8800}"
CLIENT_KEY="${GENIEHIVE_CLIENT_KEY:-change-me-client-key}"
curl -sS "$BASE_URL/v1/models" -H "X-Api-Key: $CLIENT_KEY"
printf '\n'
curl -sS "$BASE_URL/v1/cluster/health" -H "X-Api-Key: $CLIENT_KEY"
printf '\n'
curl -sS "$BASE_URL/v1/cluster/hosts" -H "X-Api-Key: $CLIENT_KEY"
printf '\n'
curl -sS "$BASE_URL/v1/cluster/services" -H "X-Api-Key: $CLIENT_KEY"
printf '\n'
curl -sS "$BASE_URL/v1/cluster/roles" -H "X-Api-Key: $CLIENT_KEY"
printf '\n'
curl -sS "$BASE_URL/v1/cluster/routes/resolve?model=mentor" -H "X-Api-Key: $CLIENT_KEY"
printf '\n'

View File

@ -0,0 +1,37 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
SESSION="${GENIEHIVE_TMUX_SESSION:-geniehive-p40}"
STATUS_CMD="$ROOT/scripts/tmux_session_status.sh"
GPU0_CMD="$ROOT/scripts/p40_triple_gpu0.sh"
GPU1_CMD="$ROOT/scripts/p40_triple_gpu1.sh"
CPU_CMD="$ROOT/scripts/p40_triple_cpu.sh"
if command -v tmux >/dev/null 2>&1; then
if tmux has-session -t "$SESSION" 2>/dev/null; then
echo "tmux session already exists: $SESSION"
echo "Inspect panes with: bash '$STATUS_CMD' '$SESSION'"
exit 1
fi
tmux new-session -d -s "$SESSION" "cd '$ROOT' && bash '$GPU0_CMD'"
tmux split-window -h -t "$SESSION:0" "cd '$ROOT' && bash '$GPU1_CMD'"
tmux split-window -v -t "$SESSION:0" "cd '$ROOT' && bash '$CPU_CMD'"
tmux set-option -t "$SESSION:0" remain-on-exit on >/dev/null
tmux select-pane -t "$SESSION:0.0" -T gpu0 >/dev/null
tmux select-pane -t "$SESSION:0.1" -T gpu1 >/dev/null
tmux select-pane -t "$SESSION:0.2" -T cpu >/dev/null
tmux select-layout -t "$SESSION" tiled >/dev/null
echo "Started tmux session: $SESSION"
echo "Inspect panes with: bash '$STATUS_CMD' '$SESSION'"
echo "Attach manually only if needed: tmux attach -t $SESSION"
exit 0
fi
echo "tmux not found. Run these in three shells:"
echo
echo "bash '$GPU0_CMD'"
echo "bash '$GPU1_CMD'"
echo "bash '$CPU_CMD'"

View File

@ -0,0 +1,7 @@
#!/usr/bin/env bash
set -euo pipefail
MODEL_CPU="${MODEL_CPU:-/home/netuser/bin/models/llm/rocket-3b.Q5_K_M.gguf}"
LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/home/netuser/bin/llama.cpp/build/bin/llama-server}"
exec "$LLAMA_SERVER_BIN" -m "$MODEL_CPU" --host 127.0.0.1 --port 18093 -ngl 0 -t 12

View File

@ -0,0 +1,7 @@
#!/usr/bin/env bash
set -euo pipefail
MODEL_GPU0="${MODEL_GPU0:-/home/netuser/bin/models/llm/Qwen2.5-14B-Instruct-1M-Q5_K_M.gguf}"
LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/home/netuser/bin/llama.cpp/build/bin/llama-server}"
exec env CUDA_VISIBLE_DEVICES=0 "$LLAMA_SERVER_BIN" -m "$MODEL_GPU0" --host 127.0.0.1 --port 18091

View File

@ -0,0 +1,32 @@
#!/usr/bin/env bash
set -euo pipefail
MODEL_GPU1="${MODEL_GPU1:-/home/netuser/bin/models/llm/Qwen3.5-9B-Q5_K_M.gguf}"
LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/home/netuser/bin/llama.cpp/build/bin/llama-server}"
HOST="${GPU1_HOST:-127.0.0.1}"
PORT="${GPU1_PORT:-18092}"
CTX_SIZE="${GPU1_CTX_SIZE:-4096}"
NGL="${GPU1_NGL:-999}"
GPU_INDEX="${GPU1_INDEX:-1}"
USE_CONTAINER="${GPU1_USE_CONTAINER:-0}"
CONTAINER_IMAGE="${GPU1_CONTAINER_IMAGE:-ghcr.io/ggml-org/llama.cpp:server-cuda}"
if [[ "${USE_CONTAINER}" == "1" ]]; then
exec docker run --rm --gpus all \
--network host \
-e CUDA_VISIBLE_DEVICES="${GPU_INDEX}" \
-v "$(dirname "${MODEL_GPU1}"):/models:ro" \
"${CONTAINER_IMAGE}" \
-m "/models/$(basename "${MODEL_GPU1}")" \
-ngl "${NGL}" \
--ctx-size "${CTX_SIZE}" \
--host "${HOST}" \
--port "${PORT}"
fi
exec env CUDA_VISIBLE_DEVICES="${GPU_INDEX}" "$LLAMA_SERVER_BIN" \
-m "$MODEL_GPU1" \
-ngl "${NGL}" \
--ctx-size "${CTX_SIZE}" \
--host "${HOST}" \
--port "${PORT}"

11
scripts/run_control.sh Normal file
View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT"
export GENIEHIVE_CONTROL_CONFIG="$ROOT/configs/control.example.yaml"
export GENIEHIVE_ROLES_CONFIG="$ROOT/configs/roles.example.yaml"
export PYTHONPATH="$ROOT/src"
exec python -m uvicorn geniehive_control.main:app --host 127.0.0.1 --port 8800

View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT"
export GENIEHIVE_BIND_HOST="${GENIEHIVE_BIND_HOST:-192.168.40.207}"
export GENIEHIVE_BIND_PORT="${GENIEHIVE_BIND_PORT:-8800}"
exec bash "$ROOT/scripts/run_control_singlebox.sh" "$ROOT/configs/control.singlebox.p40.example.yaml"

View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT"
export GENIEHIVE_BIND_HOST="${GENIEHIVE_BIND_HOST:-172.24.50.65}"
export GENIEHIVE_BIND_PORT="${GENIEHIVE_BIND_PORT:-8800}"
exec bash "$ROOT/scripts/run_control_singlebox.sh" "$ROOT/configs/control.singlebox.p40.example.yaml"

View File

@ -0,0 +1,18 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT"
CONTROL_CONFIG="${1:-$ROOT/configs/control.singlebox.example.yaml}"
export GENIEHIVE_CONTROL_CONFIG="$CONTROL_CONFIG"
if [[ -z "${GENIEHIVE_ROLES_CONFIG:-}" ]]; then
export GENIEHIVE_ROLES_CONFIG="$ROOT/configs/roles.example.yaml"
fi
export PYTHONPATH="$ROOT/src"
HOST="${GENIEHIVE_BIND_HOST:-127.0.0.1}"
PORT="${GENIEHIVE_BIND_PORT:-8800}"
exec python -m uvicorn geniehive_control.main:app --host "$HOST" --port "$PORT"

10
scripts/run_node.sh Normal file
View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT"
export GENIEHIVE_NODE_CONFIG="$ROOT/configs/node.example.yaml"
export PYTHONPATH="$ROOT/src"
exec python -m uvicorn geniehive_node.main:app --host 127.0.0.1 --port 8891

View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT"
NODE_CONFIG="${1:-$ROOT/configs/node.singlebox.ollama.example.yaml}"
export GENIEHIVE_NODE_CONFIG="$NODE_CONFIG"
export PYTHONPATH="$ROOT/src"
HOST="${GENIEHIVE_NODE_BIND_HOST:-127.0.0.1}"
PORT="${GENIEHIVE_NODE_BIND_PORT:-8891}"
exec python -m uvicorn geniehive_node.main:app --host "$HOST" --port "$PORT"

View File

@ -0,0 +1,35 @@
#!/usr/bin/env bash
set -euo pipefail
# Example launcher pattern for:
# - GPU0 chat model on :18091
# - GPU1 chat model on :18092
# - CPU fallback chat model on :18093
#
# Defaults are based on models already present under /home/netuser/bin/models/llm.
# Override them via env vars if you want different weights.
MODEL_GPU0="${MODEL_GPU0:-/home/netuser/bin/models/llm/Qwen2.5-14B-Instruct-1M-Q5_K_M.gguf}"
MODEL_GPU1="${MODEL_GPU1:-/home/netuser/bin/models/llm/Qwen3.5-9B-Q5_K_M.gguf}"
MODEL_CPU="${MODEL_CPU:-/home/netuser/bin/models/llm/rocket-3b.Q5_K_M.gguf}"
LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/home/netuser/bin/llama.cpp/build/bin/llama-server}"
echo "Start these in separate shells or tmux panes."
echo "Helper scripts are available too:"
echo
echo "bash /home/netuser/bin/geniehive/scripts/p40_triple_gpu0.sh"
echo
echo "bash /home/netuser/bin/geniehive/scripts/p40_triple_gpu1.sh"
echo
echo "bash /home/netuser/bin/geniehive/scripts/p40_triple_cpu.sh"
echo
echo "Or try the combined launcher:"
echo "bash /home/netuser/bin/geniehive/scripts/launch_p40_triple.sh"
echo
echo "Equivalent raw commands:"
echo
echo "CUDA_VISIBLE_DEVICES=0 \"$LLAMA_SERVER_BIN\" -m \"$MODEL_GPU0\" --host 127.0.0.1 --port 18091"
echo
echo "CUDA_VISIBLE_DEVICES=1 \"$LLAMA_SERVER_BIN\" -m \"$MODEL_GPU1\" --host 127.0.0.1 --port 18092"
echo
echo "\"$LLAMA_SERVER_BIN\" -m \"$MODEL_CPU\" --host 127.0.0.1 --port 18093 -ngl 0 -t 12"

View File

@ -0,0 +1,34 @@
#!/usr/bin/env bash
set -euo pipefail
IMAGE="${IMAGE:-ghcr.io/ggml-org/llama.cpp:server-cuda}"
MODEL_PATH="${MODEL_PATH:-/home/netuser/bin/models/llm/Qwen3.5-9B-Q5_K_M.gguf}"
GPU_INDEX="${GPU_INDEX:-0}"
CTX_SIZE="${CTX_SIZE:-512}"
PORT="${PORT:-19091}"
TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-90}"
if [[ ! -f "${MODEL_PATH}" ]]; then
echo "Model not found: ${MODEL_PATH}" >&2
exit 1
fi
echo "Image: ${IMAGE}"
echo "Model: ${MODEL_PATH}"
echo "GPU: ${GPU_INDEX}"
echo "Port: ${PORT}"
echo "Timeout: ${TIMEOUT_SECONDS}s"
echo
echo "This probe is successful if llama-server loads the model and begins serving."
echo "A timeout exit after successful startup is acceptable for this test."
echo
timeout "${TIMEOUT_SECONDS}"s docker run --rm --gpus all \
-e CUDA_VISIBLE_DEVICES="${GPU_INDEX}" \
-v "$(dirname "${MODEL_PATH}"):/models:ro" \
"${IMAGE}" \
-m "/models/$(basename "${MODEL_PATH}")" \
-ngl 999 \
--ctx-size "${CTX_SIZE}" \
--host 127.0.0.1 \
--port "${PORT}"

38
scripts/tmux_session_status.sh Executable file
View File

@ -0,0 +1,38 @@
#!/usr/bin/env bash
set -euo pipefail
SESSION="${1:-${GENIEHIVE_TMUX_SESSION:-geniehive-p40}}"
if ! command -v tmux >/dev/null 2>&1; then
echo "tmux not found"
exit 127
fi
if ! tmux has-session -t "$SESSION" 2>/dev/null; then
echo "tmux session not found: $SESSION"
exit 1
fi
printf 'tmux session: %s\n' "$SESSION"
printf '%-6s %-8s %-10s %-8s %s\n' "pane" "title" "state" "status" "command"
live_count=0
while IFS=$'\t' read -r pane_id pane_title pane_pid pane_dead pane_dead_status pane_start_command; do
state="exited"
status="$pane_dead_status"
if [[ "$pane_dead" == "0" ]] && kill -0 "$pane_pid" 2>/dev/null; then
state="running"
status="-"
live_count=$((live_count + 1))
fi
printf '%-6s %-8s %-10s %-8s %s\n' "$pane_id" "${pane_title:--}" "$state" "${status:--}" "$pane_start_command"
done < <(
tmux list-panes -t "$SESSION" -F "#{pane_index}\t#{pane_title}\t#{pane_pid}\t#{pane_dead}\t#{pane_dead_status}\t#{pane_start_command}"
)
if [[ "$live_count" -eq 0 ]]; then
echo
echo "No pane processes are still running."
exit 2
fi

View File

@ -0,0 +1,2 @@
"""GenieHive control-plane package."""

View File

@ -0,0 +1,25 @@
from __future__ import annotations
from fastapi import HTTPException, Request, status
def _check_key(request: Request, allowed_keys: list[str], header_name: str) -> None:
if not allowed_keys:
return
provided = request.headers.get(header_name)
if provided in allowed_keys:
return
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="unauthorized",
)
def require_client_auth(request: Request) -> None:
cfg = request.app.state.cfg
_check_key(request, cfg.auth.client_api_keys, "X-Api-Key")
def require_node_auth(request: Request) -> None:
cfg = request.app.state.cfg
_check_key(request, cfg.auth.node_api_keys, "X-GenieHive-Node-Key")

View File

@ -0,0 +1,74 @@
from __future__ import annotations
from typing import Any
from .registry import Registry
from .routing import choose_upstream_model_id
from .upstream import UpstreamClient
class ProxyError(RuntimeError):
def __init__(self, message: str, *, status_code: int) -> None:
super().__init__(message)
self.status_code = status_code
def _strip_reasoning_fields(payload: Any) -> Any:
if isinstance(payload, list):
return [_strip_reasoning_fields(item) for item in payload]
if not isinstance(payload, dict):
return payload
cleaned: dict[str, Any] = {}
for key, value in payload.items():
if key in {"reasoning_content", "reasoning"}:
continue
cleaned[key] = _strip_reasoning_fields(value)
return cleaned
async def proxy_chat_completion(
body: dict[str, Any],
*,
registry: Registry,
upstream: UpstreamClient,
) -> Any:
requested_model = body.get("model")
if not requested_model:
raise ProxyError("Missing 'model' in request body.", status_code=400)
resolved = registry.resolve_route(requested_model, kind="chat")
if resolved is None:
raise ProxyError(f"Unknown model or role '{requested_model}'.", status_code=404)
service = resolved.get("service")
if service is None:
raise ProxyError(f"No healthy chat target available for '{requested_model}'.", status_code=503)
upstream_body = dict(body)
upstream_body["model"] = choose_upstream_model_id(requested_model, service)
response = await upstream.chat_completions(service["endpoint"], upstream_body)
return _strip_reasoning_fields(response)
async def proxy_embeddings(
body: dict[str, Any],
*,
registry: Registry,
upstream: UpstreamClient,
) -> Any:
requested_model = body.get("model")
if not requested_model:
raise ProxyError("Missing 'model' in request body.", status_code=400)
resolved = registry.resolve_route(requested_model, kind="embeddings")
if resolved is None:
raise ProxyError(f"Unknown model or role '{requested_model}'.", status_code=404)
service = resolved.get("service")
if service is None:
raise ProxyError(f"No healthy embeddings target available for '{requested_model}'.", status_code=503)
upstream_body = dict(body)
upstream_body["model"] = choose_upstream_model_id(requested_model, service)
return await upstream.embeddings(service["endpoint"], upstream_body)

View File

@ -0,0 +1,40 @@
from __future__ import annotations
from pathlib import Path
import yaml
from pydantic import BaseModel, Field
class ServerConfig(BaseModel):
host: str = "127.0.0.1"
port: int = 8800
class AuthConfig(BaseModel):
client_api_keys: list[str] = Field(default_factory=list)
node_api_keys: list[str] = Field(default_factory=list)
class StorageConfig(BaseModel):
sqlite_path: str = "state/geniehive.sqlite3"
class RoutingConfig(BaseModel):
default_strategy: str = "loaded_first"
health_stale_after_s: float = 30.0
class ControlConfig(BaseModel):
server: ServerConfig = Field(default_factory=ServerConfig)
auth: AuthConfig = Field(default_factory=AuthConfig)
storage: StorageConfig = Field(default_factory=StorageConfig)
routing: RoutingConfig = Field(default_factory=RoutingConfig)
roles_path: str | None = None
def load_config(path: str | Path) -> ControlConfig:
raw = yaml.safe_load(Path(path).read_text()) or {}
if not isinstance(raw, dict):
raise ValueError("Control config must be a YAML mapping.")
return ControlConfig.model_validate(raw)

View File

@ -0,0 +1,127 @@
from __future__ import annotations
import os
from pathlib import Path
from fastapi import Depends, FastAPI, Request
from fastapi.responses import JSONResponse
from .auth import require_client_auth, require_node_auth
from .chat import ProxyError, proxy_chat_completion, proxy_embeddings
from .config import ControlConfig, load_config
from .models import HostHeartbeat, HostRegistration
from .roles import load_role_catalog
from .registry import Registry
from .upstream import UpstreamClient, UpstreamError
def create_app(
config_path: str | Path | None = None,
*,
upstream_client: UpstreamClient | None = None,
) -> FastAPI:
cfg_path = config_path or os.environ.get("GENIEHIVE_CONTROL_CONFIG")
cfg = load_config(cfg_path) if cfg_path else ControlConfig()
registry = Registry(cfg.storage.sqlite_path)
roles_path = cfg.roles_path or os.environ.get("GENIEHIVE_ROLES_CONFIG")
if roles_path:
registry.upsert_roles(load_role_catalog(roles_path).roles)
upstream = upstream_client or UpstreamClient()
app = FastAPI(title="GenieHive Control", version="0.1.0")
app.state.cfg = cfg
app.state.registry = registry
app.state.upstream = upstream
@app.get("/health")
async def health() -> dict[str, str]:
return {"status": "ok"}
@app.post("/v1/nodes/register")
async def register_node(request: Request, _=Depends(require_node_auth)) -> dict:
payload = await request.json()
reg = HostRegistration.model_validate(payload)
host = request.app.state.registry.register_host(reg)
return {"status": "ok", "host": host}
@app.post("/v1/nodes/heartbeat")
async def heartbeat_node(request: Request, _=Depends(require_node_auth)):
payload = await request.json()
hb = HostHeartbeat.model_validate(payload)
host = request.app.state.registry.heartbeat_host(hb)
if host is None:
return JSONResponse(status_code=404, content={"error": "unknown_host", "host_id": hb.host_id})
return {"status": "ok", "host": host}
@app.get("/v1/cluster/hosts")
async def list_hosts(request: Request, _=Depends(require_client_auth)) -> dict:
return {"object": "list", "data": request.app.state.registry.list_hosts()}
@app.get("/v1/models")
async def list_models(request: Request, _=Depends(require_client_auth)) -> dict:
return {"object": "list", "data": request.app.state.registry.list_client_models()}
@app.post("/v1/chat/completions")
async def chat_completions(request: Request, _=Depends(require_client_auth)):
body = await request.json()
try:
return await proxy_chat_completion(
body,
registry=request.app.state.registry,
upstream=request.app.state.upstream,
)
except ProxyError as exc:
return JSONResponse(
status_code=exc.status_code,
content={"error": {"message": str(exc), "type": "geniehive_error", "code": "chat_proxy_error"}},
)
except UpstreamError as exc:
return JSONResponse(
status_code=exc.status_code or 502,
content={"error": {"message": str(exc), "type": "geniehive_error", "code": "upstream_error"}},
)
@app.post("/v1/embeddings")
async def embeddings(request: Request, _=Depends(require_client_auth)):
body = await request.json()
try:
return await proxy_embeddings(
body,
registry=request.app.state.registry,
upstream=request.app.state.upstream,
)
except ProxyError as exc:
return JSONResponse(
status_code=exc.status_code,
content={"error": {"message": str(exc), "type": "geniehive_error", "code": "embeddings_proxy_error"}},
)
except UpstreamError as exc:
return JSONResponse(
status_code=exc.status_code or 502,
content={"error": {"message": str(exc), "type": "geniehive_error", "code": "upstream_error"}},
)
@app.get("/v1/cluster/services")
async def list_services(request: Request, _=Depends(require_client_auth)) -> dict:
return {"object": "list", "data": request.app.state.registry.list_services()}
@app.get("/v1/cluster/roles")
async def list_roles(request: Request, _=Depends(require_client_auth)) -> dict:
return {"object": "list", "data": request.app.state.registry.list_roles()}
@app.get("/v1/cluster/health")
async def cluster_health(request: Request, _=Depends(require_client_auth)) -> dict:
cfg: ControlConfig = request.app.state.cfg
return request.app.state.registry.cluster_health(cfg.routing.health_stale_after_s)
@app.get("/v1/cluster/routes/resolve")
async def resolve_route(model: str, request: Request, kind: str | None = None, _=Depends(require_client_auth)) -> dict:
resolved = request.app.state.registry.resolve_route(model, kind=kind)
if resolved is None:
return JSONResponse(status_code=404, content={"error": "no_route", "model": model, "kind": kind})
return {"status": "ok", "resolution": resolved}
return app
app = create_app()

View File

@ -0,0 +1,90 @@
from __future__ import annotations
from typing import Any, Literal
from pydantic import BaseModel, Field
class ServiceAsset(BaseModel):
asset_id: str
loaded: bool = False
class ServiceRuntime(BaseModel):
engine: str | None = None
launcher: str | None = None
class ServiceState(BaseModel):
health: str | None = None
load_state: str | None = None
accept_requests: bool = True
class ServiceObserved(BaseModel):
p50_latency_ms: float | None = None
p95_latency_ms: float | None = None
tokens_per_sec: float | None = None
queue_depth: int | None = None
in_flight: int | None = None
class RegisteredService(BaseModel):
service_id: str
host_id: str
kind: Literal["chat", "embeddings", "transcription"]
protocol: str = "openai"
endpoint: str
runtime: ServiceRuntime = Field(default_factory=ServiceRuntime)
assets: list[ServiceAsset] = Field(default_factory=list)
state: ServiceState = Field(default_factory=ServiceState)
observed: ServiceObserved = Field(default_factory=ServiceObserved)
class HostStatus(BaseModel):
state: str = "online"
last_seen: float | None = None
class HostRegistration(BaseModel):
host_id: str
display_name: str | None = None
address: str
labels: dict[str, str] = Field(default_factory=dict)
capabilities: dict[str, Any] = Field(default_factory=dict)
resources: dict[str, Any] = Field(default_factory=dict)
services: list[RegisteredService] = Field(default_factory=list)
class HostHeartbeat(BaseModel):
host_id: str
status: HostStatus = Field(default_factory=HostStatus)
metrics: dict[str, Any] = Field(default_factory=dict)
services: list[RegisteredService] = Field(default_factory=list)
class PromptPolicy(BaseModel):
system_prompt: str | None = None
user_template: str | None = None
class RoutingPolicy(BaseModel):
preferred_families: list[str] = Field(default_factory=list)
preferred_labels: list[str] = Field(default_factory=list)
min_context: int | None = None
require_loaded: bool = False
fallback_roles: list[str] = Field(default_factory=list)
class RoleProfile(BaseModel):
role_id: str
display_name: str | None = None
description: str | None = None
operation: Literal["chat", "embeddings", "transcription"]
modality: str
prompt_policy: PromptPolicy = Field(default_factory=PromptPolicy)
routing_policy: RoutingPolicy = Field(default_factory=RoutingPolicy)
class RoleCatalog(BaseModel):
roles: list[RoleProfile] = Field(default_factory=list)

View File

@ -0,0 +1,464 @@
from __future__ import annotations
import json
import sqlite3
import time
from pathlib import Path
from .models import HostHeartbeat, HostRegistration, RegisteredService, RoleProfile
def _json_dumps(value: object) -> str:
return json.dumps(value, sort_keys=True)
class Registry:
def __init__(self, db_path: str | Path) -> None:
self.db_path = Path(db_path)
self.db_path.parent.mkdir(parents=True, exist_ok=True)
self._init_db()
def _connect(self) -> sqlite3.Connection:
conn = sqlite3.connect(self.db_path)
conn.row_factory = sqlite3.Row
return conn
def _init_db(self) -> None:
with self._connect() as conn:
conn.executescript(
"""
CREATE TABLE IF NOT EXISTS hosts (
host_id TEXT PRIMARY KEY,
display_name TEXT,
address TEXT NOT NULL,
labels_json TEXT NOT NULL,
capabilities_json TEXT NOT NULL,
resources_json TEXT NOT NULL,
status_state TEXT NOT NULL DEFAULT 'online',
last_seen REAL NOT NULL,
metrics_json TEXT NOT NULL DEFAULT '{}'
);
CREATE TABLE IF NOT EXISTS services (
service_id TEXT PRIMARY KEY,
host_id TEXT NOT NULL,
kind TEXT NOT NULL,
protocol TEXT NOT NULL,
endpoint TEXT NOT NULL,
runtime_json TEXT NOT NULL,
assets_json TEXT NOT NULL,
state_json TEXT NOT NULL,
observed_json TEXT NOT NULL,
updated_at REAL NOT NULL,
FOREIGN KEY(host_id) REFERENCES hosts(host_id)
);
CREATE TABLE IF NOT EXISTS roles (
role_id TEXT PRIMARY KEY,
display_name TEXT,
description TEXT,
operation TEXT NOT NULL,
modality TEXT NOT NULL,
prompt_policy_json TEXT NOT NULL,
routing_policy_json TEXT NOT NULL,
updated_at REAL NOT NULL
);
"""
)
def register_host(self, reg: HostRegistration) -> dict:
now = time.time()
with self._connect() as conn:
conn.execute(
"""
INSERT INTO hosts (
host_id, display_name, address, labels_json, capabilities_json,
resources_json, status_state, last_seen, metrics_json
)
VALUES (?, ?, ?, ?, ?, ?, 'online', ?, '{}')
ON CONFLICT(host_id) DO UPDATE SET
display_name=excluded.display_name,
address=excluded.address,
labels_json=excluded.labels_json,
capabilities_json=excluded.capabilities_json,
resources_json=excluded.resources_json,
status_state='online',
last_seen=excluded.last_seen
""",
(
reg.host_id,
reg.display_name,
reg.address,
_json_dumps(reg.labels),
_json_dumps(reg.capabilities),
_json_dumps(reg.resources),
now,
),
)
self._replace_services(conn, reg.host_id, reg.services, now)
return self.get_host(reg.host_id)
def heartbeat_host(self, hb: HostHeartbeat) -> dict | None:
now = time.time()
with self._connect() as conn:
cur = conn.execute(
"SELECT host_id FROM hosts WHERE host_id = ?",
(hb.host_id,),
)
if cur.fetchone() is None:
return None
conn.execute(
"""
UPDATE hosts
SET status_state = ?, last_seen = ?, metrics_json = ?
WHERE host_id = ?
""",
(
hb.status.state,
now,
_json_dumps(hb.metrics),
hb.host_id,
),
)
if hb.services:
self._replace_services(conn, hb.host_id, hb.services, now)
return self.get_host(hb.host_id)
def _replace_services(
self,
conn: sqlite3.Connection,
host_id: str,
services: list[RegisteredService],
now: float,
) -> None:
conn.execute("DELETE FROM services WHERE host_id = ?", (host_id,))
for service in services:
conn.execute(
"""
INSERT INTO services (
service_id, host_id, kind, protocol, endpoint,
runtime_json, assets_json, state_json, observed_json, updated_at
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
service.service_id,
host_id,
service.kind,
service.protocol,
service.endpoint,
_json_dumps(service.runtime.model_dump()),
_json_dumps([asset.model_dump() for asset in service.assets]),
_json_dumps(service.state.model_dump()),
_json_dumps(service.observed.model_dump()),
now,
),
)
def get_host(self, host_id: str) -> dict | None:
with self._connect() as conn:
row = conn.execute("SELECT * FROM hosts WHERE host_id = ?", (host_id,)).fetchone()
if row is None:
return None
return self._host_row_to_dict(row)
def upsert_roles(self, roles: list[RoleProfile]) -> list[dict]:
now = time.time()
with self._connect() as conn:
for role in roles:
conn.execute(
"""
INSERT INTO roles (
role_id, display_name, description, operation, modality,
prompt_policy_json, routing_policy_json, updated_at
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(role_id) DO UPDATE SET
display_name=excluded.display_name,
description=excluded.description,
operation=excluded.operation,
modality=excluded.modality,
prompt_policy_json=excluded.prompt_policy_json,
routing_policy_json=excluded.routing_policy_json,
updated_at=excluded.updated_at
""",
(
role.role_id,
role.display_name,
role.description,
role.operation,
role.modality,
_json_dumps(role.prompt_policy.model_dump()),
_json_dumps(role.routing_policy.model_dump()),
now,
),
)
return self.list_roles()
def get_role(self, role_id: str) -> dict | None:
with self._connect() as conn:
row = conn.execute("SELECT * FROM roles WHERE role_id = ?", (role_id,)).fetchone()
if row is None:
return None
return self._role_row_to_dict(row)
def list_roles(self) -> list[dict]:
with self._connect() as conn:
rows = conn.execute("SELECT * FROM roles ORDER BY role_id").fetchall()
return [self._role_row_to_dict(row) for row in rows]
def list_hosts(self) -> list[dict]:
with self._connect() as conn:
rows = conn.execute("SELECT * FROM hosts ORDER BY host_id").fetchall()
return [self._host_row_to_dict(row) for row in rows]
def list_services(self) -> list[dict]:
with self._connect() as conn:
rows = conn.execute("SELECT * FROM services ORDER BY host_id, service_id").fetchall()
return [self._service_row_to_dict(row) for row in rows]
def list_client_models(self) -> list[dict]:
services = self.list_services()
roles = self.list_roles()
items: list[dict] = []
for service in services:
if not service["state"].get("accept_requests", True):
continue
if service["state"].get("health") != "healthy":
continue
item = {
"id": service["service_id"],
"object": "model",
"owned_by": service["host_id"],
"geniehive": self._service_metadata(service),
}
items.append(item)
for asset in service["assets"]:
asset_id = asset.get("asset_id")
if not asset_id:
continue
items.append(
{
"id": asset_id,
"object": "model",
"owned_by": service["host_id"],
"geniehive": self._service_metadata(service) | {"route_type": "asset", "asset_id": asset_id},
}
)
for role in roles:
matching_services = [
service
for service in services
if service["kind"] == role["operation"]
and service["state"].get("accept_requests", True)
and service["state"].get("health") == "healthy"
]
loaded_count = sum(1 for service in matching_services if any(asset.get("loaded") for asset in service["assets"]))
latencies = [
service["observed"].get("p50_latency_ms")
for service in matching_services
if service["observed"].get("p50_latency_ms") is not None
]
best_latency_ms = min(latencies) if latencies else None
items.append(
{
"id": role["role_id"],
"object": "model",
"owned_by": "geniehive-role",
"geniehive": {
"route_type": "role",
"role_id": role["role_id"],
"display_name": role["display_name"],
"operation": role["operation"],
"modality": role["modality"],
"healthy_target_count": len(matching_services),
"loaded_target_count": loaded_count,
"best_p50_latency_ms": best_latency_ms,
"offload_hint": self._offload_hint(
operation=role["operation"],
loaded_count=loaded_count,
best_latency_ms=best_latency_ms,
),
"routing_policy": role["routing_policy"],
},
}
)
deduped: dict[str, dict] = {}
for item in items:
deduped[item["id"]] = item
return [deduped[key] for key in sorted(deduped)]
def resolve_route(self, requested_model: str, *, kind: str | None = None) -> dict | None:
direct = self._resolve_direct(requested_model, kind=kind)
if direct is not None:
return {"match_type": "direct", **direct}
role = self.get_role(requested_model)
if role is None:
return None
matched_kind = kind or role["operation"]
candidates = [
service
for service in self.list_services()
if service["kind"] == matched_kind
and service["state"].get("accept_requests", True)
and service["state"].get("health") == "healthy"
]
if not candidates:
return {"match_type": "role", "role": role, "service": None}
preferred_families = [family.lower() for family in role["routing_policy"].get("preferred_families", [])]
def score(service: dict) -> tuple[int, int, float, str]:
loaded = 1 if any(asset.get("loaded") for asset in service["assets"]) else 0
family_match = 0
if preferred_families:
asset_names = " ".join(asset.get("asset_id", "") for asset in service["assets"]).lower()
family_match = 1 if any(family in asset_names for family in preferred_families) else 0
latency = service["observed"].get("p50_latency_ms")
latency_score = float(latency) if latency is not None else float("inf")
return (family_match, loaded, -latency_score, service["service_id"])
if role["routing_policy"].get("require_loaded"):
loaded_candidates = [service for service in candidates if any(asset.get("loaded") for asset in service["assets"])]
if loaded_candidates:
candidates = loaded_candidates
service = max(candidates, key=score)
return {"match_type": "role", "role": role, "service": service}
def _resolve_direct(self, requested_model: str, *, kind: str | None = None) -> dict | None:
candidates = []
for service in self.list_services():
if kind is not None and service["kind"] != kind:
continue
if not service["state"].get("accept_requests", True):
continue
if service["state"].get("health") != "healthy":
continue
asset_ids = {asset.get("asset_id") for asset in service["assets"]}
if service["service_id"] == requested_model or requested_model in asset_ids:
candidates.append(service)
if not candidates:
return None
def score(service: dict) -> tuple[int, float, str]:
loaded = 1 if any(asset.get("loaded") for asset in service["assets"]) else 0
latency = service["observed"].get("p50_latency_ms")
latency_score = float(latency) if latency is not None else float("inf")
return (loaded, -latency_score, service["service_id"])
service = max(candidates, key=score)
return {"service": service}
def cluster_health(self, stale_after_s: float) -> dict:
hosts = self.list_hosts()
services = self.list_services()
now = time.time()
online = 0
stale = 0
for host in hosts:
is_stale = (now - host["status"]["last_seen"]) > stale_after_s
if is_stale:
stale += 1
elif host["status"]["state"] == "online":
online += 1
healthy_services = sum(1 for service in services if service["state"].get("health") == "healthy")
return {
"status": "ok",
"host_count": len(hosts),
"online_host_count": online,
"stale_host_count": stale,
"service_count": len(services),
"healthy_service_count": healthy_services,
}
@staticmethod
def _offload_hint(*, operation: str, loaded_count: int, best_latency_ms: float | None) -> dict:
if loaded_count <= 0:
suitability = "cold_only"
elif best_latency_ms is not None and best_latency_ms <= 1500:
suitability = "good_for_low_complexity"
elif best_latency_ms is not None and best_latency_ms <= 4000:
suitability = "usable_for_background_tasks"
else:
suitability = "available_but_slow"
return {
"operation": operation,
"suitability": suitability,
"recommended_for": "lower-complexity offload" if operation == "chat" else f"{operation} offload",
"inference_basis": {
"loaded_target_count": loaded_count,
"best_p50_latency_ms": best_latency_ms,
},
}
def _service_metadata(self, service: dict) -> dict:
lat = service["observed"].get("p50_latency_ms")
loaded_count = 1 if any(asset.get("loaded") for asset in service["assets"]) else 0
return {
"route_type": "service",
"service_id": service["service_id"],
"host_id": service["host_id"],
"operation": service["kind"],
"protocol": service["protocol"],
"endpoint": service["endpoint"],
"health": service["state"].get("health"),
"loaded_asset_count": loaded_count,
"assets": service["assets"],
"runtime": service["runtime"],
"observed": service["observed"],
"offload_hint": self._offload_hint(
operation=service["kind"],
loaded_count=loaded_count,
best_latency_ms=lat,
),
}
@staticmethod
def _host_row_to_dict(row: sqlite3.Row) -> dict:
return {
"host_id": row["host_id"],
"display_name": row["display_name"],
"address": row["address"],
"labels": json.loads(row["labels_json"]),
"capabilities": json.loads(row["capabilities_json"]),
"resources": json.loads(row["resources_json"]),
"status": {
"state": row["status_state"],
"last_seen": row["last_seen"],
},
"metrics": json.loads(row["metrics_json"]),
}
@staticmethod
def _service_row_to_dict(row: sqlite3.Row) -> dict:
return {
"service_id": row["service_id"],
"host_id": row["host_id"],
"kind": row["kind"],
"protocol": row["protocol"],
"endpoint": row["endpoint"],
"runtime": json.loads(row["runtime_json"]),
"assets": json.loads(row["assets_json"]),
"state": json.loads(row["state_json"]),
"observed": json.loads(row["observed_json"]),
"updated_at": row["updated_at"],
}
@staticmethod
def _role_row_to_dict(row: sqlite3.Row) -> dict:
return {
"role_id": row["role_id"],
"display_name": row["display_name"],
"description": row["description"],
"operation": row["operation"],
"modality": row["modality"],
"prompt_policy": json.loads(row["prompt_policy_json"]),
"routing_policy": json.loads(row["routing_policy_json"]),
"updated_at": row["updated_at"],
}

View File

@ -0,0 +1,14 @@
from __future__ import annotations
from pathlib import Path
import yaml
from .models import RoleCatalog
def load_role_catalog(path: str | Path) -> RoleCatalog:
raw = yaml.safe_load(Path(path).read_text()) or {}
if not isinstance(raw, dict):
raise ValueError("Role catalog must be a YAML mapping.")
return RoleCatalog.model_validate(raw)

View File

@ -0,0 +1,17 @@
from __future__ import annotations
from typing import Any
def choose_upstream_model_id(requested_model: str, service: dict[str, Any]) -> str:
assets = service.get("assets", [])
asset_ids = [asset.get("asset_id") for asset in assets if asset.get("asset_id")]
if requested_model in asset_ids:
return requested_model
loaded_assets = [asset.get("asset_id") for asset in assets if asset.get("loaded") and asset.get("asset_id")]
if loaded_assets:
return loaded_assets[0]
if asset_ids:
return asset_ids[0]
return requested_model

View File

@ -0,0 +1,68 @@
from __future__ import annotations
from typing import Any, Protocol
import httpx
class UpstreamError(RuntimeError):
def __init__(self, message: str, *, status_code: int | None = None) -> None:
super().__init__(message)
self.status_code = status_code
class AsyncPoster(Protocol):
async def post(self, url: str, *, json: dict[str, Any], headers: dict[str, str] | None = None) -> object:
...
class UpstreamClient:
def __init__(self, client: AsyncPoster | None = None) -> None:
self._owns_client = client is None
self._client = client or httpx.AsyncClient(
timeout=httpx.Timeout(connect=10.0, read=600.0, write=60.0, pool=60.0)
)
async def chat_completions(
self,
base_url: str,
body: dict[str, Any],
*,
headers: dict[str, str] | None = None,
) -> Any:
url = base_url.rstrip("/") + "/v1/chat/completions"
response = await self._client.post(url, json=body, headers=headers)
status_code = getattr(response, "status_code", 200)
if status_code >= 400:
text = getattr(response, "text", "")
raise UpstreamError(
text or f"upstream error from {url}",
status_code=status_code,
)
if hasattr(response, "json"):
return response.json()
return response
async def embeddings(
self,
base_url: str,
body: dict[str, Any],
*,
headers: dict[str, str] | None = None,
) -> Any:
url = base_url.rstrip("/") + "/v1/embeddings"
response = await self._client.post(url, json=body, headers=headers)
status_code = getattr(response, "status_code", 200)
if status_code >= 400:
text = getattr(response, "text", "")
raise UpstreamError(
text or f"upstream error from {url}",
status_code=status_code,
)
if hasattr(response, "json"):
return response.json()
return response
async def aclose(self) -> None:
if self._owns_client and isinstance(self._client, httpx.AsyncClient):
await self._client.aclose()

View File

@ -0,0 +1,2 @@
"""GenieHive node-agent package."""

View File

@ -0,0 +1,68 @@
from __future__ import annotations
from pathlib import Path
from typing import Literal
import yaml
from pydantic import BaseModel, Field
ServiceKind = Literal["chat", "embeddings", "transcription"]
class NodeConfigBlock(BaseModel):
host_id: str = "node-1"
display_name: str | None = None
listen_host: str = "127.0.0.1"
listen_port: int = 8891
address: str | None = None
labels: dict[str, str] = Field(default_factory=dict)
class ControlPlaneConfig(BaseModel):
base_url: str | None = None
node_api_key: str | None = None
heartbeat_interval_s: float = 5.0
class InventoryConfig(BaseModel):
model_roots: list[str] = Field(default_factory=list)
cpu_threads: int | None = None
ram_gb: float | None = None
capabilities: dict[str, bool] = Field(default_factory=dict)
class ManagedRuntimesConfig(BaseModel):
enabled: bool = False
llama_server_bin: str | None = None
class NodeServiceAssetConfig(BaseModel):
asset_id: str
loaded: bool = False
class NodeServiceConfig(BaseModel):
service_id: str
kind: ServiceKind
protocol: str = "openai"
endpoint: str | None = None
runtime: dict[str, str] = Field(default_factory=dict)
assets: list[NodeServiceAssetConfig] = Field(default_factory=list)
state: dict[str, object] = Field(default_factory=dict)
observed: dict[str, object] = Field(default_factory=dict)
class NodeConfig(BaseModel):
node: NodeConfigBlock = Field(default_factory=NodeConfigBlock)
control_plane: ControlPlaneConfig = Field(default_factory=ControlPlaneConfig)
inventory: InventoryConfig = Field(default_factory=InventoryConfig)
managed_runtimes: ManagedRuntimesConfig = Field(default_factory=ManagedRuntimesConfig)
services: list[NodeServiceConfig] = Field(default_factory=list)
def load_config(path: str | Path) -> NodeConfig:
raw = yaml.safe_load(Path(path).read_text()) or {}
if not isinstance(raw, dict):
raise ValueError("Node config must be a YAML mapping.")
return NodeConfig.model_validate(raw)

View File

@ -0,0 +1,85 @@
from __future__ import annotations
from pathlib import Path
import time
from .config import NodeConfig
from .models import NodeInventory
def discover_model_files(roots: list[str]) -> list[dict[str, object]]:
discovered: list[dict[str, object]] = []
for root in roots:
path = Path(root)
if not path.exists():
continue
for model_path in sorted(path.rglob("*.gguf")):
discovered.append(
{
"path": str(model_path),
"name": model_path.name,
"size_bytes": model_path.stat().st_size,
}
)
return discovered
def build_inventory(cfg: NodeConfig) -> NodeInventory:
address = cfg.node.address or cfg.node.listen_host
resources: dict[str, object] = {}
if cfg.inventory.cpu_threads is not None:
resources["cpu_threads"] = cfg.inventory.cpu_threads
if cfg.inventory.ram_gb is not None:
resources["ram_gb"] = cfg.inventory.ram_gb
resources["discovered_models"] = discover_model_files(cfg.inventory.model_roots)
services: list[dict] = []
for service in cfg.services:
endpoint = service.endpoint or f"http://{cfg.node.listen_host}:{cfg.node.listen_port}"
services.append(
{
"service_id": service.service_id,
"host_id": cfg.node.host_id,
"kind": service.kind,
"protocol": service.protocol,
"endpoint": endpoint,
"runtime": service.runtime,
"assets": [asset.model_dump() for asset in service.assets],
"state": service.state,
"observed": service.observed,
}
)
return NodeInventory(
host_id=cfg.node.host_id,
display_name=cfg.node.display_name,
address=address,
labels=cfg.node.labels,
capabilities=cfg.inventory.capabilities,
resources=resources,
services=services,
)
def build_registration_payload(cfg: NodeConfig) -> dict:
inventory = build_inventory(cfg)
return inventory.model_dump()
def build_heartbeat_payload(cfg: NodeConfig) -> dict:
inventory = build_inventory(cfg)
healthy_service_count = sum(
1 for service in inventory.services if service.get("state", {}).get("health") == "healthy"
)
return {
"host_id": inventory.host_id,
"status": {
"state": "online",
"last_seen": time.time(),
},
"metrics": {
"service_count": len(inventory.services),
"healthy_service_count": healthy_service_count,
"discovered_model_count": len(inventory.resources.get("discovered_models", [])),
},
}

View File

@ -0,0 +1,62 @@
from __future__ import annotations
import asyncio
from contextlib import asynccontextmanager, suppress
import os
from pathlib import Path
from fastapi import FastAPI
from .config import NodeConfig, load_config
from .inventory import build_inventory, build_registration_payload
from .sync import ControlPlaneClient
def create_app(
config_path: str | Path | None = None,
*,
sync_enabled: bool = True,
control_client: ControlPlaneClient | None = None,
) -> FastAPI:
cfg_path = config_path or os.environ.get("GENIEHIVE_NODE_CONFIG")
cfg = load_config(cfg_path) if cfg_path else NodeConfig()
sync_client = control_client or ControlPlaneClient(cfg)
@asynccontextmanager
async def lifespan(app: FastAPI):
heartbeat_task: asyncio.Task[None] | None = None
stop_event = asyncio.Event()
if sync_enabled and sync_client.enabled:
with suppress(Exception):
await sync_client.register_once()
heartbeat_task = asyncio.create_task(sync_client.heartbeat_loop(stop_event))
try:
yield
finally:
if heartbeat_task is not None:
stop_event.set()
heartbeat_task.cancel()
with suppress(asyncio.CancelledError):
await heartbeat_task
await sync_client.aclose()
app = FastAPI(title="GenieHive Node", version="0.1.0", lifespan=lifespan)
app.state.cfg = cfg
app.state.control_client = sync_client
@app.get("/health")
async def health() -> dict[str, str]:
return {"status": "ok"}
@app.get("/v1/node/inventory")
async def inventory() -> dict:
return build_inventory(cfg).model_dump()
@app.get("/v1/node/registration")
async def registration() -> dict:
return build_registration_payload(cfg)
return app
app = create_app()

View File

@ -0,0 +1,14 @@
from __future__ import annotations
from pydantic import BaseModel, Field
class NodeInventory(BaseModel):
host_id: str
display_name: str | None = None
address: str
labels: dict[str, str] = Field(default_factory=dict)
capabilities: dict[str, bool] = Field(default_factory=dict)
resources: dict[str, object] = Field(default_factory=dict)
services: list[dict] = Field(default_factory=list)

View File

@ -0,0 +1,84 @@
from __future__ import annotations
import asyncio
from contextlib import suppress
from typing import Protocol
import httpx
from .config import NodeConfig
from .inventory import build_heartbeat_payload, build_registration_payload
class AsyncPoster(Protocol):
async def post(self, url: str, *, json: dict, headers: dict[str, str] | None = None) -> object:
...
class ControlPlaneClient:
def __init__(self, cfg: NodeConfig, http_client: AsyncPoster | None = None) -> None:
self.cfg = cfg
self._owns_client = http_client is None
self._registered = False
self._http = http_client or httpx.AsyncClient(
timeout=httpx.Timeout(connect=5.0, read=30.0, write=30.0, pool=30.0)
)
@property
def enabled(self) -> bool:
return bool(self.cfg.control_plane.base_url)
def _headers(self) -> dict[str, str]:
headers: dict[str, str] = {}
if self.cfg.control_plane.node_api_key:
headers["X-GenieHive-Node-Key"] = self.cfg.control_plane.node_api_key
return headers
async def register_once(self) -> None:
if not self.enabled:
return
url = str(self.cfg.control_plane.base_url).rstrip("/") + "/v1/nodes/register"
response = await self._http.post(
url,
json=build_registration_payload(self.cfg),
headers=self._headers(),
)
if isinstance(response, httpx.Response):
response.raise_for_status()
self._registered = True
async def heartbeat_once(self) -> None:
if not self.enabled:
return
if not self._registered:
await self.register_once()
url = str(self.cfg.control_plane.base_url).rstrip("/") + "/v1/nodes/heartbeat"
response = await self._http.post(
url,
json=build_heartbeat_payload(self.cfg),
headers=self._headers(),
)
if isinstance(response, httpx.Response):
if response.status_code == 404:
self._registered = False
await self.register_once()
response = await self._http.post(
url,
json=build_heartbeat_payload(self.cfg),
headers=self._headers(),
)
response.raise_for_status()
async def heartbeat_loop(self, stop_event: asyncio.Event) -> None:
interval = max(self.cfg.control_plane.heartbeat_interval_s, 0.1)
while not stop_event.is_set():
with suppress(Exception):
await self.heartbeat_once()
try:
await asyncio.wait_for(stop_event.wait(), timeout=interval)
except asyncio.TimeoutError:
continue
async def aclose(self) -> None:
if self._owns_client and isinstance(self._http, httpx.AsyncClient):
await self._http.aclose()

9
tests/conftest.py Normal file
View File

@ -0,0 +1,9 @@
from pathlib import Path
import sys
ROOT = Path(__file__).resolve().parents[1]
SRC = ROOT / "src"
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))

224
tests/test_control_chat.py Normal file
View File

@ -0,0 +1,224 @@
import asyncio
from pathlib import Path
from geniehive_control.chat import ProxyError, proxy_chat_completion, proxy_embeddings
from geniehive_control.models import HostRegistration, RegisteredService, RoleProfile
from geniehive_control.registry import Registry
from geniehive_control.upstream import UpstreamClient
class _FakeResponse:
def __init__(self, payload: dict, status_code: int = 200) -> None:
self._payload = payload
self.status_code = status_code
self.text = str(payload)
def json(self) -> dict:
return self._payload
class _FakePoster:
def __init__(self) -> None:
self.calls: list[dict] = []
async def post(self, url: str, *, json: dict, headers: dict[str, str] | None = None) -> _FakeResponse:
self.calls.append({"url": url, "json": json, "headers": headers or {}})
return _FakeResponse({"ok": True, "echo_model": json["model"]})
def _build_registry(tmp_path: Path) -> Registry:
registry = Registry(tmp_path / "geniehive.sqlite3")
registry.register_host(
HostRegistration(
host_id="atlas-01",
address="192.168.1.101",
services=[
RegisteredService(
service_id="atlas-01/chat/qwen3-8b",
host_id="atlas-01",
kind="chat",
endpoint="http://192.168.1.101:18091",
assets=[{"asset_id": "qwen3-8b-q4km", "loaded": True}],
state={"health": "healthy", "load_state": "loaded", "accept_requests": True},
observed={"p50_latency_ms": 900},
),
RegisteredService(
service_id="atlas-01/embeddings/bge-small",
host_id="atlas-01",
kind="embeddings",
endpoint="http://192.168.1.101:18092",
assets=[{"asset_id": "bge-small-en", "loaded": True}],
state={"health": "healthy", "load_state": "loaded", "accept_requests": True},
observed={"p50_latency_ms": 120},
)
],
)
)
registry.upsert_roles(
[
RoleProfile(
role_id="mentor",
display_name="Mentor",
operation="chat",
modality="text",
routing_policy={"preferred_families": ["qwen3"]},
),
RoleProfile(
role_id="embedder",
display_name="Embedder",
operation="embeddings",
modality="text",
routing_policy={"require_loaded": True},
)
]
)
return registry
def test_proxy_chat_completion_rewrites_role_to_loaded_asset(tmp_path: Path) -> None:
registry = _build_registry(tmp_path)
fake = _FakePoster()
upstream = UpstreamClient(client=fake)
async def run() -> dict:
return await proxy_chat_completion(
{
"model": "mentor",
"messages": [{"role": "user", "content": "hello"}],
},
registry=registry,
upstream=upstream,
)
result = asyncio.run(run())
assert result["ok"] is True
assert result["echo_model"] == "qwen3-8b-q4km"
assert fake.calls[0]["url"] == "http://192.168.1.101:18091/v1/chat/completions"
assert fake.calls[0]["json"]["model"] == "qwen3-8b-q4km"
def test_proxy_chat_completion_preserves_direct_asset_match(tmp_path: Path) -> None:
registry = _build_registry(tmp_path)
fake = _FakePoster()
upstream = UpstreamClient(client=fake)
async def run() -> dict:
return await proxy_chat_completion(
{
"model": "qwen3-8b-q4km",
"messages": [{"role": "user", "content": "hello"}],
},
registry=registry,
upstream=upstream,
)
result = asyncio.run(run())
assert result["echo_model"] == "qwen3-8b-q4km"
def test_proxy_chat_completion_strips_reasoning_fields(tmp_path: Path) -> None:
registry = _build_registry(tmp_path)
class _ReasoningPoster:
async def post(self, url: str, *, json: dict, headers: dict[str, str] | None = None) -> _FakeResponse:
return _FakeResponse(
{
"object": "chat.completion",
"model": json["model"],
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "GPU1 route is live.",
"reasoning_content": "hidden chain of thought",
},
"reasoning": {"tokens": 42},
}
],
}
)
upstream = UpstreamClient(client=_ReasoningPoster())
async def run() -> dict:
return await proxy_chat_completion(
{
"model": "mentor",
"messages": [{"role": "user", "content": "hello"}],
},
registry=registry,
upstream=upstream,
)
result = asyncio.run(run())
choice = result["choices"][0]
assert choice["message"]["content"] == "GPU1 route is live."
assert "reasoning_content" not in choice["message"]
assert "reasoning" not in choice
def test_proxy_chat_completion_fails_for_unknown_model(tmp_path: Path) -> None:
registry = _build_registry(tmp_path)
upstream = UpstreamClient(client=_FakePoster())
async def run() -> None:
await proxy_chat_completion(
{
"model": "unknown-model",
"messages": [{"role": "user", "content": "hello"}],
},
registry=registry,
upstream=upstream,
)
try:
asyncio.run(run())
except ProxyError as exc:
assert exc.status_code == 404
else:
raise AssertionError("expected ChatProxyError")
def test_proxy_embeddings_rewrites_role_to_loaded_asset(tmp_path: Path) -> None:
registry = _build_registry(tmp_path)
fake = _FakePoster()
upstream = UpstreamClient(client=fake)
async def run() -> dict:
return await proxy_embeddings(
{
"model": "embedder",
"input": "hello",
},
registry=registry,
upstream=upstream,
)
result = asyncio.run(run())
assert result["ok"] is True
assert result["echo_model"] == "bge-small-en"
assert fake.calls[0]["url"] == "http://192.168.1.101:18092/v1/embeddings"
assert fake.calls[0]["json"]["model"] == "bge-small-en"
def test_proxy_embeddings_fails_for_unknown_model(tmp_path: Path) -> None:
registry = _build_registry(tmp_path)
upstream = UpstreamClient(client=_FakePoster())
async def run() -> None:
await proxy_embeddings(
{
"model": "unknown-embedder",
"input": "hello",
},
registry=registry,
upstream=upstream,
)
try:
asyncio.run(run())
except ProxyError as exc:
assert exc.status_code == 404
else:
raise AssertionError("expected ProxyError")

View File

@ -0,0 +1,152 @@
from pathlib import Path
from geniehive_control.main import create_app
from geniehive_control.models import HostHeartbeat, HostRegistration, RegisteredService, RoleProfile
from geniehive_control.registry import Registry
def test_registry_persists_registration_and_heartbeat(tmp_path: Path) -> None:
db_path = tmp_path / "geniehive.sqlite3"
registry = Registry(db_path)
host = registry.register_host(
HostRegistration(
host_id="atlas-01",
display_name="Atlas GPU Box",
address="192.168.1.101",
labels={"site": "home-lab"},
capabilities={"cuda": True},
resources={"cpu_threads": 24},
services=[
RegisteredService(
service_id="atlas-01/chat/qwen3-8b",
host_id="atlas-01",
kind="chat",
protocol="openai",
endpoint="http://192.168.1.101:18091",
runtime={"engine": "llama.cpp", "launcher": "managed"},
assets=[{"asset_id": "qwen3-8b-q4km", "loaded": True}],
state={"health": "healthy", "load_state": "loaded", "accept_requests": True},
observed={"p50_latency_ms": 900, "tokens_per_sec": 40},
)
],
)
)
assert host is not None
assert host["host_id"] == "atlas-01"
updated = registry.heartbeat_host(
HostHeartbeat(
host_id="atlas-01",
status={"state": "online"},
metrics={"gpu_utilization_pct": 77},
)
)
assert updated is not None
assert updated["metrics"]["gpu_utilization_pct"] == 77
hosts = registry.list_hosts()
services = registry.list_services()
health = registry.cluster_health(stale_after_s=30)
assert len(hosts) == 1
assert len(services) == 1
assert services[0]["service_id"] == "atlas-01/chat/qwen3-8b"
assert services[0]["state"]["health"] == "healthy"
assert health["host_count"] == 1
assert health["healthy_service_count"] == 1
def test_registry_persists_roles_and_resolves_direct_and_role_routes(tmp_path: Path) -> None:
db_path = tmp_path / "geniehive.sqlite3"
registry = Registry(db_path)
registry.register_host(
HostRegistration(
host_id="atlas-01",
address="192.168.1.101",
services=[
RegisteredService(
service_id="atlas-01/chat/qwen3-8b",
host_id="atlas-01",
kind="chat",
endpoint="http://192.168.1.101:18091",
assets=[{"asset_id": "qwen3-8b-q4km", "loaded": True}],
state={"health": "healthy", "load_state": "loaded", "accept_requests": True},
observed={"p50_latency_ms": 900},
),
RegisteredService(
service_id="atlas-01/embeddings/bge-small",
host_id="atlas-01",
kind="embeddings",
endpoint="http://192.168.1.101:18092",
assets=[{"asset_id": "bge-small-en", "loaded": True}],
state={"health": "healthy", "load_state": "loaded", "accept_requests": True},
observed={"p50_latency_ms": 120},
),
],
)
)
registry.upsert_roles(
[
RoleProfile(
role_id="mentor",
display_name="Mentor",
operation="chat",
modality="text",
routing_policy={"preferred_families": ["qwen3"]},
),
RoleProfile(
role_id="embedder",
display_name="Embedder",
operation="embeddings",
modality="text",
routing_policy={"require_loaded": True},
),
]
)
roles = registry.list_roles()
assert len(roles) == 2
assert roles[0]["role_id"] == "embedder"
direct = registry.resolve_route("qwen3-8b-q4km")
assert direct is not None
assert direct["match_type"] == "direct"
assert direct["service"]["service_id"] == "atlas-01/chat/qwen3-8b"
by_role = registry.resolve_route("mentor")
assert by_role is not None
assert by_role["match_type"] == "role"
assert by_role["role"]["role_id"] == "mentor"
assert by_role["service"]["service_id"] == "atlas-01/chat/qwen3-8b"
embed_role = registry.resolve_route("embedder")
assert embed_role is not None
assert embed_role["service"]["service_id"] == "atlas-01/embeddings/bge-small"
models = registry.list_client_models()
ids = {item["id"] for item in models}
assert "atlas-01/chat/qwen3-8b" in ids
assert "qwen3-8b-q4km" in ids
assert "mentor" in ids
mentor = next(item for item in models if item["id"] == "mentor")
assert mentor["geniehive"]["route_type"] == "role"
assert mentor["geniehive"]["offload_hint"]["suitability"] == "good_for_low_complexity"
asset = next(item for item in models if item["id"] == "qwen3-8b-q4km")
assert asset["geniehive"]["route_type"] == "asset"
assert asset["geniehive"]["offload_hint"]["recommended_for"] == "lower-complexity offload"
def test_control_app_exposes_expected_routes() -> None:
app = create_app()
paths = {route.path for route in app.routes}
assert "/health" in paths
assert "/v1/models" in paths
assert "/v1/nodes/register" in paths
assert "/v1/nodes/heartbeat" in paths
assert "/v1/cluster/hosts" in paths
assert "/v1/cluster/services" in paths
assert "/v1/cluster/roles" in paths
assert "/v1/cluster/health" in paths
assert "/v1/cluster/routes/resolve" in paths

104
tests/test_demo_flow.py Normal file
View File

@ -0,0 +1,104 @@
from pathlib import Path
from geniehive_control.main import create_app as create_control_app
from geniehive_control.models import HostHeartbeat, HostRegistration
from geniehive_node.config import load_config as load_node_config
from geniehive_node.inventory import build_heartbeat_payload, build_registration_payload
def _write_demo_files(tmp_path: Path) -> tuple[Path, Path, Path]:
models_dir = tmp_path / "models"
models_dir.mkdir()
(models_dir / "qwen3-demo.gguf").write_bytes(b"demo")
roles_path = tmp_path / "roles.yaml"
roles_path.write_text(
"\n".join(
[
"roles:",
' - role_id: "mentor"',
' display_name: "Mentor"',
' operation: "chat"',
' modality: "text"',
" routing_policy:",
' preferred_families: ["qwen3"]',
]
)
)
control_path = tmp_path / "control.yaml"
control_path.write_text(
"\n".join(
[
"auth:",
" client_api_keys:",
' - "client-key"',
" node_api_keys:",
' - "node-key"',
"storage:",
f' sqlite_path: "{tmp_path / "state.sqlite3"}"',
f'roles_path: "{roles_path}"',
]
)
)
node_path = tmp_path / "node.yaml"
node_path.write_text(
"\n".join(
[
"node:",
' host_id: "atlas-01"',
' display_name: "Atlas GPU Box"',
' listen_host: "127.0.0.1"',
" listen_port: 8891",
' address: "192.168.1.101"',
"control_plane:",
' base_url: "http://127.0.0.1:8800"',
' node_api_key: "node-key"',
"inventory:",
f' model_roots:\n - "{models_dir}"',
" capabilities:",
" cuda: true",
"services:",
' - service_id: "atlas-01/chat/qwen3-8b"',
' kind: "chat"',
' endpoint: "http://127.0.0.1:18091"',
" assets:",
' - asset_id: "qwen3-8b-q4km"',
" loaded: true",
" state:",
' health: "healthy"',
' load_state: "loaded"',
" accept_requests: true",
" observed:",
" p50_latency_ms: 900",
]
)
)
return control_path, node_path, roles_path
def test_demo_flow_registers_node_and_resolves_role(tmp_path: Path) -> None:
control_path, node_path, _ = _write_demo_files(tmp_path)
control_app = create_control_app(control_path)
registry = control_app.state.registry
node_cfg = load_node_config(node_path)
registration = build_registration_payload(node_cfg)
heartbeat = build_heartbeat_payload(node_cfg)
host = registry.register_host(HostRegistration.model_validate(registration))
assert host["host_id"] == "atlas-01"
updated = registry.heartbeat_host(HostHeartbeat.model_validate(heartbeat))
assert updated is not None
assert updated["metrics"]["service_count"] == 1
roles = registry.list_roles()
assert len(roles) == 1
assert roles[0]["role_id"] == "mentor"
resolved = registry.resolve_route("mentor")
assert resolved is not None
assert resolved["match_type"] == "role"
assert resolved["service"]["service_id"] == "atlas-01/chat/qwen3-8b"

View File

@ -0,0 +1,108 @@
import asyncio
from pathlib import Path
from geniehive_node.config import load_config
from geniehive_node.inventory import build_heartbeat_payload, build_inventory, build_registration_payload
from geniehive_node.main import create_app
from geniehive_node.sync import ControlPlaneClient
def _write_node_config(tmp_path: Path) -> Path:
models_dir = tmp_path / "models"
models_dir.mkdir()
(models_dir / "demo.gguf").write_bytes(b"gguf-demo")
cfg_path = tmp_path / "node.yaml"
cfg_path.write_text(
"\n".join(
[
"node:",
' host_id: "atlas-01"',
' display_name: "Atlas GPU Box"',
' listen_host: "127.0.0.1"',
" listen_port: 8891",
' address: "192.168.1.101"',
" labels:",
' site: "home-lab"',
"inventory:",
f' model_roots:\n - "{models_dir}"',
" cpu_threads: 24",
" ram_gb: 128",
" capabilities:",
" cuda: true",
"services:",
' - service_id: "atlas-01/chat/qwen3-8b"',
' kind: "chat"',
' endpoint: "http://127.0.0.1:18091"',
" runtime:",
' engine: "llama.cpp"',
' launcher: "managed"',
" assets:",
' - asset_id: "qwen3-8b-q4km"',
" loaded: true",
" state:",
' health: "healthy"',
' load_state: "loaded"',
" accept_requests: true",
]
)
)
return cfg_path
def test_build_inventory_and_registration_payload(tmp_path: Path) -> None:
cfg = load_config(_write_node_config(tmp_path))
inventory = build_inventory(cfg)
payload = build_registration_payload(cfg)
heartbeat = build_heartbeat_payload(cfg)
assert inventory.host_id == "atlas-01"
assert inventory.address == "192.168.1.101"
assert inventory.capabilities["cuda"] is True
assert inventory.resources["cpu_threads"] == 24
assert len(inventory.resources["discovered_models"]) == 1
assert inventory.services[0]["host_id"] == "atlas-01"
assert inventory.services[0]["service_id"] == "atlas-01/chat/qwen3-8b"
assert payload["services"][0]["kind"] == "chat"
assert heartbeat["host_id"] == "atlas-01"
assert heartbeat["metrics"]["service_count"] == 1
assert heartbeat["metrics"]["healthy_service_count"] == 1
def test_node_app_exposes_inventory_routes(tmp_path: Path) -> None:
app = create_app(_write_node_config(tmp_path), sync_enabled=False)
paths = {route.path for route in app.routes}
assert "/health" in paths
assert "/v1/node/inventory" in paths
assert "/v1/node/registration" in paths
class _FakePoster:
def __init__(self) -> None:
self.calls: list[dict] = []
async def post(self, url: str, *, json: dict, headers: dict[str, str] | None = None) -> object:
self.calls.append({"url": url, "json": json, "headers": headers or {}})
return object()
def test_control_plane_client_posts_register_and_heartbeat(tmp_path: Path) -> None:
cfg_path = _write_node_config(tmp_path)
cfg = load_config(cfg_path)
cfg.control_plane.base_url = "http://127.0.0.1:8800"
cfg.control_plane.node_api_key = "node-key"
fake = _FakePoster()
client = ControlPlaneClient(cfg, http_client=fake)
async def run() -> None:
await client.register_once()
await client.heartbeat_once()
asyncio.run(run())
assert len(fake.calls) == 2
assert fake.calls[0]["url"] == "http://127.0.0.1:8800/v1/nodes/register"
assert fake.calls[0]["headers"]["X-GenieHive-Node-Key"] == "node-key"
assert fake.calls[0]["json"]["host_id"] == "atlas-01"
assert fake.calls[1]["url"] == "http://127.0.0.1:8800/v1/nodes/heartbeat"
assert fake.calls[1]["json"]["metrics"]["service_count"] == 1

10
tests/test_smoke.py Normal file
View File

@ -0,0 +1,10 @@
from geniehive_control.main import create_app as create_control_app
from geniehive_node.main import create_app as create_node_app
def test_control_app_title() -> None:
assert create_control_app().title == "GenieHive Control"
def test_node_app_title() -> None:
assert create_node_app().title == "GenieHive Node"