From b4e5a1af7d854ee3b5977f7da08ab4880c9d3803 Mon Sep 17 00:00:00 2001 From: welberr Date: Mon, 27 Apr 2026 10:45:04 -0400 Subject: [PATCH] P0: remove dead default_strategy field; fix benchmark quality score MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove RoutingConfig.default_strategy: the field was never read by resolve_route() or any other code path, creating a false impression that routing behaviour was configurable. Also removed from all three example config files. Fix _benchmark_quality_score: the previous implementation used max() for correctness signals and then *added* speed bonuses on top, allowing the score to accumulate past 1.0 before the final clamp. Speed bonuses were therefore dead weight whenever pass_rate or quality_score was already ≥ 0.65. Replace with an explicit weighted average: correctness (pass_rate / quality_score) carries 0.65 and a normalised speed component carries 0.35. When no correctness signal is available the speed component carries full weight. Score is always in [0, 1] without needing a clamp. Add test_benchmark_quality_score_stays_bounded_and_weighted to lock in the corrected behaviour: bounded at 1.0, correctness-dominant, speed- only case non-zero, empty input zero, speed bonus never hurts. Co-Authored-By: Claude Sonnet 4.6 --- configs/control.example.yaml | 1 - configs/control.singlebox.example.yaml | 1 - configs/control.singlebox.p40.example.yaml | 1 - src/geniehive_control/config.py | 1 - src/geniehive_control/registry.py | 43 +++++++++++++++++----- tests/test_control_registry.py | 27 +++++++++++++- 6 files changed, 60 insertions(+), 14 deletions(-) diff --git a/configs/control.example.yaml b/configs/control.example.yaml index 100122c..0292ae4 100644 --- a/configs/control.example.yaml +++ b/configs/control.example.yaml @@ -14,5 +14,4 @@ storage: roles_path: "configs/roles.example.yaml" routing: - default_strategy: "loaded_first" health_stale_after_s: 30 diff --git a/configs/control.singlebox.example.yaml b/configs/control.singlebox.example.yaml index 8b59174..fac73d0 100644 --- a/configs/control.singlebox.example.yaml +++ b/configs/control.singlebox.example.yaml @@ -14,5 +14,4 @@ storage: roles_path: "configs/roles.example.yaml" routing: - default_strategy: "loaded_first" health_stale_after_s: 30 diff --git a/configs/control.singlebox.p40.example.yaml b/configs/control.singlebox.p40.example.yaml index b4e0850..18012a5 100644 --- a/configs/control.singlebox.p40.example.yaml +++ b/configs/control.singlebox.p40.example.yaml @@ -14,5 +14,4 @@ storage: roles_path: "configs/roles.singlebox.p40.example.yaml" routing: - default_strategy: "loaded_first" health_stale_after_s: 30 diff --git a/src/geniehive_control/config.py b/src/geniehive_control/config.py index 85d55e6..68c0557 100644 --- a/src/geniehive_control/config.py +++ b/src/geniehive_control/config.py @@ -21,7 +21,6 @@ class StorageConfig(BaseModel): class RoutingConfig(BaseModel): - default_strategy: str = "loaded_first" health_stale_after_s: float = 30.0 diff --git a/src/geniehive_control/registry.py b/src/geniehive_control/registry.py index a6ca72f..7254e5b 100644 --- a/src/geniehive_control/registry.py +++ b/src/geniehive_control/registry.py @@ -778,24 +778,49 @@ def _overlap_score(task_tokens: set[str], candidate_tokens: set[str]) -> float: def _benchmark_quality_score(results: dict) -> float: + """ + Combine correctness and speed signals into a [0, 1] quality score. + + Correctness (weight 0.65): pass_rate or quality_score from the workload run. + Speed (weight 0.35): tokens_per_sec and TTFT, each contributing up to 0.5 of + the speed component. + + When a correctness signal is absent the speed component carries the full weight + so that services with runtime data but no workload results still rank above + those with no data at all. + """ if not results: return 0.0 - quality = 0.0 + tokens_per_sec = results.get("tokens_per_sec") ttft_ms = results.get("ttft_ms") pass_rate = results.get("pass_rate") quality_score = results.get("quality_score") + + # Correctness component: best available signal in [0, 1]. + correctness: float | None = None if isinstance(quality_score, (int, float)): - quality = max(quality, max(0.0, min(1.0, float(quality_score)))) + correctness = max(0.0, min(1.0, float(quality_score))) if isinstance(pass_rate, (int, float)): - quality = max(quality, max(0.0, min(1.0, float(pass_rate)))) + pr = max(0.0, min(1.0, float(pass_rate))) + correctness = pr if correctness is None else max(correctness, pr) + + # Speed component: tokens/sec (up to 0.5) + TTFT bands (up to 0.5), in [0, 1]. + speed = 0.0 if isinstance(tokens_per_sec, (int, float)): - quality += min(0.35, float(tokens_per_sec) / 100.0) + speed += min(0.5, float(tokens_per_sec) / 80.0) if isinstance(ttft_ms, (int, float)): - if float(ttft_ms) <= 1000: - quality += 0.25 + if float(ttft_ms) <= 500: + speed += 0.50 + elif float(ttft_ms) <= 1000: + speed += 0.40 elif float(ttft_ms) <= 2500: - quality += 0.15 + speed += 0.25 else: - quality += 0.05 - return min(1.0, quality) + speed += 0.10 + speed = min(1.0, speed) + + if correctness is None: + # No correctness data — speed signal carries everything. + return speed + return 0.65 * correctness + 0.35 * speed diff --git a/tests/test_control_registry.py b/tests/test_control_registry.py index 4ea3048..322ed56 100644 --- a/tests/test_control_registry.py +++ b/tests/test_control_registry.py @@ -2,7 +2,7 @@ from pathlib import Path from geniehive_control.main import create_app from geniehive_control.models import BenchmarkSample, HostHeartbeat, HostRegistration, RegisteredService, RoleProfile, RouteMatchRequest -from geniehive_control.registry import Registry +from geniehive_control.registry import Registry, _benchmark_quality_score def test_registry_persists_registration_and_heartbeat(tmp_path: Path) -> None: @@ -366,3 +366,28 @@ def test_registry_exposes_asset_request_policy_in_model_metadata(tmp_path: Path) asset = next(item for item in models if item["id"] == "custom-model-v1") assert asset["geniehive"]["effective_request_policy"]["body_defaults"]["temperature"] == 0.2 assert asset["geniehive"]["effective_request_policy"]["body_defaults"]["chat_template_kwargs"]["custom_flag"] == "yes" + + +def test_benchmark_quality_score_stays_bounded_and_weighted() -> None: + # High correctness + fast speed must not exceed 1.0. + score = _benchmark_quality_score({"pass_rate": 1.0, "tokens_per_sec": 80, "ttft_ms": 400}) + assert score <= 1.0 + assert score > 0.9 # should be near 1.0 + + # Correctness dominates: high pass_rate with slow speed should still score well. + high_correct_slow = _benchmark_quality_score({"pass_rate": 0.95, "tokens_per_sec": 5, "ttft_ms": 4000}) + low_correct_fast = _benchmark_quality_score({"pass_rate": 0.3, "tokens_per_sec": 80, "ttft_ms": 400}) + assert high_correct_slow > low_correct_fast + + # Speed-only (no correctness signal) returns a non-zero score. + speed_only = _benchmark_quality_score({"tokens_per_sec": 40, "ttft_ms": 800}) + assert 0.0 < speed_only < 1.0 + + # Empty results return 0. + assert _benchmark_quality_score({}) == 0.0 + + # No stacking: pass_rate=1.0 alone should not score above 1.0 when speed is added. + perfect_correct = _benchmark_quality_score({"pass_rate": 1.0}) + with_speed = _benchmark_quality_score({"pass_rate": 1.0, "tokens_per_sec": 100, "ttft_ms": 100}) + assert with_speed <= 1.0 + assert with_speed >= perfect_correct # speed can only help, not hurt