P0: remove dead default_strategy field; fix benchmark quality score

Remove RoutingConfig.default_strategy: the field was never read by resolve_route() or any other code path, creating a false impression that routing behaviour was configurable. Also removed from all three example config files. Fix _benchmark_quality_score: the previous implementation used max() for correctness signals and then *added* speed bonuses on top, allowing the score to accumulate past 1.0 before the final clamp. Speed bonuses were therefore dead weight whenever pass_rate or quality_score was already ≥ 0.65. Replace with an explicit weighted average: correctness (pass_rate / quality_score) carries 0.65 and a normalised speed component carries 0.35. When no correctness signal is available the speed component carries full weight. Score is always in [0, 1] without needing a clamp. Add test_benchmark_quality_score_stays_bounded_and_weighted to lock in the corrected behaviour: bounded at 1.0, correctness-dominant, speed- only case non-zero, empty input zero, speed bonus never hurts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-27 10:45:04 -04:00 · 2026-04-27 10:45:04 -04:00 · b4e5a1af7d
parent a76c7e81f4
commit b4e5a1af7d
6 changed files with 60 additions and 14 deletions
--- a/configs/control.example.yaml
+++ b/configs/control.example.yaml
@ -14,5 +14,4 @@ storage:
 roles_path: "configs/roles.example.yaml"

 routing:
-  default_strategy: "loaded_first"
  health_stale_after_s: 30
--- a/configs/control.singlebox.example.yaml
+++ b/configs/control.singlebox.example.yaml
@ -14,5 +14,4 @@ storage:
 roles_path: "configs/roles.example.yaml"

 routing:
-  default_strategy: "loaded_first"
  health_stale_after_s: 30
--- a/configs/control.singlebox.p40.example.yaml
+++ b/configs/control.singlebox.p40.example.yaml
@ -14,5 +14,4 @@ storage:
 roles_path: "configs/roles.singlebox.p40.example.yaml"

 routing:
-  default_strategy: "loaded_first"
  health_stale_after_s: 30
--- a/src/geniehive_control/config.py
+++ b/src/geniehive_control/config.py
@ -21,7 +21,6 @@ class StorageConfig(BaseModel):


 class RoutingConfig(BaseModel):
-    default_strategy: str = "loaded_first"
    health_stale_after_s: float = 30.0


--- a/src/geniehive_control/registry.py
+++ b/src/geniehive_control/registry.py
@ -778,24 +778,49 @@ def _overlap_score(task_tokens: set[str], candidate_tokens: set[str]) -> float:


 def _benchmark_quality_score(results: dict) -> float:
+    """
+    Combine correctness and speed signals into a [0, 1] quality score.
+
+    Correctness (weight 0.65): pass_rate or quality_score from the workload run.
+    Speed (weight 0.35): tokens_per_sec and TTFT, each contributing up to 0.5 of
+    the speed component.
+
+    When a correctness signal is absent the speed component carries the full weight
+    so that services with runtime data but no workload results still rank above
+    those with no data at all.
+    """
    if not results:
        return 0.0
-    quality = 0.0
+
    tokens_per_sec = results.get("tokens_per_sec")
    ttft_ms = results.get("ttft_ms")
    pass_rate = results.get("pass_rate")
    quality_score = results.get("quality_score")
+
+    # Correctness component: best available signal in [0, 1].
+    correctness: float | None = None
    if isinstance(quality_score, (int, float)):
-        quality = max(quality, max(0.0, min(1.0, float(quality_score))))
+        correctness = max(0.0, min(1.0, float(quality_score)))
    if isinstance(pass_rate, (int, float)):
-        quality = max(quality, max(0.0, min(1.0, float(pass_rate))))
+        pr = max(0.0, min(1.0, float(pass_rate)))
+        correctness = pr if correctness is None else max(correctness, pr)
+
+    # Speed component: tokens/sec (up to 0.5) + TTFT bands (up to 0.5), in [0, 1].
+    speed = 0.0
    if isinstance(tokens_per_sec, (int, float)):
-        quality += min(0.35, float(tokens_per_sec) / 100.0)
+        speed += min(0.5, float(tokens_per_sec) / 80.0)
    if isinstance(ttft_ms, (int, float)):
-        if float(ttft_ms) <= 1000:
-            quality += 0.25
+        if float(ttft_ms) <= 500:
+            speed += 0.50
+        elif float(ttft_ms) <= 1000:
+            speed += 0.40
        elif float(ttft_ms) <= 2500:
-            quality += 0.15
+            speed += 0.25
        else:
-            quality += 0.05
-    return min(1.0, quality)
+            speed += 0.10
+    speed = min(1.0, speed)
+
+    if correctness is None:
+        # No correctness data — speed signal carries everything.
+        return speed
+    return 0.65 * correctness + 0.35 * speed
--- a/tests/test_control_registry.py
+++ b/tests/test_control_registry.py
@ -2,7 +2,7 @@ from pathlib import Path

 from geniehive_control.main import create_app
 from geniehive_control.models import BenchmarkSample, HostHeartbeat, HostRegistration, RegisteredService, RoleProfile, RouteMatchRequest
-from geniehive_control.registry import Registry
+from geniehive_control.registry import Registry, _benchmark_quality_score


 def test_registry_persists_registration_and_heartbeat(tmp_path: Path) -> None:
@ -366,3 +366,28 @@ def test_registry_exposes_asset_request_policy_in_model_metadata(tmp_path: Path)
    asset = next(item for item in models if item["id"] == "custom-model-v1")
    assert asset["geniehive"]["effective_request_policy"]["body_defaults"]["temperature"] == 0.2
    assert asset["geniehive"]["effective_request_policy"]["body_defaults"]["chat_template_kwargs"]["custom_flag"] == "yes"
+
+
+def test_benchmark_quality_score_stays_bounded_and_weighted() -> None:
+    # High correctness + fast speed must not exceed 1.0.
+    score = _benchmark_quality_score({"pass_rate": 1.0, "tokens_per_sec": 80, "ttft_ms": 400})
+    assert score <= 1.0
+    assert score > 0.9  # should be near 1.0
+
+    # Correctness dominates: high pass_rate with slow speed should still score well.
+    high_correct_slow = _benchmark_quality_score({"pass_rate": 0.95, "tokens_per_sec": 5, "ttft_ms": 4000})
+    low_correct_fast = _benchmark_quality_score({"pass_rate": 0.3, "tokens_per_sec": 80, "ttft_ms": 400})
+    assert high_correct_slow > low_correct_fast
+
+    # Speed-only (no correctness signal) returns a non-zero score.
+    speed_only = _benchmark_quality_score({"tokens_per_sec": 40, "ttft_ms": 800})
+    assert 0.0 < speed_only < 1.0
+
+    # Empty results return 0.
+    assert _benchmark_quality_score({}) == 0.0
+
+    # No stacking: pass_rate=1.0 alone should not score above 1.0 when speed is added.
+    perfect_correct = _benchmark_quality_score({"pass_rate": 1.0})
+    with_speed = _benchmark_quality_score({"pass_rate": 1.0, "tokens_per_sec": 100, "ttft_ms": 100})
+    assert with_speed <= 1.0
+    assert with_speed >= perfect_correct  # speed can only help, not hurt