P0: remove dead default_strategy field; fix benchmark quality score

Remove RoutingConfig.default_strategy: the field was never read by resolve_route() or any other code path, creating a false impression that routing behaviour was configurable. Also removed from all three example config files. Fix _benchmark_quality_score: the previous implementation used max() for correctness signals and then *added* speed bonuses on top, allowing the score to accumulate past 1.0 before the final clamp. Speed bonuses were therefore dead weight whenever pass_rate or quality_score was already ≥ 0.65. Replace with an explicit weighted average: correctness (pass_rate / quality_score) carries 0.65 and a normalised speed component carries 0.35. When no correctness signal is available the speed component carries full weight. Score is always in [0, 1] without needing a clamp. Add test_benchmark_quality_score_stays_bounded_and_weighted to lock in the corrected behaviour: bounded at 1.0, correctness-dominant, speed- only case non-zero, empty input zero, speed bonus never hurts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-27 10:45:04 -04:00 · 2026-04-27 10:45:04 -04:00 · b4e5a1af7d
parent a76c7e81f4
commit b4e5a1af7d
6 changed files with 60 additions and 14 deletions
--- a/configs/control.example.yaml
+++ b/configs/control.example.yaml
@ -14,5 +14,4 @@ storage:
 roles_path: "configs/roles.example.yaml"
 routing:
  default_strategy: "loaded_first"
  health_stale_after_s: 30
--- a/configs/control.singlebox.example.yaml
+++ b/configs/control.singlebox.example.yaml
@ -14,5 +14,4 @@ storage:
 roles_path: "configs/roles.example.yaml"
 routing:
  default_strategy: "loaded_first"
  health_stale_after_s: 30
--- a/configs/control.singlebox.p40.example.yaml
+++ b/configs/control.singlebox.p40.example.yaml
@ -14,5 +14,4 @@ storage:
 roles_path: "configs/roles.singlebox.p40.example.yaml"
 routing:
  default_strategy: "loaded_first"
  health_stale_after_s: 30
--- a/src/geniehive_control/config.py
+++ b/src/geniehive_control/config.py
@ -21,7 +21,6 @@ class StorageConfig(BaseModel):
 class RoutingConfig(BaseModel):
    default_strategy: str = "loaded_first"
    health_stale_after_s: float = 30.0
--- a/src/geniehive_control/registry.py
+++ b/src/geniehive_control/registry.py
@ -778,24 +778,49 @@ def _overlap_score(task_tokens: set[str], candidate_tokens: set[str]) -> float:
 def _benchmark_quality_score(results: dict) -> float:
    """
    Combine correctness and speed signals into a [0, 1] quality score.
    Correctness (weight 0.65): pass_rate or quality_score from the workload run.
    Speed (weight 0.35): tokens_per_sec and TTFT, each contributing up to 0.5 of
    the speed component.
    When a correctness signal is absent the speed component carries the full weight
    so that services with runtime data but no workload results still rank above
    those with no data at all.
    """
    if not results:
        return 0.0
-    quality = 0.0
+
    tokens_per_sec = results.get("tokens_per_sec")
    ttft_ms = results.get("ttft_ms")
    pass_rate = results.get("pass_rate")
    quality_score = results.get("quality_score")
    # Correctness component: best available signal in [0, 1].
    correctness: float | None = None
    if isinstance(quality_score, (int, float)):
-        quality = max(quality, max(0.0, min(1.0, float(quality_score))))
+        correctness = max(0.0, min(1.0, float(quality_score)))
    if isinstance(pass_rate, (int, float)):
-        quality = max(quality, max(0.0, min(1.0, float(pass_rate))))
+        pr = max(0.0, min(1.0, float(pass_rate)))
        correctness = pr if correctness is None else max(correctness, pr)
    # Speed component: tokens/sec (up to 0.5) + TTFT bands (up to 0.5), in [0, 1].
    speed = 0.0
    if isinstance(tokens_per_sec, (int, float)):
-        quality += min(0.35, float(tokens_per_sec) / 100.0)
+        speed += min(0.5, float(tokens_per_sec) / 80.0)
    if isinstance(ttft_ms, (int, float)):
-        if float(ttft_ms) <= 1000:
+        if float(ttft_ms) <= 500:
-            quality += 0.25
+            speed += 0.50
        elif float(ttft_ms) <= 1000:
            speed += 0.40
        elif float(ttft_ms) <= 2500:
-            quality += 0.15
+            speed += 0.25
        else:
-            quality += 0.05
+            speed += 0.10
-    return min(1.0, quality)
+    speed = min(1.0, speed)
    if correctness is None:
        # No correctness data — speed signal carries everything.
        return speed
    return 0.65 * correctness + 0.35 * speed
--- a/tests/test_control_registry.py
+++ b/tests/test_control_registry.py
@ -2,7 +2,7 @@ from pathlib import Path
 from geniehive_control.main import create_app
 from geniehive_control.models import BenchmarkSample, HostHeartbeat, HostRegistration, RegisteredService, RoleProfile, RouteMatchRequest
-from geniehive_control.registry import Registry
+from geniehive_control.registry import Registry, _benchmark_quality_score
 def test_registry_persists_registration_and_heartbeat(tmp_path: Path) -> None:
@ -366,3 +366,28 @@ def test_registry_exposes_asset_request_policy_in_model_metadata(tmp_path: Path)
    asset = next(item for item in models if item["id"] == "custom-model-v1")
    assert asset["geniehive"]["effective_request_policy"]["body_defaults"]["temperature"] == 0.2
    assert asset["geniehive"]["effective_request_policy"]["body_defaults"]["chat_template_kwargs"]["custom_flag"] == "yes"
 def test_benchmark_quality_score_stays_bounded_and_weighted() -> None:
    # High correctness + fast speed must not exceed 1.0.
    score = _benchmark_quality_score({"pass_rate": 1.0, "tokens_per_sec": 80, "ttft_ms": 400})
    assert score <= 1.0
    assert score > 0.9  # should be near 1.0
    # Correctness dominates: high pass_rate with slow speed should still score well.
    high_correct_slow = _benchmark_quality_score({"pass_rate": 0.95, "tokens_per_sec": 5, "ttft_ms": 4000})
    low_correct_fast = _benchmark_quality_score({"pass_rate": 0.3, "tokens_per_sec": 80, "ttft_ms": 400})
    assert high_correct_slow > low_correct_fast
    # Speed-only (no correctness signal) returns a non-zero score.
    speed_only = _benchmark_quality_score({"tokens_per_sec": 40, "ttft_ms": 800})
    assert 0.0 < speed_only < 1.0
    # Empty results return 0.
    assert _benchmark_quality_score({}) == 0.0
    # No stacking: pass_rate=1.0 alone should not score above 1.0 when speed is added.
    perfect_correct = _benchmark_quality_score({"pass_rate": 1.0})
    with_speed = _benchmark_quality_score({"pass_rate": 1.0, "tokens_per_sec": 100, "ttft_ms": 100})
    assert with_speed <= 1.0
    assert with_speed >= perfect_correct  # speed can only help, not hurt