From b4e5a1af7d854ee3b5977f7da08ab4880c9d3803 Mon Sep 17 00:00:00 2001
From: welberr <welsberr@gmail.com>
Date: Mon, 27 Apr 2026 10:45:04 -0400
Subject: [PATCH] P0: remove dead default_strategy field; fix benchmark quality
 score
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove RoutingConfig.default_strategy: the field was never read by
resolve_route() or any other code path, creating a false impression
that routing behaviour was configurable. Also removed from all three
example config files.

Fix _benchmark_quality_score: the previous implementation used max()
for correctness signals and then *added* speed bonuses on top, allowing
the score to accumulate past 1.0 before the final clamp. Speed bonuses
were therefore dead weight whenever pass_rate or quality_score was
already ≥ 0.65. Replace with an explicit weighted average: correctness
(pass_rate / quality_score) carries 0.65 and a normalised speed
component carries 0.35. When no correctness signal is available the
speed component carries full weight. Score is always in [0, 1] without
needing a clamp.

Add test_benchmark_quality_score_stays_bounded_and_weighted to lock in
the corrected behaviour: bounded at 1.0, correctness-dominant, speed-
only case non-zero, empty input zero, speed bonus never hurts.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 configs/control.example.yaml               |  1 -
 configs/control.singlebox.example.yaml     |  1 -
 configs/control.singlebox.p40.example.yaml |  1 -
 src/geniehive_control/config.py            |  1 -
 src/geniehive_control/registry.py          | 43 +++++++++++++++++-----
 tests/test_control_registry.py             | 27 +++++++++++++-
 6 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/configs/control.example.yaml b/configs/control.example.yaml
index 100122c..0292ae4 100644
--- a/configs/control.example.yaml
+++ b/configs/control.example.yaml
@@ -14,5 +14,4 @@ storage:
 roles_path: "configs/roles.example.yaml"
 
 routing:
-  default_strategy: "loaded_first"
   health_stale_after_s: 30
diff --git a/configs/control.singlebox.example.yaml b/configs/control.singlebox.example.yaml
index 8b59174..fac73d0 100644
--- a/configs/control.singlebox.example.yaml
+++ b/configs/control.singlebox.example.yaml
@@ -14,5 +14,4 @@ storage:
 roles_path: "configs/roles.example.yaml"
 
 routing:
-  default_strategy: "loaded_first"
   health_stale_after_s: 30
diff --git a/configs/control.singlebox.p40.example.yaml b/configs/control.singlebox.p40.example.yaml
index b4e0850..18012a5 100644
--- a/configs/control.singlebox.p40.example.yaml
+++ b/configs/control.singlebox.p40.example.yaml
@@ -14,5 +14,4 @@ storage:
 roles_path: "configs/roles.singlebox.p40.example.yaml"
 
 routing:
-  default_strategy: "loaded_first"
   health_stale_after_s: 30
diff --git a/src/geniehive_control/config.py b/src/geniehive_control/config.py
index 85d55e6..68c0557 100644
--- a/src/geniehive_control/config.py
+++ b/src/geniehive_control/config.py
@@ -21,7 +21,6 @@ class StorageConfig(BaseModel):
 
 
 class RoutingConfig(BaseModel):
-    default_strategy: str = "loaded_first"
     health_stale_after_s: float = 30.0
 
 
diff --git a/src/geniehive_control/registry.py b/src/geniehive_control/registry.py
index a6ca72f..7254e5b 100644
--- a/src/geniehive_control/registry.py
+++ b/src/geniehive_control/registry.py
@@ -778,24 +778,49 @@ def _overlap_score(task_tokens: set[str], candidate_tokens: set[str]) -> float:
 
 
 def _benchmark_quality_score(results: dict) -> float:
+    """
+    Combine correctness and speed signals into a [0, 1] quality score.
+
+    Correctness (weight 0.65): pass_rate or quality_score from the workload run.
+    Speed (weight 0.35): tokens_per_sec and TTFT, each contributing up to 0.5 of
+    the speed component.
+
+    When a correctness signal is absent the speed component carries the full weight
+    so that services with runtime data but no workload results still rank above
+    those with no data at all.
+    """
     if not results:
         return 0.0
-    quality = 0.0
+
     tokens_per_sec = results.get("tokens_per_sec")
     ttft_ms = results.get("ttft_ms")
     pass_rate = results.get("pass_rate")
     quality_score = results.get("quality_score")
+
+    # Correctness component: best available signal in [0, 1].
+    correctness: float | None = None
     if isinstance(quality_score, (int, float)):
-        quality = max(quality, max(0.0, min(1.0, float(quality_score))))
+        correctness = max(0.0, min(1.0, float(quality_score)))
     if isinstance(pass_rate, (int, float)):
-        quality = max(quality, max(0.0, min(1.0, float(pass_rate))))
+        pr = max(0.0, min(1.0, float(pass_rate)))
+        correctness = pr if correctness is None else max(correctness, pr)
+
+    # Speed component: tokens/sec (up to 0.5) + TTFT bands (up to 0.5), in [0, 1].
+    speed = 0.0
     if isinstance(tokens_per_sec, (int, float)):
-        quality += min(0.35, float(tokens_per_sec) / 100.0)
+        speed += min(0.5, float(tokens_per_sec) / 80.0)
     if isinstance(ttft_ms, (int, float)):
-        if float(ttft_ms) <= 1000:
-            quality += 0.25
+        if float(ttft_ms) <= 500:
+            speed += 0.50
+        elif float(ttft_ms) <= 1000:
+            speed += 0.40
         elif float(ttft_ms) <= 2500:
-            quality += 0.15
+            speed += 0.25
         else:
-            quality += 0.05
-    return min(1.0, quality)
+            speed += 0.10
+    speed = min(1.0, speed)
+
+    if correctness is None:
+        # No correctness data — speed signal carries everything.
+        return speed
+    return 0.65 * correctness + 0.35 * speed
diff --git a/tests/test_control_registry.py b/tests/test_control_registry.py
index 4ea3048..322ed56 100644
--- a/tests/test_control_registry.py
+++ b/tests/test_control_registry.py
@@ -2,7 +2,7 @@ from pathlib import Path
 
 from geniehive_control.main import create_app
 from geniehive_control.models import BenchmarkSample, HostHeartbeat, HostRegistration, RegisteredService, RoleProfile, RouteMatchRequest
-from geniehive_control.registry import Registry
+from geniehive_control.registry import Registry, _benchmark_quality_score
 
 
 def test_registry_persists_registration_and_heartbeat(tmp_path: Path) -> None:
@@ -366,3 +366,28 @@ def test_registry_exposes_asset_request_policy_in_model_metadata(tmp_path: Path)
     asset = next(item for item in models if item["id"] == "custom-model-v1")
     assert asset["geniehive"]["effective_request_policy"]["body_defaults"]["temperature"] == 0.2
     assert asset["geniehive"]["effective_request_policy"]["body_defaults"]["chat_template_kwargs"]["custom_flag"] == "yes"
+
+
+def test_benchmark_quality_score_stays_bounded_and_weighted() -> None:
+    # High correctness + fast speed must not exceed 1.0.
+    score = _benchmark_quality_score({"pass_rate": 1.0, "tokens_per_sec": 80, "ttft_ms": 400})
+    assert score <= 1.0
+    assert score > 0.9  # should be near 1.0
+
+    # Correctness dominates: high pass_rate with slow speed should still score well.
+    high_correct_slow = _benchmark_quality_score({"pass_rate": 0.95, "tokens_per_sec": 5, "ttft_ms": 4000})
+    low_correct_fast = _benchmark_quality_score({"pass_rate": 0.3, "tokens_per_sec": 80, "ttft_ms": 400})
+    assert high_correct_slow > low_correct_fast
+
+    # Speed-only (no correctness signal) returns a non-zero score.
+    speed_only = _benchmark_quality_score({"tokens_per_sec": 40, "ttft_ms": 800})
+    assert 0.0 < speed_only < 1.0
+
+    # Empty results return 0.
+    assert _benchmark_quality_score({}) == 0.0
+
+    # No stacking: pass_rate=1.0 alone should not score above 1.0 when speed is added.
+    perfect_correct = _benchmark_quality_score({"pass_rate": 1.0})
+    with_speed = _benchmark_quality_score({"pass_rate": 1.0, "tokens_per_sec": 100, "ttft_ms": 100})
+    assert with_speed <= 1.0
+    assert with_speed >= perfect_correct  # speed can only help, not hurt