P0: remove dead default_strategy field; fix benchmark quality score

Remove RoutingConfig.default_strategy: the field was never read by
resolve_route() or any other code path, creating a false impression
that routing behaviour was configurable. Also removed from all three
example config files.

Fix _benchmark_quality_score: the previous implementation used max()
for correctness signals and then *added* speed bonuses on top, allowing
the score to accumulate past 1.0 before the final clamp. Speed bonuses
were therefore dead weight whenever pass_rate or quality_score was
already ≥ 0.65. Replace with an explicit weighted average: correctness
(pass_rate / quality_score) carries 0.65 and a normalised speed
component carries 0.35. When no correctness signal is available the
speed component carries full weight. Score is always in [0, 1] without
needing a clamp.

Add test_benchmark_quality_score_stays_bounded_and_weighted to lock in
the corrected behaviour: bounded at 1.0, correctness-dominant, speed-
only case non-zero, empty input zero, speed bonus never hurts.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
welberr 2026-04-27 10:45:04 -04:00
parent a76c7e81f4
commit b4e5a1af7d
6 changed files with 60 additions and 14 deletions

View File

@ -14,5 +14,4 @@ storage:
roles_path: "configs/roles.example.yaml"
routing:
default_strategy: "loaded_first"
health_stale_after_s: 30

View File

@ -14,5 +14,4 @@ storage:
roles_path: "configs/roles.example.yaml"
routing:
default_strategy: "loaded_first"
health_stale_after_s: 30

View File

@ -14,5 +14,4 @@ storage:
roles_path: "configs/roles.singlebox.p40.example.yaml"
routing:
default_strategy: "loaded_first"
health_stale_after_s: 30

View File

@ -21,7 +21,6 @@ class StorageConfig(BaseModel):
class RoutingConfig(BaseModel):
default_strategy: str = "loaded_first"
health_stale_after_s: float = 30.0

View File

@ -778,24 +778,49 @@ def _overlap_score(task_tokens: set[str], candidate_tokens: set[str]) -> float:
def _benchmark_quality_score(results: dict) -> float:
"""
Combine correctness and speed signals into a [0, 1] quality score.
Correctness (weight 0.65): pass_rate or quality_score from the workload run.
Speed (weight 0.35): tokens_per_sec and TTFT, each contributing up to 0.5 of
the speed component.
When a correctness signal is absent the speed component carries the full weight
so that services with runtime data but no workload results still rank above
those with no data at all.
"""
if not results:
return 0.0
quality = 0.0
tokens_per_sec = results.get("tokens_per_sec")
ttft_ms = results.get("ttft_ms")
pass_rate = results.get("pass_rate")
quality_score = results.get("quality_score")
# Correctness component: best available signal in [0, 1].
correctness: float | None = None
if isinstance(quality_score, (int, float)):
quality = max(quality, max(0.0, min(1.0, float(quality_score))))
correctness = max(0.0, min(1.0, float(quality_score)))
if isinstance(pass_rate, (int, float)):
quality = max(quality, max(0.0, min(1.0, float(pass_rate))))
pr = max(0.0, min(1.0, float(pass_rate)))
correctness = pr if correctness is None else max(correctness, pr)
# Speed component: tokens/sec (up to 0.5) + TTFT bands (up to 0.5), in [0, 1].
speed = 0.0
if isinstance(tokens_per_sec, (int, float)):
quality += min(0.35, float(tokens_per_sec) / 100.0)
speed += min(0.5, float(tokens_per_sec) / 80.0)
if isinstance(ttft_ms, (int, float)):
if float(ttft_ms) <= 1000:
quality += 0.25
if float(ttft_ms) <= 500:
speed += 0.50
elif float(ttft_ms) <= 1000:
speed += 0.40
elif float(ttft_ms) <= 2500:
quality += 0.15
speed += 0.25
else:
quality += 0.05
return min(1.0, quality)
speed += 0.10
speed = min(1.0, speed)
if correctness is None:
# No correctness data — speed signal carries everything.
return speed
return 0.65 * correctness + 0.35 * speed

View File

@ -2,7 +2,7 @@ from pathlib import Path
from geniehive_control.main import create_app
from geniehive_control.models import BenchmarkSample, HostHeartbeat, HostRegistration, RegisteredService, RoleProfile, RouteMatchRequest
from geniehive_control.registry import Registry
from geniehive_control.registry import Registry, _benchmark_quality_score
def test_registry_persists_registration_and_heartbeat(tmp_path: Path) -> None:
@ -366,3 +366,28 @@ def test_registry_exposes_asset_request_policy_in_model_metadata(tmp_path: Path)
asset = next(item for item in models if item["id"] == "custom-model-v1")
assert asset["geniehive"]["effective_request_policy"]["body_defaults"]["temperature"] == 0.2
assert asset["geniehive"]["effective_request_policy"]["body_defaults"]["chat_template_kwargs"]["custom_flag"] == "yes"
def test_benchmark_quality_score_stays_bounded_and_weighted() -> None:
# High correctness + fast speed must not exceed 1.0.
score = _benchmark_quality_score({"pass_rate": 1.0, "tokens_per_sec": 80, "ttft_ms": 400})
assert score <= 1.0
assert score > 0.9 # should be near 1.0
# Correctness dominates: high pass_rate with slow speed should still score well.
high_correct_slow = _benchmark_quality_score({"pass_rate": 0.95, "tokens_per_sec": 5, "ttft_ms": 4000})
low_correct_fast = _benchmark_quality_score({"pass_rate": 0.3, "tokens_per_sec": 80, "ttft_ms": 400})
assert high_correct_slow > low_correct_fast
# Speed-only (no correctness signal) returns a non-zero score.
speed_only = _benchmark_quality_score({"tokens_per_sec": 40, "ttft_ms": 800})
assert 0.0 < speed_only < 1.0
# Empty results return 0.
assert _benchmark_quality_score({}) == 0.0
# No stacking: pass_rate=1.0 alone should not score above 1.0 when speed is added.
perfect_correct = _benchmark_quality_score({"pass_rate": 1.0})
with_speed = _benchmark_quality_score({"pass_rate": 1.0, "tokens_per_sec": 100, "ttft_ms": 100})
assert with_speed <= 1.0
assert with_speed >= perfect_correct # speed can only help, not hurt