P0: remove dead default_strategy field; fix benchmark quality score
Remove RoutingConfig.default_strategy: the field was never read by resolve_route() or any other code path, creating a false impression that routing behaviour was configurable. Also removed from all three example config files. Fix _benchmark_quality_score: the previous implementation used max() for correctness signals and then *added* speed bonuses on top, allowing the score to accumulate past 1.0 before the final clamp. Speed bonuses were therefore dead weight whenever pass_rate or quality_score was already ≥ 0.65. Replace with an explicit weighted average: correctness (pass_rate / quality_score) carries 0.65 and a normalised speed component carries 0.35. When no correctness signal is available the speed component carries full weight. Score is always in [0, 1] without needing a clamp. Add test_benchmark_quality_score_stays_bounded_and_weighted to lock in the corrected behaviour: bounded at 1.0, correctness-dominant, speed- only case non-zero, empty input zero, speed bonus never hurts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
a76c7e81f4
commit
b4e5a1af7d
|
|
@ -14,5 +14,4 @@ storage:
|
|||
roles_path: "configs/roles.example.yaml"
|
||||
|
||||
routing:
|
||||
default_strategy: "loaded_first"
|
||||
health_stale_after_s: 30
|
||||
|
|
|
|||
|
|
@ -14,5 +14,4 @@ storage:
|
|||
roles_path: "configs/roles.example.yaml"
|
||||
|
||||
routing:
|
||||
default_strategy: "loaded_first"
|
||||
health_stale_after_s: 30
|
||||
|
|
|
|||
|
|
@ -14,5 +14,4 @@ storage:
|
|||
roles_path: "configs/roles.singlebox.p40.example.yaml"
|
||||
|
||||
routing:
|
||||
default_strategy: "loaded_first"
|
||||
health_stale_after_s: 30
|
||||
|
|
|
|||
|
|
@ -21,7 +21,6 @@ class StorageConfig(BaseModel):
|
|||
|
||||
|
||||
class RoutingConfig(BaseModel):
|
||||
default_strategy: str = "loaded_first"
|
||||
health_stale_after_s: float = 30.0
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -778,24 +778,49 @@ def _overlap_score(task_tokens: set[str], candidate_tokens: set[str]) -> float:
|
|||
|
||||
|
||||
def _benchmark_quality_score(results: dict) -> float:
|
||||
"""
|
||||
Combine correctness and speed signals into a [0, 1] quality score.
|
||||
|
||||
Correctness (weight 0.65): pass_rate or quality_score from the workload run.
|
||||
Speed (weight 0.35): tokens_per_sec and TTFT, each contributing up to 0.5 of
|
||||
the speed component.
|
||||
|
||||
When a correctness signal is absent the speed component carries the full weight
|
||||
so that services with runtime data but no workload results still rank above
|
||||
those with no data at all.
|
||||
"""
|
||||
if not results:
|
||||
return 0.0
|
||||
quality = 0.0
|
||||
|
||||
tokens_per_sec = results.get("tokens_per_sec")
|
||||
ttft_ms = results.get("ttft_ms")
|
||||
pass_rate = results.get("pass_rate")
|
||||
quality_score = results.get("quality_score")
|
||||
|
||||
# Correctness component: best available signal in [0, 1].
|
||||
correctness: float | None = None
|
||||
if isinstance(quality_score, (int, float)):
|
||||
quality = max(quality, max(0.0, min(1.0, float(quality_score))))
|
||||
correctness = max(0.0, min(1.0, float(quality_score)))
|
||||
if isinstance(pass_rate, (int, float)):
|
||||
quality = max(quality, max(0.0, min(1.0, float(pass_rate))))
|
||||
pr = max(0.0, min(1.0, float(pass_rate)))
|
||||
correctness = pr if correctness is None else max(correctness, pr)
|
||||
|
||||
# Speed component: tokens/sec (up to 0.5) + TTFT bands (up to 0.5), in [0, 1].
|
||||
speed = 0.0
|
||||
if isinstance(tokens_per_sec, (int, float)):
|
||||
quality += min(0.35, float(tokens_per_sec) / 100.0)
|
||||
speed += min(0.5, float(tokens_per_sec) / 80.0)
|
||||
if isinstance(ttft_ms, (int, float)):
|
||||
if float(ttft_ms) <= 1000:
|
||||
quality += 0.25
|
||||
if float(ttft_ms) <= 500:
|
||||
speed += 0.50
|
||||
elif float(ttft_ms) <= 1000:
|
||||
speed += 0.40
|
||||
elif float(ttft_ms) <= 2500:
|
||||
quality += 0.15
|
||||
speed += 0.25
|
||||
else:
|
||||
quality += 0.05
|
||||
return min(1.0, quality)
|
||||
speed += 0.10
|
||||
speed = min(1.0, speed)
|
||||
|
||||
if correctness is None:
|
||||
# No correctness data — speed signal carries everything.
|
||||
return speed
|
||||
return 0.65 * correctness + 0.35 * speed
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ from pathlib import Path
|
|||
|
||||
from geniehive_control.main import create_app
|
||||
from geniehive_control.models import BenchmarkSample, HostHeartbeat, HostRegistration, RegisteredService, RoleProfile, RouteMatchRequest
|
||||
from geniehive_control.registry import Registry
|
||||
from geniehive_control.registry import Registry, _benchmark_quality_score
|
||||
|
||||
|
||||
def test_registry_persists_registration_and_heartbeat(tmp_path: Path) -> None:
|
||||
|
|
@ -366,3 +366,28 @@ def test_registry_exposes_asset_request_policy_in_model_metadata(tmp_path: Path)
|
|||
asset = next(item for item in models if item["id"] == "custom-model-v1")
|
||||
assert asset["geniehive"]["effective_request_policy"]["body_defaults"]["temperature"] == 0.2
|
||||
assert asset["geniehive"]["effective_request_policy"]["body_defaults"]["chat_template_kwargs"]["custom_flag"] == "yes"
|
||||
|
||||
|
||||
def test_benchmark_quality_score_stays_bounded_and_weighted() -> None:
|
||||
# High correctness + fast speed must not exceed 1.0.
|
||||
score = _benchmark_quality_score({"pass_rate": 1.0, "tokens_per_sec": 80, "ttft_ms": 400})
|
||||
assert score <= 1.0
|
||||
assert score > 0.9 # should be near 1.0
|
||||
|
||||
# Correctness dominates: high pass_rate with slow speed should still score well.
|
||||
high_correct_slow = _benchmark_quality_score({"pass_rate": 0.95, "tokens_per_sec": 5, "ttft_ms": 4000})
|
||||
low_correct_fast = _benchmark_quality_score({"pass_rate": 0.3, "tokens_per_sec": 80, "ttft_ms": 400})
|
||||
assert high_correct_slow > low_correct_fast
|
||||
|
||||
# Speed-only (no correctness signal) returns a non-zero score.
|
||||
speed_only = _benchmark_quality_score({"tokens_per_sec": 40, "ttft_ms": 800})
|
||||
assert 0.0 < speed_only < 1.0
|
||||
|
||||
# Empty results return 0.
|
||||
assert _benchmark_quality_score({}) == 0.0
|
||||
|
||||
# No stacking: pass_rate=1.0 alone should not score above 1.0 when speed is added.
|
||||
perfect_correct = _benchmark_quality_score({"pass_rate": 1.0})
|
||||
with_speed = _benchmark_quality_score({"pass_rate": 1.0, "tokens_per_sec": 100, "ttft_ms": 100})
|
||||
assert with_speed <= 1.0
|
||||
assert with_speed >= perfect_correct # speed can only help, not hurt
|
||||
|
|
|
|||
Loading…
Reference in New Issue