P0: remove dead default_strategy field; fix benchmark quality score
Remove RoutingConfig.default_strategy: the field was never read by resolve_route() or any other code path, creating a false impression that routing behaviour was configurable. Also removed from all three example config files. Fix _benchmark_quality_score: the previous implementation used max() for correctness signals and then *added* speed bonuses on top, allowing the score to accumulate past 1.0 before the final clamp. Speed bonuses were therefore dead weight whenever pass_rate or quality_score was already ≥ 0.65. Replace with an explicit weighted average: correctness (pass_rate / quality_score) carries 0.65 and a normalised speed component carries 0.35. When no correctness signal is available the speed component carries full weight. Score is always in [0, 1] without needing a clamp. Add test_benchmark_quality_score_stays_bounded_and_weighted to lock in the corrected behaviour: bounded at 1.0, correctness-dominant, speed- only case non-zero, empty input zero, speed bonus never hurts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
a76c7e81f4
commit
b4e5a1af7d
|
|
@ -14,5 +14,4 @@ storage:
|
||||||
roles_path: "configs/roles.example.yaml"
|
roles_path: "configs/roles.example.yaml"
|
||||||
|
|
||||||
routing:
|
routing:
|
||||||
default_strategy: "loaded_first"
|
|
||||||
health_stale_after_s: 30
|
health_stale_after_s: 30
|
||||||
|
|
|
||||||
|
|
@ -14,5 +14,4 @@ storage:
|
||||||
roles_path: "configs/roles.example.yaml"
|
roles_path: "configs/roles.example.yaml"
|
||||||
|
|
||||||
routing:
|
routing:
|
||||||
default_strategy: "loaded_first"
|
|
||||||
health_stale_after_s: 30
|
health_stale_after_s: 30
|
||||||
|
|
|
||||||
|
|
@ -14,5 +14,4 @@ storage:
|
||||||
roles_path: "configs/roles.singlebox.p40.example.yaml"
|
roles_path: "configs/roles.singlebox.p40.example.yaml"
|
||||||
|
|
||||||
routing:
|
routing:
|
||||||
default_strategy: "loaded_first"
|
|
||||||
health_stale_after_s: 30
|
health_stale_after_s: 30
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,6 @@ class StorageConfig(BaseModel):
|
||||||
|
|
||||||
|
|
||||||
class RoutingConfig(BaseModel):
|
class RoutingConfig(BaseModel):
|
||||||
default_strategy: str = "loaded_first"
|
|
||||||
health_stale_after_s: float = 30.0
|
health_stale_after_s: float = 30.0
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -778,24 +778,49 @@ def _overlap_score(task_tokens: set[str], candidate_tokens: set[str]) -> float:
|
||||||
|
|
||||||
|
|
||||||
def _benchmark_quality_score(results: dict) -> float:
|
def _benchmark_quality_score(results: dict) -> float:
|
||||||
|
"""
|
||||||
|
Combine correctness and speed signals into a [0, 1] quality score.
|
||||||
|
|
||||||
|
Correctness (weight 0.65): pass_rate or quality_score from the workload run.
|
||||||
|
Speed (weight 0.35): tokens_per_sec and TTFT, each contributing up to 0.5 of
|
||||||
|
the speed component.
|
||||||
|
|
||||||
|
When a correctness signal is absent the speed component carries the full weight
|
||||||
|
so that services with runtime data but no workload results still rank above
|
||||||
|
those with no data at all.
|
||||||
|
"""
|
||||||
if not results:
|
if not results:
|
||||||
return 0.0
|
return 0.0
|
||||||
quality = 0.0
|
|
||||||
tokens_per_sec = results.get("tokens_per_sec")
|
tokens_per_sec = results.get("tokens_per_sec")
|
||||||
ttft_ms = results.get("ttft_ms")
|
ttft_ms = results.get("ttft_ms")
|
||||||
pass_rate = results.get("pass_rate")
|
pass_rate = results.get("pass_rate")
|
||||||
quality_score = results.get("quality_score")
|
quality_score = results.get("quality_score")
|
||||||
|
|
||||||
|
# Correctness component: best available signal in [0, 1].
|
||||||
|
correctness: float | None = None
|
||||||
if isinstance(quality_score, (int, float)):
|
if isinstance(quality_score, (int, float)):
|
||||||
quality = max(quality, max(0.0, min(1.0, float(quality_score))))
|
correctness = max(0.0, min(1.0, float(quality_score)))
|
||||||
if isinstance(pass_rate, (int, float)):
|
if isinstance(pass_rate, (int, float)):
|
||||||
quality = max(quality, max(0.0, min(1.0, float(pass_rate))))
|
pr = max(0.0, min(1.0, float(pass_rate)))
|
||||||
|
correctness = pr if correctness is None else max(correctness, pr)
|
||||||
|
|
||||||
|
# Speed component: tokens/sec (up to 0.5) + TTFT bands (up to 0.5), in [0, 1].
|
||||||
|
speed = 0.0
|
||||||
if isinstance(tokens_per_sec, (int, float)):
|
if isinstance(tokens_per_sec, (int, float)):
|
||||||
quality += min(0.35, float(tokens_per_sec) / 100.0)
|
speed += min(0.5, float(tokens_per_sec) / 80.0)
|
||||||
if isinstance(ttft_ms, (int, float)):
|
if isinstance(ttft_ms, (int, float)):
|
||||||
if float(ttft_ms) <= 1000:
|
if float(ttft_ms) <= 500:
|
||||||
quality += 0.25
|
speed += 0.50
|
||||||
|
elif float(ttft_ms) <= 1000:
|
||||||
|
speed += 0.40
|
||||||
elif float(ttft_ms) <= 2500:
|
elif float(ttft_ms) <= 2500:
|
||||||
quality += 0.15
|
speed += 0.25
|
||||||
else:
|
else:
|
||||||
quality += 0.05
|
speed += 0.10
|
||||||
return min(1.0, quality)
|
speed = min(1.0, speed)
|
||||||
|
|
||||||
|
if correctness is None:
|
||||||
|
# No correctness data — speed signal carries everything.
|
||||||
|
return speed
|
||||||
|
return 0.65 * correctness + 0.35 * speed
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@ from pathlib import Path
|
||||||
|
|
||||||
from geniehive_control.main import create_app
|
from geniehive_control.main import create_app
|
||||||
from geniehive_control.models import BenchmarkSample, HostHeartbeat, HostRegistration, RegisteredService, RoleProfile, RouteMatchRequest
|
from geniehive_control.models import BenchmarkSample, HostHeartbeat, HostRegistration, RegisteredService, RoleProfile, RouteMatchRequest
|
||||||
from geniehive_control.registry import Registry
|
from geniehive_control.registry import Registry, _benchmark_quality_score
|
||||||
|
|
||||||
|
|
||||||
def test_registry_persists_registration_and_heartbeat(tmp_path: Path) -> None:
|
def test_registry_persists_registration_and_heartbeat(tmp_path: Path) -> None:
|
||||||
|
|
@ -366,3 +366,28 @@ def test_registry_exposes_asset_request_policy_in_model_metadata(tmp_path: Path)
|
||||||
asset = next(item for item in models if item["id"] == "custom-model-v1")
|
asset = next(item for item in models if item["id"] == "custom-model-v1")
|
||||||
assert asset["geniehive"]["effective_request_policy"]["body_defaults"]["temperature"] == 0.2
|
assert asset["geniehive"]["effective_request_policy"]["body_defaults"]["temperature"] == 0.2
|
||||||
assert asset["geniehive"]["effective_request_policy"]["body_defaults"]["chat_template_kwargs"]["custom_flag"] == "yes"
|
assert asset["geniehive"]["effective_request_policy"]["body_defaults"]["chat_template_kwargs"]["custom_flag"] == "yes"
|
||||||
|
|
||||||
|
|
||||||
|
def test_benchmark_quality_score_stays_bounded_and_weighted() -> None:
|
||||||
|
# High correctness + fast speed must not exceed 1.0.
|
||||||
|
score = _benchmark_quality_score({"pass_rate": 1.0, "tokens_per_sec": 80, "ttft_ms": 400})
|
||||||
|
assert score <= 1.0
|
||||||
|
assert score > 0.9 # should be near 1.0
|
||||||
|
|
||||||
|
# Correctness dominates: high pass_rate with slow speed should still score well.
|
||||||
|
high_correct_slow = _benchmark_quality_score({"pass_rate": 0.95, "tokens_per_sec": 5, "ttft_ms": 4000})
|
||||||
|
low_correct_fast = _benchmark_quality_score({"pass_rate": 0.3, "tokens_per_sec": 80, "ttft_ms": 400})
|
||||||
|
assert high_correct_slow > low_correct_fast
|
||||||
|
|
||||||
|
# Speed-only (no correctness signal) returns a non-zero score.
|
||||||
|
speed_only = _benchmark_quality_score({"tokens_per_sec": 40, "ttft_ms": 800})
|
||||||
|
assert 0.0 < speed_only < 1.0
|
||||||
|
|
||||||
|
# Empty results return 0.
|
||||||
|
assert _benchmark_quality_score({}) == 0.0
|
||||||
|
|
||||||
|
# No stacking: pass_rate=1.0 alone should not score above 1.0 when speed is added.
|
||||||
|
perfect_correct = _benchmark_quality_score({"pass_rate": 1.0})
|
||||||
|
with_speed = _benchmark_quality_score({"pass_rate": 1.0, "tokens_per_sec": 100, "ttft_ms": 100})
|
||||||
|
assert with_speed <= 1.0
|
||||||
|
assert with_speed >= perfect_correct # speed can only help, not hurt
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue