mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-04 06:29:56 +08:00
The training gate currently always compares candidate vs. current/best using *hard* exact-match accuracy. On environments with a small held-out selection set (e.g. 3-6 items) or partial-credit scoring, hard accuracy is too coarse: candidate skills that meaningfully improve per-item soft scores get rejected because the discrete hard count does not move. Add three opt-in metrics so users can pick the one that matches their scoring function: - `gate_metric: hard` — original behavior (default, fully backward compatible). - `gate_metric: soft` — gate on the soft / F1 / partial-credit score. - `gate_metric: mixed` — `(1 - w) * hard + w * soft`, where `w` is set by `gate_mixed_weight` (default 0.5). Changes ------- - `skillopt/evaluation/gate.py`: extend `evaluate_gate` with `cand_soft`, `metric`, and `mixed_weight` keyword arguments; add a pure helper `select_gate_score(hard, soft, metric, mixed_weight)`. Defaults preserve the original `metric="hard"` behavior — existing callers that only pass `cand_hard` keep working unchanged. - `skillopt/evaluation/__init__.py`: export the new helper / type. - `skillopt/engine/trainer.py`: read `evaluation.gate_metric` and `evaluation.gate_mixed_weight` from the config (with safe defaults), pass both metrics into `evaluate_gate`, and project the baseline `current_score` / `best_score` into metric space so subsequent comparisons are consistent. Print the gate metric on the `[6/6 EVALUATE]` line so logs make the decision basis explicit. The selection cache still records both `(hard, soft)` so a metric change on resume is non-destructive. - `configs/_base_/default.yaml`: document and ship the new keys with backward-compatible defaults (`hard`, `0.5`). Backward compatibility ---------------------- - Default config does not change behavior: `gate_metric` defaults to `hard`, exactly matching the previous gate. - `evaluate_gate(...)` keeps its existing positional signature; the new parameters are keyword-only with safe defaults. - `step_record.json` gains optional `gate_metric` and `candidate_gate_score` fields; old records still load. Tested ------ - Unit-tested all three metrics + boundary `mixed_weight` values (0.0 / 1.0) and rejection of unknown metric strings. All six cases pass. - Verified `skillopt.engine.trainer` imports cleanly after the refactor.
149 lines
4.5 KiB
Python
149 lines
4.5 KiB
Python
"""Validation gate — accept / reject candidate skills.
|
|
|
|
Analogous to validation-based early stopping and model selection in neural
|
|
network training: compares the candidate's score against the current and
|
|
best scores, then returns an accept/reject decision.
|
|
|
|
The trainer owns side-effects (cache lookup, rollout, printing, state
|
|
mutation). This module is the pure decision function.
|
|
|
|
Metric selection
|
|
----------------
|
|
Three gate metrics are supported:
|
|
|
|
* ``"hard"`` (default, backward-compatible):
|
|
Compare candidate vs current/best using *hard* exact-match accuracy.
|
|
* ``"soft"``:
|
|
Compare using *soft* per-item score (F1 / partial credit / etc.).
|
|
Use this when a small held-out selection set has too few items for
|
|
hard accuracy to be sensitive to incremental skill improvements.
|
|
* ``"mixed"``:
|
|
Compare using a weighted average ``(1 - w) * hard + w * soft``.
|
|
``w`` is configurable via ``mixed_weight`` (default ``0.5``).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
from typing import Literal
|
|
|
|
|
|
GateAction = Literal["accept_new_best", "accept", "reject"]
|
|
GateMetric = Literal["hard", "soft", "mixed"]
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class GateResult:
|
|
"""Immutable outcome of the validation gate."""
|
|
|
|
action: GateAction
|
|
current_skill: str
|
|
current_score: float
|
|
best_skill: str
|
|
best_score: float
|
|
best_step: int
|
|
|
|
|
|
def select_gate_score(
|
|
hard: float,
|
|
soft: float,
|
|
metric: GateMetric = "hard",
|
|
mixed_weight: float = 0.5,
|
|
) -> float:
|
|
"""Project (hard, soft) onto a single comparison metric.
|
|
|
|
Parameters
|
|
----------
|
|
hard, soft
|
|
Aggregate hard / soft scores from a rollout batch (both 0..1).
|
|
metric
|
|
Which metric to compare on.
|
|
mixed_weight
|
|
For ``"mixed"``: weight given to ``soft``. Must be in ``[0, 1]``.
|
|
Ignored for ``"hard"`` / ``"soft"``.
|
|
"""
|
|
if metric == "hard":
|
|
return float(hard)
|
|
if metric == "soft":
|
|
return float(soft)
|
|
if metric == "mixed":
|
|
w = max(0.0, min(1.0, float(mixed_weight)))
|
|
return (1.0 - w) * float(hard) + w * float(soft)
|
|
raise ValueError(
|
|
f"unknown gate metric {metric!r}; expected 'hard', 'soft', or 'mixed'"
|
|
)
|
|
|
|
|
|
def evaluate_gate(
|
|
candidate_skill: str,
|
|
cand_hard: float,
|
|
current_skill: str,
|
|
current_score: float,
|
|
best_skill: str,
|
|
best_score: float,
|
|
best_step: int,
|
|
global_step: int,
|
|
*,
|
|
cand_soft: float = 0.0,
|
|
metric: GateMetric = "hard",
|
|
mixed_weight: float = 0.5,
|
|
) -> GateResult:
|
|
"""Pure gate decision: compare candidate score to current/best.
|
|
|
|
Parameters
|
|
----------
|
|
candidate_skill
|
|
The candidate skill content being evaluated.
|
|
cand_hard, cand_soft
|
|
Aggregate hard / soft scores of the candidate on the selection set.
|
|
current_skill, current_score
|
|
The currently-active skill and its *metric-space* score.
|
|
best_skill, best_score, best_step
|
|
The best-so-far skill, its *metric-space* score, and the step
|
|
at which it was accepted.
|
|
global_step
|
|
Current global training step (recorded if a new best is accepted).
|
|
cand_soft
|
|
Soft score of the candidate; only consulted when ``metric != "hard"``.
|
|
Defaults to ``0.0`` for backward compatibility with callers that
|
|
previously passed only ``cand_hard``.
|
|
metric
|
|
Which metric to compare on. Defaults to ``"hard"`` to preserve
|
|
the original gate behavior.
|
|
mixed_weight
|
|
Weight on ``soft`` when ``metric == "mixed"``.
|
|
|
|
Returns
|
|
-------
|
|
GateResult
|
|
Updated state; the caller decides what to do with it (print,
|
|
mutate trainer state, log, etc.).
|
|
"""
|
|
cand_score = select_gate_score(cand_hard, cand_soft, metric, mixed_weight)
|
|
|
|
if cand_score > current_score:
|
|
if cand_score > best_score:
|
|
return GateResult(
|
|
action="accept_new_best",
|
|
current_skill=candidate_skill,
|
|
current_score=cand_score,
|
|
best_skill=candidate_skill,
|
|
best_score=cand_score,
|
|
best_step=global_step,
|
|
)
|
|
return GateResult(
|
|
action="accept",
|
|
current_skill=candidate_skill,
|
|
current_score=cand_score,
|
|
best_skill=best_skill,
|
|
best_score=best_score,
|
|
best_step=best_step,
|
|
)
|
|
return GateResult(
|
|
action="reject",
|
|
current_skill=current_skill,
|
|
current_score=current_score,
|
|
best_skill=best_skill,
|
|
best_score=best_score,
|
|
best_step=best_step,
|
|
)
|