fix(skillopt-sleep): surface codex auth/model/version failures instead of silently scoring 0 (#92)

Splits CodexCliBackend._call into _call_once + a retry wrapper so transient empties/timeouts are retried instead of silently scored 0, and fails fast on fatal auth/model/version errors (401, refresh_token_reused, token_expired, ChatGPT-account-unsupported, newer-Codex-required). On non-zero exit the CLI error text is surfaced via last_call_error instead of being returned as a model response. Adds per-cycle diagnostics.json (observability only; gate and learning algorithm unchanged) so a 0.0 night self-explains.
This commit is contained in:
Yifan Yang
2026-07-01 03:20:08 +08:00
committed by GitHub
5 changed files with 233 additions and 6 deletions

View File

@@ -486,6 +486,18 @@ class TestConsolidateGate(unittest.TestCase):
self.assertTrue(res.accepted)
self.assertGreater(res.candidate_score, res.baseline_score)
def test_consolidate_records_holdout_detail(self):
# observability: a 0.0 night must carry per-task evidence (was empty
# response vs failing checks?) so it is diagnosable, not a black box.
be = MockBackend()
tasks = assign_splits(researcher_persona(), holdout_fraction=0.34, seed=42)
res = consolidate(be, tasks, set_learned("", []), "", edit_budget=4,
gate_metric="mixed", night=1)
self.assertTrue(res.holdout_detail) # non-empty per-task rows
row = res.holdout_detail[0]
for k in ("id", "hard", "soft", "response_len", "why"):
self.assertIn(k, row)
def test_no_op_when_already_optimal(self):
be = MockBackend()
tasks = assign_splits(programmer_persona(), holdout_fraction=0.34, seed=1)
@@ -612,6 +624,24 @@ class TestMultiObjectiveAndPrefs(unittest.TestCase):
[], "skill", "", edit_budget=2, evolve_skill=True, evolve_memory=False)
self.assertIn("British English", captured["prompt"])
def test_reflect_records_last_raw(self):
# the optimizer's raw reply must be retained so a no-edits night is
# diagnosable (empty/non-JSON reflect vs genuinely no failures).
from skillopt_sleep.backend import CliBackend
from skillopt_sleep.types import ReplayResult
class CapBackend(CliBackend):
name = "cap"
def _call(self, prompt, *, max_tokens=1024):
return '[{"op":"add","content":"a learned rule","rationale":"x"}]'
be = CapBackend()
t = TaskRecord(id="t", project="/p", intent="x", reference_kind="rule",
judge={"checks": [{"op": "contains", "arg": "z"}]})
be.reflect([(t, ReplayResult(id="t", hard=0.0, fail_reason="failed: contains=z"))],
[], "skill", "", edit_budget=2, evolve_skill=True, evolve_memory=False)
self.assertIn("a learned rule", be.last_reflect_raw)
def test_replay_records_cost(self):
from skillopt_sleep.backend import MockBackend
from skillopt_sleep.replay import replay_one
@@ -654,6 +684,89 @@ class TestCodexBackend(unittest.TestCase):
self.assertIn("-C", cmd)
self.assertEqual(cmd[cmd.index("-C") + 1], expected_project)
def test_codex_call_retries_transient_failure_not_silent_zero(self):
"""A transient timeout must be RETRIED, not silently returned as "" — an
empty reply scores 0 on every judge and zeroes the held-out baseline,
making a flaky backend look identical to 'nothing to learn'."""
import subprocess as _sp
from skillopt_sleep.backend import CodexCliBackend
calls = {"n": 0}
def fake_run(cmd, **kwargs):
calls["n"] += 1
if calls["n"] == 1:
raise _sp.TimeoutExpired(cmd, kwargs.get("timeout", 1))
out_path = cmd[cmd.index("-o") + 1]
with open(out_path, "w", encoding="utf-8") as f:
f.write("real answer")
class Proc:
returncode = 0
stdout = ""
stderr = ""
return Proc()
backend = CodexCliBackend(codex_path="codex")
with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run), \
mock.patch("time.sleep", lambda *_a, **_k: None):
out = backend._call("hello")
self.assertEqual(out, "real answer") # recovered on retry
self.assertGreaterEqual(calls["n"], 2) # proves it did not silently return "" once
def test_codex_auth_error_surfaces_not_scored_as_response(self):
"""An auth 401 must become a clear last_call_error + EMPTY response (not the
9k-char error text scored as a 0 'answer'), and must NOT be retried — the
exact failure that silently stalled learning (refresh_token_reused)."""
from skillopt_sleep.backend import CodexCliBackend
calls = {"n": 0}
def fake_run(cmd, **kwargs):
calls["n"] += 1
out_path = cmd[cmd.index("-o") + 1]
open(out_path, "w").close() # empty output file (codex wrote nothing)
class Proc:
returncode = 1
stdout = ""
stderr = "ERROR codex_core::auth: 401 Unauthorized: refresh_token_reused"
return Proc()
be = CodexCliBackend(codex_path="codex")
with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run), \
mock.patch("time.sleep", lambda *_a, **_k: None):
out = be._call("hi")
self.assertEqual(out, "") # NOT the error text
self.assertIn("refresh_token_reused", be.last_call_error) # surfaced for the operator
self.assertEqual(calls["n"], 1) # failed fast, no wasted retries
def test_codex_attempt_with_tools_surfaces_error_not_silent(self):
"""A failed tool-rollout (non-zero codex exec) on the tool path must set
last_call_error and return an empty response — not a silent empty->0 the
diagnostics can't see (the gap a _call-only fix would otherwise leave)."""
from skillopt_sleep.backend import CodexCliBackend
def fake_run(cmd, **kwargs):
class Proc:
returncode = 1
stdout = ""
stderr = "ERROR codex_core::auth: 401 Unauthorized: refresh_token_reused"
return Proc() # writes nothing to out_path -> empty response
be = CodexCliBackend(codex_path="codex")
task = TaskRecord(id="t", project="/p", intent="answer the question",
reference_kind="rule",
judge={"checks": [{"op": "tool_called", "arg": "search"}]})
with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run):
resp, called = be.attempt_with_tools(task, "", "", ["search"])
self.assertEqual(resp, "") # no leaked error text as a "response"
self.assertIn("exited 1", be.last_call_error) # failure surfaced for diagnostics
self.assertEqual(called, []) # no tool actually ran
class TestMultiRolloutAndBudget(unittest.TestCase):
def test_rolloutset_stats(self):