fix(skillopt-sleep): surface codex auth/model/version failures instead of silently scoring 0 (#92)

Splits CodexCliBackend._call into _call_once + a retry wrapper so transient empties/timeouts are retried instead of silently scored 0, and fails fast on fatal auth/model/version errors (401, refresh_token_reused, token_expired, ChatGPT-account-unsupported, newer-Codex-required). On non-zero exit the CLI error text is surfaced via last_call_error instead of being returned as a model response. Adds per-cycle diagnostics.json (observability only; gate and learning algorithm unchanged) so a 0.0 night self-explains.
This commit is contained in:
Yifan Yang
2026-07-01 03:20:08 +08:00
committed by GitHub
5 changed files with 233 additions and 6 deletions

2
.gitignore vendored
View File

@@ -22,6 +22,8 @@ data/*
outputs/
logs/
external/
# SkillOpt-Sleep runtime state (staging proposals, config, diagnostics, cron logs)
.skillopt-sleep/
/BabyVision/
/MMRB/

View File

@@ -520,6 +520,10 @@ class CliBackend(Backend):
arr = _extract_json(raw, "array")
if isinstance(arr, list) and arr:
break
# Expose the last raw optimizer reply so a no-edits night is diagnosable:
# a 0.0->0.0 gate with zero edits is otherwise indistinguishable from
# "nothing to learn" (the cycle persists this in diagnostics.json).
self.last_reflect_raw = raw or ""
edits: List[EditRecord] = []
if isinstance(arr, list):
for e in arr[:edit_budget]:
@@ -750,9 +754,11 @@ class CodexCliBackend(CliBackend):
os.path.abspath(os.path.expanduser(project_dir)) if project_dir else ""
)
def _call(self, prompt: str, *, max_tokens: int = 1024) -> str:
def _call_once(self, prompt: str, *, max_tokens: int = 1024) -> str:
"""One codex exec attempt: returns the response text, or "" on
timeout/exception/empty-output (with last_call_error set). ``_call``
wraps this with retries so a transient failure is NOT silently scored 0."""
import tempfile
self.last_call_error = ""
out_path = tempfile.NamedTemporaryFile(
prefix="codex_last_", suffix=".txt", delete=False
).name
@@ -793,6 +799,12 @@ class CodexCliBackend(CliBackend):
stderr = (proc.stderr or "").strip() if proc is not None else ""
if proc is not None and proc.returncode != 0 and not self.last_call_error:
self.last_call_error = f"codex exec exited {proc.returncode}: {stderr[:500]}"
# Do NOT return the CLI's error text as if it were a model response: it
# pollutes rollout/judge/reflect and gets silently scored 0, hiding the
# real cause (e.g. an expired codex auth token surfacing as a 9k-char 401).
# Surface it via last_call_error and return empty instead.
if self.last_call_error:
return ""
return stdout or stderr
finally:
try:
@@ -800,6 +812,45 @@ class CodexCliBackend(CliBackend):
except Exception:
pass
# Fatal codex failures that will NOT recover on retry — fail fast + loud so a
# 0.0 night reads as "codex auth/model/version problem" not "nothing to learn".
# Covers: auth (re-login), and 400 config errors like an unsupported model on a
# ChatGPT account or a model that needs a newer codex CLI (upgrade).
_AUTH_MARKERS = (
"401 Unauthorized", "refresh_token_reused", "token_expired",
"Please log out and sign in", "Not logged in", "Please run /login",
"authentication token is expired", "Unauthorized: invalid",
"is not supported when using Codex", "requires a newer version of Codex",
)
def _call(self, prompt: str, *, max_tokens: int = 1024, retries: int = 3) -> str:
"""Retry transient empties/timeouts instead of silently returning "".
An empty reply scores 0 on every judge, which deflates the held-out
baseline AND blocks the candidate from ever improving — making a flaky
backend indistinguishable from "nothing to learn". The Azure backend
already guards this way (AzureOpenAIBackend._call); codex now does too.
Auth errors are NOT retried (hopeless until the user re-logs-in).
"""
import logging
import random as _r
import time as _t
out = ""
for attempt in range(max(1, retries)):
self.last_call_error = ""
out = self._call_once(prompt, max_tokens=max_tokens)
if out:
return out
err = self.last_call_error or ""
if any(m in err for m in self._AUTH_MARKERS):
logging.getLogger("skillopt_sleep").error(
"codex auth error — re-login required (`codex login`): %s", err[:200]
)
break # fail fast: retrying a 401 just burns calls
if attempt < retries - 1:
_t.sleep(min(6.0, (2 ** attempt) * 0.5) + _r.random() * 0.3)
return out
def attempt_with_tools(self, task, skill, memory, tools):
# Codex exec runs in a sandbox with shell access; expose the same real
# `search` shim and let it run (workspace-write so the shim can log).
@@ -839,16 +890,27 @@ class CodexCliBackend(CliBackend):
if self.model:
cmd += ["-m", self.model]
cmd += ["--", prompt]
self.last_call_error = ""
proc = None
try:
subprocess.run(cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work)
except Exception:
pass
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work)
except subprocess.TimeoutExpired:
self.last_call_error = f"codex exec (tools) timed out after {self.timeout}s"
except Exception as exc: # noqa: BLE001
self.last_call_error = f"codex exec (tools) failed: {exc}"
resp = ""
try:
with open(out_path, encoding="utf-8") as f:
resp = f.read().strip()
except Exception:
resp = ""
# Surface a failed tool-rollout the SAME way _call does: an auth/model/version
# failure on this path must show up in diagnostics (call_error), not vanish as a
# silent empty->0 scored as a failed rollout. Response stays "" (never the error text).
if not resp and not self.last_call_error and proc is not None and proc.returncode != 0:
self.last_call_error = (
f"codex exec (tools) exited {proc.returncode}: {(proc.stderr or '')[:500]}"
)
self._tokens += len(prompt) // 4 + len(resp) // 4
called: List[str] = []
if os.path.exists(calllog):

View File

@@ -9,7 +9,7 @@ validation gate, vendored self-contained in ``skillopt_sleep.gate``.
from __future__ import annotations
import os
from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import List, Optional, Tuple
from skillopt_sleep.backend import Backend
@@ -36,6 +36,10 @@ class ConsolidationResult:
rejected_edits: List[EditRecord]
holdout_baseline: float
holdout_candidate: float
# ── observability (so a 0.0->0.0 night is self-diagnosing, not a black box) ──
holdout_detail: List[dict] = field(default_factory=list) # per val task: hard/soft/resp/why
reflect_raw: str = "" # the optimizer's last raw reply (empty => reflect produced nothing)
call_error: str = "" # backend's last call error (timeout/auth/empty)
def _split(tasks: List[TaskRecord]) -> Tuple[List[TaskRecord], List[TaskRecord]]:
@@ -61,6 +65,25 @@ def _split(tasks: List[TaskRecord]) -> Tuple[List[TaskRecord], List[TaskRecord]]
return train, val
def _holdout_detail(pairs: List[Tuple[TaskRecord, ReplayResult]]) -> List[dict]:
"""Per-task held-out evidence so a 0.0 night explains itself: was the
response empty (backend call failed) or non-empty-but-failing-checks
(judge too strict / edit didn't help)? The two need opposite fixes."""
out: List[dict] = []
for t, r in pairs:
resp = r.response or ""
out.append({
"id": t.id,
"reference_kind": t.reference_kind,
"hard": r.hard,
"soft": r.soft,
"response_len": len(resp),
"response_head": resp[:200],
"why": (r.fail_reason or r.judge_rationale or "")[:200],
})
return out
def consolidate(
backend: Backend,
tasks: List[TaskRecord],
@@ -87,6 +110,7 @@ def consolidate(
"""
train_tasks, val_tasks = _split(tasks)
gate_off = str(gate_mode).strip().lower() in {"off", "none", "false", "greedy"}
holdout_detail: List[dict] = []
# ── baseline on the VAL slice (the gate reference) ────────────────────
# When the gate is OFF the user has opted out of holding out a validation set
@@ -98,6 +122,7 @@ def consolidate(
else:
base_pairs = replay_batch(backend, val_tasks, skill, memory)
base_hard, base_soft = aggregate_scores(base_pairs)
holdout_detail = _holdout_detail(base_pairs)
base_score = select_gate_score(base_hard, base_soft, gate_metric, gate_mixed_weight)
# ── reflect over TRAIN-split failures/successes ───────────────────────
@@ -235,4 +260,7 @@ def consolidate(
rejected_edits=all_rejected,
holdout_baseline=base_hard,
holdout_candidate=final_hard,
holdout_detail=holdout_detail,
reflect_raw=getattr(backend, "last_reflect_raw", "") or "",
call_error=getattr(backend, "last_call_error", "") or "",
)

View File

@@ -276,6 +276,28 @@ def run_sleep_cycle(
live_memory_path=live_memory_path,
report_md=report_md,
)
# Observability: persist per-task held-out evidence + optimizer/codex errors so a
# 0.0->0.0 night self-explains (empty responses vs failing checks vs no edits) — the
# cycle previously captured none of this, making the gate a black box (#learning-stall).
try:
import json as _json
with open(os.path.join(staging_dir, "diagnostics.json"), "w", encoding="utf-8") as _fh:
_json.dump({
"night": night,
"backend": cfg.get("backend"),
"gate_mode": cfg.get("gate_mode"),
"n_tasks": len(tasks),
"baseline_score": result.baseline_score,
"candidate_score": result.candidate_score,
"accepted": result.accepted,
"n_applied_edits": len(result.applied_edits),
"n_rejected_edits": len(result.rejected_edits),
"call_error": getattr(result, "call_error", ""),
"reflect_raw_head": (getattr(result, "reflect_raw", "") or "")[:1200],
"holdout_detail": getattr(result, "holdout_detail", []),
}, _fh, indent=2)
except Exception:
pass
state.set_last_harvest(project, started)
state.record_night({
"night": night, "accepted": result.accepted,

View File

@@ -486,6 +486,18 @@ class TestConsolidateGate(unittest.TestCase):
self.assertTrue(res.accepted)
self.assertGreater(res.candidate_score, res.baseline_score)
def test_consolidate_records_holdout_detail(self):
# observability: a 0.0 night must carry per-task evidence (was empty
# response vs failing checks?) so it is diagnosable, not a black box.
be = MockBackend()
tasks = assign_splits(researcher_persona(), holdout_fraction=0.34, seed=42)
res = consolidate(be, tasks, set_learned("", []), "", edit_budget=4,
gate_metric="mixed", night=1)
self.assertTrue(res.holdout_detail) # non-empty per-task rows
row = res.holdout_detail[0]
for k in ("id", "hard", "soft", "response_len", "why"):
self.assertIn(k, row)
def test_no_op_when_already_optimal(self):
be = MockBackend()
tasks = assign_splits(programmer_persona(), holdout_fraction=0.34, seed=1)
@@ -612,6 +624,24 @@ class TestMultiObjectiveAndPrefs(unittest.TestCase):
[], "skill", "", edit_budget=2, evolve_skill=True, evolve_memory=False)
self.assertIn("British English", captured["prompt"])
def test_reflect_records_last_raw(self):
# the optimizer's raw reply must be retained so a no-edits night is
# diagnosable (empty/non-JSON reflect vs genuinely no failures).
from skillopt_sleep.backend import CliBackend
from skillopt_sleep.types import ReplayResult
class CapBackend(CliBackend):
name = "cap"
def _call(self, prompt, *, max_tokens=1024):
return '[{"op":"add","content":"a learned rule","rationale":"x"}]'
be = CapBackend()
t = TaskRecord(id="t", project="/p", intent="x", reference_kind="rule",
judge={"checks": [{"op": "contains", "arg": "z"}]})
be.reflect([(t, ReplayResult(id="t", hard=0.0, fail_reason="failed: contains=z"))],
[], "skill", "", edit_budget=2, evolve_skill=True, evolve_memory=False)
self.assertIn("a learned rule", be.last_reflect_raw)
def test_replay_records_cost(self):
from skillopt_sleep.backend import MockBackend
from skillopt_sleep.replay import replay_one
@@ -654,6 +684,89 @@ class TestCodexBackend(unittest.TestCase):
self.assertIn("-C", cmd)
self.assertEqual(cmd[cmd.index("-C") + 1], expected_project)
def test_codex_call_retries_transient_failure_not_silent_zero(self):
"""A transient timeout must be RETRIED, not silently returned as "" — an
empty reply scores 0 on every judge and zeroes the held-out baseline,
making a flaky backend look identical to 'nothing to learn'."""
import subprocess as _sp
from skillopt_sleep.backend import CodexCliBackend
calls = {"n": 0}
def fake_run(cmd, **kwargs):
calls["n"] += 1
if calls["n"] == 1:
raise _sp.TimeoutExpired(cmd, kwargs.get("timeout", 1))
out_path = cmd[cmd.index("-o") + 1]
with open(out_path, "w", encoding="utf-8") as f:
f.write("real answer")
class Proc:
returncode = 0
stdout = ""
stderr = ""
return Proc()
backend = CodexCliBackend(codex_path="codex")
with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run), \
mock.patch("time.sleep", lambda *_a, **_k: None):
out = backend._call("hello")
self.assertEqual(out, "real answer") # recovered on retry
self.assertGreaterEqual(calls["n"], 2) # proves it did not silently return "" once
def test_codex_auth_error_surfaces_not_scored_as_response(self):
"""An auth 401 must become a clear last_call_error + EMPTY response (not the
9k-char error text scored as a 0 'answer'), and must NOT be retried — the
exact failure that silently stalled learning (refresh_token_reused)."""
from skillopt_sleep.backend import CodexCliBackend
calls = {"n": 0}
def fake_run(cmd, **kwargs):
calls["n"] += 1
out_path = cmd[cmd.index("-o") + 1]
open(out_path, "w").close() # empty output file (codex wrote nothing)
class Proc:
returncode = 1
stdout = ""
stderr = "ERROR codex_core::auth: 401 Unauthorized: refresh_token_reused"
return Proc()
be = CodexCliBackend(codex_path="codex")
with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run), \
mock.patch("time.sleep", lambda *_a, **_k: None):
out = be._call("hi")
self.assertEqual(out, "") # NOT the error text
self.assertIn("refresh_token_reused", be.last_call_error) # surfaced for the operator
self.assertEqual(calls["n"], 1) # failed fast, no wasted retries
def test_codex_attempt_with_tools_surfaces_error_not_silent(self):
"""A failed tool-rollout (non-zero codex exec) on the tool path must set
last_call_error and return an empty response — not a silent empty->0 the
diagnostics can't see (the gap a _call-only fix would otherwise leave)."""
from skillopt_sleep.backend import CodexCliBackend
def fake_run(cmd, **kwargs):
class Proc:
returncode = 1
stdout = ""
stderr = "ERROR codex_core::auth: 401 Unauthorized: refresh_token_reused"
return Proc() # writes nothing to out_path -> empty response
be = CodexCliBackend(codex_path="codex")
task = TaskRecord(id="t", project="/p", intent="answer the question",
reference_kind="rule",
judge={"checks": [{"op": "tool_called", "arg": "search"}]})
with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run):
resp, called = be.attempt_with_tools(task, "", "", ["search"])
self.assertEqual(resp, "") # no leaked error text as a "response"
self.assertIn("exited 1", be.last_call_error) # failure surfaced for diagnostics
self.assertEqual(called, []) # no tool actually ran
class TestMultiRolloutAndBudget(unittest.TestCase):
def test_rolloutset_stats(self):