mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
fix(skillopt-sleep): surface codex auth/model/version failures instead of silently scoring 0 (#92)
Splits CodexCliBackend._call into _call_once + a retry wrapper so transient empties/timeouts are retried instead of silently scored 0, and fails fast on fatal auth/model/version errors (401, refresh_token_reused, token_expired, ChatGPT-account-unsupported, newer-Codex-required). On non-zero exit the CLI error text is surfaced via last_call_error instead of being returned as a model response. Adds per-cycle diagnostics.json (observability only; gate and learning algorithm unchanged) so a 0.0 night self-explains.
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -22,6 +22,8 @@ data/*
|
||||
outputs/
|
||||
logs/
|
||||
external/
|
||||
# SkillOpt-Sleep runtime state (staging proposals, config, diagnostics, cron logs)
|
||||
.skillopt-sleep/
|
||||
|
||||
/BabyVision/
|
||||
/MMRB/
|
||||
|
||||
@@ -520,6 +520,10 @@ class CliBackend(Backend):
|
||||
arr = _extract_json(raw, "array")
|
||||
if isinstance(arr, list) and arr:
|
||||
break
|
||||
# Expose the last raw optimizer reply so a no-edits night is diagnosable:
|
||||
# a 0.0->0.0 gate with zero edits is otherwise indistinguishable from
|
||||
# "nothing to learn" (the cycle persists this in diagnostics.json).
|
||||
self.last_reflect_raw = raw or ""
|
||||
edits: List[EditRecord] = []
|
||||
if isinstance(arr, list):
|
||||
for e in arr[:edit_budget]:
|
||||
@@ -750,9 +754,11 @@ class CodexCliBackend(CliBackend):
|
||||
os.path.abspath(os.path.expanduser(project_dir)) if project_dir else ""
|
||||
)
|
||||
|
||||
def _call(self, prompt: str, *, max_tokens: int = 1024) -> str:
|
||||
def _call_once(self, prompt: str, *, max_tokens: int = 1024) -> str:
|
||||
"""One codex exec attempt: returns the response text, or "" on
|
||||
timeout/exception/empty-output (with last_call_error set). ``_call``
|
||||
wraps this with retries so a transient failure is NOT silently scored 0."""
|
||||
import tempfile
|
||||
self.last_call_error = ""
|
||||
out_path = tempfile.NamedTemporaryFile(
|
||||
prefix="codex_last_", suffix=".txt", delete=False
|
||||
).name
|
||||
@@ -793,6 +799,12 @@ class CodexCliBackend(CliBackend):
|
||||
stderr = (proc.stderr or "").strip() if proc is not None else ""
|
||||
if proc is not None and proc.returncode != 0 and not self.last_call_error:
|
||||
self.last_call_error = f"codex exec exited {proc.returncode}: {stderr[:500]}"
|
||||
# Do NOT return the CLI's error text as if it were a model response: it
|
||||
# pollutes rollout/judge/reflect and gets silently scored 0, hiding the
|
||||
# real cause (e.g. an expired codex auth token surfacing as a 9k-char 401).
|
||||
# Surface it via last_call_error and return empty instead.
|
||||
if self.last_call_error:
|
||||
return ""
|
||||
return stdout or stderr
|
||||
finally:
|
||||
try:
|
||||
@@ -800,6 +812,45 @@ class CodexCliBackend(CliBackend):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fatal codex failures that will NOT recover on retry — fail fast + loud so a
|
||||
# 0.0 night reads as "codex auth/model/version problem" not "nothing to learn".
|
||||
# Covers: auth (re-login), and 400 config errors like an unsupported model on a
|
||||
# ChatGPT account or a model that needs a newer codex CLI (upgrade).
|
||||
_AUTH_MARKERS = (
|
||||
"401 Unauthorized", "refresh_token_reused", "token_expired",
|
||||
"Please log out and sign in", "Not logged in", "Please run /login",
|
||||
"authentication token is expired", "Unauthorized: invalid",
|
||||
"is not supported when using Codex", "requires a newer version of Codex",
|
||||
)
|
||||
|
||||
def _call(self, prompt: str, *, max_tokens: int = 1024, retries: int = 3) -> str:
|
||||
"""Retry transient empties/timeouts instead of silently returning "".
|
||||
|
||||
An empty reply scores 0 on every judge, which deflates the held-out
|
||||
baseline AND blocks the candidate from ever improving — making a flaky
|
||||
backend indistinguishable from "nothing to learn". The Azure backend
|
||||
already guards this way (AzureOpenAIBackend._call); codex now does too.
|
||||
Auth errors are NOT retried (hopeless until the user re-logs-in).
|
||||
"""
|
||||
import logging
|
||||
import random as _r
|
||||
import time as _t
|
||||
out = ""
|
||||
for attempt in range(max(1, retries)):
|
||||
self.last_call_error = ""
|
||||
out = self._call_once(prompt, max_tokens=max_tokens)
|
||||
if out:
|
||||
return out
|
||||
err = self.last_call_error or ""
|
||||
if any(m in err for m in self._AUTH_MARKERS):
|
||||
logging.getLogger("skillopt_sleep").error(
|
||||
"codex auth error — re-login required (`codex login`): %s", err[:200]
|
||||
)
|
||||
break # fail fast: retrying a 401 just burns calls
|
||||
if attempt < retries - 1:
|
||||
_t.sleep(min(6.0, (2 ** attempt) * 0.5) + _r.random() * 0.3)
|
||||
return out
|
||||
|
||||
def attempt_with_tools(self, task, skill, memory, tools):
|
||||
# Codex exec runs in a sandbox with shell access; expose the same real
|
||||
# `search` shim and let it run (workspace-write so the shim can log).
|
||||
@@ -839,16 +890,27 @@ class CodexCliBackend(CliBackend):
|
||||
if self.model:
|
||||
cmd += ["-m", self.model]
|
||||
cmd += ["--", prompt]
|
||||
self.last_call_error = ""
|
||||
proc = None
|
||||
try:
|
||||
subprocess.run(cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work)
|
||||
except Exception:
|
||||
pass
|
||||
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work)
|
||||
except subprocess.TimeoutExpired:
|
||||
self.last_call_error = f"codex exec (tools) timed out after {self.timeout}s"
|
||||
except Exception as exc: # noqa: BLE001
|
||||
self.last_call_error = f"codex exec (tools) failed: {exc}"
|
||||
resp = ""
|
||||
try:
|
||||
with open(out_path, encoding="utf-8") as f:
|
||||
resp = f.read().strip()
|
||||
except Exception:
|
||||
resp = ""
|
||||
# Surface a failed tool-rollout the SAME way _call does: an auth/model/version
|
||||
# failure on this path must show up in diagnostics (call_error), not vanish as a
|
||||
# silent empty->0 scored as a failed rollout. Response stays "" (never the error text).
|
||||
if not resp and not self.last_call_error and proc is not None and proc.returncode != 0:
|
||||
self.last_call_error = (
|
||||
f"codex exec (tools) exited {proc.returncode}: {(proc.stderr or '')[:500]}"
|
||||
)
|
||||
self._tokens += len(prompt) // 4 + len(resp) // 4
|
||||
called: List[str] = []
|
||||
if os.path.exists(calllog):
|
||||
|
||||
@@ -9,7 +9,7 @@ validation gate, vendored self-contained in ``skillopt_sleep.gate``.
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from skillopt_sleep.backend import Backend
|
||||
@@ -36,6 +36,10 @@ class ConsolidationResult:
|
||||
rejected_edits: List[EditRecord]
|
||||
holdout_baseline: float
|
||||
holdout_candidate: float
|
||||
# ── observability (so a 0.0->0.0 night is self-diagnosing, not a black box) ──
|
||||
holdout_detail: List[dict] = field(default_factory=list) # per val task: hard/soft/resp/why
|
||||
reflect_raw: str = "" # the optimizer's last raw reply (empty => reflect produced nothing)
|
||||
call_error: str = "" # backend's last call error (timeout/auth/empty)
|
||||
|
||||
|
||||
def _split(tasks: List[TaskRecord]) -> Tuple[List[TaskRecord], List[TaskRecord]]:
|
||||
@@ -61,6 +65,25 @@ def _split(tasks: List[TaskRecord]) -> Tuple[List[TaskRecord], List[TaskRecord]]
|
||||
return train, val
|
||||
|
||||
|
||||
def _holdout_detail(pairs: List[Tuple[TaskRecord, ReplayResult]]) -> List[dict]:
|
||||
"""Per-task held-out evidence so a 0.0 night explains itself: was the
|
||||
response empty (backend call failed) or non-empty-but-failing-checks
|
||||
(judge too strict / edit didn't help)? The two need opposite fixes."""
|
||||
out: List[dict] = []
|
||||
for t, r in pairs:
|
||||
resp = r.response or ""
|
||||
out.append({
|
||||
"id": t.id,
|
||||
"reference_kind": t.reference_kind,
|
||||
"hard": r.hard,
|
||||
"soft": r.soft,
|
||||
"response_len": len(resp),
|
||||
"response_head": resp[:200],
|
||||
"why": (r.fail_reason or r.judge_rationale or "")[:200],
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def consolidate(
|
||||
backend: Backend,
|
||||
tasks: List[TaskRecord],
|
||||
@@ -87,6 +110,7 @@ def consolidate(
|
||||
"""
|
||||
train_tasks, val_tasks = _split(tasks)
|
||||
gate_off = str(gate_mode).strip().lower() in {"off", "none", "false", "greedy"}
|
||||
holdout_detail: List[dict] = []
|
||||
|
||||
# ── baseline on the VAL slice (the gate reference) ────────────────────
|
||||
# When the gate is OFF the user has opted out of holding out a validation set
|
||||
@@ -98,6 +122,7 @@ def consolidate(
|
||||
else:
|
||||
base_pairs = replay_batch(backend, val_tasks, skill, memory)
|
||||
base_hard, base_soft = aggregate_scores(base_pairs)
|
||||
holdout_detail = _holdout_detail(base_pairs)
|
||||
base_score = select_gate_score(base_hard, base_soft, gate_metric, gate_mixed_weight)
|
||||
|
||||
# ── reflect over TRAIN-split failures/successes ───────────────────────
|
||||
@@ -235,4 +260,7 @@ def consolidate(
|
||||
rejected_edits=all_rejected,
|
||||
holdout_baseline=base_hard,
|
||||
holdout_candidate=final_hard,
|
||||
holdout_detail=holdout_detail,
|
||||
reflect_raw=getattr(backend, "last_reflect_raw", "") or "",
|
||||
call_error=getattr(backend, "last_call_error", "") or "",
|
||||
)
|
||||
|
||||
@@ -276,6 +276,28 @@ def run_sleep_cycle(
|
||||
live_memory_path=live_memory_path,
|
||||
report_md=report_md,
|
||||
)
|
||||
# Observability: persist per-task held-out evidence + optimizer/codex errors so a
|
||||
# 0.0->0.0 night self-explains (empty responses vs failing checks vs no edits) — the
|
||||
# cycle previously captured none of this, making the gate a black box (#learning-stall).
|
||||
try:
|
||||
import json as _json
|
||||
with open(os.path.join(staging_dir, "diagnostics.json"), "w", encoding="utf-8") as _fh:
|
||||
_json.dump({
|
||||
"night": night,
|
||||
"backend": cfg.get("backend"),
|
||||
"gate_mode": cfg.get("gate_mode"),
|
||||
"n_tasks": len(tasks),
|
||||
"baseline_score": result.baseline_score,
|
||||
"candidate_score": result.candidate_score,
|
||||
"accepted": result.accepted,
|
||||
"n_applied_edits": len(result.applied_edits),
|
||||
"n_rejected_edits": len(result.rejected_edits),
|
||||
"call_error": getattr(result, "call_error", ""),
|
||||
"reflect_raw_head": (getattr(result, "reflect_raw", "") or "")[:1200],
|
||||
"holdout_detail": getattr(result, "holdout_detail", []),
|
||||
}, _fh, indent=2)
|
||||
except Exception:
|
||||
pass
|
||||
state.set_last_harvest(project, started)
|
||||
state.record_night({
|
||||
"night": night, "accepted": result.accepted,
|
||||
|
||||
@@ -486,6 +486,18 @@ class TestConsolidateGate(unittest.TestCase):
|
||||
self.assertTrue(res.accepted)
|
||||
self.assertGreater(res.candidate_score, res.baseline_score)
|
||||
|
||||
def test_consolidate_records_holdout_detail(self):
|
||||
# observability: a 0.0 night must carry per-task evidence (was empty
|
||||
# response vs failing checks?) so it is diagnosable, not a black box.
|
||||
be = MockBackend()
|
||||
tasks = assign_splits(researcher_persona(), holdout_fraction=0.34, seed=42)
|
||||
res = consolidate(be, tasks, set_learned("", []), "", edit_budget=4,
|
||||
gate_metric="mixed", night=1)
|
||||
self.assertTrue(res.holdout_detail) # non-empty per-task rows
|
||||
row = res.holdout_detail[0]
|
||||
for k in ("id", "hard", "soft", "response_len", "why"):
|
||||
self.assertIn(k, row)
|
||||
|
||||
def test_no_op_when_already_optimal(self):
|
||||
be = MockBackend()
|
||||
tasks = assign_splits(programmer_persona(), holdout_fraction=0.34, seed=1)
|
||||
@@ -612,6 +624,24 @@ class TestMultiObjectiveAndPrefs(unittest.TestCase):
|
||||
[], "skill", "", edit_budget=2, evolve_skill=True, evolve_memory=False)
|
||||
self.assertIn("British English", captured["prompt"])
|
||||
|
||||
def test_reflect_records_last_raw(self):
|
||||
# the optimizer's raw reply must be retained so a no-edits night is
|
||||
# diagnosable (empty/non-JSON reflect vs genuinely no failures).
|
||||
from skillopt_sleep.backend import CliBackend
|
||||
from skillopt_sleep.types import ReplayResult
|
||||
|
||||
class CapBackend(CliBackend):
|
||||
name = "cap"
|
||||
def _call(self, prompt, *, max_tokens=1024):
|
||||
return '[{"op":"add","content":"a learned rule","rationale":"x"}]'
|
||||
|
||||
be = CapBackend()
|
||||
t = TaskRecord(id="t", project="/p", intent="x", reference_kind="rule",
|
||||
judge={"checks": [{"op": "contains", "arg": "z"}]})
|
||||
be.reflect([(t, ReplayResult(id="t", hard=0.0, fail_reason="failed: contains=z"))],
|
||||
[], "skill", "", edit_budget=2, evolve_skill=True, evolve_memory=False)
|
||||
self.assertIn("a learned rule", be.last_reflect_raw)
|
||||
|
||||
def test_replay_records_cost(self):
|
||||
from skillopt_sleep.backend import MockBackend
|
||||
from skillopt_sleep.replay import replay_one
|
||||
@@ -654,6 +684,89 @@ class TestCodexBackend(unittest.TestCase):
|
||||
self.assertIn("-C", cmd)
|
||||
self.assertEqual(cmd[cmd.index("-C") + 1], expected_project)
|
||||
|
||||
def test_codex_call_retries_transient_failure_not_silent_zero(self):
|
||||
"""A transient timeout must be RETRIED, not silently returned as "" — an
|
||||
empty reply scores 0 on every judge and zeroes the held-out baseline,
|
||||
making a flaky backend look identical to 'nothing to learn'."""
|
||||
import subprocess as _sp
|
||||
|
||||
from skillopt_sleep.backend import CodexCliBackend
|
||||
|
||||
calls = {"n": 0}
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
calls["n"] += 1
|
||||
if calls["n"] == 1:
|
||||
raise _sp.TimeoutExpired(cmd, kwargs.get("timeout", 1))
|
||||
out_path = cmd[cmd.index("-o") + 1]
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
f.write("real answer")
|
||||
|
||||
class Proc:
|
||||
returncode = 0
|
||||
stdout = ""
|
||||
stderr = ""
|
||||
|
||||
return Proc()
|
||||
|
||||
backend = CodexCliBackend(codex_path="codex")
|
||||
with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run), \
|
||||
mock.patch("time.sleep", lambda *_a, **_k: None):
|
||||
out = backend._call("hello")
|
||||
self.assertEqual(out, "real answer") # recovered on retry
|
||||
self.assertGreaterEqual(calls["n"], 2) # proves it did not silently return "" once
|
||||
|
||||
def test_codex_auth_error_surfaces_not_scored_as_response(self):
|
||||
"""An auth 401 must become a clear last_call_error + EMPTY response (not the
|
||||
9k-char error text scored as a 0 'answer'), and must NOT be retried — the
|
||||
exact failure that silently stalled learning (refresh_token_reused)."""
|
||||
from skillopt_sleep.backend import CodexCliBackend
|
||||
|
||||
calls = {"n": 0}
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
calls["n"] += 1
|
||||
out_path = cmd[cmd.index("-o") + 1]
|
||||
open(out_path, "w").close() # empty output file (codex wrote nothing)
|
||||
|
||||
class Proc:
|
||||
returncode = 1
|
||||
stdout = ""
|
||||
stderr = "ERROR codex_core::auth: 401 Unauthorized: refresh_token_reused"
|
||||
|
||||
return Proc()
|
||||
|
||||
be = CodexCliBackend(codex_path="codex")
|
||||
with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run), \
|
||||
mock.patch("time.sleep", lambda *_a, **_k: None):
|
||||
out = be._call("hi")
|
||||
self.assertEqual(out, "") # NOT the error text
|
||||
self.assertIn("refresh_token_reused", be.last_call_error) # surfaced for the operator
|
||||
self.assertEqual(calls["n"], 1) # failed fast, no wasted retries
|
||||
|
||||
def test_codex_attempt_with_tools_surfaces_error_not_silent(self):
|
||||
"""A failed tool-rollout (non-zero codex exec) on the tool path must set
|
||||
last_call_error and return an empty response — not a silent empty->0 the
|
||||
diagnostics can't see (the gap a _call-only fix would otherwise leave)."""
|
||||
from skillopt_sleep.backend import CodexCliBackend
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
class Proc:
|
||||
returncode = 1
|
||||
stdout = ""
|
||||
stderr = "ERROR codex_core::auth: 401 Unauthorized: refresh_token_reused"
|
||||
return Proc() # writes nothing to out_path -> empty response
|
||||
|
||||
be = CodexCliBackend(codex_path="codex")
|
||||
task = TaskRecord(id="t", project="/p", intent="answer the question",
|
||||
reference_kind="rule",
|
||||
judge={"checks": [{"op": "tool_called", "arg": "search"}]})
|
||||
with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run):
|
||||
resp, called = be.attempt_with_tools(task, "", "", ["search"])
|
||||
self.assertEqual(resp, "") # no leaked error text as a "response"
|
||||
self.assertIn("exited 1", be.last_call_error) # failure surfaced for diagnostics
|
||||
self.assertEqual(called, []) # no tool actually ran
|
||||
|
||||
|
||||
class TestMultiRolloutAndBudget(unittest.TestCase):
|
||||
def test_rolloutset_stats(self):
|
||||
|
||||
Reference in New Issue
Block a user