From 9fcf5868c3f00b9c590fd877e1d726be5f2f51d0 Mon Sep 17 00:00:00 2001 From: Daniel Martinez Date: Sat, 27 Jun 2026 22:23:19 -0500 Subject: [PATCH 1/2] fix(skillopt-sleep): surface codex auth/model/version failures instead of silently scoring 0 A nightly sleep cycle could run for weeks emitting held-out 0.0 -> 0.0 (gate reject, zero edits), indistinguishable from "nothing to learn", when the real cause was the codex backend returning an error (expired auth / model unsupported on the account / outdated CLI) that got scored as a failed rollout. backend (CodexCliBackend): - split _call into _call_once + a retry wrapper: transient empties/timeouts are retried instead of silently returning "" (mirrors AzureOpenAIBackend's guard); - on a non-zero exit, surface the reason via last_call_error and return "" rather than leaking the CLI error text as if it were a model response; - fail fast (no retries) on fatal auth/model/version errors (401, refresh_token_reused, token_expired, "not supported when using Codex with a ChatGPT account", "requires a newer version of Codex"). backend (CliBackend.reflect): retain last_reflect_raw so a no-edits night is diagnosable. consolidate: ConsolidationResult now carries per-task held-out detail (response, hard/soft, fail_reason) + reflect_raw + call_error. cycle: write diagnostics.json per cycle so a 0.0 night self-explains instead of being a black box. tests: 4 new (retry-not-silent-zero, auth-error-surfaced-not-scored, holdout-detail, reflect-raw). Also gitignore the .skillopt-sleep/ runtime dir. Co-Authored-By: Claude Opus 4.8 --- .gitignore | 2 + skillopt_sleep/backend.py | 55 ++++++++++++++++++++- skillopt_sleep/consolidate.py | 30 +++++++++++- skillopt_sleep/cycle.py | 22 +++++++++ tests/test_sleep_engine.py | 90 +++++++++++++++++++++++++++++++++++ 5 files changed, 196 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 7108e72..4b90712 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,8 @@ data/* outputs/ logs/ external/ +# SkillOpt-Sleep runtime state (staging proposals, config, diagnostics, cron logs) +.skillopt-sleep/ /BabyVision/ /MMRB/ diff --git a/skillopt_sleep/backend.py b/skillopt_sleep/backend.py index f472da7..d5658c6 100644 --- a/skillopt_sleep/backend.py +++ b/skillopt_sleep/backend.py @@ -520,6 +520,10 @@ class CliBackend(Backend): arr = _extract_json(raw, "array") if isinstance(arr, list) and arr: break + # Expose the last raw optimizer reply so a no-edits night is diagnosable: + # a 0.0->0.0 gate with zero edits is otherwise indistinguishable from + # "nothing to learn" (the cycle persists this in diagnostics.json). + self.last_reflect_raw = raw or "" edits: List[EditRecord] = [] if isinstance(arr, list): for e in arr[:edit_budget]: @@ -750,9 +754,11 @@ class CodexCliBackend(CliBackend): os.path.abspath(os.path.expanduser(project_dir)) if project_dir else "" ) - def _call(self, prompt: str, *, max_tokens: int = 1024) -> str: + def _call_once(self, prompt: str, *, max_tokens: int = 1024) -> str: + """One codex exec attempt: returns the response text, or "" on + timeout/exception/empty-output (with last_call_error set). ``_call`` + wraps this with retries so a transient failure is NOT silently scored 0.""" import tempfile - self.last_call_error = "" out_path = tempfile.NamedTemporaryFile( prefix="codex_last_", suffix=".txt", delete=False ).name @@ -793,6 +799,12 @@ class CodexCliBackend(CliBackend): stderr = (proc.stderr or "").strip() if proc is not None else "" if proc is not None and proc.returncode != 0 and not self.last_call_error: self.last_call_error = f"codex exec exited {proc.returncode}: {stderr[:500]}" + # Do NOT return the CLI's error text as if it were a model response: it + # pollutes rollout/judge/reflect and gets silently scored 0, hiding the + # real cause (e.g. an expired codex auth token surfacing as a 9k-char 401). + # Surface it via last_call_error and return empty instead. + if self.last_call_error: + return "" return stdout or stderr finally: try: @@ -800,6 +812,45 @@ class CodexCliBackend(CliBackend): except Exception: pass + # Fatal codex failures that will NOT recover on retry — fail fast + loud so a + # 0.0 night reads as "codex auth/model/version problem" not "nothing to learn". + # Covers: auth (re-login), and 400 config errors like an unsupported model on a + # ChatGPT account or a model that needs a newer codex CLI (upgrade). + _AUTH_MARKERS = ( + "401 Unauthorized", "refresh_token_reused", "token_expired", + "Please log out and sign in", "Not logged in", "Please run /login", + "authentication token is expired", "Unauthorized: invalid", + "is not supported when using Codex", "requires a newer version of Codex", + ) + + def _call(self, prompt: str, *, max_tokens: int = 1024, retries: int = 3) -> str: + """Retry transient empties/timeouts instead of silently returning "". + + An empty reply scores 0 on every judge, which deflates the held-out + baseline AND blocks the candidate from ever improving — making a flaky + backend indistinguishable from "nothing to learn". The Azure backend + already guards this way (AzureOpenAIBackend._call); codex now does too. + Auth errors are NOT retried (hopeless until the user re-logs-in). + """ + import logging + import random as _r + import time as _t + out = "" + for attempt in range(max(1, retries)): + self.last_call_error = "" + out = self._call_once(prompt, max_tokens=max_tokens) + if out: + return out + err = self.last_call_error or "" + if any(m in err for m in self._AUTH_MARKERS): + logging.getLogger("skillopt_sleep").error( + "codex auth error — re-login required (`codex login`): %s", err[:200] + ) + break # fail fast: retrying a 401 just burns calls + if attempt < retries - 1: + _t.sleep(min(6.0, (2 ** attempt) * 0.5) + _r.random() * 0.3) + return out + def attempt_with_tools(self, task, skill, memory, tools): # Codex exec runs in a sandbox with shell access; expose the same real # `search` shim and let it run (workspace-write so the shim can log). diff --git a/skillopt_sleep/consolidate.py b/skillopt_sleep/consolidate.py index 78ee77d..a9ea662 100644 --- a/skillopt_sleep/consolidate.py +++ b/skillopt_sleep/consolidate.py @@ -9,7 +9,7 @@ validation gate, vendored self-contained in ``skillopt_sleep.gate``. from __future__ import annotations import os -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import List, Optional, Tuple from skillopt_sleep.backend import Backend @@ -36,6 +36,10 @@ class ConsolidationResult: rejected_edits: List[EditRecord] holdout_baseline: float holdout_candidate: float + # ── observability (so a 0.0->0.0 night is self-diagnosing, not a black box) ── + holdout_detail: List[dict] = field(default_factory=list) # per val task: hard/soft/resp/why + reflect_raw: str = "" # the optimizer's last raw reply (empty => reflect produced nothing) + call_error: str = "" # backend's last call error (timeout/auth/empty) def _split(tasks: List[TaskRecord]) -> Tuple[List[TaskRecord], List[TaskRecord]]: @@ -61,6 +65,25 @@ def _split(tasks: List[TaskRecord]) -> Tuple[List[TaskRecord], List[TaskRecord]] return train, val +def _holdout_detail(pairs: List[Tuple[TaskRecord, ReplayResult]]) -> List[dict]: + """Per-task held-out evidence so a 0.0 night explains itself: was the + response empty (backend call failed) or non-empty-but-failing-checks + (judge too strict / edit didn't help)? The two need opposite fixes.""" + out: List[dict] = [] + for t, r in pairs: + resp = r.response or "" + out.append({ + "id": t.id, + "reference_kind": t.reference_kind, + "hard": r.hard, + "soft": r.soft, + "response_len": len(resp), + "response_head": resp[:200], + "why": (r.fail_reason or r.judge_rationale or "")[:200], + }) + return out + + def consolidate( backend: Backend, tasks: List[TaskRecord], @@ -87,6 +110,7 @@ def consolidate( """ train_tasks, val_tasks = _split(tasks) gate_off = str(gate_mode).strip().lower() in {"off", "none", "false", "greedy"} + holdout_detail: List[dict] = [] # ── baseline on the VAL slice (the gate reference) ──────────────────── # When the gate is OFF the user has opted out of holding out a validation set @@ -98,6 +122,7 @@ def consolidate( else: base_pairs = replay_batch(backend, val_tasks, skill, memory) base_hard, base_soft = aggregate_scores(base_pairs) + holdout_detail = _holdout_detail(base_pairs) base_score = select_gate_score(base_hard, base_soft, gate_metric, gate_mixed_weight) # ── reflect over TRAIN-split failures/successes ─────────────────────── @@ -235,4 +260,7 @@ def consolidate( rejected_edits=all_rejected, holdout_baseline=base_hard, holdout_candidate=final_hard, + holdout_detail=holdout_detail, + reflect_raw=getattr(backend, "last_reflect_raw", "") or "", + call_error=getattr(backend, "last_call_error", "") or "", ) diff --git a/skillopt_sleep/cycle.py b/skillopt_sleep/cycle.py index 57b06a9..ee726f3 100644 --- a/skillopt_sleep/cycle.py +++ b/skillopt_sleep/cycle.py @@ -276,6 +276,28 @@ def run_sleep_cycle( live_memory_path=live_memory_path, report_md=report_md, ) + # Observability: persist per-task held-out evidence + optimizer/codex errors so a + # 0.0->0.0 night self-explains (empty responses vs failing checks vs no edits) — the + # cycle previously captured none of this, making the gate a black box (#learning-stall). + try: + import json as _json + with open(os.path.join(staging_dir, "diagnostics.json"), "w", encoding="utf-8") as _fh: + _json.dump({ + "night": night, + "backend": cfg.get("backend"), + "gate_mode": cfg.get("gate_mode"), + "n_tasks": len(tasks), + "baseline_score": result.baseline_score, + "candidate_score": result.candidate_score, + "accepted": result.accepted, + "n_applied_edits": len(result.applied_edits), + "n_rejected_edits": len(result.rejected_edits), + "call_error": getattr(result, "call_error", ""), + "reflect_raw_head": (getattr(result, "reflect_raw", "") or "")[:1200], + "holdout_detail": getattr(result, "holdout_detail", []), + }, _fh, indent=2) + except Exception: + pass state.set_last_harvest(project, started) state.record_night({ "night": night, "accepted": result.accepted, diff --git a/tests/test_sleep_engine.py b/tests/test_sleep_engine.py index 4e4bc8b..113bc8e 100644 --- a/tests/test_sleep_engine.py +++ b/tests/test_sleep_engine.py @@ -486,6 +486,18 @@ class TestConsolidateGate(unittest.TestCase): self.assertTrue(res.accepted) self.assertGreater(res.candidate_score, res.baseline_score) + def test_consolidate_records_holdout_detail(self): + # observability: a 0.0 night must carry per-task evidence (was empty + # response vs failing checks?) so it is diagnosable, not a black box. + be = MockBackend() + tasks = assign_splits(researcher_persona(), holdout_fraction=0.34, seed=42) + res = consolidate(be, tasks, set_learned("", []), "", edit_budget=4, + gate_metric="mixed", night=1) + self.assertTrue(res.holdout_detail) # non-empty per-task rows + row = res.holdout_detail[0] + for k in ("id", "hard", "soft", "response_len", "why"): + self.assertIn(k, row) + def test_no_op_when_already_optimal(self): be = MockBackend() tasks = assign_splits(programmer_persona(), holdout_fraction=0.34, seed=1) @@ -612,6 +624,24 @@ class TestMultiObjectiveAndPrefs(unittest.TestCase): [], "skill", "", edit_budget=2, evolve_skill=True, evolve_memory=False) self.assertIn("British English", captured["prompt"]) + def test_reflect_records_last_raw(self): + # the optimizer's raw reply must be retained so a no-edits night is + # diagnosable (empty/non-JSON reflect vs genuinely no failures). + from skillopt_sleep.backend import CliBackend + from skillopt_sleep.types import ReplayResult + + class CapBackend(CliBackend): + name = "cap" + def _call(self, prompt, *, max_tokens=1024): + return '[{"op":"add","content":"a learned rule","rationale":"x"}]' + + be = CapBackend() + t = TaskRecord(id="t", project="/p", intent="x", reference_kind="rule", + judge={"checks": [{"op": "contains", "arg": "z"}]}) + be.reflect([(t, ReplayResult(id="t", hard=0.0, fail_reason="failed: contains=z"))], + [], "skill", "", edit_budget=2, evolve_skill=True, evolve_memory=False) + self.assertIn("a learned rule", be.last_reflect_raw) + def test_replay_records_cost(self): from skillopt_sleep.backend import MockBackend from skillopt_sleep.replay import replay_one @@ -654,6 +684,66 @@ class TestCodexBackend(unittest.TestCase): self.assertIn("-C", cmd) self.assertEqual(cmd[cmd.index("-C") + 1], expected_project) + def test_codex_call_retries_transient_failure_not_silent_zero(self): + """A transient timeout must be RETRIED, not silently returned as "" — an + empty reply scores 0 on every judge and zeroes the held-out baseline, + making a flaky backend look identical to 'nothing to learn'.""" + import subprocess as _sp + + from skillopt_sleep.backend import CodexCliBackend + + calls = {"n": 0} + + def fake_run(cmd, **kwargs): + calls["n"] += 1 + if calls["n"] == 1: + raise _sp.TimeoutExpired(cmd, kwargs.get("timeout", 1)) + out_path = cmd[cmd.index("-o") + 1] + with open(out_path, "w", encoding="utf-8") as f: + f.write("real answer") + + class Proc: + returncode = 0 + stdout = "" + stderr = "" + + return Proc() + + backend = CodexCliBackend(codex_path="codex") + with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run), \ + mock.patch("time.sleep", lambda *_a, **_k: None): + out = backend._call("hello") + self.assertEqual(out, "real answer") # recovered on retry + self.assertGreaterEqual(calls["n"], 2) # proves it did not silently return "" once + + def test_codex_auth_error_surfaces_not_scored_as_response(self): + """An auth 401 must become a clear last_call_error + EMPTY response (not the + 9k-char error text scored as a 0 'answer'), and must NOT be retried — the + exact failure that silently stalled learning (refresh_token_reused).""" + from skillopt_sleep.backend import CodexCliBackend + + calls = {"n": 0} + + def fake_run(cmd, **kwargs): + calls["n"] += 1 + out_path = cmd[cmd.index("-o") + 1] + open(out_path, "w").close() # empty output file (codex wrote nothing) + + class Proc: + returncode = 1 + stdout = "" + stderr = "ERROR codex_core::auth: 401 Unauthorized: refresh_token_reused" + + return Proc() + + be = CodexCliBackend(codex_path="codex") + with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run), \ + mock.patch("time.sleep", lambda *_a, **_k: None): + out = be._call("hi") + self.assertEqual(out, "") # NOT the error text + self.assertIn("refresh_token_reused", be.last_call_error) # surfaced for the operator + self.assertEqual(calls["n"], 1) # failed fast, no wasted retries + class TestMultiRolloutAndBudget(unittest.TestCase): def test_rolloutset_stats(self): From 9fa0716c72b7c67fe6099e75f46a354e2319245c Mon Sep 17 00:00:00 2001 From: Daniel Martinez Date: Sat, 27 Jun 2026 23:56:11 -0500 Subject: [PATCH 2/2] fix(skillopt-sleep): also surface codex failures on the tool-call rollout path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up from a fresh-context review of the prior commit: CodexCliBackend.attempt_with_tools (the rollout path for tool-requiring tasks) ran codex exec inline, swallowed all exceptions, and never set last_call_error — so an auth/model/version failure on the tool path still produced a silent empty->0 with no diagnostic signal, the exact failure class the prior commit fixed for the _call path. Now it surfaces timeout/exception/non-zero-exit via last_call_error (response stays empty; never leaks the CLI error text), so a failed tool rollout shows up in diagnostics.json. Adds a regression test. Co-Authored-By: Claude Opus 4.8 --- skillopt_sleep/backend.py | 17 ++++++++++++++--- tests/test_sleep_engine.py | 23 +++++++++++++++++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/skillopt_sleep/backend.py b/skillopt_sleep/backend.py index d5658c6..b1c7208 100644 --- a/skillopt_sleep/backend.py +++ b/skillopt_sleep/backend.py @@ -890,16 +890,27 @@ class CodexCliBackend(CliBackend): if self.model: cmd += ["-m", self.model] cmd += ["--", prompt] + self.last_call_error = "" + proc = None try: - subprocess.run(cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work) - except Exception: - pass + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work) + except subprocess.TimeoutExpired: + self.last_call_error = f"codex exec (tools) timed out after {self.timeout}s" + except Exception as exc: # noqa: BLE001 + self.last_call_error = f"codex exec (tools) failed: {exc}" resp = "" try: with open(out_path, encoding="utf-8") as f: resp = f.read().strip() except Exception: resp = "" + # Surface a failed tool-rollout the SAME way _call does: an auth/model/version + # failure on this path must show up in diagnostics (call_error), not vanish as a + # silent empty->0 scored as a failed rollout. Response stays "" (never the error text). + if not resp and not self.last_call_error and proc is not None and proc.returncode != 0: + self.last_call_error = ( + f"codex exec (tools) exited {proc.returncode}: {(proc.stderr or '')[:500]}" + ) self._tokens += len(prompt) // 4 + len(resp) // 4 called: List[str] = [] if os.path.exists(calllog): diff --git a/tests/test_sleep_engine.py b/tests/test_sleep_engine.py index 113bc8e..bd5b971 100644 --- a/tests/test_sleep_engine.py +++ b/tests/test_sleep_engine.py @@ -744,6 +744,29 @@ class TestCodexBackend(unittest.TestCase): self.assertIn("refresh_token_reused", be.last_call_error) # surfaced for the operator self.assertEqual(calls["n"], 1) # failed fast, no wasted retries + def test_codex_attempt_with_tools_surfaces_error_not_silent(self): + """A failed tool-rollout (non-zero codex exec) on the tool path must set + last_call_error and return an empty response — not a silent empty->0 the + diagnostics can't see (the gap a _call-only fix would otherwise leave).""" + from skillopt_sleep.backend import CodexCliBackend + + def fake_run(cmd, **kwargs): + class Proc: + returncode = 1 + stdout = "" + stderr = "ERROR codex_core::auth: 401 Unauthorized: refresh_token_reused" + return Proc() # writes nothing to out_path -> empty response + + be = CodexCliBackend(codex_path="codex") + task = TaskRecord(id="t", project="/p", intent="answer the question", + reference_kind="rule", + judge={"checks": [{"op": "tool_called", "arg": "search"}]}) + with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run): + resp, called = be.attempt_with_tools(task, "", "", ["search"]) + self.assertEqual(resp, "") # no leaked error text as a "response" + self.assertIn("exited 1", be.last_call_error) # failure surfaced for diagnostics + self.assertEqual(called, []) # no tool actually ran + class TestMultiRolloutAndBudget(unittest.TestCase): def test_rolloutset_stats(self):