fix(skillopt-sleep): surface codex auth/model/version failures instead of silently scoring 0 (#92)

Splits CodexCliBackend._call into _call_once + a retry wrapper so transient empties/timeouts are retried instead of silently scored 0, and fails fast on fatal auth/model/version errors (401, refresh_token_reused, token_expired, ChatGPT-account-unsupported, newer-Codex-required). On non-zero exit the CLI error text is surfaced via last_call_error instead of being returned as a model response. Adds per-cycle diagnostics.json (observability only; gate and learning algorithm unchanged) so a 0.0 night self-explains.
2026-07-03 14:02:58 +08:00 · 2026-07-01 03:20:08 +08:00
parent 95a9e959fe 9fa0716c72
commit b9142bad24
5 changed files with 233 additions and 6 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -22,6 +22,8 @@ data/*
 outputs/
 logs/
 external/
+# SkillOpt-Sleep runtime state (staging proposals, config, diagnostics, cron logs)
+.skillopt-sleep/

 /BabyVision/
 /MMRB/
--- a/skillopt_sleep/backend.py
+++ b/skillopt_sleep/backend.py
@@ -520,6 +520,10 @@ class CliBackend(Backend):
            arr = _extract_json(raw, "array")
            if isinstance(arr, list) and arr:
                break
+        # Expose the last raw optimizer reply so a no-edits night is diagnosable:
+        # a 0.0->0.0 gate with zero edits is otherwise indistinguishable from
+        # "nothing to learn" (the cycle persists this in diagnostics.json).
+        self.last_reflect_raw = raw or ""
        edits: List[EditRecord] = []
        if isinstance(arr, list):
            for e in arr[:edit_budget]:
@@ -750,9 +754,11 @@ class CodexCliBackend(CliBackend):
            os.path.abspath(os.path.expanduser(project_dir)) if project_dir else ""
        )

-    def _call(self, prompt: str, *, max_tokens: int = 1024) -> str:
+    def _call_once(self, prompt: str, *, max_tokens: int = 1024) -> str:
+        """One codex exec attempt: returns the response text, or "" on
+        timeout/exception/empty-output (with last_call_error set). ``_call``
+        wraps this with retries so a transient failure is NOT silently scored 0."""
        import tempfile
-        self.last_call_error = ""
        out_path = tempfile.NamedTemporaryFile(
            prefix="codex_last_", suffix=".txt", delete=False
        ).name
@@ -793,6 +799,12 @@ class CodexCliBackend(CliBackend):
            stderr = (proc.stderr or "").strip() if proc is not None else ""
            if proc is not None and proc.returncode != 0 and not self.last_call_error:
                self.last_call_error = f"codex exec exited {proc.returncode}: {stderr[:500]}"
+            # Do NOT return the CLI's error text as if it were a model response: it
+            # pollutes rollout/judge/reflect and gets silently scored 0, hiding the
+            # real cause (e.g. an expired codex auth token surfacing as a 9k-char 401).
+            # Surface it via last_call_error and return empty instead.
+            if self.last_call_error:
+                return ""
            return stdout or stderr
        finally:
            try:
@@ -800,6 +812,45 @@ class CodexCliBackend(CliBackend):
            except Exception:
                pass

+    # Fatal codex failures that will NOT recover on retry — fail fast + loud so a
+    # 0.0 night reads as "codex auth/model/version problem" not "nothing to learn".
+    # Covers: auth (re-login), and 400 config errors like an unsupported model on a
+    # ChatGPT account or a model that needs a newer codex CLI (upgrade).
+    _AUTH_MARKERS = (
+        "401 Unauthorized", "refresh_token_reused", "token_expired",
+        "Please log out and sign in", "Not logged in", "Please run /login",
+        "authentication token is expired", "Unauthorized: invalid",
+        "is not supported when using Codex", "requires a newer version of Codex",
+    )
+
+    def _call(self, prompt: str, *, max_tokens: int = 1024, retries: int = 3) -> str:
+        """Retry transient empties/timeouts instead of silently returning "".
+
+        An empty reply scores 0 on every judge, which deflates the held-out
+        baseline AND blocks the candidate from ever improving — making a flaky
+        backend indistinguishable from "nothing to learn". The Azure backend
+        already guards this way (AzureOpenAIBackend._call); codex now does too.
+        Auth errors are NOT retried (hopeless until the user re-logs-in).
+        """
+        import logging
+        import random as _r
+        import time as _t
+        out = ""
+        for attempt in range(max(1, retries)):
+            self.last_call_error = ""
+            out = self._call_once(prompt, max_tokens=max_tokens)
+            if out:
+                return out
+            err = self.last_call_error or ""
+            if any(m in err for m in self._AUTH_MARKERS):
+                logging.getLogger("skillopt_sleep").error(
+                    "codex auth error — re-login required (`codex login`): %s", err[:200]
+                )
+                break  # fail fast: retrying a 401 just burns calls
+            if attempt < retries - 1:
+                _t.sleep(min(6.0, (2 ** attempt) * 0.5) + _r.random() * 0.3)
+        return out
+
    def attempt_with_tools(self, task, skill, memory, tools):
        # Codex exec runs in a sandbox with shell access; expose the same real
        # `search` shim and let it run (workspace-write so the shim can log).
@@ -839,16 +890,27 @@ class CodexCliBackend(CliBackend):
            if self.model:
                cmd += ["-m", self.model]
            cmd += ["--", prompt]
+            self.last_call_error = ""
+            proc = None
            try:
-                subprocess.run(cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work)
-            except Exception:
-                pass
+                proc = subprocess.run(cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work)
+            except subprocess.TimeoutExpired:
+                self.last_call_error = f"codex exec (tools) timed out after {self.timeout}s"
+            except Exception as exc:  # noqa: BLE001
+                self.last_call_error = f"codex exec (tools) failed: {exc}"
            resp = ""
            try:
                with open(out_path, encoding="utf-8") as f:
                    resp = f.read().strip()
            except Exception:
                resp = ""
+            # Surface a failed tool-rollout the SAME way _call does: an auth/model/version
+            # failure on this path must show up in diagnostics (call_error), not vanish as a
+            # silent empty->0 scored as a failed rollout. Response stays "" (never the error text).
+            if not resp and not self.last_call_error and proc is not None and proc.returncode != 0:
+                self.last_call_error = (
+                    f"codex exec (tools) exited {proc.returncode}: {(proc.stderr or '')[:500]}"
+                )
            self._tokens += len(prompt) // 4 + len(resp) // 4
            called: List[str] = []
            if os.path.exists(calllog):
--- a/skillopt_sleep/consolidate.py
+++ b/skillopt_sleep/consolidate.py
@@ -9,7 +9,7 @@ validation gate, vendored self-contained in ``skillopt_sleep.gate``.
 from __future__ import annotations

 import os
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import List, Optional, Tuple

 from skillopt_sleep.backend import Backend
@@ -36,6 +36,10 @@ class ConsolidationResult:
    rejected_edits: List[EditRecord]
    holdout_baseline: float
    holdout_candidate: float
+    # ── observability (so a 0.0->0.0 night is self-diagnosing, not a black box) ──
+    holdout_detail: List[dict] = field(default_factory=list)  # per val task: hard/soft/resp/why
+    reflect_raw: str = ""        # the optimizer's last raw reply (empty => reflect produced nothing)
+    call_error: str = ""         # backend's last call error (timeout/auth/empty)


 def _split(tasks: List[TaskRecord]) -> Tuple[List[TaskRecord], List[TaskRecord]]:
@@ -61,6 +65,25 @@ def _split(tasks: List[TaskRecord]) -> Tuple[List[TaskRecord], List[TaskRecord]]
    return train, val


+def _holdout_detail(pairs: List[Tuple[TaskRecord, ReplayResult]]) -> List[dict]:
+    """Per-task held-out evidence so a 0.0 night explains itself: was the
+    response empty (backend call failed) or non-empty-but-failing-checks
+    (judge too strict / edit didn't help)? The two need opposite fixes."""
+    out: List[dict] = []
+    for t, r in pairs:
+        resp = r.response or ""
+        out.append({
+            "id": t.id,
+            "reference_kind": t.reference_kind,
+            "hard": r.hard,
+            "soft": r.soft,
+            "response_len": len(resp),
+            "response_head": resp[:200],
+            "why": (r.fail_reason or r.judge_rationale or "")[:200],
+        })
+    return out
+
+
 def consolidate(
    backend: Backend,
    tasks: List[TaskRecord],
@@ -87,6 +110,7 @@ def consolidate(
    """
    train_tasks, val_tasks = _split(tasks)
    gate_off = str(gate_mode).strip().lower() in {"off", "none", "false", "greedy"}
+    holdout_detail: List[dict] = []

    # ── baseline on the VAL slice (the gate reference) ────────────────────
    # When the gate is OFF the user has opted out of holding out a validation set
@@ -98,6 +122,7 @@ def consolidate(
    else:
        base_pairs = replay_batch(backend, val_tasks, skill, memory)
        base_hard, base_soft = aggregate_scores(base_pairs)
+        holdout_detail = _holdout_detail(base_pairs)
    base_score = select_gate_score(base_hard, base_soft, gate_metric, gate_mixed_weight)

    # ── reflect over TRAIN-split failures/successes ───────────────────────
@@ -235,4 +260,7 @@ def consolidate(
        rejected_edits=all_rejected,
        holdout_baseline=base_hard,
        holdout_candidate=final_hard,
+        holdout_detail=holdout_detail,
+        reflect_raw=getattr(backend, "last_reflect_raw", "") or "",
+        call_error=getattr(backend, "last_call_error", "") or "",
    )
--- a/skillopt_sleep/cycle.py
+++ b/skillopt_sleep/cycle.py
@@ -276,6 +276,28 @@ def run_sleep_cycle(
            live_memory_path=live_memory_path,
            report_md=report_md,
        )
+        # Observability: persist per-task held-out evidence + optimizer/codex errors so a
+        # 0.0->0.0 night self-explains (empty responses vs failing checks vs no edits) — the
+        # cycle previously captured none of this, making the gate a black box (#learning-stall).
+        try:
+            import json as _json
+            with open(os.path.join(staging_dir, "diagnostics.json"), "w", encoding="utf-8") as _fh:
+                _json.dump({
+                    "night": night,
+                    "backend": cfg.get("backend"),
+                    "gate_mode": cfg.get("gate_mode"),
+                    "n_tasks": len(tasks),
+                    "baseline_score": result.baseline_score,
+                    "candidate_score": result.candidate_score,
+                    "accepted": result.accepted,
+                    "n_applied_edits": len(result.applied_edits),
+                    "n_rejected_edits": len(result.rejected_edits),
+                    "call_error": getattr(result, "call_error", ""),
+                    "reflect_raw_head": (getattr(result, "reflect_raw", "") or "")[:1200],
+                    "holdout_detail": getattr(result, "holdout_detail", []),
+                }, _fh, indent=2)
+        except Exception:
+            pass
        state.set_last_harvest(project, started)
        state.record_night({
            "night": night, "accepted": result.accepted,
--- a/tests/test_sleep_engine.py
+++ b/tests/test_sleep_engine.py
@@ -486,6 +486,18 @@ class TestConsolidateGate(unittest.TestCase):
        self.assertTrue(res.accepted)
        self.assertGreater(res.candidate_score, res.baseline_score)

+    def test_consolidate_records_holdout_detail(self):
+        # observability: a 0.0 night must carry per-task evidence (was empty
+        # response vs failing checks?) so it is diagnosable, not a black box.
+        be = MockBackend()
+        tasks = assign_splits(researcher_persona(), holdout_fraction=0.34, seed=42)
+        res = consolidate(be, tasks, set_learned("", []), "", edit_budget=4,
+                          gate_metric="mixed", night=1)
+        self.assertTrue(res.holdout_detail)  # non-empty per-task rows
+        row = res.holdout_detail[0]
+        for k in ("id", "hard", "soft", "response_len", "why"):
+            self.assertIn(k, row)
+
    def test_no_op_when_already_optimal(self):
        be = MockBackend()
        tasks = assign_splits(programmer_persona(), holdout_fraction=0.34, seed=1)
@@ -612,6 +624,24 @@ class TestMultiObjectiveAndPrefs(unittest.TestCase):
                   [], "skill", "", edit_budget=2, evolve_skill=True, evolve_memory=False)
        self.assertIn("British English", captured["prompt"])

+    def test_reflect_records_last_raw(self):
+        # the optimizer's raw reply must be retained so a no-edits night is
+        # diagnosable (empty/non-JSON reflect vs genuinely no failures).
+        from skillopt_sleep.backend import CliBackend
+        from skillopt_sleep.types import ReplayResult
+
+        class CapBackend(CliBackend):
+            name = "cap"
+            def _call(self, prompt, *, max_tokens=1024):
+                return '[{"op":"add","content":"a learned rule","rationale":"x"}]'
+
+        be = CapBackend()
+        t = TaskRecord(id="t", project="/p", intent="x", reference_kind="rule",
+                       judge={"checks": [{"op": "contains", "arg": "z"}]})
+        be.reflect([(t, ReplayResult(id="t", hard=0.0, fail_reason="failed: contains=z"))],
+                   [], "skill", "", edit_budget=2, evolve_skill=True, evolve_memory=False)
+        self.assertIn("a learned rule", be.last_reflect_raw)
+
    def test_replay_records_cost(self):
        from skillopt_sleep.backend import MockBackend
        from skillopt_sleep.replay import replay_one
@@ -654,6 +684,89 @@ class TestCodexBackend(unittest.TestCase):
            self.assertIn("-C", cmd)
            self.assertEqual(cmd[cmd.index("-C") + 1], expected_project)

+    def test_codex_call_retries_transient_failure_not_silent_zero(self):
+        """A transient timeout must be RETRIED, not silently returned as "" — an
+        empty reply scores 0 on every judge and zeroes the held-out baseline,
+        making a flaky backend look identical to 'nothing to learn'."""
+        import subprocess as _sp
+
+        from skillopt_sleep.backend import CodexCliBackend
+
+        calls = {"n": 0}
+
+        def fake_run(cmd, **kwargs):
+            calls["n"] += 1
+            if calls["n"] == 1:
+                raise _sp.TimeoutExpired(cmd, kwargs.get("timeout", 1))
+            out_path = cmd[cmd.index("-o") + 1]
+            with open(out_path, "w", encoding="utf-8") as f:
+                f.write("real answer")
+
+            class Proc:
+                returncode = 0
+                stdout = ""
+                stderr = ""
+
+            return Proc()
+
+        backend = CodexCliBackend(codex_path="codex")
+        with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run), \
+             mock.patch("time.sleep", lambda *_a, **_k: None):
+            out = backend._call("hello")
+        self.assertEqual(out, "real answer")     # recovered on retry
+        self.assertGreaterEqual(calls["n"], 2)   # proves it did not silently return "" once
+
+    def test_codex_auth_error_surfaces_not_scored_as_response(self):
+        """An auth 401 must become a clear last_call_error + EMPTY response (not the
+        9k-char error text scored as a 0 'answer'), and must NOT be retried — the
+        exact failure that silently stalled learning (refresh_token_reused)."""
+        from skillopt_sleep.backend import CodexCliBackend
+
+        calls = {"n": 0}
+
+        def fake_run(cmd, **kwargs):
+            calls["n"] += 1
+            out_path = cmd[cmd.index("-o") + 1]
+            open(out_path, "w").close()  # empty output file (codex wrote nothing)
+
+            class Proc:
+                returncode = 1
+                stdout = ""
+                stderr = "ERROR codex_core::auth: 401 Unauthorized: refresh_token_reused"
+
+            return Proc()
+
+        be = CodexCliBackend(codex_path="codex")
+        with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run), \
+             mock.patch("time.sleep", lambda *_a, **_k: None):
+            out = be._call("hi")
+        self.assertEqual(out, "")                                   # NOT the error text
+        self.assertIn("refresh_token_reused", be.last_call_error)   # surfaced for the operator
+        self.assertEqual(calls["n"], 1)                             # failed fast, no wasted retries
+
+    def test_codex_attempt_with_tools_surfaces_error_not_silent(self):
+        """A failed tool-rollout (non-zero codex exec) on the tool path must set
+        last_call_error and return an empty response — not a silent empty->0 the
+        diagnostics can't see (the gap a _call-only fix would otherwise leave)."""
+        from skillopt_sleep.backend import CodexCliBackend
+
+        def fake_run(cmd, **kwargs):
+            class Proc:
+                returncode = 1
+                stdout = ""
+                stderr = "ERROR codex_core::auth: 401 Unauthorized: refresh_token_reused"
+            return Proc()  # writes nothing to out_path -> empty response
+
+        be = CodexCliBackend(codex_path="codex")
+        task = TaskRecord(id="t", project="/p", intent="answer the question",
+                          reference_kind="rule",
+                          judge={"checks": [{"op": "tool_called", "arg": "search"}]})
+        with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run):
+            resp, called = be.attempt_with_tools(task, "", "", ["search"])
+        self.assertEqual(resp, "")                     # no leaked error text as a "response"
+        self.assertIn("exited 1", be.last_call_error)  # failure surfaced for diagnostics
+        self.assertEqual(called, [])                   # no tool actually ran
+

 class TestMultiRolloutAndBudget(unittest.TestCase):
    def test_rolloutset_stats(self):