From 9fcf5868c3f00b9c590fd877e1d726be5f2f51d0 Mon Sep 17 00:00:00 2001
From: Daniel Martinez <dmmdea@hotmail.com>
Date: Sat, 27 Jun 2026 22:23:19 -0500
Subject: [PATCH 1/2] fix(skillopt-sleep): surface codex auth/model/version
 failures instead of silently scoring 0

A nightly sleep cycle could run for weeks emitting held-out 0.0 -> 0.0 (gate reject, zero
edits), indistinguishable from "nothing to learn", when the real cause was the codex backend
returning an error (expired auth / model unsupported on the account / outdated CLI) that got
scored as a failed rollout.

backend (CodexCliBackend):
- split _call into _call_once + a retry wrapper: transient empties/timeouts are retried
  instead of silently returning "" (mirrors AzureOpenAIBackend's guard);
- on a non-zero exit, surface the reason via last_call_error and return "" rather than
  leaking the CLI error text as if it were a model response;
- fail fast (no retries) on fatal auth/model/version errors (401, refresh_token_reused,
  token_expired, "not supported when using Codex with a ChatGPT account",
  "requires a newer version of Codex").
backend (CliBackend.reflect): retain last_reflect_raw so a no-edits night is diagnosable.
consolidate: ConsolidationResult now carries per-task held-out detail (response, hard/soft,
  fail_reason) + reflect_raw + call_error.
cycle: write diagnostics.json per cycle so a 0.0 night self-explains instead of being a black box.
tests: 4 new (retry-not-silent-zero, auth-error-surfaced-not-scored, holdout-detail, reflect-raw).

Also gitignore the .skillopt-sleep/ runtime dir.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .gitignore                    |  2 +
 skillopt_sleep/backend.py     | 55 ++++++++++++++++++++-
 skillopt_sleep/consolidate.py | 30 +++++++++++-
 skillopt_sleep/cycle.py       | 22 +++++++++
 tests/test_sleep_engine.py    | 90 +++++++++++++++++++++++++++++++++++
 5 files changed, 196 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7108e72..4b90712 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,6 +22,8 @@ data/*
 outputs/
 logs/
 external/
+# SkillOpt-Sleep runtime state (staging proposals, config, diagnostics, cron logs)
+.skillopt-sleep/
 
 /BabyVision/
 /MMRB/
diff --git a/skillopt_sleep/backend.py b/skillopt_sleep/backend.py
index f472da7..d5658c6 100644
--- a/skillopt_sleep/backend.py
+++ b/skillopt_sleep/backend.py
@@ -520,6 +520,10 @@ class CliBackend(Backend):
             arr = _extract_json(raw, "array")
             if isinstance(arr, list) and arr:
                 break
+        # Expose the last raw optimizer reply so a no-edits night is diagnosable:
+        # a 0.0->0.0 gate with zero edits is otherwise indistinguishable from
+        # "nothing to learn" (the cycle persists this in diagnostics.json).
+        self.last_reflect_raw = raw or ""
         edits: List[EditRecord] = []
         if isinstance(arr, list):
             for e in arr[:edit_budget]:
@@ -750,9 +754,11 @@ class CodexCliBackend(CliBackend):
             os.path.abspath(os.path.expanduser(project_dir)) if project_dir else ""
         )
 
-    def _call(self, prompt: str, *, max_tokens: int = 1024) -> str:
+    def _call_once(self, prompt: str, *, max_tokens: int = 1024) -> str:
+        """One codex exec attempt: returns the response text, or "" on
+        timeout/exception/empty-output (with last_call_error set). ``_call``
+        wraps this with retries so a transient failure is NOT silently scored 0."""
         import tempfile
-        self.last_call_error = ""
         out_path = tempfile.NamedTemporaryFile(
             prefix="codex_last_", suffix=".txt", delete=False
         ).name
@@ -793,6 +799,12 @@ class CodexCliBackend(CliBackend):
             stderr = (proc.stderr or "").strip() if proc is not None else ""
             if proc is not None and proc.returncode != 0 and not self.last_call_error:
                 self.last_call_error = f"codex exec exited {proc.returncode}: {stderr[:500]}"
+            # Do NOT return the CLI's error text as if it were a model response: it
+            # pollutes rollout/judge/reflect and gets silently scored 0, hiding the
+            # real cause (e.g. an expired codex auth token surfacing as a 9k-char 401).
+            # Surface it via last_call_error and return empty instead.
+            if self.last_call_error:
+                return ""
             return stdout or stderr
         finally:
             try:
@@ -800,6 +812,45 @@ class CodexCliBackend(CliBackend):
             except Exception:
                 pass
 
+    # Fatal codex failures that will NOT recover on retry — fail fast + loud so a
+    # 0.0 night reads as "codex auth/model/version problem" not "nothing to learn".
+    # Covers: auth (re-login), and 400 config errors like an unsupported model on a
+    # ChatGPT account or a model that needs a newer codex CLI (upgrade).
+    _AUTH_MARKERS = (
+        "401 Unauthorized", "refresh_token_reused", "token_expired",
+        "Please log out and sign in", "Not logged in", "Please run /login",
+        "authentication token is expired", "Unauthorized: invalid",
+        "is not supported when using Codex", "requires a newer version of Codex",
+    )
+
+    def _call(self, prompt: str, *, max_tokens: int = 1024, retries: int = 3) -> str:
+        """Retry transient empties/timeouts instead of silently returning "".
+
+        An empty reply scores 0 on every judge, which deflates the held-out
+        baseline AND blocks the candidate from ever improving — making a flaky
+        backend indistinguishable from "nothing to learn". The Azure backend
+        already guards this way (AzureOpenAIBackend._call); codex now does too.
+        Auth errors are NOT retried (hopeless until the user re-logs-in).
+        """
+        import logging
+        import random as _r
+        import time as _t
+        out = ""
+        for attempt in range(max(1, retries)):
+            self.last_call_error = ""
+            out = self._call_once(prompt, max_tokens=max_tokens)
+            if out:
+                return out
+            err = self.last_call_error or ""
+            if any(m in err for m in self._AUTH_MARKERS):
+                logging.getLogger("skillopt_sleep").error(
+                    "codex auth error — re-login required (`codex login`): %s", err[:200]
+                )
+                break  # fail fast: retrying a 401 just burns calls
+            if attempt < retries - 1:
+                _t.sleep(min(6.0, (2 ** attempt) * 0.5) + _r.random() * 0.3)
+        return out
+
     def attempt_with_tools(self, task, skill, memory, tools):
         # Codex exec runs in a sandbox with shell access; expose the same real
         # `search` shim and let it run (workspace-write so the shim can log).
diff --git a/skillopt_sleep/consolidate.py b/skillopt_sleep/consolidate.py
index 78ee77d..a9ea662 100644
--- a/skillopt_sleep/consolidate.py
+++ b/skillopt_sleep/consolidate.py
@@ -9,7 +9,7 @@ validation gate, vendored self-contained in ``skillopt_sleep.gate``.
 from __future__ import annotations
 
 import os
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import List, Optional, Tuple
 
 from skillopt_sleep.backend import Backend
@@ -36,6 +36,10 @@ class ConsolidationResult:
     rejected_edits: List[EditRecord]
     holdout_baseline: float
     holdout_candidate: float
+    # ── observability (so a 0.0->0.0 night is self-diagnosing, not a black box) ──
+    holdout_detail: List[dict] = field(default_factory=list)  # per val task: hard/soft/resp/why
+    reflect_raw: str = ""        # the optimizer's last raw reply (empty => reflect produced nothing)
+    call_error: str = ""         # backend's last call error (timeout/auth/empty)
 
 
 def _split(tasks: List[TaskRecord]) -> Tuple[List[TaskRecord], List[TaskRecord]]:
@@ -61,6 +65,25 @@ def _split(tasks: List[TaskRecord]) -> Tuple[List[TaskRecord], List[TaskRecord]]
     return train, val
 
 
+def _holdout_detail(pairs: List[Tuple[TaskRecord, ReplayResult]]) -> List[dict]:
+    """Per-task held-out evidence so a 0.0 night explains itself: was the
+    response empty (backend call failed) or non-empty-but-failing-checks
+    (judge too strict / edit didn't help)? The two need opposite fixes."""
+    out: List[dict] = []
+    for t, r in pairs:
+        resp = r.response or ""
+        out.append({
+            "id": t.id,
+            "reference_kind": t.reference_kind,
+            "hard": r.hard,
+            "soft": r.soft,
+            "response_len": len(resp),
+            "response_head": resp[:200],
+            "why": (r.fail_reason or r.judge_rationale or "")[:200],
+        })
+    return out
+
+
 def consolidate(
     backend: Backend,
     tasks: List[TaskRecord],
@@ -87,6 +110,7 @@ def consolidate(
     """
     train_tasks, val_tasks = _split(tasks)
     gate_off = str(gate_mode).strip().lower() in {"off", "none", "false", "greedy"}
+    holdout_detail: List[dict] = []
 
     # ── baseline on the VAL slice (the gate reference) ────────────────────
     # When the gate is OFF the user has opted out of holding out a validation set
@@ -98,6 +122,7 @@ def consolidate(
     else:
         base_pairs = replay_batch(backend, val_tasks, skill, memory)
         base_hard, base_soft = aggregate_scores(base_pairs)
+        holdout_detail = _holdout_detail(base_pairs)
     base_score = select_gate_score(base_hard, base_soft, gate_metric, gate_mixed_weight)
 
     # ── reflect over TRAIN-split failures/successes ───────────────────────
@@ -235,4 +260,7 @@ def consolidate(
         rejected_edits=all_rejected,
         holdout_baseline=base_hard,
         holdout_candidate=final_hard,
+        holdout_detail=holdout_detail,
+        reflect_raw=getattr(backend, "last_reflect_raw", "") or "",
+        call_error=getattr(backend, "last_call_error", "") or "",
     )
diff --git a/skillopt_sleep/cycle.py b/skillopt_sleep/cycle.py
index 57b06a9..ee726f3 100644
--- a/skillopt_sleep/cycle.py
+++ b/skillopt_sleep/cycle.py
@@ -276,6 +276,28 @@ def run_sleep_cycle(
             live_memory_path=live_memory_path,
             report_md=report_md,
         )
+        # Observability: persist per-task held-out evidence + optimizer/codex errors so a
+        # 0.0->0.0 night self-explains (empty responses vs failing checks vs no edits) — the
+        # cycle previously captured none of this, making the gate a black box (#learning-stall).
+        try:
+            import json as _json
+            with open(os.path.join(staging_dir, "diagnostics.json"), "w", encoding="utf-8") as _fh:
+                _json.dump({
+                    "night": night,
+                    "backend": cfg.get("backend"),
+                    "gate_mode": cfg.get("gate_mode"),
+                    "n_tasks": len(tasks),
+                    "baseline_score": result.baseline_score,
+                    "candidate_score": result.candidate_score,
+                    "accepted": result.accepted,
+                    "n_applied_edits": len(result.applied_edits),
+                    "n_rejected_edits": len(result.rejected_edits),
+                    "call_error": getattr(result, "call_error", ""),
+                    "reflect_raw_head": (getattr(result, "reflect_raw", "") or "")[:1200],
+                    "holdout_detail": getattr(result, "holdout_detail", []),
+                }, _fh, indent=2)
+        except Exception:
+            pass
         state.set_last_harvest(project, started)
         state.record_night({
             "night": night, "accepted": result.accepted,
diff --git a/tests/test_sleep_engine.py b/tests/test_sleep_engine.py
index 4e4bc8b..113bc8e 100644
--- a/tests/test_sleep_engine.py
+++ b/tests/test_sleep_engine.py
@@ -486,6 +486,18 @@ class TestConsolidateGate(unittest.TestCase):
         self.assertTrue(res.accepted)
         self.assertGreater(res.candidate_score, res.baseline_score)
 
+    def test_consolidate_records_holdout_detail(self):
+        # observability: a 0.0 night must carry per-task evidence (was empty
+        # response vs failing checks?) so it is diagnosable, not a black box.
+        be = MockBackend()
+        tasks = assign_splits(researcher_persona(), holdout_fraction=0.34, seed=42)
+        res = consolidate(be, tasks, set_learned("", []), "", edit_budget=4,
+                          gate_metric="mixed", night=1)
+        self.assertTrue(res.holdout_detail)  # non-empty per-task rows
+        row = res.holdout_detail[0]
+        for k in ("id", "hard", "soft", "response_len", "why"):
+            self.assertIn(k, row)
+
     def test_no_op_when_already_optimal(self):
         be = MockBackend()
         tasks = assign_splits(programmer_persona(), holdout_fraction=0.34, seed=1)
@@ -612,6 +624,24 @@ class TestMultiObjectiveAndPrefs(unittest.TestCase):
                    [], "skill", "", edit_budget=2, evolve_skill=True, evolve_memory=False)
         self.assertIn("British English", captured["prompt"])
 
+    def test_reflect_records_last_raw(self):
+        # the optimizer's raw reply must be retained so a no-edits night is
+        # diagnosable (empty/non-JSON reflect vs genuinely no failures).
+        from skillopt_sleep.backend import CliBackend
+        from skillopt_sleep.types import ReplayResult
+
+        class CapBackend(CliBackend):
+            name = "cap"
+            def _call(self, prompt, *, max_tokens=1024):
+                return '[{"op":"add","content":"a learned rule","rationale":"x"}]'
+
+        be = CapBackend()
+        t = TaskRecord(id="t", project="/p", intent="x", reference_kind="rule",
+                       judge={"checks": [{"op": "contains", "arg": "z"}]})
+        be.reflect([(t, ReplayResult(id="t", hard=0.0, fail_reason="failed: contains=z"))],
+                   [], "skill", "", edit_budget=2, evolve_skill=True, evolve_memory=False)
+        self.assertIn("a learned rule", be.last_reflect_raw)
+
     def test_replay_records_cost(self):
         from skillopt_sleep.backend import MockBackend
         from skillopt_sleep.replay import replay_one
@@ -654,6 +684,66 @@ class TestCodexBackend(unittest.TestCase):
             self.assertIn("-C", cmd)
             self.assertEqual(cmd[cmd.index("-C") + 1], expected_project)
 
+    def test_codex_call_retries_transient_failure_not_silent_zero(self):
+        """A transient timeout must be RETRIED, not silently returned as "" — an
+        empty reply scores 0 on every judge and zeroes the held-out baseline,
+        making a flaky backend look identical to 'nothing to learn'."""
+        import subprocess as _sp
+
+        from skillopt_sleep.backend import CodexCliBackend
+
+        calls = {"n": 0}
+
+        def fake_run(cmd, **kwargs):
+            calls["n"] += 1
+            if calls["n"] == 1:
+                raise _sp.TimeoutExpired(cmd, kwargs.get("timeout", 1))
+            out_path = cmd[cmd.index("-o") + 1]
+            with open(out_path, "w", encoding="utf-8") as f:
+                f.write("real answer")
+
+            class Proc:
+                returncode = 0
+                stdout = ""
+                stderr = ""
+
+            return Proc()
+
+        backend = CodexCliBackend(codex_path="codex")
+        with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run), \
+             mock.patch("time.sleep", lambda *_a, **_k: None):
+            out = backend._call("hello")
+        self.assertEqual(out, "real answer")     # recovered on retry
+        self.assertGreaterEqual(calls["n"], 2)   # proves it did not silently return "" once
+
+    def test_codex_auth_error_surfaces_not_scored_as_response(self):
+        """An auth 401 must become a clear last_call_error + EMPTY response (not the
+        9k-char error text scored as a 0 'answer'), and must NOT be retried — the
+        exact failure that silently stalled learning (refresh_token_reused)."""
+        from skillopt_sleep.backend import CodexCliBackend
+
+        calls = {"n": 0}
+
+        def fake_run(cmd, **kwargs):
+            calls["n"] += 1
+            out_path = cmd[cmd.index("-o") + 1]
+            open(out_path, "w").close()  # empty output file (codex wrote nothing)
+
+            class Proc:
+                returncode = 1
+                stdout = ""
+                stderr = "ERROR codex_core::auth: 401 Unauthorized: refresh_token_reused"
+
+            return Proc()
+
+        be = CodexCliBackend(codex_path="codex")
+        with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run), \
+             mock.patch("time.sleep", lambda *_a, **_k: None):
+            out = be._call("hi")
+        self.assertEqual(out, "")                                   # NOT the error text
+        self.assertIn("refresh_token_reused", be.last_call_error)   # surfaced for the operator
+        self.assertEqual(calls["n"], 1)                             # failed fast, no wasted retries
+
 
 class TestMultiRolloutAndBudget(unittest.TestCase):
     def test_rolloutset_stats(self):

From 9fa0716c72b7c67fe6099e75f46a354e2319245c Mon Sep 17 00:00:00 2001
From: Daniel Martinez <dmmdea@hotmail.com>
Date: Sat, 27 Jun 2026 23:56:11 -0500
Subject: [PATCH 2/2] fix(skillopt-sleep): also surface codex failures on the
 tool-call rollout path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up from a fresh-context review of the prior commit: CodexCliBackend.attempt_with_tools
(the rollout path for tool-requiring tasks) ran codex exec inline, swallowed all exceptions,
and never set last_call_error — so an auth/model/version failure on the tool path still
produced a silent empty->0 with no diagnostic signal, the exact failure class the prior commit
fixed for the _call path. Now it surfaces timeout/exception/non-zero-exit via last_call_error
(response stays empty; never leaks the CLI error text), so a failed tool rollout shows up in
diagnostics.json. Adds a regression test.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 skillopt_sleep/backend.py  | 17 ++++++++++++++---
 tests/test_sleep_engine.py | 23 +++++++++++++++++++++++
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/skillopt_sleep/backend.py b/skillopt_sleep/backend.py
index d5658c6..b1c7208 100644
--- a/skillopt_sleep/backend.py
+++ b/skillopt_sleep/backend.py
@@ -890,16 +890,27 @@ class CodexCliBackend(CliBackend):
             if self.model:
                 cmd += ["-m", self.model]
             cmd += ["--", prompt]
+            self.last_call_error = ""
+            proc = None
             try:
-                subprocess.run(cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work)
-            except Exception:
-                pass
+                proc = subprocess.run(cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work)
+            except subprocess.TimeoutExpired:
+                self.last_call_error = f"codex exec (tools) timed out after {self.timeout}s"
+            except Exception as exc:  # noqa: BLE001
+                self.last_call_error = f"codex exec (tools) failed: {exc}"
             resp = ""
             try:
                 with open(out_path, encoding="utf-8") as f:
                     resp = f.read().strip()
             except Exception:
                 resp = ""
+            # Surface a failed tool-rollout the SAME way _call does: an auth/model/version
+            # failure on this path must show up in diagnostics (call_error), not vanish as a
+            # silent empty->0 scored as a failed rollout. Response stays "" (never the error text).
+            if not resp and not self.last_call_error and proc is not None and proc.returncode != 0:
+                self.last_call_error = (
+                    f"codex exec (tools) exited {proc.returncode}: {(proc.stderr or '')[:500]}"
+                )
             self._tokens += len(prompt) // 4 + len(resp) // 4
             called: List[str] = []
             if os.path.exists(calllog):
diff --git a/tests/test_sleep_engine.py b/tests/test_sleep_engine.py
index 113bc8e..bd5b971 100644
--- a/tests/test_sleep_engine.py
+++ b/tests/test_sleep_engine.py
@@ -744,6 +744,29 @@ class TestCodexBackend(unittest.TestCase):
         self.assertIn("refresh_token_reused", be.last_call_error)   # surfaced for the operator
         self.assertEqual(calls["n"], 1)                             # failed fast, no wasted retries
 
+    def test_codex_attempt_with_tools_surfaces_error_not_silent(self):
+        """A failed tool-rollout (non-zero codex exec) on the tool path must set
+        last_call_error and return an empty response — not a silent empty->0 the
+        diagnostics can't see (the gap a _call-only fix would otherwise leave)."""
+        from skillopt_sleep.backend import CodexCliBackend
+
+        def fake_run(cmd, **kwargs):
+            class Proc:
+                returncode = 1
+                stdout = ""
+                stderr = "ERROR codex_core::auth: 401 Unauthorized: refresh_token_reused"
+            return Proc()  # writes nothing to out_path -> empty response
+
+        be = CodexCliBackend(codex_path="codex")
+        task = TaskRecord(id="t", project="/p", intent="answer the question",
+                          reference_kind="rule",
+                          judge={"checks": [{"op": "tool_called", "arg": "search"}]})
+        with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run):
+            resp, called = be.attempt_with_tools(task, "", "", ["search"])
+        self.assertEqual(resp, "")                     # no leaked error text as a "response"
+        self.assertIn("exited 1", be.last_call_error)  # failure surfaced for diagnostics
+        self.assertEqual(called, [])                   # no tool actually ran
+
 
 class TestMultiRolloutAndBudget(unittest.TestCase):
     def test_rolloutset_stats(self):