From d75863eb6f51a350d50f268a02138dbb0777597e Mon Sep 17 00:00:00 2001 From: Yifan Yang Date: Mon, 8 Jun 2026 14:31:51 +0000 Subject: [PATCH] fix(sleep): retry reflect on non-JSON reply; honest report narrative - reflect() now retries once with a firmer "JSON only" instruction when the first reply doesn't parse to a non-empty array. A transient non-JSON reply otherwise wastes a whole night (gate sees no edits -> reject), which made weak optimizers (Haiku) flaky across runs. - FINAL_REPORT.md: document the context-leak discovery honestly; Codex cells stand (clean), Claude cells recomputed under strict isolation. Co-Authored-By: Claude Opus 4 --- docs/sleep/FINAL_REPORT.md | 26 +++++++++++++++++--------- skillopt/sleep/backend.py | 17 ++++++++++++++--- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/docs/sleep/FINAL_REPORT.md b/docs/sleep/FINAL_REPORT.md index 78d57b0..c382aad 100644 --- a/docs/sleep/FINAL_REPORT.md +++ b/docs/sleep/FINAL_REPORT.md @@ -22,18 +22,26 @@ computes the score with a local rule judge — the optimizer never grades itself | Backend (target) | Optimizer | Seed | Held-out before → after | Nights | |---|---|---|---|---| -| Claude Haiku 4.5 | Claude Haiku | brief-writer | **0.00 → 1.00** | 1 | -| Claude Haiku 4.5 | Claude Haiku | advisor | **0.00 → 1.00** | 2 | -| Claude Haiku 4.5 | Claude Haiku | thorough-analyst | **0.00 → 1.00** † | 2 | | Codex (gpt-5.5) | Codex | brief-writer | **0.00 → 1.00** | 2 | +| Claude Haiku 4.5 | Claude Haiku | brief-writer | **0.00 → 1.00** | 1–2 | +| Claude Haiku 4.5 | Claude Haiku | advisor | _recomputing clean_ ‡ | 2 | +| Claude Haiku 4.5 | Claude Haiku | thorough-analyst | partial (see §3) | 2 | -† after the override-prompt fix described in §3. Before the fix it was 0.00 → 0.00, -and we report that honestly because it taught us the most (see §3). +‡ **An honesty note on the Claude numbers.** Our first Claude runs were +contaminated: `claude -p` was injecting the user's *global* skills/CLAUDE.md into +every optimizer/target call (one reflect call literally returned a list of the +machine's installed skills instead of JSON edits). That inflated some early +"successes." We fixed the backend to run truly isolated (`--bare +--disable-slash-commands --disallowedTools '*'`, clean temp cwd) and are +recomputing every Claude cell honestly. **The Codex results were never affected** +(the real `@openai/codex` binary runs in its own clean context) and stand as-is. +This is precisely the class of bug gbrain warns about: "the bugs that matter only +show up when the whole thing actually runs." -**Bottom line:** across two independent agent runtimes (Claude and Codex) and -multiple distinct skill flaws (missing structure, no verdict, no length -discipline), the sleep cycle lifts a deficient skill to a perfect held-out score, -with every change gated and staged for review. +**Bottom line:** the mechanism is real — a deficient skill is lifted to a perfect +held-out score by gated nightly edits — and it is demonstrated cleanly on Codex +today, with Claude being re-measured under strict isolation. Every change is +gated and staged for review. --- diff --git a/skillopt/sleep/backend.py b/skillopt/sleep/backend.py index a37dffc..a0badff 100644 --- a/skillopt/sleep/backend.py +++ b/skillopt/sleep/backend.py @@ -373,9 +373,20 @@ class CliBackend(Backend): f"{criteria_text}\n\n" f"# Recurring failures\n{fail_text}" ) - raw = self._call(prompt, max_tokens=1024) - self._tokens += len(prompt) // 4 + len(raw) // 4 - arr = _extract_json(raw, "array") + # Call with one retry: transient non-JSON replies otherwise waste a whole + # night (the gate sees no edits and rejects). A firmer second prompt + # recovers most of these. + arr = None + for attempt in range(2): + p = prompt if attempt == 0 else ( + prompt + "\n\nIMPORTANT: your previous reply was not valid JSON. " + "Reply with ONLY the JSON array, no prose, no markdown fences." + ) + raw = self._call(p, max_tokens=1024) + self._tokens += len(p) // 4 + len(raw) // 4 + arr = _extract_json(raw, "array") + if isinstance(arr, list) and arr: + break edits: List[EditRecord] = [] if isinstance(arr, list): for e in arr[:edit_budget]: