Files
Yifan Yang 86bad36ffe feat(sleep): SkillOpt-Sleep plugin update (preview) — engine robustness + scheduling
Updates the SkillOpt-Sleep plugin on top of the current main. User-facing and
engine improvements since the initial drop:

* Command renamed /sleep -> /skillopt-sleep across Claude Code + Codex shells;
  refreshed plugin READMEs and install scripts.
* Built-in scheduling (skillopt_sleep/scheduler.py + __main__): schedule /
  unschedule the nightly cycle without external cron wiring.
* Backend robustness: bounded retry with backoff (no more silent empty-string
  on transient 429/timeout), content-filter-safe rollout prompt, an
  output-contract guardrail that rejects edits violating the task's required
  format, and a per-sample cache key so repeated dream rollouts are independent
  samples (fixes degenerate single-sample reflection).
* consolidate / rollout / replay: parallel multi-rollout dreaming, gate-mode
  controls, TaskRecord.system framing field.

Scope: this commit ships only the plugin engine + shells. Research/benchmark
harnesses and their data are intentionally not included; the public package
has no dependency on them (the one research-evaluator import is now guarded).
Marked as an early preview in the README; we'll keep iterating.

99/99 unit tests pass.

Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
2026-06-14 16:12:00 +00:00

154 lines
6.0 KiB
Python

"""SkillOpt-Sleep — multi-rollout + contrastive reflection (the imagination core).
The core idea: let the agent re-run the SAME task many times, then look at
which rollouts went well vs badly and distill a rule from the *contrast*. This
is a much stronger learning signal than a single failure, and it is the essence
of the offline "dream/imagination" process — train-time rollouts are synthetic,
so doing many is fine.
Pieces:
* multi_rollout — run one task K times under (skill, memory), return scored attempts
* contrastive_reflect — given good vs bad attempts of the same tasks, ask the
optimizer what distinguishes them and propose a general rule
Driven through the Backend abstraction (mock/claude/codex), import-light.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import List, Optional, Tuple
from skillopt_sleep.backend import Backend, _extract_json
from skillopt_sleep.replay import replay_one
from skillopt_sleep.types import EditRecord, ReplayResult, TaskRecord
@dataclass
class RolloutSet:
"""K scored attempts at one task under a fixed (skill, memory)."""
task: TaskRecord
attempts: List[ReplayResult] = field(default_factory=list)
@property
def best(self) -> Optional[ReplayResult]:
return max(self.attempts, key=lambda r: r.hard, default=None)
@property
def worst(self) -> Optional[ReplayResult]:
return min(self.attempts, key=lambda r: r.hard, default=None)
@property
def spread(self) -> float:
if not self.attempts:
return 0.0
hs = [r.hard for r in self.attempts]
return max(hs) - min(hs)
@property
def pass_rate(self) -> float:
if not self.attempts:
return 0.0
return sum(1 for r in self.attempts if r.hard >= 1.0) / len(self.attempts)
def multi_rollout(
backend: Backend,
task: TaskRecord,
skill: str,
memory: str,
*,
k: int = 3,
workers: int = 0,
) -> RolloutSet:
"""Run ``task`` K times. replay_one is deterministic for mock; for real
backends the model's own sampling yields variation across attempts.
The K attempts are independent, so they run concurrently (this is the dream
phase's dominant cost). ``workers`` defaults to the SKILLOPT_SLEEP_WORKERS
env (capped at k); set to 1 to force serial (used by the mock tests).
"""
import os
rs = RolloutSet(task=task)
k = max(1, k)
if workers <= 0:
try:
workers = int(os.environ.get("SKILLOPT_SLEEP_WORKERS", "1"))
except ValueError:
workers = 1
workers = max(1, min(workers, k))
if workers == 1:
for i in range(k):
rs.attempts.append(replay_one(backend, task, skill, memory, sample_id=i))
return rs
from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=workers) as ex:
futs = [ex.submit(replay_one, backend, task, skill, memory, sample_id=i)
for i in range(k)]
for f in futs:
rs.attempts.append(f.result())
return rs
def contrastive_reflect(
backend: Backend,
rollout_sets: List[RolloutSet],
skill: str,
memory: str,
*,
edit_budget: int = 4,
target: str = "skill",
) -> List[EditRecord]:
"""Distill a rule from the contrast between good and bad attempts.
We pick tasks with the highest score *spread* (some attempts passed, some
failed) — those are the most informative — and show the optimizer a
high-scoring vs a low-scoring attempt of each, asking what general rule makes
the good behavior reliable.
"""
informative = [rs for rs in rollout_sets if rs.spread > 0 and rs.best and rs.worst]
informative.sort(key=lambda rs: rs.spread, reverse=True)
informative = informative[:6]
if not informative:
return []
blocks = []
for rs in informative:
blocks.append(
f"## Task: {rs.task.intent[:160]}\n"
f"- GOOD attempt (score {rs.best.hard:.1f}): {rs.best.response[:200]}\n"
f"- BAD attempt (score {rs.worst.hard:.1f}): {rs.worst.response[:200]}\n"
f" (bad failed: {rs.worst.fail_reason[:100]})"
)
# the output contract the proposed rules must not violate (same guardrail the
# single-shot reflect uses — prevents harness-violating rules like "return VBA"
# or "ask the user for the range" on SpreadsheetBench).
from skillopt_sleep.backend import _task_guardrail
guard = _task_guardrail([(rs.task, rs.best) for rs in informative])
prompt = (
"You are SkillOpt's optimizer doing CONTRASTIVE reflection. For each task "
"below the agent was run multiple times; some attempts succeeded and some "
"failed. Identify what the GOOD attempts did that the BAD ones did not, "
f"and propose at most {edit_budget} SHORT, GENERAL, reusable rules for the "
f"{target} that would make the good behavior reliable every time. Quote "
"concrete thresholds/formats verbatim; do not paraphrase vaguely. "
"Every rule MUST obey the task output contract (if shown) — never propose "
"a rule that changes the required output format/language or tells the agent "
"to ask the user a question; such a rule scores ZERO.\n"
f"{guard}"
'Return ONLY a JSON array: '
'[{"op":"add","content":"<rule>","rationale":"<what good did that bad didnt>"}].\n\n'
+ "\n\n".join(blocks)
)
raw = backend._call(prompt, max_tokens=1024) # type: ignore[attr-defined]
arr = _extract_json(raw, "array")
edits: List[EditRecord] = []
if isinstance(arr, list):
for e in arr[:edit_budget]:
if isinstance(e, dict) and str(e.get("content", "")).strip():
edits.append(EditRecord(
target=target, op=str(e.get("op", "add")).strip().lower(),
content=str(e["content"]).strip(),
rationale=str(e.get("rationale", "")).strip(),
))
return edits