mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
Updates the SkillOpt-Sleep plugin on top of the current main. User-facing and engine improvements since the initial drop: * Command renamed /sleep -> /skillopt-sleep across Claude Code + Codex shells; refreshed plugin READMEs and install scripts. * Built-in scheduling (skillopt_sleep/scheduler.py + __main__): schedule / unschedule the nightly cycle without external cron wiring. * Backend robustness: bounded retry with backoff (no more silent empty-string on transient 429/timeout), content-filter-safe rollout prompt, an output-contract guardrail that rejects edits violating the task's required format, and a per-sample cache key so repeated dream rollouts are independent samples (fixes degenerate single-sample reflection). * consolidate / rollout / replay: parallel multi-rollout dreaming, gate-mode controls, TaskRecord.system framing field. Scope: this commit ships only the plugin engine + shells. Research/benchmark harnesses and their data are intentionally not included; the public package has no dependency on them (the one research-evaluator import is now guarded). Marked as an early preview in the README; we'll keep iterating. 99/99 unit tests pass. Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
154 lines
6.0 KiB
Python
154 lines
6.0 KiB
Python
"""SkillOpt-Sleep — multi-rollout + contrastive reflection (the imagination core).
|
|
|
|
The core idea: let the agent re-run the SAME task many times, then look at
|
|
which rollouts went well vs badly and distill a rule from the *contrast*. This
|
|
is a much stronger learning signal than a single failure, and it is the essence
|
|
of the offline "dream/imagination" process — train-time rollouts are synthetic,
|
|
so doing many is fine.
|
|
|
|
Pieces:
|
|
* multi_rollout — run one task K times under (skill, memory), return scored attempts
|
|
* contrastive_reflect — given good vs bad attempts of the same tasks, ask the
|
|
optimizer what distinguishes them and propose a general rule
|
|
|
|
Driven through the Backend abstraction (mock/claude/codex), import-light.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import List, Optional, Tuple
|
|
|
|
from skillopt_sleep.backend import Backend, _extract_json
|
|
from skillopt_sleep.replay import replay_one
|
|
from skillopt_sleep.types import EditRecord, ReplayResult, TaskRecord
|
|
|
|
|
|
@dataclass
|
|
class RolloutSet:
|
|
"""K scored attempts at one task under a fixed (skill, memory)."""
|
|
task: TaskRecord
|
|
attempts: List[ReplayResult] = field(default_factory=list)
|
|
|
|
@property
|
|
def best(self) -> Optional[ReplayResult]:
|
|
return max(self.attempts, key=lambda r: r.hard, default=None)
|
|
|
|
@property
|
|
def worst(self) -> Optional[ReplayResult]:
|
|
return min(self.attempts, key=lambda r: r.hard, default=None)
|
|
|
|
@property
|
|
def spread(self) -> float:
|
|
if not self.attempts:
|
|
return 0.0
|
|
hs = [r.hard for r in self.attempts]
|
|
return max(hs) - min(hs)
|
|
|
|
@property
|
|
def pass_rate(self) -> float:
|
|
if not self.attempts:
|
|
return 0.0
|
|
return sum(1 for r in self.attempts if r.hard >= 1.0) / len(self.attempts)
|
|
|
|
|
|
def multi_rollout(
|
|
backend: Backend,
|
|
task: TaskRecord,
|
|
skill: str,
|
|
memory: str,
|
|
*,
|
|
k: int = 3,
|
|
workers: int = 0,
|
|
) -> RolloutSet:
|
|
"""Run ``task`` K times. replay_one is deterministic for mock; for real
|
|
backends the model's own sampling yields variation across attempts.
|
|
|
|
The K attempts are independent, so they run concurrently (this is the dream
|
|
phase's dominant cost). ``workers`` defaults to the SKILLOPT_SLEEP_WORKERS
|
|
env (capped at k); set to 1 to force serial (used by the mock tests).
|
|
"""
|
|
import os
|
|
rs = RolloutSet(task=task)
|
|
k = max(1, k)
|
|
if workers <= 0:
|
|
try:
|
|
workers = int(os.environ.get("SKILLOPT_SLEEP_WORKERS", "1"))
|
|
except ValueError:
|
|
workers = 1
|
|
workers = max(1, min(workers, k))
|
|
if workers == 1:
|
|
for i in range(k):
|
|
rs.attempts.append(replay_one(backend, task, skill, memory, sample_id=i))
|
|
return rs
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
with ThreadPoolExecutor(max_workers=workers) as ex:
|
|
futs = [ex.submit(replay_one, backend, task, skill, memory, sample_id=i)
|
|
for i in range(k)]
|
|
for f in futs:
|
|
rs.attempts.append(f.result())
|
|
return rs
|
|
|
|
|
|
def contrastive_reflect(
|
|
backend: Backend,
|
|
rollout_sets: List[RolloutSet],
|
|
skill: str,
|
|
memory: str,
|
|
*,
|
|
edit_budget: int = 4,
|
|
target: str = "skill",
|
|
) -> List[EditRecord]:
|
|
"""Distill a rule from the contrast between good and bad attempts.
|
|
|
|
We pick tasks with the highest score *spread* (some attempts passed, some
|
|
failed) — those are the most informative — and show the optimizer a
|
|
high-scoring vs a low-scoring attempt of each, asking what general rule makes
|
|
the good behavior reliable.
|
|
"""
|
|
informative = [rs for rs in rollout_sets if rs.spread > 0 and rs.best and rs.worst]
|
|
informative.sort(key=lambda rs: rs.spread, reverse=True)
|
|
informative = informative[:6]
|
|
if not informative:
|
|
return []
|
|
|
|
blocks = []
|
|
for rs in informative:
|
|
blocks.append(
|
|
f"## Task: {rs.task.intent[:160]}\n"
|
|
f"- GOOD attempt (score {rs.best.hard:.1f}): {rs.best.response[:200]}\n"
|
|
f"- BAD attempt (score {rs.worst.hard:.1f}): {rs.worst.response[:200]}\n"
|
|
f" (bad failed: {rs.worst.fail_reason[:100]})"
|
|
)
|
|
# the output contract the proposed rules must not violate (same guardrail the
|
|
# single-shot reflect uses — prevents harness-violating rules like "return VBA"
|
|
# or "ask the user for the range" on SpreadsheetBench).
|
|
from skillopt_sleep.backend import _task_guardrail
|
|
guard = _task_guardrail([(rs.task, rs.best) for rs in informative])
|
|
prompt = (
|
|
"You are SkillOpt's optimizer doing CONTRASTIVE reflection. For each task "
|
|
"below the agent was run multiple times; some attempts succeeded and some "
|
|
"failed. Identify what the GOOD attempts did that the BAD ones did not, "
|
|
f"and propose at most {edit_budget} SHORT, GENERAL, reusable rules for the "
|
|
f"{target} that would make the good behavior reliable every time. Quote "
|
|
"concrete thresholds/formats verbatim; do not paraphrase vaguely. "
|
|
"Every rule MUST obey the task output contract (if shown) — never propose "
|
|
"a rule that changes the required output format/language or tells the agent "
|
|
"to ask the user a question; such a rule scores ZERO.\n"
|
|
f"{guard}"
|
|
'Return ONLY a JSON array: '
|
|
'[{"op":"add","content":"<rule>","rationale":"<what good did that bad didnt>"}].\n\n'
|
|
+ "\n\n".join(blocks)
|
|
)
|
|
raw = backend._call(prompt, max_tokens=1024) # type: ignore[attr-defined]
|
|
arr = _extract_json(raw, "array")
|
|
edits: List[EditRecord] = []
|
|
if isinstance(arr, list):
|
|
for e in arr[:edit_budget]:
|
|
if isinstance(e, dict) and str(e.get("content", "")).strip():
|
|
edits.append(EditRecord(
|
|
target=target, op=str(e.get("op", "add")).strip().lower(),
|
|
content=str(e["content"]).strip(),
|
|
rationale=str(e.get("rationale", "")).strip(),
|
|
))
|
|
return edits
|