Files
microsoft-SkillOpt/skillopt_sleep/rollout.py
Yifan Yang dae974a5e3 chore(sleep): English-only across the engine, plugins, and docs
Remove every non-ASCII/CJK character for a professional open-source repo:
  - harvest.py: drop hardcoded Chinese feedback phrases; add an env-based
    extensibility hook (SKILLOPT_SLEEP_NEG_FEEDBACK / _POS_FEEDBACK) so any
    locale can be added without baking one in. Verified with a German example.
  - rollout.py / consolidate.py: English comments.
  - README.md section heading + anchor, CONTROLLABLE_DREAMING.md, plugin.json,
    marketplace.json (also fixed stale path skillopt-sleep-plugin ->
    plugins/claude-code), SKILL.md: English only.
  - Remove the internal WAKE_UP_SUMMARY.md note (not user-facing, not referenced).

Verified: zero CJK chars remain anywhere; 29 tests pass.

Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
2026-06-08 14:31:52 +00:00

123 lines
4.6 KiB
Python

"""SkillOpt-Sleep — multi-rollout + contrastive reflection (the imagination core).
The core idea: let the agent re-run the SAME task many times, then look at
which rollouts went well vs badly and distill a rule from the *contrast*. This
is a much stronger learning signal than a single failure, and it is the essence
of the offline "dream/imagination" process — train-time rollouts are synthetic,
so doing many is fine.
Pieces:
* multi_rollout — run one task K times under (skill, memory), return scored attempts
* contrastive_reflect — given good vs bad attempts of the same tasks, ask the
optimizer what distinguishes them and propose a general rule
Driven through the Backend abstraction (mock/claude/codex), import-light.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import List, Optional, Tuple
from skillopt_sleep.backend import Backend, _extract_json
from skillopt_sleep.replay import replay_one
from skillopt_sleep.types import EditRecord, ReplayResult, TaskRecord
@dataclass
class RolloutSet:
"""K scored attempts at one task under a fixed (skill, memory)."""
task: TaskRecord
attempts: List[ReplayResult] = field(default_factory=list)
@property
def best(self) -> Optional[ReplayResult]:
return max(self.attempts, key=lambda r: r.hard, default=None)
@property
def worst(self) -> Optional[ReplayResult]:
return min(self.attempts, key=lambda r: r.hard, default=None)
@property
def spread(self) -> float:
if not self.attempts:
return 0.0
hs = [r.hard for r in self.attempts]
return max(hs) - min(hs)
@property
def pass_rate(self) -> float:
if not self.attempts:
return 0.0
return sum(1 for r in self.attempts if r.hard >= 1.0) / len(self.attempts)
def multi_rollout(
backend: Backend,
task: TaskRecord,
skill: str,
memory: str,
*,
k: int = 3,
) -> RolloutSet:
"""Run ``task`` K times. replay_one is deterministic for mock; for real
backends the model's own sampling yields variation across attempts."""
rs = RolloutSet(task=task)
for _ in range(max(1, k)):
rs.attempts.append(replay_one(backend, task, skill, memory))
return rs
def contrastive_reflect(
backend: Backend,
rollout_sets: List[RolloutSet],
skill: str,
memory: str,
*,
edit_budget: int = 4,
target: str = "skill",
) -> List[EditRecord]:
"""Distill a rule from the contrast between good and bad attempts.
We pick tasks with the highest score *spread* (some attempts passed, some
failed) — those are the most informative — and show the optimizer a
high-scoring vs a low-scoring attempt of each, asking what general rule makes
the good behavior reliable.
"""
informative = [rs for rs in rollout_sets if rs.spread > 0 and rs.best and rs.worst]
informative.sort(key=lambda rs: rs.spread, reverse=True)
informative = informative[:6]
if not informative:
return []
blocks = []
for rs in informative:
blocks.append(
f"## Task: {rs.task.intent[:160]}\n"
f"- GOOD attempt (score {rs.best.hard:.1f}): {rs.best.response[:200]}\n"
f"- BAD attempt (score {rs.worst.hard:.1f}): {rs.worst.response[:200]}\n"
f" (bad failed: {rs.worst.fail_reason[:100]})"
)
prompt = (
"You are SkillOpt's optimizer doing CONTRASTIVE reflection. For each task "
"below the agent was run multiple times; some attempts succeeded and some "
"failed. Identify what the GOOD attempts did that the BAD ones did not, "
f"and propose at most {edit_budget} SHORT, GENERAL, reusable rules for the "
f"{target} that would make the good behavior reliable every time. Quote "
"concrete thresholds/formats verbatim; do not paraphrase vaguely. "
'Return ONLY a JSON array: '
'[{"op":"add","content":"<rule>","rationale":"<what good did that bad didnt>"}].\n\n'
+ "\n\n".join(blocks)
)
raw = backend._call(prompt, max_tokens=1024) # type: ignore[attr-defined]
arr = _extract_json(raw, "array")
edits: List[EditRecord] = []
if isinstance(arr, list):
for e in arr[:edit_budget]:
if isinstance(e, dict) and str(e.get("content", "")).strip():
edits.append(EditRecord(
target=target, op=str(e.get("op", "add")).strip().lower(),
content=str(e["content"]).strip(),
rationale=str(e.get("rationale", "")).strip(),
))
return edits