mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
Remove every non-ASCII/CJK character for a professional open-source repo:
- harvest.py: drop hardcoded Chinese feedback phrases; add an env-based
extensibility hook (SKILLOPT_SLEEP_NEG_FEEDBACK / _POS_FEEDBACK) so any
locale can be added without baking one in. Verified with a German example.
- rollout.py / consolidate.py: English comments.
- README.md section heading + anchor, CONTROLLABLE_DREAMING.md, plugin.json,
marketplace.json (also fixed stale path skillopt-sleep-plugin ->
plugins/claude-code), SKILL.md: English only.
- Remove the internal WAKE_UP_SUMMARY.md note (not user-facing, not referenced).
Verified: zero CJK chars remain anywhere; 29 tests pass.
Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
123 lines
4.6 KiB
Python
123 lines
4.6 KiB
Python
"""SkillOpt-Sleep — multi-rollout + contrastive reflection (the imagination core).
|
|
|
|
The core idea: let the agent re-run the SAME task many times, then look at
|
|
which rollouts went well vs badly and distill a rule from the *contrast*. This
|
|
is a much stronger learning signal than a single failure, and it is the essence
|
|
of the offline "dream/imagination" process — train-time rollouts are synthetic,
|
|
so doing many is fine.
|
|
|
|
Pieces:
|
|
* multi_rollout — run one task K times under (skill, memory), return scored attempts
|
|
* contrastive_reflect — given good vs bad attempts of the same tasks, ask the
|
|
optimizer what distinguishes them and propose a general rule
|
|
|
|
Driven through the Backend abstraction (mock/claude/codex), import-light.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import List, Optional, Tuple
|
|
|
|
from skillopt_sleep.backend import Backend, _extract_json
|
|
from skillopt_sleep.replay import replay_one
|
|
from skillopt_sleep.types import EditRecord, ReplayResult, TaskRecord
|
|
|
|
|
|
@dataclass
|
|
class RolloutSet:
|
|
"""K scored attempts at one task under a fixed (skill, memory)."""
|
|
task: TaskRecord
|
|
attempts: List[ReplayResult] = field(default_factory=list)
|
|
|
|
@property
|
|
def best(self) -> Optional[ReplayResult]:
|
|
return max(self.attempts, key=lambda r: r.hard, default=None)
|
|
|
|
@property
|
|
def worst(self) -> Optional[ReplayResult]:
|
|
return min(self.attempts, key=lambda r: r.hard, default=None)
|
|
|
|
@property
|
|
def spread(self) -> float:
|
|
if not self.attempts:
|
|
return 0.0
|
|
hs = [r.hard for r in self.attempts]
|
|
return max(hs) - min(hs)
|
|
|
|
@property
|
|
def pass_rate(self) -> float:
|
|
if not self.attempts:
|
|
return 0.0
|
|
return sum(1 for r in self.attempts if r.hard >= 1.0) / len(self.attempts)
|
|
|
|
|
|
def multi_rollout(
|
|
backend: Backend,
|
|
task: TaskRecord,
|
|
skill: str,
|
|
memory: str,
|
|
*,
|
|
k: int = 3,
|
|
) -> RolloutSet:
|
|
"""Run ``task`` K times. replay_one is deterministic for mock; for real
|
|
backends the model's own sampling yields variation across attempts."""
|
|
rs = RolloutSet(task=task)
|
|
for _ in range(max(1, k)):
|
|
rs.attempts.append(replay_one(backend, task, skill, memory))
|
|
return rs
|
|
|
|
|
|
def contrastive_reflect(
|
|
backend: Backend,
|
|
rollout_sets: List[RolloutSet],
|
|
skill: str,
|
|
memory: str,
|
|
*,
|
|
edit_budget: int = 4,
|
|
target: str = "skill",
|
|
) -> List[EditRecord]:
|
|
"""Distill a rule from the contrast between good and bad attempts.
|
|
|
|
We pick tasks with the highest score *spread* (some attempts passed, some
|
|
failed) — those are the most informative — and show the optimizer a
|
|
high-scoring vs a low-scoring attempt of each, asking what general rule makes
|
|
the good behavior reliable.
|
|
"""
|
|
informative = [rs for rs in rollout_sets if rs.spread > 0 and rs.best and rs.worst]
|
|
informative.sort(key=lambda rs: rs.spread, reverse=True)
|
|
informative = informative[:6]
|
|
if not informative:
|
|
return []
|
|
|
|
blocks = []
|
|
for rs in informative:
|
|
blocks.append(
|
|
f"## Task: {rs.task.intent[:160]}\n"
|
|
f"- GOOD attempt (score {rs.best.hard:.1f}): {rs.best.response[:200]}\n"
|
|
f"- BAD attempt (score {rs.worst.hard:.1f}): {rs.worst.response[:200]}\n"
|
|
f" (bad failed: {rs.worst.fail_reason[:100]})"
|
|
)
|
|
prompt = (
|
|
"You are SkillOpt's optimizer doing CONTRASTIVE reflection. For each task "
|
|
"below the agent was run multiple times; some attempts succeeded and some "
|
|
"failed. Identify what the GOOD attempts did that the BAD ones did not, "
|
|
f"and propose at most {edit_budget} SHORT, GENERAL, reusable rules for the "
|
|
f"{target} that would make the good behavior reliable every time. Quote "
|
|
"concrete thresholds/formats verbatim; do not paraphrase vaguely. "
|
|
'Return ONLY a JSON array: '
|
|
'[{"op":"add","content":"<rule>","rationale":"<what good did that bad didnt>"}].\n\n'
|
|
+ "\n\n".join(blocks)
|
|
)
|
|
raw = backend._call(prompt, max_tokens=1024) # type: ignore[attr-defined]
|
|
arr = _extract_json(raw, "array")
|
|
edits: List[EditRecord] = []
|
|
if isinstance(arr, list):
|
|
for e in arr[:edit_budget]:
|
|
if isinstance(e, dict) and str(e.get("content", "")).strip():
|
|
edits.append(EditRecord(
|
|
target=target, op=str(e.get("op", "add")).strip().lower(),
|
|
content=str(e["content"]).strip(),
|
|
rationale=str(e.get("rationale", "")).strip(),
|
|
))
|
|
return edits
|