mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
feat(sleep): slow-update long-term memory field (runs even with gate off)
Bring SkillOpt's epoch-wise slow/meta update (paper §3.6) into the sleep engine
as skillopt/sleep/slow_update.py — import-light, driven through the Backend
abstraction (mock/claude/codex):
- Reuses the main repo's protected-field markers
<!-- SLOW_UPDATE_START --> ... <!-- SLOW_UPDATE_END --> so the artifact is
compatible; step-level edits never touch this field.
- run_slow_update compares behavior under the first-night vs final skill across
the val tasks, groups into improved/regressed/persistent/stable, and asks the
optimizer to distill durable longitudinal guidance (refining prior text).
- Wired into run_gbrain.run_seed AFTER the nights loop, gated by slow_update=True
and run REGARDLESS of gate_mode — this is what preserves long-term memory even
when the user turns the hard gate OFF (the user's slot_date=slow-update intent).
2 new tests (protected-field round-trip, stub-backend synthesis). 23 tests pass.
Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
This commit is contained in:
@@ -45,6 +45,7 @@ def _score(backend, tasks, skill, memory, split="test", metric="mixed", w=0.5):
|
||||
|
||||
def run_seed(backend, seed: str, skill: str, tasks: List, *,
|
||||
nights: int = 3, edit_budget: int = 4, gate_mode: str = "on",
|
||||
slow_update: bool = True,
|
||||
limit_replay: int = 0, limit_holdout: int = 0) -> dict:
|
||||
memory = ""
|
||||
# optionally cap each split to control API cost / latency.
|
||||
@@ -63,6 +64,7 @@ def run_seed(backend, seed: str, skill: str, tasks: List, *,
|
||||
bh, bs, bscore = _score(backend, tasks, skill, memory, split="test")
|
||||
trace = [{"night": 0, "test_hard": round(bh, 3), "action": "baseline"}]
|
||||
cur = skill
|
||||
first_night_skill = skill
|
||||
for night in range(1, nights + 1):
|
||||
res = consolidate(
|
||||
backend, tasks, cur, memory,
|
||||
@@ -71,6 +73,8 @@ def run_seed(backend, seed: str, skill: str, tasks: List, *,
|
||||
)
|
||||
if res.accepted:
|
||||
cur = res.new_skill
|
||||
if night == 1:
|
||||
first_night_skill = cur
|
||||
# report the TEST score each night (independent of the val gate)
|
||||
th, _ts, _ = _score(backend, tasks, cur, memory, split="test")
|
||||
trace.append({
|
||||
@@ -83,6 +87,27 @@ def run_seed(backend, seed: str, skill: str, tasks: List, *,
|
||||
})
|
||||
if th >= 0.999:
|
||||
break
|
||||
|
||||
# ── SLOW UPDATE: consolidate cross-night experience into the protected
|
||||
# long-term field. Runs regardless of gate mode (it is what preserves
|
||||
# long-term memory even when the gate is OFF).
|
||||
slow_text = None
|
||||
if nights >= 2 and slow_update:
|
||||
try:
|
||||
from skillopt.sleep.slow_update import run_slow_update, replace_slow_field
|
||||
val_tasks = [t for t in tasks if t.split == "val"] or tasks
|
||||
prev_pairs = replay_batch(backend, val_tasks, first_night_skill, memory)
|
||||
curr_pairs = replay_batch(backend, val_tasks, cur, memory)
|
||||
slow_text = run_slow_update(
|
||||
backend, prev_skill=first_night_skill, curr_skill=cur,
|
||||
prev_pairs=[(t, r) for t, r in prev_pairs],
|
||||
curr_pairs=[(t, r) for t, r in curr_pairs],
|
||||
)
|
||||
if slow_text:
|
||||
cur = replace_slow_field(cur, slow_text)
|
||||
except Exception:
|
||||
slow_text = None
|
||||
|
||||
ah, as_, ascore = _score(backend, tasks, cur, memory, split="test")
|
||||
return {
|
||||
"seed": seed,
|
||||
@@ -91,6 +116,7 @@ def run_seed(backend, seed: str, skill: str, tasks: List, *,
|
||||
"improved": ah > bh,
|
||||
"nights": len(trace) - 1,
|
||||
"trace": trace,
|
||||
"slow_update": slow_text,
|
||||
"final_skill_tail": cur[-400:],
|
||||
}
|
||||
|
||||
|
||||
142
skillopt/sleep/slow_update.py
Normal file
142
skillopt/sleep/slow_update.py
Normal file
@@ -0,0 +1,142 @@
|
||||
"""SkillOpt-Sleep — slow update (cross-night long-term memory).
|
||||
|
||||
This is the deployment-time analogue of SkillOpt's epoch-wise slow/meta update
|
||||
(paper §3.6). Step-level edits (consolidate) learn from one night's batch; the
|
||||
slow update learns across nights and writes a durable "longitudinal guidance"
|
||||
block into a PROTECTED field of the skill that step-level edits never touch.
|
||||
|
||||
It reuses the exact protected-field marker convention from the main repo
|
||||
(``skillopt/optimizer/slow_update.py``) so the artifact is compatible:
|
||||
|
||||
<!-- SLOW_UPDATE_START --> ... <!-- SLOW_UPDATE_END -->
|
||||
|
||||
Why it matters: even when the user turns the validation gate OFF (greedy mode),
|
||||
the slow update still runs at the end of the run, so short-term nightly
|
||||
experience is consolidated into long-term memory rather than lost. The cross-night
|
||||
content is carried in ``state.slow_memory``.
|
||||
|
||||
Driven through the Backend abstraction (mock/claude/codex), so it stays
|
||||
import-light — no `openai` dependency.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from skillopt.sleep.backend import Backend, _extract_json
|
||||
from skillopt.sleep.types import ReplayResult, TaskRecord
|
||||
|
||||
|
||||
SLOW_UPDATE_START = "<!-- SLOW_UPDATE_START -->"
|
||||
SLOW_UPDATE_END = "<!-- SLOW_UPDATE_END -->"
|
||||
|
||||
|
||||
# ── protected-field helpers (mirror skillopt/optimizer/slow_update.py) ─────────
|
||||
|
||||
def has_slow_field(skill: str) -> bool:
|
||||
return SLOW_UPDATE_START in skill and SLOW_UPDATE_END in skill
|
||||
|
||||
|
||||
def extract_slow_field(skill: str) -> str:
|
||||
s = skill.find(SLOW_UPDATE_START)
|
||||
e = skill.find(SLOW_UPDATE_END)
|
||||
if s == -1 or e == -1:
|
||||
return ""
|
||||
return skill[s + len(SLOW_UPDATE_START):e].strip()
|
||||
|
||||
|
||||
def _strip_slow_fields(skill: str) -> str:
|
||||
while True:
|
||||
s = skill.find(SLOW_UPDATE_START)
|
||||
if s == -1:
|
||||
break
|
||||
e = skill.find(SLOW_UPDATE_END, s)
|
||||
if e == -1:
|
||||
skill = skill[:s]
|
||||
break
|
||||
skill = skill[:s] + skill[e + len(SLOW_UPDATE_END):]
|
||||
skill = skill.replace(SLOW_UPDATE_END, "")
|
||||
while "\n\n\n" in skill:
|
||||
skill = skill.replace("\n\n\n", "\n\n")
|
||||
return skill.rstrip()
|
||||
|
||||
|
||||
def replace_slow_field(skill: str, content: str) -> str:
|
||||
"""Set the protected slow-update field to ``content`` (exactly one block)."""
|
||||
base = _strip_slow_fields(skill)
|
||||
if not content.strip():
|
||||
return base
|
||||
block = f"\n\n{SLOW_UPDATE_START}\n{content.strip()}\n{SLOW_UPDATE_END}\n"
|
||||
return base + block
|
||||
|
||||
|
||||
# ── the slow-update synthesis ──────────────────────────────────────────────────
|
||||
|
||||
def _summarize_pairs(
|
||||
prev_pairs: List[Tuple[TaskRecord, ReplayResult]],
|
||||
curr_pairs: List[Tuple[TaskRecord, ReplayResult]],
|
||||
) -> str:
|
||||
"""Group adjacent-version outcomes into improved/regressed/persistent/stable."""
|
||||
prev_by = {t.id: r for t, r in prev_pairs}
|
||||
lines: List[str] = []
|
||||
counts = {"improved": 0, "regressed": 0, "persistent_fail": 0, "stable_success": 0}
|
||||
for t, r in curr_pairs:
|
||||
p = prev_by.get(t.id)
|
||||
if p is None:
|
||||
continue
|
||||
a, b = p.hard, r.hard
|
||||
if b > a:
|
||||
cat = "improved"
|
||||
elif b < a:
|
||||
cat = "regressed"
|
||||
elif b >= 1.0:
|
||||
cat = "stable_success"
|
||||
else:
|
||||
cat = "persistent_fail"
|
||||
counts[cat] += 1
|
||||
if cat in ("regressed", "persistent_fail") and len(lines) < 8:
|
||||
lines.append(f"- [{cat}] {t.intent[:120]} (why: {r.fail_reason[:80]})")
|
||||
head = ", ".join(f"{k}={v}" for k, v in counts.items())
|
||||
return head + ("\n" + "\n".join(lines) if lines else ""), counts # type: ignore[return-value]
|
||||
|
||||
|
||||
def run_slow_update(
|
||||
backend: Backend,
|
||||
*,
|
||||
prev_skill: str,
|
||||
curr_skill: str,
|
||||
prev_pairs: List[Tuple[TaskRecord, ReplayResult]],
|
||||
curr_pairs: List[Tuple[TaskRecord, ReplayResult]],
|
||||
prev_slow_content: str = "",
|
||||
) -> Optional[str]:
|
||||
"""Produce durable longitudinal guidance text (or None).
|
||||
|
||||
Compares behavior under the previous vs current skill across the same tasks
|
||||
and asks the optimizer to distill a short, durable guidance block — what to
|
||||
keep doing, what regressions to avoid — refining any prior slow-update text.
|
||||
"""
|
||||
summary, counts = _summarize_pairs(prev_pairs, curr_pairs) # type: ignore[misc]
|
||||
# nothing changed and no prior guidance to refine → skip
|
||||
if counts["regressed"] == 0 and counts["persistent_fail"] == 0 and not prev_slow_content:
|
||||
return None
|
||||
|
||||
prompt = (
|
||||
"You are SkillOpt's SLOW UPDATE — the long-term memory pass that runs "
|
||||
"across nights. Write a SHORT, durable guidance block (2-5 bullet "
|
||||
"points) capturing the longitudinal lessons: behaviors that reliably "
|
||||
"help and should be preserved, and regressions/persistent failures to "
|
||||
"avoid. Keep it GENERAL and stable (not tied to one task). If prior "
|
||||
"guidance is given, refine it rather than restate it.\n"
|
||||
'Return ONLY JSON: {"guidance": "<bullet list as one string>"}.\n\n'
|
||||
f"# Cross-night outcome summary\n{summary}\n\n"
|
||||
f"# Prior long-term guidance (refine this)\n{prev_slow_content or '(none)'}"
|
||||
)
|
||||
raw = backend._call(prompt, max_tokens=600) # type: ignore[attr-defined]
|
||||
obj = _extract_json(raw, "object")
|
||||
if isinstance(obj, dict):
|
||||
g = str(obj.get("guidance", "")).strip()
|
||||
if g:
|
||||
return g
|
||||
# fallback: if the model returned prose, keep the first ~400 chars
|
||||
text = (raw or "").strip()
|
||||
return text[:400] if text else None
|
||||
@@ -232,6 +232,49 @@ class TestLlmMiner(unittest.TestCase):
|
||||
self.assertEqual(make_llm_miner(EmptyBackend())([digest]), [])
|
||||
|
||||
|
||||
class TestSlowUpdate(unittest.TestCase):
|
||||
def test_protected_field_roundtrip(self):
|
||||
from skillopt.sleep.slow_update import (
|
||||
replace_slow_field, extract_slow_field, has_slow_field,
|
||||
SLOW_UPDATE_START, SLOW_UPDATE_END,
|
||||
)
|
||||
base = "# skill\nkeep me\n"
|
||||
doc = replace_slow_field(base, "durable lesson A")
|
||||
self.assertTrue(has_slow_field(doc))
|
||||
self.assertIn("keep me", doc)
|
||||
self.assertEqual(extract_slow_field(doc), "durable lesson A")
|
||||
# replacing keeps exactly one block and preserves hand-written text
|
||||
doc2 = replace_slow_field(doc, "durable lesson B")
|
||||
self.assertEqual(doc2.count(SLOW_UPDATE_START), 1)
|
||||
self.assertEqual(doc2.count(SLOW_UPDATE_END), 1)
|
||||
self.assertEqual(extract_slow_field(doc2), "durable lesson B")
|
||||
self.assertIn("keep me", doc2)
|
||||
|
||||
def test_run_slow_update_with_stub_backend(self):
|
||||
from skillopt.sleep.backend import Backend
|
||||
from skillopt.sleep.slow_update import run_slow_update
|
||||
from skillopt.sleep.types import TaskRecord, ReplayResult
|
||||
|
||||
class StubBackend(Backend):
|
||||
name = "stub"
|
||||
def _call(self, prompt, *, max_tokens=1024):
|
||||
return '{"guidance": "- keep doing X\\n- avoid regression Y"}'
|
||||
|
||||
t = TaskRecord(id="t1", project="/p", intent="do thing")
|
||||
prev = [(t, ReplayResult(id="t1", hard=0.0))] # was failing
|
||||
curr = [(t, ReplayResult(id="t1", hard=1.0))] # now passing (improved)
|
||||
out = run_slow_update(StubBackend(), prev_skill="s0", curr_skill="s1",
|
||||
prev_pairs=prev, curr_pairs=curr)
|
||||
# improvements alone with no regression/persistent-fail and no prior text -> None
|
||||
self.assertIsNone(out)
|
||||
# a regression triggers guidance
|
||||
prev2 = [(t, ReplayResult(id="t1", hard=1.0))]
|
||||
curr2 = [(t, ReplayResult(id="t1", hard=0.0))]
|
||||
out2 = run_slow_update(StubBackend(), prev_skill="s0", curr_skill="s1",
|
||||
prev_pairs=prev2, curr_pairs=curr2)
|
||||
self.assertIn("keep doing X", out2)
|
||||
|
||||
|
||||
class TestToolLoop(unittest.TestCase):
|
||||
def test_tool_called_judge_via_replay(self):
|
||||
from skillopt.sleep.backend import MockBackend
|
||||
|
||||
Reference in New Issue
Block a user