feat(sleep): multi-rollout contrastive reflection + token/time budget

The "脑补推演" core the user described — re-run the same task many times and
learn from the contrast between good and bad rollouts:

  - rollout.py: multi_rollout(task, k) runs K scored attempts; RolloutSet exposes
    best/worst/spread/pass_rate. contrastive_reflect picks the highest-spread
    tasks (some attempts passed, some failed — most informative) and asks the
    optimizer what the GOOD attempts did that the BAD ones didn't, distilling a
    general rule. Far stronger signal than a single failure.
  - consolidate(rollouts_k>1) uses contrastive reflection (falls back to
    single-shot reflect if it yields nothing).
  - budget.py: Budget(max_tokens|max_minutes) tracks spend; plan_depth() derives
    (nights, rollouts_k) from a token budget. run_gbrain gains --rollouts-k,
    --budget-tokens, --budget-minutes (auto-plans depth).

3 new tests (rollout stats, budget+plan, contrastive stub). 26 tests pass.

Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
This commit is contained in:
Yifan Yang
2026-06-08 14:31:51 +00:00
parent c179a24c45
commit 77ac33e8bf
5 changed files with 283 additions and 8 deletions

View File

@@ -232,6 +232,49 @@ class TestLlmMiner(unittest.TestCase):
self.assertEqual(make_llm_miner(EmptyBackend())([digest]), [])
class TestMultiRolloutAndBudget(unittest.TestCase):
def test_rolloutset_stats(self):
from skillopt.sleep.rollout import RolloutSet
from skillopt.sleep.types import ReplayResult, TaskRecord
rs = RolloutSet(task=TaskRecord(id="t", project="/p", intent="x"),
attempts=[ReplayResult(id="t", hard=1.0),
ReplayResult(id="t", hard=0.0),
ReplayResult(id="t", hard=1.0)])
self.assertEqual(rs.best.hard, 1.0)
self.assertEqual(rs.worst.hard, 0.0)
self.assertEqual(rs.spread, 1.0)
self.assertAlmostEqual(rs.pass_rate, 2 / 3)
def test_budget_exhaustion_and_plan(self):
from skillopt.sleep.budget import Budget, plan_depth
clock = [0.0]
b = Budget(max_tokens=1000)
b.start(lambda: clock[0], tokens_now=0)
self.assertFalse(b.exhausted(tokens_now=500, clock_fn=lambda: clock[0]))
self.assertTrue(b.exhausted(tokens_now=1000, clock_fn=lambda: clock[0]))
self.assertEqual(plan_depth(Budget(), n_tasks=5, default_nights=2, default_k=1), (2, 1))
nights, k = plan_depth(Budget(max_tokens=100_000), n_tasks=5)
self.assertGreaterEqual(nights, 1)
self.assertGreaterEqual(k, 1)
def test_contrastive_reflect_with_stub(self):
from skillopt.sleep.backend import Backend
from skillopt.sleep.rollout import RolloutSet, contrastive_reflect
from skillopt.sleep.types import ReplayResult, TaskRecord
class StubBackend(Backend):
name = "stub"
def _call(self, prompt, *, max_tokens=1024):
return '[{"op":"add","content":"always do the good thing","rationale":"good passed"}]'
rs = RolloutSet(task=TaskRecord(id="t", project="/p", intent="x"),
attempts=[ReplayResult(id="t", hard=1.0, response="good"),
ReplayResult(id="t", hard=0.0, response="bad")])
edits = contrastive_reflect(StubBackend(), [rs], "skill", "")
self.assertEqual(len(edits), 1)
self.assertIn("good thing", edits[0].content)
class TestSlowUpdate(unittest.TestCase):
def test_protected_field_roundtrip(self):
from skillopt.sleep.slow_update import (