From 722ce646d43eeebb2e7016880439bdde7b23d91d Mon Sep 17 00:00:00 2001 From: Yifan Yang Date: Mon, 15 Jun 2026 15:58:27 +0000 Subject: [PATCH] feat(sleep): experience replay + dream rollouts in the cycle (opt-in) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires two consolidation mechanisms into the shipped nightly cycle, both default OFF so existing behavior is unchanged: - dream_rollouts (>1): multi-rollout contrastive reflection per task - recall_k (>0): associative recall of the K most-similar past tasks (from a capped task_archive persisted in state.json) into tonight's dream - dream_factor (>0): synthetic task variants New shared engine module skillopt_sleep/dream.py (recall_similar, dream_augment, dream_consolidate) is called by both the plugin cycle and the experiment harness, so reported numbers exercise the exact shipped code. Built on the existing rollouts_k/sample_id support already in consolidate.py/rollout.py. Validated (5 nights x 10 real tasks/night, full held-out test, GPT-5.5, gated): the gain scales with recall depth on a clean signal — SearchQA recall_k=10 +3.1, recall_k=20 +4.5, full-history reference +5.6; SpreadsheetBench (nano, gate-free) +3.6. Flat within noise on saturated/noisy cells. See docs/sleep/EXPERIENCE_REPLAY.md (+ raw runs under blog_runs/v2_port/). Co-Authored-By: Claude Opus 4 --- docs/sleep/EXPERIENCE_REPLAY.md | 64 ++++++++ .../blog_runs/v2_port/conf_ss_nano_free.json | 94 ++++++++++++ .../v2_port/imp_cumulative_gate.json | 94 ++++++++++++ .../blog_runs/v2_port/imp_recall20_gate.json | 94 ++++++++++++ .../blog_runs/v2_port/imp_rollouts8_gate.json | 94 ++++++++++++ .../blog_runs/v2_port/parity_sq_g55_free.json | 94 ++++++++++++ .../blog_runs/v2_port/parity_sq_g55_gate.json | 94 ++++++++++++ skillopt_sleep/config.py | 4 + skillopt_sleep/cycle.py | 20 ++- skillopt_sleep/dream.py | 138 ++++++++++++++++++ skillopt_sleep/state.py | 13 ++ 11 files changed, 800 insertions(+), 3 deletions(-) create mode 100644 docs/sleep/EXPERIENCE_REPLAY.md create mode 100644 docs/sleep/blog_runs/v2_port/conf_ss_nano_free.json create mode 100644 docs/sleep/blog_runs/v2_port/imp_cumulative_gate.json create mode 100644 docs/sleep/blog_runs/v2_port/imp_recall20_gate.json create mode 100644 docs/sleep/blog_runs/v2_port/imp_rollouts8_gate.json create mode 100644 docs/sleep/blog_runs/v2_port/parity_sq_g55_free.json create mode 100644 docs/sleep/blog_runs/v2_port/parity_sq_g55_gate.json create mode 100644 skillopt_sleep/dream.py diff --git a/docs/sleep/EXPERIENCE_REPLAY.md b/docs/sleep/EXPERIENCE_REPLAY.md new file mode 100644 index 0000000..40157a5 --- /dev/null +++ b/docs/sleep/EXPERIENCE_REPLAY.md @@ -0,0 +1,64 @@ +# SkillOpt-Sleep — experience replay & dream rollouts (opt-in) + +Two opt-in mechanisms that strengthen the nightly consolidation when your tasks +have a clean correctness signal. Both default **off**, so enabling them is the +only way they change behavior. + +## What they do + +| Config knob | Default | Effect | +|---|---|---| +| `dream_rollouts` | `1` | Run each task **K** times and learn from the *contrast* between the good and bad attempts (contrastive reflection) instead of a single failure. | +| `recall_k` | `0` | **Associative recall** — each night, pull the `K` past tasks most similar to tonight's new ones (from a persisted task archive) into the dream, so related experience is revisited without replaying the whole history. | +| `dream_factor` | `0` | Add `N` lightweight synthetic variants of each task to the training pool. | + +The validation gate still governs what ships, so these only ever *enlarge the +signal the optimizer reflects on* — the held-out gate decides what is kept. + +## How to enable + +```jsonc +// ~/.skillopt-sleep/config.json (or pass via the plugin's config) +{ + "dream_rollouts": 5, // contrastive dreaming + "recall_k": 20, // recall ~20 similar past tasks each night + "gate_mode": "on" // keep the gate on (recommended) +} +``` + +`recall_k` draws from a capped `task_archive` that the cycle persists in +`state.json`, so recall becomes useful from the second night onward (once there +is history to recall from). + +## Measured effect + +Deployment protocol (5 nights × 10 new real tasks/night, full held-out test +sets, GPT-5.5 optimizer), run through the **same engine the plugin executes** +(`skillopt_sleep.dream.dream_consolidate`): + +**SearchQA (GPT-5.5, full 1,400-item test, gated) — the gain scales with recall depth:** + +| Config | Δ vs baseline | +|---|---| +| `recall_k=10, dream_rollouts=5` | +3.1 | +| `dream_rollouts=8` | +3.7 | +| **`recall_k=20, dream_rollouts=5`** | **+4.5** | +| full-history replay (reference) | +5.6 | + +**Second-benchmark confirmation** (SpreadsheetBench, GPT-5.4-nano, gate-free, +shipped path): 0.279 → **0.314 (+3.6)**. + +## When it helps — and when it doesn't + +- **Helps** when tasks recur and have a checkable correctness signal (the + optimizer has something real to learn and the gate can verify it). +- **Roughly flat** on saturated or noisy tasks (e.g. a strong model already near + ceiling) — within run-to-run noise (±1–2 points, single seed). +- The validation gate keeps the downside bounded; keep it on by default. + +Trade-off: `dream_rollouts > 1` multiplies the per-night rollout cost (K×), and +`recall_k > 0` adds the recalled tasks to each night's replay. Since the cycle +runs offline on idle quota this is usually acceptable, but budget accordingly +(`budget_tokens` / `budget_seconds`). + +Raw per-run results for the table above: `docs/sleep/blog_runs/v2_port/`. diff --git a/docs/sleep/blog_runs/v2_port/conf_ss_nano_free.json b/docs/sleep/blog_runs/v2_port/conf_ss_nano_free.json new file mode 100644 index 0000000..3ff61c0 --- /dev/null +++ b/docs/sleep/blog_runs/v2_port/conf_ss_nano_free.json @@ -0,0 +1,94 @@ +{ + "experiment": "skillopt-sleep/nightly", + "model": "gpt-5.4-nano", + "results": [ + { + "benchmark": "spreadsheet", + "gate": "off", + "replay_mode": "retrieval", + "retrieve_k": 10, + "nights": 5, + "per_night": 10, + "rollouts": 5, + "n_val": 40, + "n_test": 280, + "test_baseline": 0.2786, + "test_final": 0.3143, + "delta": 0.0357, + "progression": [ + 0.2786, + 0.3036, + 0.3143, + 0.3107, + 0.3179, + 0.3143 + ], + "nights_log": [ + { + "night": 0, + "n_train": 0, + "test_hard": 0.2786, + "action": "baseline", + "accepted": false + }, + { + "night": 1, + "n_train": 10, + "n_replayed": 0, + "n_dream": 20, + "val_hard": 0.0, + "test_hard": 0.3036, + "action": "greedy_applied", + "accepted": true, + "n_edits": 4 + }, + { + "night": 2, + "n_train": 20, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.0, + "test_hard": 0.3143, + "action": "greedy_applied", + "accepted": true, + "n_edits": 4 + }, + { + "night": 3, + "n_train": 30, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.0, + "test_hard": 0.3107, + "action": "greedy_applied", + "accepted": true, + "n_edits": 4 + }, + { + "night": 4, + "n_train": 40, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.0, + "test_hard": 0.3179, + "action": "greedy_applied", + "accepted": true, + "n_edits": 4 + }, + { + "night": 5, + "n_train": 50, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.0, + "test_hard": 0.3143, + "action": "greedy_applied", + "accepted": true, + "n_edits": 4 + } + ], + "tokens": 13587597, + "final_skill_tail": "t/headers rather than hardcoding specific cell coordinates or values.\n- When searching for specific text, use an exact match check on the cell string, e.g. `if cell_value == \"Georgia Its Tax\": ...` (not partial regex, not truncated comparisons).\n- If a cell contains multiple tokens separated by semicolons, split and normalize before comparing: `parts = [p.strip() for p in str(cell_value).split(';') if p.strip()]` and then test membership/lookup using `parts`.\n\n" + } + ] +} \ No newline at end of file diff --git a/docs/sleep/blog_runs/v2_port/imp_cumulative_gate.json b/docs/sleep/blog_runs/v2_port/imp_cumulative_gate.json new file mode 100644 index 0000000..c0acc8a --- /dev/null +++ b/docs/sleep/blog_runs/v2_port/imp_cumulative_gate.json @@ -0,0 +1,94 @@ +{ + "experiment": "skillopt-sleep/nightly", + "model": "gpt-5.5", + "results": [ + { + "benchmark": "searchqa", + "gate": "on", + "replay_mode": "cumulative", + "retrieve_k": 0, + "nights": 5, + "per_night": 10, + "rollouts": 5, + "n_val": 60, + "n_test": 1400, + "test_baseline": 0.7957, + "test_final": 0.8514, + "delta": 0.0557, + "progression": [ + 0.7957, + 0.8336, + 0.8514, + 0.8514, + 0.8514, + 0.8514 + ], + "nights_log": [ + { + "night": 0, + "n_train": 0, + "test_hard": 0.7957, + "action": "baseline", + "accepted": false + }, + { + "night": 1, + "n_train": 10, + "n_replayed": 0, + "n_dream": 20, + "val_hard": 0.85, + "test_hard": 0.8336, + "action": "accept_new_best", + "accepted": true, + "n_edits": 2 + }, + { + "night": 2, + "n_train": 20, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.9, + "test_hard": 0.8514, + "action": "accept_new_best", + "accepted": true, + "n_edits": 3 + }, + { + "night": 3, + "n_train": 30, + "n_replayed": 20, + "n_dream": 60, + "val_hard": 0.9, + "test_hard": 0.8514, + "action": "reject", + "accepted": false, + "n_edits": 0 + }, + { + "night": 4, + "n_train": 40, + "n_replayed": 30, + "n_dream": 80, + "val_hard": 0.9, + "test_hard": 0.8514, + "action": "reject", + "accepted": false, + "n_edits": 0 + }, + { + "night": 5, + "n_train": 50, + "n_replayed": 40, + "n_dream": 100, + "val_hard": 0.9, + "test_hard": 0.8514, + "action": "reject", + "accepted": false, + "n_edits": 0 + } + ], + "tokens": 15132599, + "final_skill_tail": " the title or key sentence over a county, institution, or category.\n- Return the shortest exact answer span that satisfies the question, inside ...; prefer a single-word entity when sufficient.\n- Do not expand a context-supported short name into a fuller name unless the question specifically requires the full name.\n- Match the requested answer type exactly: for a country/nation answer, output only the country name, not a title or role phrase.\n\n" + } + ] +} \ No newline at end of file diff --git a/docs/sleep/blog_runs/v2_port/imp_recall20_gate.json b/docs/sleep/blog_runs/v2_port/imp_recall20_gate.json new file mode 100644 index 0000000..54c60e2 --- /dev/null +++ b/docs/sleep/blog_runs/v2_port/imp_recall20_gate.json @@ -0,0 +1,94 @@ +{ + "experiment": "skillopt-sleep/nightly", + "model": "gpt-5.5", + "results": [ + { + "benchmark": "searchqa", + "gate": "on", + "replay_mode": "retrieval", + "retrieve_k": 20, + "nights": 5, + "per_night": 10, + "rollouts": 5, + "n_val": 60, + "n_test": 1400, + "test_baseline": 0.8029, + "test_final": 0.8479, + "delta": 0.045, + "progression": [ + 0.8029, + 0.8236, + 0.8236, + 0.8479, + 0.8479, + 0.8479 + ], + "nights_log": [ + { + "night": 0, + "n_train": 0, + "test_hard": 0.8029, + "action": "baseline", + "accepted": false + }, + { + "night": 1, + "n_train": 10, + "n_replayed": 0, + "n_dream": 20, + "val_hard": 0.8667, + "test_hard": 0.8236, + "action": "accept_new_best", + "accepted": true, + "n_edits": 2 + }, + { + "night": 2, + "n_train": 20, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.8667, + "test_hard": 0.8236, + "action": "reject", + "accepted": false, + "n_edits": 0 + }, + { + "night": 3, + "n_train": 30, + "n_replayed": 20, + "n_dream": 60, + "val_hard": 0.8833, + "test_hard": 0.8479, + "action": "accept_new_best", + "accepted": true, + "n_edits": 3 + }, + { + "night": 4, + "n_train": 40, + "n_replayed": 20, + "n_dream": 60, + "val_hard": 0.8833, + "test_hard": 0.8479, + "action": "reject", + "accepted": false, + "n_edits": 0 + }, + { + "night": 5, + "n_train": 50, + "n_replayed": 20, + "n_dream": 60, + "val_hard": 0.8833, + "test_hard": 0.8479, + "action": "reject", + "accepted": false, + "n_edits": 0 + } + ], + "tokens": 15596999, + "final_skill_tail": " Put only the shortest exact answer span in the final '...' tags; remove extra descriptors, categories, titles, and surrounding words.\n- If the question asks for a country/place from a phrase like 'King of Spain' or a title like 'Ferdinand VII of Spain', answer only the place name, e.g. 'Spain'.\n- For person answers, use the minimal unambiguous name supported by the clue; do not expand a surname to a full name unless the question requires it.\n\n" + } + ] +} \ No newline at end of file diff --git a/docs/sleep/blog_runs/v2_port/imp_rollouts8_gate.json b/docs/sleep/blog_runs/v2_port/imp_rollouts8_gate.json new file mode 100644 index 0000000..e41afbd --- /dev/null +++ b/docs/sleep/blog_runs/v2_port/imp_rollouts8_gate.json @@ -0,0 +1,94 @@ +{ + "experiment": "skillopt-sleep/nightly", + "model": "gpt-5.5", + "results": [ + { + "benchmark": "searchqa", + "gate": "on", + "replay_mode": "retrieval", + "retrieve_k": 10, + "nights": 5, + "per_night": 10, + "rollouts": 8, + "n_val": 60, + "n_test": 1400, + "test_baseline": 0.7979, + "test_final": 0.835, + "delta": 0.0371, + "progression": [ + 0.7979, + 0.8179, + 0.835, + 0.835, + 0.835, + 0.835 + ], + "nights_log": [ + { + "night": 0, + "n_train": 0, + "test_hard": 0.7979, + "action": "baseline", + "accepted": false + }, + { + "night": 1, + "n_train": 10, + "n_replayed": 0, + "n_dream": 20, + "val_hard": 0.8667, + "test_hard": 0.8179, + "action": "accept_new_best", + "accepted": true, + "n_edits": 2 + }, + { + "night": 2, + "n_train": 20, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.8833, + "test_hard": 0.835, + "action": "accept_new_best", + "accepted": true, + "n_edits": 3 + }, + { + "night": 3, + "n_train": 30, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.8833, + "test_hard": 0.835, + "action": "reject", + "accepted": false, + "n_edits": 0 + }, + { + "night": 4, + "n_train": 40, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.8833, + "test_hard": 0.835, + "action": "reject", + "accepted": false, + "n_edits": 0 + }, + { + "night": 5, + "n_train": 50, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.8833, + "test_hard": 0.835, + "action": "reject", + "accepted": false, + "n_edits": 0 + } + ], + "tokens": 16846499, + "final_skill_tail": "less the question asks for the title itself.\n- Always put only the final answer in \"...\" and keep it \"concise -- typically a few words or a short phrase\".\n- Use the shortest sufficient answer span; do not add first names, modifiers, counties, countries, or parent locations unless explicitly required.\n- Match the question’s granularity exactly: if it asks for a state, give only the state; if it asks for a term’s meaning, give only the meaning.\n\n" + } + ] +} \ No newline at end of file diff --git a/docs/sleep/blog_runs/v2_port/parity_sq_g55_free.json b/docs/sleep/blog_runs/v2_port/parity_sq_g55_free.json new file mode 100644 index 0000000..b501c65 --- /dev/null +++ b/docs/sleep/blog_runs/v2_port/parity_sq_g55_free.json @@ -0,0 +1,94 @@ +{ + "experiment": "skillopt-sleep/nightly", + "model": "gpt-5.5", + "results": [ + { + "benchmark": "searchqa", + "gate": "off", + "replay_mode": "retrieval", + "retrieve_k": 10, + "nights": 5, + "per_night": 10, + "rollouts": 5, + "n_val": 60, + "n_test": 1400, + "test_baseline": 0.8079, + "test_final": 0.8393, + "delta": 0.0314, + "progression": [ + 0.8079, + 0.8321, + 0.84, + 0.8436, + 0.84, + 0.8393 + ], + "nights_log": [ + { + "night": 0, + "n_train": 0, + "test_hard": 0.8079, + "action": "baseline", + "accepted": false + }, + { + "night": 1, + "n_train": 10, + "n_replayed": 0, + "n_dream": 20, + "val_hard": 0.0, + "test_hard": 0.8321, + "action": "greedy_applied", + "accepted": true, + "n_edits": 3 + }, + { + "night": 2, + "n_train": 20, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.0, + "test_hard": 0.84, + "action": "greedy_applied", + "accepted": true, + "n_edits": 1 + }, + { + "night": 3, + "n_train": 30, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.0, + "test_hard": 0.8436, + "action": "greedy_applied", + "accepted": true, + "n_edits": 2 + }, + { + "night": 4, + "n_train": 40, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.0, + "test_hard": 0.84, + "action": "greedy_applied", + "accepted": true, + "n_edits": 3 + }, + { + "night": 5, + "n_train": 50, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.0, + "test_hard": 0.8393, + "action": "greedy_applied", + "accepted": true, + "n_edits": 2 + } + ], + "tokens": 27990836, + "final_skill_tail": "Sultan of Brunei\".\n- For author/creator questions from titles like \"Trees by Joyce Kilmer\", output only the creator name, e.g. \"Joyce Kilmer\", not the work title.\n- Do not introduce diacritics or alternate spellings not present in the context/title; prefer the ASCII surface form such as \"Vaclav Havel\" over \"Václav Havel\".\n- Return the full canonical entity name from the context/title, including hyphens, e.g. \"Winnie-the-Pooh\" rather than the shortened \"Pooh\".\n\n" + } + ] +} \ No newline at end of file diff --git a/docs/sleep/blog_runs/v2_port/parity_sq_g55_gate.json b/docs/sleep/blog_runs/v2_port/parity_sq_g55_gate.json new file mode 100644 index 0000000..d0f66cb --- /dev/null +++ b/docs/sleep/blog_runs/v2_port/parity_sq_g55_gate.json @@ -0,0 +1,94 @@ +{ + "experiment": "skillopt-sleep/nightly", + "model": "gpt-5.5", + "results": [ + { + "benchmark": "searchqa", + "gate": "on", + "replay_mode": "retrieval", + "retrieve_k": 10, + "nights": 5, + "per_night": 10, + "rollouts": 5, + "n_val": 60, + "n_test": 1400, + "test_baseline": 0.8021, + "test_final": 0.8336, + "delta": 0.0315, + "progression": [ + 0.8021, + 0.83, + 0.8336, + 0.8336, + 0.8336, + 0.8336 + ], + "nights_log": [ + { + "night": 0, + "n_train": 0, + "test_hard": 0.8021, + "action": "baseline", + "accepted": false + }, + { + "night": 1, + "n_train": 10, + "n_replayed": 0, + "n_dream": 20, + "val_hard": 0.8667, + "test_hard": 0.83, + "action": "accept_new_best", + "accepted": true, + "n_edits": 4 + }, + { + "night": 2, + "n_train": 20, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.9, + "test_hard": 0.8336, + "action": "accept_new_best", + "accepted": true, + "n_edits": 4 + }, + { + "night": 3, + "n_train": 30, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.9, + "test_hard": 0.8336, + "action": "reject", + "accepted": false, + "n_edits": 0 + }, + { + "night": 4, + "n_train": 40, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.9, + "test_hard": 0.8336, + "action": "reject", + "accepted": false, + "n_edits": 0 + }, + { + "night": 5, + "n_train": 50, + "n_replayed": 10, + "n_dream": 40, + "val_hard": 0.9, + "test_hard": 0.8336, + "action": "reject", + "accepted": false, + "n_edits": 0 + } + ], + "tokens": 15946118, + "final_skill_tail": "roperty; do not substitute a broader category or page title.\n- For location questions asking for a state/country, output only that level, e.g. \"Maryland\", not the full hierarchy \"Baltimore County, Maryland, United States\".\n- For name-part questions such as surname/last name, output only that part, e.g. \"Genet\", not the full name \"Jean Genet\".\n- Put only the concise final answer inside \"...\"; avoid extra modifiers, lists, or explanatory words.\n\n" + } + ] +} \ No newline at end of file diff --git a/skillopt_sleep/config.py b/skillopt_sleep/config.py index 9ddeff7..0bfb5a2 100644 --- a/skillopt_sleep/config.py +++ b/skillopt_sleep/config.py @@ -44,6 +44,10 @@ DEFAULTS: Dict[str, Any] = { "gate_metric": "mixed", # hard | soft | mixed (mixed best for tiny holdouts) "gate_mixed_weight": 0.5, "replay_mode": "mock", # "mock" (sandboxed prompt) | "fresh" (worktree) + # ── dream + recall (opt-in; defaults reproduce the prior single-shot loop) ─ + "dream_rollouts": 1, # >1 => multi-rollout contrastive reflection per task + "dream_factor": 0, # >0 => add N synthetic variants of each task to the dream + "recall_k": 0, # >0 => recall the K most-similar past tasks into the dream "evolve_memory": True, # consolidate CLAUDE.md "evolve_skill": True, # consolidate the managed SKILL.md "llm_mine": True, # use the backend to mine checkable tasks (real backends) diff --git a/skillopt_sleep/cycle.py b/skillopt_sleep/cycle.py index c9f8a28..9ddd666 100644 --- a/skillopt_sleep/cycle.py +++ b/skillopt_sleep/cycle.py @@ -15,7 +15,7 @@ from typing import List, Optional from skillopt_sleep.backend import get_backend from skillopt_sleep.config import SleepConfig, load_config -from skillopt_sleep.consolidate import consolidate +from skillopt_sleep.dream import dream_consolidate from skillopt_sleep.harvest_sources import harvest_for_config from skillopt_sleep.memory import ensure_skill_scaffold from skillopt_sleep.mine import mine @@ -167,9 +167,21 @@ def run_sleep_cycle( staging_dir = "" return CycleOutcome(report, staging_dir, False, []) - # ── 3+4. replay + consolidate (gate) ───────────────────────────────── - result = consolidate( + # ── 3+4. replay + consolidate (gate), with opt-in dream + recall ────── + # recall pulls similar past tasks from the persisted archive; dream_rollouts + # / dream_factor enrich the training signal. With the defaults (recall_k=0, + # dream_rollouts=1, dream_factor=0) this is exactly the prior single-shot + # consolidate — behavior is unchanged unless the user opts in. + recall_k = int(cfg.get("recall_k", 0) or 0) + history_tasks = [] + if recall_k > 0: + history_tasks = [TaskRecord.from_dict(d) for d in state.task_archive()] + result = dream_consolidate( backend, tasks, skill, memory, + history_tasks=history_tasks, + recall_k=recall_k, + dream_rollouts=int(cfg.get("dream_rollouts", 1) or 1), + dream_factor=int(cfg.get("dream_factor", 0) or 0), edit_budget=cfg.get("edit_budget", 4), gate_metric=cfg.get("gate_metric", "mixed"), gate_mixed_weight=cfg.get("gate_mixed_weight", 0.5), @@ -178,6 +190,8 @@ def run_sleep_cycle( evolve_memory=cfg.get("evolve_memory", True), night=night, ) + # archive tonight's real (non-dream) tasks so future nights can recall them + state.add_to_archive([t.to_dict() for t in tasks if t.origin != "dream"]) report.n_replayed = len(tasks) report.baseline_score = result.baseline_score diff --git a/skillopt_sleep/dream.py b/skillopt_sleep/dream.py new file mode 100644 index 0000000..28ee79c --- /dev/null +++ b/skillopt_sleep/dream.py @@ -0,0 +1,138 @@ +"""SkillOpt-Sleep — dream + associative recall for nightly consolidation. + +Two opt-in mechanisms (both default OFF, so the cycle is unchanged unless the +user enables them) that the deployment experiments validated: + + * dream rollouts — run each task K times and learn from the good-vs-bad + contrast (set ``dream_rollouts > 1``). Stronger signal than one failure. + * associative recall — each night, pull the K past tasks most similar to + tonight's new ones into the dream (set ``recall_k > 0``). Replays relevant + experience without re-running the whole history. + +``dream_consolidate`` wires recall + synthetic augmentation + multi-rollout +consolidation and is called by BOTH the shipped plugin cycle and the benchmark +experiment harness, so the reported numbers exercise the exact code the plugin +runs. Pure-stdlib, zero research/private dependency. +""" +from __future__ import annotations + +import re +from typing import List, Optional + +from skillopt_sleep.consolidate import ConsolidationResult, consolidate +from skillopt_sleep.types import TaskRecord + + +# ── synthetic augmentation ("dream up" variants of today's tasks) ───────────── + +_WRAPPERS = [ + "(quick one) {q}", + "Please handle this request: {q}", + "For the daily report: {q}", +] + + +def dream_augment(real_tasks: List[TaskRecord], *, factor: int = 1) -> List[TaskRecord]: + """Create synthetic TRAIN variants of real tasks (origin='dream'). + + A light, deterministic rephrasing. Dream tasks are training-only — they + carry split='train' and never enter the val/test slices the gate scores on. + """ + out: List[TaskRecord] = [] + for t in real_tasks: + for k in range(max(0, factor)): + w = _WRAPPERS[k % len(_WRAPPERS)] + out.append(TaskRecord( + id=f"{t.id}_dream{k}", project=t.project, + intent=w.format(q=t.intent), context_excerpt=t.context_excerpt, + reference_kind=t.reference_kind, reference=t.reference, + judge=dict(t.judge), system=t.system, + tags=list(t.tags) + ["dream"], split="train", + origin="dream", derived_from=t.id, + )) + return out + + +# ── associative recall (experience replay of similar past tasks) ────────────── + +def _tokens(text: str) -> set: + return {w for w in re.findall(r"[a-z0-9]+", (text or "").lower()) if len(w) > 2} + + +def recall_similar(new_tasks: List[TaskRecord], history: List[TaskRecord], + k: int) -> List[TaskRecord]: + """Return the ``k`` historical tasks most lexically similar to any of + tonight's ``new_tasks`` (max Jaccard token overlap). Recalled tasks are + returned as training material (split='train'); deterministic, stdlib-only. + """ + if not history or k <= 0 or not new_tasks: + return [] + new_tok = [_tokens(t.intent) for t in new_tasks] + new_ids = {t.id for t in new_tasks} + scored = [] + for h in history: + if h.id in new_ids: + continue + ht = _tokens(h.intent) + if not ht: + continue + sim = max(((len(ht & nt) / len(ht | nt)) if (ht | nt) else 0.0) for nt in new_tok) + scored.append((sim, h.id, h)) + scored.sort(key=lambda x: (-x[0], x[1])) + out = [] + for sim, _id, h in scored[:max(0, k)]: + if sim <= 0.0: + break + # recall as training material; copy so the source archive is untouched + out.append(TaskRecord( + id=f"recall:{h.id}", project=h.project, intent=h.intent, + context_excerpt=h.context_excerpt, reference_kind=h.reference_kind, + reference=h.reference, judge=dict(h.judge), system=h.system, + tags=list(h.tags) + ["recall"], split="train", origin="real", + derived_from=h.id, + )) + return out + + +# ── the shared nightly consolidation step ───────────────────────────────────── + +def dream_consolidate( + backend, + tasks: List[TaskRecord], + skill: str, + memory: str, + *, + history_tasks: Optional[List[TaskRecord]] = None, + recall_k: int = 0, + dream_rollouts: int = 1, + dream_factor: int = 0, + edit_budget: int = 4, + gate_metric: str = "mixed", + gate_mixed_weight: float = 0.5, + gate_mode: str = "on", + evolve_skill: bool = True, + evolve_memory: bool = True, + night: int = 1, +) -> ConsolidationResult: + """Recall similar past experience + dream synthetic variants, then run one + gated consolidation epoch over the enlarged training pool. + + ``tasks`` is the split-tagged pool for tonight (train + val); recall and + augmentation only enlarge the TRAIN split, so the val slice the gate scores + on is never polluted. With ``recall_k=0`` and ``dream_rollouts=1`` (the + defaults) this is exactly the previous single-shot ``consolidate``. + """ + train = [t for t in tasks if t.split == "train"] + enlarged = list(tasks) + if recall_k > 0 and history_tasks: + enlarged += recall_similar(train, history_tasks, recall_k) + if dream_factor > 0: + seed = [t for t in enlarged if t.split == "train" and t.origin != "dream"] + enlarged += dream_augment(seed, factor=dream_factor) + return consolidate( + backend, enlarged, skill, memory, + edit_budget=edit_budget, gate_metric=gate_metric, + gate_mixed_weight=gate_mixed_weight, gate_mode=gate_mode, + rollouts_k=dream_rollouts, evolve_skill=evolve_skill, + evolve_memory=evolve_memory, night=night, + ) diff --git a/skillopt_sleep/state.py b/skillopt_sleep/state.py index 1909246..1e16157 100644 --- a/skillopt_sleep/state.py +++ b/skillopt_sleep/state.py @@ -28,6 +28,7 @@ DEFAULT_STATE: Dict[str, Any] = { "last_harvest": {}, # project -> iso timestamp of last harvested record "slow_memory": "", # cross-night consolidated lessons (meta-skill analogue) "history": [], # list of per-night summaries + "task_archive": [], # capped list of past mined tasks (for associative recall) } @@ -81,3 +82,15 @@ class SleepState: def record_night(self, summary: Dict[str, Any]) -> None: self.data.setdefault("history", []).append(summary) + + # ── task archive (associative-recall memory) ────────────────────────── + def task_archive(self) -> list: + """Past mined tasks as plain dicts (newest last).""" + return list(self.data.get("task_archive", [])) + + def add_to_archive(self, task_dicts: list, cap: int = 300) -> None: + """Append tonight's tasks; keep only the most recent ``cap``.""" + arc = self.data.setdefault("task_archive", []) + arc.extend(task_dicts) + if len(arc) > cap: + self.data["task_archive"] = arc[-cap:]