diff --git a/docs/sleep/EXPERIENCE_REPLAY.md b/docs/sleep/EXPERIENCE_REPLAY.md
new file mode 100644
index 0000000..40157a5
--- /dev/null
+++ b/docs/sleep/EXPERIENCE_REPLAY.md
@@ -0,0 +1,64 @@
+# SkillOpt-Sleep — experience replay & dream rollouts (opt-in)
+
+Two opt-in mechanisms that strengthen the nightly consolidation when your tasks
+have a clean correctness signal. Both default **off**, so enabling them is the
+only way they change behavior.
+
+## What they do
+
+| Config knob | Default | Effect |
+|---|---|---|
+| `dream_rollouts` | `1` | Run each task **K** times and learn from the *contrast* between the good and bad attempts (contrastive reflection) instead of a single failure. |
+| `recall_k` | `0` | **Associative recall** — each night, pull the `K` past tasks most similar to tonight's new ones (from a persisted task archive) into the dream, so related experience is revisited without replaying the whole history. |
+| `dream_factor` | `0` | Add `N` lightweight synthetic variants of each task to the training pool. |
+
+The validation gate still governs what ships, so these only ever *enlarge the
+signal the optimizer reflects on* — the held-out gate decides what is kept.
+
+## How to enable
+
+```jsonc
+// ~/.skillopt-sleep/config.json (or pass via the plugin's config)
+{
+ "dream_rollouts": 5, // contrastive dreaming
+ "recall_k": 20, // recall ~20 similar past tasks each night
+ "gate_mode": "on" // keep the gate on (recommended)
+}
+```
+
+`recall_k` draws from a capped `task_archive` that the cycle persists in
+`state.json`, so recall becomes useful from the second night onward (once there
+is history to recall from).
+
+## Measured effect
+
+Deployment protocol (5 nights × 10 new real tasks/night, full held-out test
+sets, GPT-5.5 optimizer), run through the **same engine the plugin executes**
+(`skillopt_sleep.dream.dream_consolidate`):
+
+**SearchQA (GPT-5.5, full 1,400-item test, gated) — the gain scales with recall depth:**
+
+| Config | Δ vs baseline |
+|---|---|
+| `recall_k=10, dream_rollouts=5` | +3.1 |
+| `dream_rollouts=8` | +3.7 |
+| **`recall_k=20, dream_rollouts=5`** | **+4.5** |
+| full-history replay (reference) | +5.6 |
+
+**Second-benchmark confirmation** (SpreadsheetBench, GPT-5.4-nano, gate-free,
+shipped path): 0.279 → **0.314 (+3.6)**.
+
+## When it helps — and when it doesn't
+
+- **Helps** when tasks recur and have a checkable correctness signal (the
+ optimizer has something real to learn and the gate can verify it).
+- **Roughly flat** on saturated or noisy tasks (e.g. a strong model already near
+ ceiling) — within run-to-run noise (±1–2 points, single seed).
+- The validation gate keeps the downside bounded; keep it on by default.
+
+Trade-off: `dream_rollouts > 1` multiplies the per-night rollout cost (K×), and
+`recall_k > 0` adds the recalled tasks to each night's replay. Since the cycle
+runs offline on idle quota this is usually acceptable, but budget accordingly
+(`budget_tokens` / `budget_seconds`).
+
+Raw per-run results for the table above: `docs/sleep/blog_runs/v2_port/`.
diff --git a/docs/sleep/blog_runs/v2_port/conf_ss_nano_free.json b/docs/sleep/blog_runs/v2_port/conf_ss_nano_free.json
new file mode 100644
index 0000000..3ff61c0
--- /dev/null
+++ b/docs/sleep/blog_runs/v2_port/conf_ss_nano_free.json
@@ -0,0 +1,94 @@
+{
+ "experiment": "skillopt-sleep/nightly",
+ "model": "gpt-5.4-nano",
+ "results": [
+ {
+ "benchmark": "spreadsheet",
+ "gate": "off",
+ "replay_mode": "retrieval",
+ "retrieve_k": 10,
+ "nights": 5,
+ "per_night": 10,
+ "rollouts": 5,
+ "n_val": 40,
+ "n_test": 280,
+ "test_baseline": 0.2786,
+ "test_final": 0.3143,
+ "delta": 0.0357,
+ "progression": [
+ 0.2786,
+ 0.3036,
+ 0.3143,
+ 0.3107,
+ 0.3179,
+ 0.3143
+ ],
+ "nights_log": [
+ {
+ "night": 0,
+ "n_train": 0,
+ "test_hard": 0.2786,
+ "action": "baseline",
+ "accepted": false
+ },
+ {
+ "night": 1,
+ "n_train": 10,
+ "n_replayed": 0,
+ "n_dream": 20,
+ "val_hard": 0.0,
+ "test_hard": 0.3036,
+ "action": "greedy_applied",
+ "accepted": true,
+ "n_edits": 4
+ },
+ {
+ "night": 2,
+ "n_train": 20,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.0,
+ "test_hard": 0.3143,
+ "action": "greedy_applied",
+ "accepted": true,
+ "n_edits": 4
+ },
+ {
+ "night": 3,
+ "n_train": 30,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.0,
+ "test_hard": 0.3107,
+ "action": "greedy_applied",
+ "accepted": true,
+ "n_edits": 4
+ },
+ {
+ "night": 4,
+ "n_train": 40,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.0,
+ "test_hard": 0.3179,
+ "action": "greedy_applied",
+ "accepted": true,
+ "n_edits": 4
+ },
+ {
+ "night": 5,
+ "n_train": 50,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.0,
+ "test_hard": 0.3143,
+ "action": "greedy_applied",
+ "accepted": true,
+ "n_edits": 4
+ }
+ ],
+ "tokens": 13587597,
+ "final_skill_tail": "t/headers rather than hardcoding specific cell coordinates or values.\n- When searching for specific text, use an exact match check on the cell string, e.g. `if cell_value == \"Georgia Its Tax\": ...` (not partial regex, not truncated comparisons).\n- If a cell contains multiple tokens separated by semicolons, split and normalize before comparing: `parts = [p.strip() for p in str(cell_value).split(';') if p.strip()]` and then test membership/lookup using `parts`.\n\n"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/sleep/blog_runs/v2_port/imp_cumulative_gate.json b/docs/sleep/blog_runs/v2_port/imp_cumulative_gate.json
new file mode 100644
index 0000000..c0acc8a
--- /dev/null
+++ b/docs/sleep/blog_runs/v2_port/imp_cumulative_gate.json
@@ -0,0 +1,94 @@
+{
+ "experiment": "skillopt-sleep/nightly",
+ "model": "gpt-5.5",
+ "results": [
+ {
+ "benchmark": "searchqa",
+ "gate": "on",
+ "replay_mode": "cumulative",
+ "retrieve_k": 0,
+ "nights": 5,
+ "per_night": 10,
+ "rollouts": 5,
+ "n_val": 60,
+ "n_test": 1400,
+ "test_baseline": 0.7957,
+ "test_final": 0.8514,
+ "delta": 0.0557,
+ "progression": [
+ 0.7957,
+ 0.8336,
+ 0.8514,
+ 0.8514,
+ 0.8514,
+ 0.8514
+ ],
+ "nights_log": [
+ {
+ "night": 0,
+ "n_train": 0,
+ "test_hard": 0.7957,
+ "action": "baseline",
+ "accepted": false
+ },
+ {
+ "night": 1,
+ "n_train": 10,
+ "n_replayed": 0,
+ "n_dream": 20,
+ "val_hard": 0.85,
+ "test_hard": 0.8336,
+ "action": "accept_new_best",
+ "accepted": true,
+ "n_edits": 2
+ },
+ {
+ "night": 2,
+ "n_train": 20,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.9,
+ "test_hard": 0.8514,
+ "action": "accept_new_best",
+ "accepted": true,
+ "n_edits": 3
+ },
+ {
+ "night": 3,
+ "n_train": 30,
+ "n_replayed": 20,
+ "n_dream": 60,
+ "val_hard": 0.9,
+ "test_hard": 0.8514,
+ "action": "reject",
+ "accepted": false,
+ "n_edits": 0
+ },
+ {
+ "night": 4,
+ "n_train": 40,
+ "n_replayed": 30,
+ "n_dream": 80,
+ "val_hard": 0.9,
+ "test_hard": 0.8514,
+ "action": "reject",
+ "accepted": false,
+ "n_edits": 0
+ },
+ {
+ "night": 5,
+ "n_train": 50,
+ "n_replayed": 40,
+ "n_dream": 100,
+ "val_hard": 0.9,
+ "test_hard": 0.8514,
+ "action": "reject",
+ "accepted": false,
+ "n_edits": 0
+ }
+ ],
+ "tokens": 15132599,
+ "final_skill_tail": " the title or key sentence over a county, institution, or category.\n- Return the shortest exact answer span that satisfies the question, inside ...; prefer a single-word entity when sufficient.\n- Do not expand a context-supported short name into a fuller name unless the question specifically requires the full name.\n- Match the requested answer type exactly: for a country/nation answer, output only the country name, not a title or role phrase.\n\n"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/sleep/blog_runs/v2_port/imp_recall20_gate.json b/docs/sleep/blog_runs/v2_port/imp_recall20_gate.json
new file mode 100644
index 0000000..54c60e2
--- /dev/null
+++ b/docs/sleep/blog_runs/v2_port/imp_recall20_gate.json
@@ -0,0 +1,94 @@
+{
+ "experiment": "skillopt-sleep/nightly",
+ "model": "gpt-5.5",
+ "results": [
+ {
+ "benchmark": "searchqa",
+ "gate": "on",
+ "replay_mode": "retrieval",
+ "retrieve_k": 20,
+ "nights": 5,
+ "per_night": 10,
+ "rollouts": 5,
+ "n_val": 60,
+ "n_test": 1400,
+ "test_baseline": 0.8029,
+ "test_final": 0.8479,
+ "delta": 0.045,
+ "progression": [
+ 0.8029,
+ 0.8236,
+ 0.8236,
+ 0.8479,
+ 0.8479,
+ 0.8479
+ ],
+ "nights_log": [
+ {
+ "night": 0,
+ "n_train": 0,
+ "test_hard": 0.8029,
+ "action": "baseline",
+ "accepted": false
+ },
+ {
+ "night": 1,
+ "n_train": 10,
+ "n_replayed": 0,
+ "n_dream": 20,
+ "val_hard": 0.8667,
+ "test_hard": 0.8236,
+ "action": "accept_new_best",
+ "accepted": true,
+ "n_edits": 2
+ },
+ {
+ "night": 2,
+ "n_train": 20,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.8667,
+ "test_hard": 0.8236,
+ "action": "reject",
+ "accepted": false,
+ "n_edits": 0
+ },
+ {
+ "night": 3,
+ "n_train": 30,
+ "n_replayed": 20,
+ "n_dream": 60,
+ "val_hard": 0.8833,
+ "test_hard": 0.8479,
+ "action": "accept_new_best",
+ "accepted": true,
+ "n_edits": 3
+ },
+ {
+ "night": 4,
+ "n_train": 40,
+ "n_replayed": 20,
+ "n_dream": 60,
+ "val_hard": 0.8833,
+ "test_hard": 0.8479,
+ "action": "reject",
+ "accepted": false,
+ "n_edits": 0
+ },
+ {
+ "night": 5,
+ "n_train": 50,
+ "n_replayed": 20,
+ "n_dream": 60,
+ "val_hard": 0.8833,
+ "test_hard": 0.8479,
+ "action": "reject",
+ "accepted": false,
+ "n_edits": 0
+ }
+ ],
+ "tokens": 15596999,
+ "final_skill_tail": " Put only the shortest exact answer span in the final '...' tags; remove extra descriptors, categories, titles, and surrounding words.\n- If the question asks for a country/place from a phrase like 'King of Spain' or a title like 'Ferdinand VII of Spain', answer only the place name, e.g. 'Spain'.\n- For person answers, use the minimal unambiguous name supported by the clue; do not expand a surname to a full name unless the question requires it.\n\n"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/sleep/blog_runs/v2_port/imp_rollouts8_gate.json b/docs/sleep/blog_runs/v2_port/imp_rollouts8_gate.json
new file mode 100644
index 0000000..e41afbd
--- /dev/null
+++ b/docs/sleep/blog_runs/v2_port/imp_rollouts8_gate.json
@@ -0,0 +1,94 @@
+{
+ "experiment": "skillopt-sleep/nightly",
+ "model": "gpt-5.5",
+ "results": [
+ {
+ "benchmark": "searchqa",
+ "gate": "on",
+ "replay_mode": "retrieval",
+ "retrieve_k": 10,
+ "nights": 5,
+ "per_night": 10,
+ "rollouts": 8,
+ "n_val": 60,
+ "n_test": 1400,
+ "test_baseline": 0.7979,
+ "test_final": 0.835,
+ "delta": 0.0371,
+ "progression": [
+ 0.7979,
+ 0.8179,
+ 0.835,
+ 0.835,
+ 0.835,
+ 0.835
+ ],
+ "nights_log": [
+ {
+ "night": 0,
+ "n_train": 0,
+ "test_hard": 0.7979,
+ "action": "baseline",
+ "accepted": false
+ },
+ {
+ "night": 1,
+ "n_train": 10,
+ "n_replayed": 0,
+ "n_dream": 20,
+ "val_hard": 0.8667,
+ "test_hard": 0.8179,
+ "action": "accept_new_best",
+ "accepted": true,
+ "n_edits": 2
+ },
+ {
+ "night": 2,
+ "n_train": 20,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.8833,
+ "test_hard": 0.835,
+ "action": "accept_new_best",
+ "accepted": true,
+ "n_edits": 3
+ },
+ {
+ "night": 3,
+ "n_train": 30,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.8833,
+ "test_hard": 0.835,
+ "action": "reject",
+ "accepted": false,
+ "n_edits": 0
+ },
+ {
+ "night": 4,
+ "n_train": 40,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.8833,
+ "test_hard": 0.835,
+ "action": "reject",
+ "accepted": false,
+ "n_edits": 0
+ },
+ {
+ "night": 5,
+ "n_train": 50,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.8833,
+ "test_hard": 0.835,
+ "action": "reject",
+ "accepted": false,
+ "n_edits": 0
+ }
+ ],
+ "tokens": 16846499,
+ "final_skill_tail": "less the question asks for the title itself.\n- Always put only the final answer in \"...\" and keep it \"concise -- typically a few words or a short phrase\".\n- Use the shortest sufficient answer span; do not add first names, modifiers, counties, countries, or parent locations unless explicitly required.\n- Match the question’s granularity exactly: if it asks for a state, give only the state; if it asks for a term’s meaning, give only the meaning.\n\n"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/sleep/blog_runs/v2_port/parity_sq_g55_free.json b/docs/sleep/blog_runs/v2_port/parity_sq_g55_free.json
new file mode 100644
index 0000000..b501c65
--- /dev/null
+++ b/docs/sleep/blog_runs/v2_port/parity_sq_g55_free.json
@@ -0,0 +1,94 @@
+{
+ "experiment": "skillopt-sleep/nightly",
+ "model": "gpt-5.5",
+ "results": [
+ {
+ "benchmark": "searchqa",
+ "gate": "off",
+ "replay_mode": "retrieval",
+ "retrieve_k": 10,
+ "nights": 5,
+ "per_night": 10,
+ "rollouts": 5,
+ "n_val": 60,
+ "n_test": 1400,
+ "test_baseline": 0.8079,
+ "test_final": 0.8393,
+ "delta": 0.0314,
+ "progression": [
+ 0.8079,
+ 0.8321,
+ 0.84,
+ 0.8436,
+ 0.84,
+ 0.8393
+ ],
+ "nights_log": [
+ {
+ "night": 0,
+ "n_train": 0,
+ "test_hard": 0.8079,
+ "action": "baseline",
+ "accepted": false
+ },
+ {
+ "night": 1,
+ "n_train": 10,
+ "n_replayed": 0,
+ "n_dream": 20,
+ "val_hard": 0.0,
+ "test_hard": 0.8321,
+ "action": "greedy_applied",
+ "accepted": true,
+ "n_edits": 3
+ },
+ {
+ "night": 2,
+ "n_train": 20,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.0,
+ "test_hard": 0.84,
+ "action": "greedy_applied",
+ "accepted": true,
+ "n_edits": 1
+ },
+ {
+ "night": 3,
+ "n_train": 30,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.0,
+ "test_hard": 0.8436,
+ "action": "greedy_applied",
+ "accepted": true,
+ "n_edits": 2
+ },
+ {
+ "night": 4,
+ "n_train": 40,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.0,
+ "test_hard": 0.84,
+ "action": "greedy_applied",
+ "accepted": true,
+ "n_edits": 3
+ },
+ {
+ "night": 5,
+ "n_train": 50,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.0,
+ "test_hard": 0.8393,
+ "action": "greedy_applied",
+ "accepted": true,
+ "n_edits": 2
+ }
+ ],
+ "tokens": 27990836,
+ "final_skill_tail": "Sultan of Brunei\".\n- For author/creator questions from titles like \"Trees by Joyce Kilmer\", output only the creator name, e.g. \"Joyce Kilmer\", not the work title.\n- Do not introduce diacritics or alternate spellings not present in the context/title; prefer the ASCII surface form such as \"Vaclav Havel\" over \"Václav Havel\".\n- Return the full canonical entity name from the context/title, including hyphens, e.g. \"Winnie-the-Pooh\" rather than the shortened \"Pooh\".\n\n"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/sleep/blog_runs/v2_port/parity_sq_g55_gate.json b/docs/sleep/blog_runs/v2_port/parity_sq_g55_gate.json
new file mode 100644
index 0000000..d0f66cb
--- /dev/null
+++ b/docs/sleep/blog_runs/v2_port/parity_sq_g55_gate.json
@@ -0,0 +1,94 @@
+{
+ "experiment": "skillopt-sleep/nightly",
+ "model": "gpt-5.5",
+ "results": [
+ {
+ "benchmark": "searchqa",
+ "gate": "on",
+ "replay_mode": "retrieval",
+ "retrieve_k": 10,
+ "nights": 5,
+ "per_night": 10,
+ "rollouts": 5,
+ "n_val": 60,
+ "n_test": 1400,
+ "test_baseline": 0.8021,
+ "test_final": 0.8336,
+ "delta": 0.0315,
+ "progression": [
+ 0.8021,
+ 0.83,
+ 0.8336,
+ 0.8336,
+ 0.8336,
+ 0.8336
+ ],
+ "nights_log": [
+ {
+ "night": 0,
+ "n_train": 0,
+ "test_hard": 0.8021,
+ "action": "baseline",
+ "accepted": false
+ },
+ {
+ "night": 1,
+ "n_train": 10,
+ "n_replayed": 0,
+ "n_dream": 20,
+ "val_hard": 0.8667,
+ "test_hard": 0.83,
+ "action": "accept_new_best",
+ "accepted": true,
+ "n_edits": 4
+ },
+ {
+ "night": 2,
+ "n_train": 20,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.9,
+ "test_hard": 0.8336,
+ "action": "accept_new_best",
+ "accepted": true,
+ "n_edits": 4
+ },
+ {
+ "night": 3,
+ "n_train": 30,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.9,
+ "test_hard": 0.8336,
+ "action": "reject",
+ "accepted": false,
+ "n_edits": 0
+ },
+ {
+ "night": 4,
+ "n_train": 40,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.9,
+ "test_hard": 0.8336,
+ "action": "reject",
+ "accepted": false,
+ "n_edits": 0
+ },
+ {
+ "night": 5,
+ "n_train": 50,
+ "n_replayed": 10,
+ "n_dream": 40,
+ "val_hard": 0.9,
+ "test_hard": 0.8336,
+ "action": "reject",
+ "accepted": false,
+ "n_edits": 0
+ }
+ ],
+ "tokens": 15946118,
+ "final_skill_tail": "roperty; do not substitute a broader category or page title.\n- For location questions asking for a state/country, output only that level, e.g. \"Maryland\", not the full hierarchy \"Baltimore County, Maryland, United States\".\n- For name-part questions such as surname/last name, output only that part, e.g. \"Genet\", not the full name \"Jean Genet\".\n- Put only the concise final answer inside \"...\"; avoid extra modifiers, lists, or explanatory words.\n\n"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/skillopt_sleep/config.py b/skillopt_sleep/config.py
index 9ddeff7..0bfb5a2 100644
--- a/skillopt_sleep/config.py
+++ b/skillopt_sleep/config.py
@@ -44,6 +44,10 @@ DEFAULTS: Dict[str, Any] = {
"gate_metric": "mixed", # hard | soft | mixed (mixed best for tiny holdouts)
"gate_mixed_weight": 0.5,
"replay_mode": "mock", # "mock" (sandboxed prompt) | "fresh" (worktree)
+ # ── dream + recall (opt-in; defaults reproduce the prior single-shot loop) ─
+ "dream_rollouts": 1, # >1 => multi-rollout contrastive reflection per task
+ "dream_factor": 0, # >0 => add N synthetic variants of each task to the dream
+ "recall_k": 0, # >0 => recall the K most-similar past tasks into the dream
"evolve_memory": True, # consolidate CLAUDE.md
"evolve_skill": True, # consolidate the managed SKILL.md
"llm_mine": True, # use the backend to mine checkable tasks (real backends)
diff --git a/skillopt_sleep/cycle.py b/skillopt_sleep/cycle.py
index c9f8a28..9ddd666 100644
--- a/skillopt_sleep/cycle.py
+++ b/skillopt_sleep/cycle.py
@@ -15,7 +15,7 @@ from typing import List, Optional
from skillopt_sleep.backend import get_backend
from skillopt_sleep.config import SleepConfig, load_config
-from skillopt_sleep.consolidate import consolidate
+from skillopt_sleep.dream import dream_consolidate
from skillopt_sleep.harvest_sources import harvest_for_config
from skillopt_sleep.memory import ensure_skill_scaffold
from skillopt_sleep.mine import mine
@@ -167,9 +167,21 @@ def run_sleep_cycle(
staging_dir = ""
return CycleOutcome(report, staging_dir, False, [])
- # ── 3+4. replay + consolidate (gate) ─────────────────────────────────
- result = consolidate(
+ # ── 3+4. replay + consolidate (gate), with opt-in dream + recall ──────
+ # recall pulls similar past tasks from the persisted archive; dream_rollouts
+ # / dream_factor enrich the training signal. With the defaults (recall_k=0,
+ # dream_rollouts=1, dream_factor=0) this is exactly the prior single-shot
+ # consolidate — behavior is unchanged unless the user opts in.
+ recall_k = int(cfg.get("recall_k", 0) or 0)
+ history_tasks = []
+ if recall_k > 0:
+ history_tasks = [TaskRecord.from_dict(d) for d in state.task_archive()]
+ result = dream_consolidate(
backend, tasks, skill, memory,
+ history_tasks=history_tasks,
+ recall_k=recall_k,
+ dream_rollouts=int(cfg.get("dream_rollouts", 1) or 1),
+ dream_factor=int(cfg.get("dream_factor", 0) or 0),
edit_budget=cfg.get("edit_budget", 4),
gate_metric=cfg.get("gate_metric", "mixed"),
gate_mixed_weight=cfg.get("gate_mixed_weight", 0.5),
@@ -178,6 +190,8 @@ def run_sleep_cycle(
evolve_memory=cfg.get("evolve_memory", True),
night=night,
)
+ # archive tonight's real (non-dream) tasks so future nights can recall them
+ state.add_to_archive([t.to_dict() for t in tasks if t.origin != "dream"])
report.n_replayed = len(tasks)
report.baseline_score = result.baseline_score
diff --git a/skillopt_sleep/dream.py b/skillopt_sleep/dream.py
new file mode 100644
index 0000000..28ee79c
--- /dev/null
+++ b/skillopt_sleep/dream.py
@@ -0,0 +1,138 @@
+"""SkillOpt-Sleep — dream + associative recall for nightly consolidation.
+
+Two opt-in mechanisms (both default OFF, so the cycle is unchanged unless the
+user enables them) that the deployment experiments validated:
+
+ * dream rollouts — run each task K times and learn from the good-vs-bad
+ contrast (set ``dream_rollouts > 1``). Stronger signal than one failure.
+ * associative recall — each night, pull the K past tasks most similar to
+ tonight's new ones into the dream (set ``recall_k > 0``). Replays relevant
+ experience without re-running the whole history.
+
+``dream_consolidate`` wires recall + synthetic augmentation + multi-rollout
+consolidation and is called by BOTH the shipped plugin cycle and the benchmark
+experiment harness, so the reported numbers exercise the exact code the plugin
+runs. Pure-stdlib, zero research/private dependency.
+"""
+from __future__ import annotations
+
+import re
+from typing import List, Optional
+
+from skillopt_sleep.consolidate import ConsolidationResult, consolidate
+from skillopt_sleep.types import TaskRecord
+
+
+# ── synthetic augmentation ("dream up" variants of today's tasks) ─────────────
+
+_WRAPPERS = [
+ "(quick one) {q}",
+ "Please handle this request: {q}",
+ "For the daily report: {q}",
+]
+
+
+def dream_augment(real_tasks: List[TaskRecord], *, factor: int = 1) -> List[TaskRecord]:
+ """Create synthetic TRAIN variants of real tasks (origin='dream').
+
+ A light, deterministic rephrasing. Dream tasks are training-only — they
+ carry split='train' and never enter the val/test slices the gate scores on.
+ """
+ out: List[TaskRecord] = []
+ for t in real_tasks:
+ for k in range(max(0, factor)):
+ w = _WRAPPERS[k % len(_WRAPPERS)]
+ out.append(TaskRecord(
+ id=f"{t.id}_dream{k}", project=t.project,
+ intent=w.format(q=t.intent), context_excerpt=t.context_excerpt,
+ reference_kind=t.reference_kind, reference=t.reference,
+ judge=dict(t.judge), system=t.system,
+ tags=list(t.tags) + ["dream"], split="train",
+ origin="dream", derived_from=t.id,
+ ))
+ return out
+
+
+# ── associative recall (experience replay of similar past tasks) ──────────────
+
+def _tokens(text: str) -> set:
+ return {w for w in re.findall(r"[a-z0-9]+", (text or "").lower()) if len(w) > 2}
+
+
+def recall_similar(new_tasks: List[TaskRecord], history: List[TaskRecord],
+ k: int) -> List[TaskRecord]:
+ """Return the ``k`` historical tasks most lexically similar to any of
+ tonight's ``new_tasks`` (max Jaccard token overlap). Recalled tasks are
+ returned as training material (split='train'); deterministic, stdlib-only.
+ """
+ if not history or k <= 0 or not new_tasks:
+ return []
+ new_tok = [_tokens(t.intent) for t in new_tasks]
+ new_ids = {t.id for t in new_tasks}
+ scored = []
+ for h in history:
+ if h.id in new_ids:
+ continue
+ ht = _tokens(h.intent)
+ if not ht:
+ continue
+ sim = max(((len(ht & nt) / len(ht | nt)) if (ht | nt) else 0.0) for nt in new_tok)
+ scored.append((sim, h.id, h))
+ scored.sort(key=lambda x: (-x[0], x[1]))
+ out = []
+ for sim, _id, h in scored[:max(0, k)]:
+ if sim <= 0.0:
+ break
+ # recall as training material; copy so the source archive is untouched
+ out.append(TaskRecord(
+ id=f"recall:{h.id}", project=h.project, intent=h.intent,
+ context_excerpt=h.context_excerpt, reference_kind=h.reference_kind,
+ reference=h.reference, judge=dict(h.judge), system=h.system,
+ tags=list(h.tags) + ["recall"], split="train", origin="real",
+ derived_from=h.id,
+ ))
+ return out
+
+
+# ── the shared nightly consolidation step ─────────────────────────────────────
+
+def dream_consolidate(
+ backend,
+ tasks: List[TaskRecord],
+ skill: str,
+ memory: str,
+ *,
+ history_tasks: Optional[List[TaskRecord]] = None,
+ recall_k: int = 0,
+ dream_rollouts: int = 1,
+ dream_factor: int = 0,
+ edit_budget: int = 4,
+ gate_metric: str = "mixed",
+ gate_mixed_weight: float = 0.5,
+ gate_mode: str = "on",
+ evolve_skill: bool = True,
+ evolve_memory: bool = True,
+ night: int = 1,
+) -> ConsolidationResult:
+ """Recall similar past experience + dream synthetic variants, then run one
+ gated consolidation epoch over the enlarged training pool.
+
+ ``tasks`` is the split-tagged pool for tonight (train + val); recall and
+ augmentation only enlarge the TRAIN split, so the val slice the gate scores
+ on is never polluted. With ``recall_k=0`` and ``dream_rollouts=1`` (the
+ defaults) this is exactly the previous single-shot ``consolidate``.
+ """
+ train = [t for t in tasks if t.split == "train"]
+ enlarged = list(tasks)
+ if recall_k > 0 and history_tasks:
+ enlarged += recall_similar(train, history_tasks, recall_k)
+ if dream_factor > 0:
+ seed = [t for t in enlarged if t.split == "train" and t.origin != "dream"]
+ enlarged += dream_augment(seed, factor=dream_factor)
+ return consolidate(
+ backend, enlarged, skill, memory,
+ edit_budget=edit_budget, gate_metric=gate_metric,
+ gate_mixed_weight=gate_mixed_weight, gate_mode=gate_mode,
+ rollouts_k=dream_rollouts, evolve_skill=evolve_skill,
+ evolve_memory=evolve_memory, night=night,
+ )
diff --git a/skillopt_sleep/state.py b/skillopt_sleep/state.py
index 1909246..1e16157 100644
--- a/skillopt_sleep/state.py
+++ b/skillopt_sleep/state.py
@@ -28,6 +28,7 @@ DEFAULT_STATE: Dict[str, Any] = {
"last_harvest": {}, # project -> iso timestamp of last harvested record
"slow_memory": "", # cross-night consolidated lessons (meta-skill analogue)
"history": [], # list of per-night summaries
+ "task_archive": [], # capped list of past mined tasks (for associative recall)
}
@@ -81,3 +82,15 @@ class SleepState:
def record_night(self, summary: Dict[str, Any]) -> None:
self.data.setdefault("history", []).append(summary)
+
+ # ── task archive (associative-recall memory) ──────────────────────────
+ def task_archive(self) -> list:
+ """Past mined tasks as plain dicts (newest last)."""
+ return list(self.data.get("task_archive", []))
+
+ def add_to_archive(self, task_dicts: list, cap: int = 300) -> None:
+ """Append tonight's tasks; keep only the most recent ``cap``."""
+ arc = self.data.setdefault("task_archive", [])
+ arc.extend(task_dicts)
+ if len(arc) > cap:
+ self.data["task_archive"] = arc[-cap:]