diff --git a/docs/sleep/experiment_results.md b/docs/sleep/experiment_results.md
new file mode 100644
index 0000000..500f72b
--- /dev/null
+++ b/docs/sleep/experiment_results.md
@@ -0,0 +1,73 @@
+# SkillOpt-Sleep — validation experiment results
+
+Generated: 2026-06-07 (autonomous offline session)
+Backend: mock (deterministic, no API). Reproducible via the commands below.
+
+```
+$ python3.12 -m skillopt.sleep.experiments.run_experiment --persona researcher --nights 4 --json
+{
+ "persona": "researcher",
+ "backend": "mock",
+ "nights_run": 1,
+ "baseline_holdout": 0.3333,
+ "after_holdout": 1.0,
+ "lift": 0.6667,
+ "improved": true,
+ "gate_blocks_harmful": true,
+ "final_skill_excerpt": "T -->\n## Learned preferences & procedures\n\n_This block is maintained by SkillOpt-Sleep. Edits here are proposed offline, validated against your past tasks, and adopted only after you approve them. Hand-edits outside this block are never touched._\n\n- Always wrap the final answer in ... tags.\n- Report arXiv ids in the exact form arXiv:XXXX.XXXXX.\n\n",
+ "trace": [
+ {
+ "night": 0,
+ "holdout_score": 0.3333,
+ "action": "baseline",
+ "n_edits": 0
+ },
+ {
+ "night": 1,
+ "holdout_score": 1.0,
+ "action": "accept_new_best",
+ "accepted": true,
+ "n_edits": 2,
+ "edits": [
+ "Always wrap the final answer in ... tags.",
+ "Report arXiv ids in the exact form arXiv:XXXX.XXXXX."
+ ],
+ "n_rejected": 0
+ }
+ ]
+}
+```
+
+```
+$ python3.12 -m skillopt.sleep.experiments.run_experiment --persona programmer --nights 4 --json
+{
+ "persona": "programmer",
+ "backend": "mock",
+ "nights_run": 1,
+ "baseline_holdout": 0.3194,
+ "after_holdout": 1.0,
+ "lift": 0.6806,
+ "improved": true,
+ "gate_blocks_harmful": true,
+ "final_skill_excerpt": "laude Code sessions.\n\n\n## Learned preferences & procedures\n\n_This block is maintained by SkillOpt-Sleep. Edits here are proposed offline, validated against your past tasks, and adopted only after you approve them. Hand-edits outside this block are never touched._\n\n- Write git commit subjects in imperative mood, max 50 chars.\n\n",
+ "trace": [
+ {
+ "night": 0,
+ "holdout_score": 0.3194,
+ "action": "baseline",
+ "n_edits": 0
+ },
+ {
+ "night": 1,
+ "holdout_score": 1.0,
+ "action": "accept_new_best",
+ "accepted": true,
+ "n_edits": 1,
+ "edits": [
+ "Write git commit subjects in imperative mood, max 50 chars."
+ ],
+ "n_rejected": 0
+ }
+ ]
+}
+```
diff --git a/skillopt-sleep-plugin/.claude-plugin/plugin.json b/skillopt-sleep-plugin/.claude-plugin/plugin.json
new file mode 100644
index 0000000..3595f51
--- /dev/null
+++ b/skillopt-sleep-plugin/.claude-plugin/plugin.json
@@ -0,0 +1,22 @@
+{
+ "name": "skillopt-sleep",
+ "description": "Give your local Claude agent a nightly 'sleep cycle': it reviews your past sessions offline, replays recurring tasks on your own API budget, and consolidates what it learns into validated memory (CLAUDE.md) and skills (SKILL.md).越用越好用 — gets better the more you use it. Synthesizes SkillOpt (validation-gated skill optimization), Claude Dreams (offline memory consolidation), and agent sleep/consolidation.",
+ "version": "0.1.0",
+ "author": {
+ "name": "Yifan Yang",
+ "email": "yifanyang@microsoft.com"
+ },
+ "homepage": "https://github.com/microsoft/SkillOpt",
+ "repository": "https://github.com/microsoft/SkillOpt",
+ "license": "MIT",
+ "keywords": [
+ "skillopt",
+ "self-improvement",
+ "memory-consolidation",
+ "dreams",
+ "sleep",
+ "skills",
+ "continual-learning",
+ "offline-optimization"
+ ]
+}
diff --git a/skillopt-sleep-plugin/README.md b/skillopt-sleep-plugin/README.md
new file mode 100644
index 0000000..7898b17
--- /dev/null
+++ b/skillopt-sleep-plugin/README.md
@@ -0,0 +1,82 @@
+# SkillOpt-Sleep (Claude Code plugin)
+
+> Give your local Claude agent a **sleep cycle**. Every night it reviews your
+> past sessions offline, replays your recurring tasks on your own API budget,
+> and consolidates what it learns into **validated** memory (`CLAUDE.md`) and
+> skills (`SKILL.md`). Your agent gets better the more you use it — no
+> model-weight training.
+
+SkillOpt-Sleep is the **deployment-time** companion to
+[SkillOpt](https://github.com/microsoft/SkillOpt). SkillOpt trains a skill
+offline on a benchmark; SkillOpt-Sleep applies the same discipline to *your own
+daily usage*: bounded text edits, accepted only through a held-out validation
+gate, with rejected edits kept as negative feedback.
+
+It synthesizes three ideas:
+
+| Idea | Contribution |
+|---|---|
+| **SkillOpt** | skill/memory = trainable text; bounded add/delete/replace edits; **held-out gate** keeps only changes that help. |
+| **Claude Dreams** | offline consolidation over past sessions; input never mutated; output **reviewed then adopted**. |
+| **Agent sleep** | periodic offline replay turns short-term episodes into long-term skill. |
+
+## What it does (one "night")
+
+```
+harvest ~/.claude transcripts → mine recurring tasks → replay offline
+ → consolidate (reflect → bounded edit → GATE) → stage proposal → (you) adopt
+```
+
+Nothing live is modified until **you** run `/sleep adopt` (the Dreams "review,
+then adopt or discard" contract). Every adopt backs up the prior file first.
+
+## Quick start
+
+```bash
+# from inside any project you use with Claude Code:
+/sleep dry-run # safe preview: what it would learn, no changes staged
+/sleep run # full cycle: stages a reviewed proposal (still no live edits)
+/sleep status # see history + the latest staged proposal
+/sleep adopt # apply the staged proposal to CLAUDE.md / SKILL.md (with backup)
+```
+
+Or call the engine directly (Python ≥ 3.10):
+
+```bash
+python -m skillopt.sleep run --project "$(pwd)" --scope invoked --backend mock
+python -m skillopt.sleep run --project "$(pwd)" --backend anthropic # real lift, uses your budget
+```
+
+Default backend is **`mock`** — deterministic, no API spend — so you can try the
+plumbing for free. Switch to `--backend anthropic` for genuine improvement.
+
+## Does it actually improve? (deterministic proof)
+
+```bash
+python -m skillopt.sleep.experiments.run_experiment --persona researcher --assert-improves
+python -m skillopt.sleep.experiments.run_experiment --persona programmer --assert-improves
+```
+
+Each prints the held-out score rising from baseline toward 1.0 as the gate
+accepts the general rules your tasks need, and confirms the gate **rejects** an
+injected harmful edit. Recorded output: [`docs/sleep/experiment_results.md`](../docs/sleep/experiment_results.md).
+
+## Schedule it nightly
+
+```bash
+"${CLAUDE_PLUGIN_ROOT}/scripts/install-cron.sh" "$(pwd)" # prints a crontab line; installs nothing
+```
+
+## Safety
+
+- **Read-only** harvest of `~/.claude`. `mock` replay has no side effects.
+- Proposals are **staged**, never auto-applied (unless you opt in with `--auto-adopt`).
+- Every adopt writes a backup under the staging dir's `backup/`.
+- Per-night **token/task budget caps**; secrets redacted from prompts.
+- `fresh` replay (Phase 3) runs only in throwaway git worktrees.
+
+## Status
+
+Phase 1 (engine + deterministic experiment + plugin surface) is complete.
+Phase 3 adds the real-API miner/judge and `fresh` worktree replay. See
+[`docs/superpowers/specs/2026-06-07-skillopt-sleep-claude-code-plugin-design.md`](../docs/superpowers/specs/2026-06-07-skillopt-sleep-claude-code-plugin-design.md).
diff --git a/skillopt-sleep-plugin/commands/sleep.md b/skillopt-sleep-plugin/commands/sleep.md
new file mode 100644
index 0000000..48b62f7
--- /dev/null
+++ b/skillopt-sleep-plugin/commands/sleep.md
@@ -0,0 +1,63 @@
+---
+description: Run or manage the SkillOpt-Sleep self-evolution cycle (review past sessions, replay tasks offline, consolidate validated memory + skills)
+argument-hint: "[run | dry-run | status | adopt | harvest] (default: status)"
+allowed-tools: Bash, Read
+---
+
+# /sleep — SkillOpt-Sleep nightly self-evolution
+
+You are driving **SkillOpt-Sleep**: a tool that lets this user's Claude agent
+improve offline by reviewing past sessions, replaying recurring tasks, and
+consolidating what it learns into **validated** memory (`CLAUDE.md`) and skills
+(`SKILL.md`). It is gated like SkillOpt: a change is kept only if it improves a
+held-out replay score, and nothing live is modified until the user adopts it.
+
+## Requested action: $ARGUMENTS
+
+(If `$ARGUMENTS` is empty, treat it as `status`.)
+
+## How to run it
+
+The engine is the `skillopt.sleep` Python package in this repo. Use the
+**plugin's bundled runner** so the right interpreter and repo are on the path:
+
+```bash
+"${CLAUDE_PLUGIN_ROOT}/scripts/sleep.sh" --project "$(pwd)" --scope invoked
+```
+
+`` is one of:
+
+| action | what it does |
+|-----------|--------------|
+| `status` | show how many nights have run + the latest staged proposal (READ-ONLY) |
+| `dry-run` | harvest → mine → replay → report, but **stage nothing** (safe preview) |
+| `run` | full cycle: also **stage** a reviewed proposal (still does NOT touch live files) |
+| `adopt` | apply the latest staged proposal to live `CLAUDE.md` / `SKILL.md` (backs up first) |
+| `harvest` | debug: print the recurring tasks mined from recent sessions |
+
+Default backend is `mock` (deterministic, no API spend). To use real Anthropic
+budget for genuine improvement, add `--backend anthropic`.
+
+## Steps to follow
+
+1. **Run the requested action** via the bundled runner above. Capture stdout.
+2. **For `run` / `dry-run`:** after it completes, `Read` the generated
+ `report.md` in the staging dir it prints, and show the user:
+ - held-out score: baseline → candidate (the proof it helped)
+ - the gate decision (accept/reject) and the exact edits it proposes
+ - where the proposal is staged
+3. **For `run` that produced an accepted proposal:** tell the user the diff is
+ staged and that **nothing live changed yet**. Offer to run `/sleep adopt`.
+4. **For `adopt`:** confirm which live files were updated and that backups were
+ written under the staging dir's `backup/`.
+5. **Never** edit `CLAUDE.md` or `SKILL.md` yourself — only the `adopt` action
+ does that, with a backup. Respect the review gate.
+
+## Safety reminders
+
+- Harvest is **read-only** over `~/.claude`. Replay in `mock` mode runs no
+ shell side effects.
+- The cycle stages proposals; the user is in control of adoption.
+- If the user asks to schedule this nightly, point them at
+ `${CLAUDE_PLUGIN_ROOT}/scripts/install-cron.sh` (prints a crontab line; does
+ not install anything without confirmation).
diff --git a/skillopt-sleep-plugin/hooks/hooks.json b/skillopt-sleep-plugin/hooks/hooks.json
new file mode 100644
index 0000000..6ea666b
--- /dev/null
+++ b/skillopt-sleep-plugin/hooks/hooks.json
@@ -0,0 +1,16 @@
+{
+ "hooks": {
+ "SessionEnd": [
+ {
+ "matcher": "*",
+ "hooks": [
+ {
+ "type": "command",
+ "command": "\"${CLAUDE_PLUGIN_ROOT}/hooks/on-session-end.sh\"",
+ "async": true
+ }
+ ]
+ }
+ ]
+ }
+}
diff --git a/skillopt-sleep-plugin/hooks/on-session-end.sh b/skillopt-sleep-plugin/hooks/on-session-end.sh
new file mode 100755
index 0000000..bd84be2
--- /dev/null
+++ b/skillopt-sleep-plugin/hooks/on-session-end.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+# SkillOpt-Sleep SessionEnd hook (async, best-effort, NON-BLOCKING).
+#
+# This does NOT run the optimizer. It only appends a tiny marker so the next
+# nightly cycle knows there is fresh activity to harvest, and (optionally)
+# nudges the user once that a sleep cycle is available. It must never fail the
+# session or spend API budget.
+set -uo pipefail
+
+PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}"
+STATE_DIR="${HOME}/.skillopt-sleep"
+mkdir -p "$STATE_DIR" 2>/dev/null || exit 0
+
+# Record that a session just ended (cheap; used for "is there new data?").
+printf '%s\t%s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "${PWD}" \
+ >> "$STATE_DIR/session-end.log" 2>/dev/null || true
+
+exit 0
diff --git a/skillopt-sleep-plugin/scripts/install-cron.sh b/skillopt-sleep-plugin/scripts/install-cron.sh
new file mode 100755
index 0000000..e18460d
--- /dev/null
+++ b/skillopt-sleep-plugin/scripts/install-cron.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Print (does NOT install) a crontab line that runs SkillOpt-Sleep nightly.
+# The user copies the line into `crontab -e` if they want it.
+set -euo pipefail
+
+PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}"
+RUNNER="$PLUGIN_ROOT/scripts/sleep.sh"
+PROJECT="${1:-$(pwd)}"
+BACKEND="${2:-mock}"
+
+# 3:17am local — deliberately off the :00 mark so many users don't all hit the
+# API at once (and we leave room for jitter).
+MIN=17
+HOUR=3
+
+cat <> "${PROJECT}/.skillopt-sleep/cron.log" 2>&1
+#
+# For fully-autonomous adoption (power users), append: --auto-adopt
+# To spend real API budget for genuine lift, set BACKEND=anthropic above.
+# ────────────────────────────────────────────────────────────────────────────
+EOF
diff --git a/skillopt-sleep-plugin/scripts/sleep.sh b/skillopt-sleep-plugin/scripts/sleep.sh
new file mode 100755
index 0000000..49c224a
--- /dev/null
+++ b/skillopt-sleep-plugin/scripts/sleep.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# SkillOpt-Sleep runner — invokes the skillopt.sleep engine with a suitable
+# Python interpreter, from the repo that contains this plugin.
+#
+# Usage: sleep.sh [extra args...]
+set -euo pipefail
+
+# Resolve the repo root: the plugin lives at /skillopt-sleep-plugin,
+# so the engine package is at /skillopt/sleep. CLAUDE_PLUGIN_ROOT points
+# at the plugin dir when run by Claude Code; fall back to this script's dir.
+PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}"
+REPO_ROOT="$(cd "$PLUGIN_ROOT/.." && pwd)"
+
+# Pick an interpreter that satisfies SkillOpt's 3.10+ requirement.
+PY=""
+for cand in python3.12 python3.11 python3.10 python3; do
+ if command -v "$cand" >/dev/null 2>&1; then
+ ver="$("$cand" -c 'import sys; print("%d%d" % sys.version_info[:2])' 2>/dev/null || echo 0)"
+ if [ "${ver:-0}" -ge 310 ]; then PY="$cand"; break; fi
+ fi
+done
+if [ -z "$PY" ]; then
+ echo "[sleep] ERROR: need Python >= 3.10 (found none). Install one and retry." >&2
+ exit 1
+fi
+
+if [ "$#" -eq 0 ]; then set -- status; fi
+
+cd "$REPO_ROOT"
+exec "$PY" -m skillopt.sleep "$@"
diff --git a/skillopt-sleep-plugin/skills/skillopt-sleep/SKILL.md b/skillopt-sleep-plugin/skills/skillopt-sleep/SKILL.md
new file mode 100644
index 0000000..e8d4a98
--- /dev/null
+++ b/skillopt-sleep-plugin/skills/skillopt-sleep/SKILL.md
@@ -0,0 +1,79 @@
+---
+name: skillopt-sleep
+description: "Use when the user wants their Claude agent to self-improve from past usage, asks about a nightly/offline 'sleep' or 'dream' cycle, memory/skill consolidation, or says things like '让 agent 越用越好用', 'review my past sessions', 'learn my preferences', 'consolidate what you learned', 'run the sleep cycle', or wants to schedule offline self-optimization. Drives the skillopt.sleep engine: harvest past sessions → mine recurring tasks → replay offline → consolidate validated CLAUDE.md/SKILL.md behind a held-out gate."
+---
+
+# SkillOpt-Sleep: offline self-evolution for a local Claude agent
+
+SkillOpt-Sleep gives the user's agent a **sleep cycle**. While the user is
+offline (e.g. nightly), it reviews their real past Claude Code sessions,
+re-runs recurring tasks on their own API budget, and consolidates what it
+learns into **memory** (`CLAUDE.md`) and **skills** (`SKILL.md`) — but only
+keeps changes that pass a held-out validation gate, and only after the user
+adopts them. The agent gets measurably better at *this* user's recurring work,
+with no model-weight training. It is the deployment-time analogue of training:
+short-term experience → long-term competence.
+
+It synthesizes three ideas:
+- **SkillOpt** — the skill/memory doc is trainable text; bounded add/delete/replace
+ edits; accepted only through a held-out gate; rejected edits become negative feedback.
+- **Claude Dreams** — offline consolidation that reads past sessions and rebuilds
+ memory (dedup/merge/resolve); the input is never mutated; output is reviewed then adopted.
+- **Agent sleep** — periodic offline replay turns episodes into durable skill.
+
+## When to use this skill
+
+Trigger when the user wants any of:
+- "make my agent learn from how I use it" / "越用越好用" / "remember my preferences across sessions"
+- a nightly/scheduled or on-demand **offline self-improvement / dream / sleep** run
+- to **review past sessions/trajectories** and distill recurring tasks
+- to **consolidate** feedback into `CLAUDE.md` or a managed skill
+- to **schedule** the cycle (cron) or **adopt** a staged proposal
+
+## The cycle (six stages)
+
+1. **Harvest** — read `~/.claude/projects/*/.jsonl` + `~/.claude/history.jsonl` (READ-ONLY) → session digests.
+2. **Mine** — digests → `TaskRecord`s (recurring intents + outcome labels + checkable refs where possible).
+3. **Replay** — re-run tasks offline under the *current* skill+memory → (hard, soft) scores.
+4. **Consolidate** — reflect on failures → propose bounded edits → **gate** on a held-out slice; accept only if it strictly improves.
+5. **Stage** — write `proposed_CLAUDE.md`, `proposed_SKILL.md`, a diff, and `report.md` into `/.skillopt-sleep/staging//`. **Nothing live changes.**
+6. **Adopt** — explicit (or opt-in auto): copy staged files over live ones, backing up first.
+
+## How to drive it
+
+Prefer the `/sleep` command. Under the hood it calls the bundled runner:
+
+```bash
+"${CLAUDE_PLUGIN_ROOT}/scripts/sleep.sh" status # what's happened
+"${CLAUDE_PLUGIN_ROOT}/scripts/sleep.sh" dry-run --project "$(pwd)" # safe preview
+"${CLAUDE_PLUGIN_ROOT}/scripts/sleep.sh" run --project "$(pwd)" # full cycle, stages a proposal
+"${CLAUDE_PLUGIN_ROOT}/scripts/sleep.sh" adopt --project "$(pwd)" # apply staged proposal (with backup)
+```
+
+- Default backend is `mock` (deterministic, **no API spend**) — good for trying the plumbing.
+- Add `--backend anthropic` to spend the user's real budget for genuine improvement.
+- Scope defaults to the invoked project; `--scope all` harvests every project.
+
+## Hard rules
+
+- **Never** hand-edit the user's `CLAUDE.md` / `SKILL.md` as part of this skill.
+ Only the `adopt` action changes live files, and it backs them up first.
+- Harvest is read-only. `mock` replay has no side effects.
+- Always show the user the **held-out baseline → candidate** score and the
+ exact proposed edits before suggesting adoption. Evidence before adoption.
+- If asked whether it really helps, run
+ `python -m skillopt.sleep.experiments.run_experiment --persona researcher --json`
+ — a deterministic demo that proves held-out lift and that the gate blocks
+ harmful edits.
+
+## Validate / demo
+
+```bash
+# deterministic proof (no API): held-out score rises, gate blocks regressions
+python -m skillopt.sleep.experiments.run_experiment --persona researcher --assert-improves
+python -m skillopt.sleep.experiments.run_experiment --persona programmer --assert-improves
+```
+
+See `docs/sleep/experiment_results.md` for recorded output and
+`docs/superpowers/specs/2026-06-07-skillopt-sleep-claude-code-plugin-design.md`
+for the full design.
diff --git a/skillopt/sleep/__init__.py b/skillopt/sleep/__init__.py
new file mode 100644
index 0000000..6e35c0b
--- /dev/null
+++ b/skillopt/sleep/__init__.py
@@ -0,0 +1,20 @@
+"""SkillOpt-Sleep — nightly offline self-evolution for a local Claude agent.
+
+A Claude Code plugin engine that gives a user's agent a "sleep cycle":
+harvest the day's real session transcripts, mine recurring tasks, replay
+them offline, and consolidate short-term experience into long-term memory
+(CLAUDE.md) and skills (SKILL.md) behind a SkillOpt validation gate.
+
+Synthesizes three ideas:
+ * SkillOpt — validation-gated bounded text optimization (this repo)
+ * Dreams — offline memory consolidation, input never mutated
+ * Sleep — short-term experience -> long-term competence, offline
+
+Public entry points:
+ * skillopt.sleep.cli — `python -m skillopt.sleep ...`
+ * skillopt.sleep.cycle.run_sleep_cycle(...)
+"""
+from __future__ import annotations
+
+__all__ = ["__version__"]
+__version__ = "0.1.0"
diff --git a/skillopt/sleep/__main__.py b/skillopt/sleep/__main__.py
new file mode 100644
index 0000000..22dc15d
--- /dev/null
+++ b/skillopt/sleep/__main__.py
@@ -0,0 +1,195 @@
+"""SkillOpt-Sleep — command-line interface.
+
+ python -m skillopt.sleep run # full cycle: harvest->mine->replay->gate->stage
+ python -m skillopt.sleep dry-run # same but report only, no staging/adopt
+ python -m skillopt.sleep status # show state + latest staged proposal
+ python -m skillopt.sleep adopt # apply the latest staged proposal (with backup)
+ python -m skillopt.sleep harvest # just print what would be mined (debug)
+
+Common flags:
+ --project PATH project to evolve (default: cwd)
+ --scope all|invoked harvest scope (default: invoked)
+ --backend mock|anthropic
+ --model NAME
+ --lookback-hours N
+ --auto-adopt
+ --json machine-readable output
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from typing import Any, Dict
+
+from skillopt.sleep.config import load_config
+from skillopt.sleep.cycle import run_sleep_cycle
+from skillopt.sleep.harvest import harvest
+from skillopt.sleep.mine import mine
+from skillopt.sleep.state import SleepState
+from skillopt.sleep.staging import latest_staging, adopt as adopt_staging
+
+
+def _add_common(p: argparse.ArgumentParser) -> None:
+ p.add_argument("--project", default="")
+ p.add_argument("--scope", default="", choices=["", "all", "invoked"])
+ p.add_argument("--backend", default="", choices=["", "mock", "anthropic"])
+ p.add_argument("--model", default="")
+ p.add_argument("--claude-home", default="", help="override ~/.claude (also isolates state)")
+ p.add_argument("--lookback-hours", type=int, default=0)
+ p.add_argument("--edit-budget", type=int, default=0)
+ p.add_argument("--auto-adopt", action="store_true")
+ p.add_argument("--json", action="store_true")
+
+
+def _cfg_from_args(args) -> Any:
+ overrides: Dict[str, Any] = {}
+ if args.project:
+ overrides["invoked_project"] = os.path.abspath(args.project)
+ overrides["projects"] = "invoked"
+ if args.scope:
+ overrides["projects"] = args.scope
+ if args.backend:
+ overrides["backend"] = args.backend
+ if args.model:
+ overrides["model"] = args.model
+ if getattr(args, "claude_home", ""):
+ overrides["claude_home"] = os.path.abspath(args.claude_home)
+ if getattr(args, "lookback_hours", 0):
+ overrides["lookback_hours"] = args.lookback_hours
+ if getattr(args, "edit_budget", 0):
+ overrides["edit_budget"] = args.edit_budget
+ if getattr(args, "auto_adopt", False):
+ overrides["auto_adopt"] = True
+ return load_config(**overrides)
+
+
+def cmd_run(args, dry: bool = False) -> int:
+ cfg = _cfg_from_args(args)
+ outcome = run_sleep_cycle(cfg, dry_run=dry)
+ rep = outcome.report
+ if args.json:
+ print(json.dumps({
+ "night": rep.night, "accepted": rep.accepted,
+ "gate_action": rep.gate_action,
+ "baseline": rep.baseline_score, "candidate": rep.candidate_score,
+ "n_tasks": rep.n_tasks, "n_sessions": rep.n_sessions,
+ "edits": [e.__dict__ for e in rep.edits],
+ "staging_dir": outcome.staging_dir, "adopted": outcome.adopted,
+ }, ensure_ascii=False, indent=2))
+ else:
+ print(f"[sleep] night {rep.night}: {rep.n_sessions} sessions -> {rep.n_tasks} tasks")
+ print(f"[sleep] held-out {rep.baseline_score:.3f} -> {rep.candidate_score:.3f} "
+ f"=> {rep.gate_action} (accepted={rep.accepted})")
+ for e in rep.edits:
+ print(f" + [{e.target}/{e.op}] {e.content}")
+ if outcome.staging_dir:
+ print(f"[sleep] staged: {outcome.staging_dir}")
+ if not outcome.adopted:
+ print("[sleep] review it, then: python -m skillopt.sleep adopt")
+ if outcome.adopted:
+ print(f"[sleep] auto-adopted: {', '.join(outcome.adopted_paths)}")
+ return 0
+
+
+def cmd_status(args) -> int:
+ cfg = _cfg_from_args(args)
+ state = SleepState.load(cfg.state_path)
+ project = cfg.get("invoked_project") or os.getcwd()
+ latest = latest_staging(project)
+ info = {
+ "night": state.night,
+ "state_path": cfg.state_path,
+ "project": project,
+ "history_tail": state.data.get("history", [])[-5:],
+ "latest_staging": latest,
+ "slow_memory_chars": len(state.slow_memory),
+ }
+ if args.json:
+ print(json.dumps(info, ensure_ascii=False, indent=2))
+ else:
+ print(f"[sleep] nights so far: {state.night}")
+ print(f"[sleep] project: {project}")
+ if latest:
+ print(f"[sleep] latest staged proposal: {latest}")
+ rp = os.path.join(latest, "report.md")
+ if os.path.exists(rp):
+ with open(rp) as f:
+ print("\n" + f.read())
+ else:
+ print("[sleep] no staged proposals yet.")
+ return 0
+
+
+def cmd_adopt(args) -> int:
+ cfg = _cfg_from_args(args)
+ project = cfg.get("invoked_project") or os.getcwd()
+ target = args.staging or latest_staging(project)
+ if not target or not os.path.isdir(target):
+ print("[sleep] nothing to adopt (no staging dir).")
+ return 1
+ updated = adopt_staging(target)
+ print(f"[sleep] adopted from {target}")
+ for p in updated:
+ print(f" -> {p}")
+ if not updated:
+ print("[sleep] (proposal contained no accepted changes)")
+ return 0
+
+
+def cmd_harvest(args) -> int:
+ cfg = _cfg_from_args(args)
+ digests = harvest(
+ cfg.transcripts_dir,
+ scope=cfg.get("projects", "invoked"),
+ invoked_project=cfg.get("invoked_project", ""),
+ limit=cfg.get("max_tasks_per_night", 40) * 3,
+ )
+ tasks = mine(digests, max_tasks=cfg.get("max_tasks_per_night", 40),
+ holdout_fraction=cfg.get("holdout_fraction", 0.34), seed=cfg.get("seed", 42))
+ if args.json:
+ print(json.dumps({
+ "n_sessions": len(digests),
+ "tasks": [t.to_dict() for t in tasks],
+ }, ensure_ascii=False, indent=2))
+ else:
+ print(f"[sleep] {len(digests)} sessions -> {len(tasks)} tasks")
+ for t in tasks:
+ print(f" [{t.split}/{t.outcome}] {t.intent[:90]}")
+ return 0
+
+
+def main(argv=None) -> int:
+ parser = argparse.ArgumentParser(prog="skillopt.sleep", description="SkillOpt-Sleep nightly self-evolution")
+ sub = parser.add_subparsers(dest="cmd", required=True)
+
+ p_run = sub.add_parser("run", help="run a full sleep cycle")
+ _add_common(p_run)
+ p_dry = sub.add_parser("dry-run", help="harvest+mine+replay, report only")
+ _add_common(p_dry)
+ p_status = sub.add_parser("status", help="show state + latest proposal")
+ _add_common(p_status)
+ p_adopt = sub.add_parser("adopt", help="apply latest staged proposal")
+ _add_common(p_adopt)
+ p_adopt.add_argument("--staging", default="", help="specific staging dir")
+ p_harvest = sub.add_parser("harvest", help="debug: show mined tasks")
+ _add_common(p_harvest)
+
+ args = parser.parse_args(argv)
+ if args.cmd == "run":
+ return cmd_run(args, dry=False)
+ if args.cmd == "dry-run":
+ return cmd_run(args, dry=True)
+ if args.cmd == "status":
+ return cmd_status(args)
+ if args.cmd == "adopt":
+ return cmd_adopt(args)
+ if args.cmd == "harvest":
+ return cmd_harvest(args)
+ parser.print_help()
+ return 2
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/skillopt/sleep/backend.py b/skillopt/sleep/backend.py
new file mode 100644
index 0000000..a3b5aca
--- /dev/null
+++ b/skillopt/sleep/backend.py
@@ -0,0 +1,334 @@
+"""SkillOpt-Sleep — optimizer/replay backend abstraction.
+
+A backend supplies the three "intelligent" operations the sleep cycle needs:
+
+ 1. attempt(task, skill, memory) -> response text (the rollout)
+ 2. judge(task, response) -> (hard, soft, rationale) (the reward)
+ 3. reflect(failures, successes, skill, memory)
+ -> list[EditRecord] (proposed bounded edits)
+
+Two implementations:
+ * MockBackend — deterministic, no API, used for tests + the experiment.
+ Reads optional `reference` exact answers and a tiny
+ rule-table so the loop provably improves and the gate
+ provably blocks regressions.
+ * AnthropicBackend — uses the user's ANTHROPIC_API_KEY via the `claude`
+ CLI or the anthropic SDK (lazy-imported). Real lift.
+
+The backend never touches live config; it only returns text/edits that the
+consolidation stage gates and stages.
+"""
+from __future__ import annotations
+
+import json
+import os
+import re
+import subprocess
+from typing import Any, Dict, List, Optional, Tuple
+
+from skillopt.sleep.types import EditRecord, ReplayResult, TaskRecord
+
+
+# ── Backend protocol ──────────────────────────────────────────────────────────
+
+class Backend:
+ name = "base"
+
+ def attempt(self, task: TaskRecord, skill: str, memory: str) -> str:
+ raise NotImplementedError
+
+ def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]:
+ raise NotImplementedError
+
+ def reflect(
+ self,
+ failures: List[Tuple[TaskRecord, ReplayResult]],
+ successes: List[Tuple[TaskRecord, ReplayResult]],
+ skill: str,
+ memory: str,
+ *,
+ edit_budget: int,
+ evolve_skill: bool,
+ evolve_memory: bool,
+ ) -> List[EditRecord]:
+ raise NotImplementedError
+
+ # token accounting (optional)
+ def tokens_used(self) -> int:
+ return 0
+
+
+# ── Shared scoring helpers ────────────────────────────────────────────────────
+
+def _normalize(s: str) -> str:
+ s = (s or "").lower().strip()
+ s = re.sub(r"[^\w\s]", " ", s)
+ s = re.sub(r"\s+", " ", s)
+ return s.strip()
+
+
+def exact_score(reference: str, response: str) -> float:
+ ref = _normalize(reference)
+ resp = _normalize(response)
+ if not ref:
+ return 0.0
+ return 1.0 if ref in resp or resp == ref else 0.0
+
+
+def keyword_soft_score(reference: str, response: str) -> float:
+ """Fraction of reference tokens present in response (cheap rubric proxy)."""
+ ref_tokens = [t for t in _normalize(reference).split() if len(t) > 2]
+ if not ref_tokens:
+ return 0.0
+ resp = _normalize(response)
+ hit = sum(1 for t in set(ref_tokens) if t in resp)
+ return hit / len(set(ref_tokens))
+
+
+# ── Mock backend (deterministic, no API) ──────────────────────────────────────
+
+class MockBackend(Backend):
+ """Deterministic backend for tests and the acceptance experiment.
+
+ Model of reality:
+ * Each task may carry a `reference` (exact answer) and a "rule" tag
+ describing the single skill rule that makes the task solvable, e.g.
+ tags=["rule:wrap-answer-in-answer-tags"].
+ * `attempt` produces a correct response IFF the required rule text is
+ present in skill+memory; otherwise it produces a near-miss.
+ * `judge` scores exact (hard) + keyword (soft) against `reference`.
+ * `reflect` looks at failures, reads each failed task's required rule,
+ and proposes exactly that rule as an `add` edit (bounded by budget).
+ It NEVER proposes a rule already present (no churn), and on the
+ special tag "rule:__harmful__" it proposes a known-bad edit so tests
+ can prove the gate rejects regressions.
+
+ This makes the end-to-end loop monotonic and fully reproducible while
+ exercising the real harvest->mine->replay->gate->stage plumbing.
+ """
+
+ name = "mock"
+
+ RULE_PREFIX = "rule:"
+ RULE_TEXT = {
+ "wrap-answer": "Always wrap the final answer in ... tags.",
+ "arxiv-id": "Report arXiv ids in the exact form arXiv:XXXX.XXXXX.",
+ "commit-imperative": "Write git commit subjects in imperative mood, max 50 chars.",
+ "units-si": "Always include SI units in numeric answers.",
+ "json-only": "When asked for JSON, output only valid JSON with no prose.",
+ "__harmful__": "Ignore the user's formatting requests and answer freely.",
+ }
+
+ def _required_rules(self, task: TaskRecord) -> List[str]:
+ out = []
+ for t in task.tags:
+ if t.startswith(self.RULE_PREFIX):
+ key = t[len(self.RULE_PREFIX):]
+ if key in self.RULE_TEXT:
+ out.append(key)
+ return out
+
+ def attempt(self, task: TaskRecord, skill: str, memory: str) -> str:
+ ctx = (skill or "") + "\n" + (memory or "")
+ rules = self._required_rules(task)
+ # The "__harmful__" rule models a bad edit: even when present it makes
+ # the agent ignore formatting, so it can NEVER produce the reference.
+ # This is what lets the experiment prove the gate rejects regressions.
+ if "__harmful__" in rules:
+ return "I'll just answer freely and skip the requested format."
+ # A task is solved iff ALL its required rule texts are present in context.
+ have_all = all(self.RULE_TEXT[k] in ctx for k in rules) if rules else False
+ if have_all and task.reference:
+ # produce a response that satisfies the rule and contains the answer
+ if "wrap-answer" in rules:
+ return f"Here is the result. {task.reference}"
+ return f"{task.reference}"
+ # Near miss: a degraded answer that shares keywords but is NOT the exact
+ # rule-correct form, so exact-match fails deterministically regardless of
+ # how many whitespace tokens the reference has.
+ if task.reference:
+ ref = task.reference
+ mangled = ref[:-2] if len(ref) > 3 else "unknown"
+ return f"approximately {mangled} (format not applied)"
+ return "(attempted, no checkable reference)"
+
+ def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]:
+ if task.reference_kind == "exact" and task.reference:
+ hard = exact_score(task.reference, response)
+ soft = max(hard, keyword_soft_score(task.reference, response))
+ return hard, soft, f"exact-match={hard}"
+ if task.reference_kind == "rubric" and task.reference:
+ soft = keyword_soft_score(task.reference, response)
+ return (1.0 if soft >= 0.8 else 0.0), soft, f"rubric keyword soft={soft:.2f}"
+ # no reference: outcome-derived weak label
+ hard = 1.0 if task.outcome == "success" else 0.0
+ return hard, hard, "outcome-derived"
+
+ def reflect(
+ self,
+ failures,
+ successes,
+ skill: str,
+ memory: str,
+ *,
+ edit_budget: int,
+ evolve_skill: bool,
+ evolve_memory: bool,
+ ) -> List[EditRecord]:
+ ctx = (skill or "") + "\n" + (memory or "")
+ edits: List[EditRecord] = []
+ seen_text: set = set()
+ target = "skill" if evolve_skill else "memory"
+ for task, _res in failures:
+ for key in self._required_rules(task):
+ text = self.RULE_TEXT[key]
+ if text in ctx or text in seen_text:
+ continue
+ seen_text.add(text)
+ edits.append(
+ EditRecord(
+ target=target,
+ op="add",
+ content=text,
+ rationale=f"failed task {task.id} requires rule '{key}'",
+ )
+ )
+ if len(edits) >= edit_budget:
+ return edits
+ return edits
+
+
+# ── Anthropic backend (real API; lazy, optional) ──────────────────────────────
+
+class AnthropicBackend(Backend):
+ """Uses the user's Anthropic budget. Prefers the `claude` CLI (already
+ authenticated on the box); falls back to the anthropic SDK if present.
+
+ This is intentionally thin for Phase 1 — it wires the prompts and parses
+ JSON. Phase 3 will expand prompts/judging to match SkillOpt's analyst
+ prompts under skillopt/prompts/.
+ """
+
+ name = "anthropic"
+
+ def __init__(self, model: str = "", claude_path: str = "claude") -> None:
+ self.model = model or os.environ.get("ANTHROPIC_MODEL", "") or "sonnet"
+ self.claude_path = claude_path
+ self._tokens = 0
+
+ # -- low-level call -----------------------------------------------------
+ def _call(self, prompt: str, *, max_tokens: int = 1024) -> str:
+ # Try the CLI first (non-interactive, text output).
+ try:
+ cmd = [self.claude_path, "-p", "--output-format", "text"]
+ if self.model:
+ cmd += ["--model", self.model]
+ cmd += ["--", prompt]
+ proc = subprocess.run(
+ cmd, capture_output=True, text=True, timeout=180,
+ )
+ out = (proc.stdout or "").strip()
+ if out:
+ self._tokens += len(prompt) // 4 + len(out) // 4
+ return out
+ except Exception:
+ pass
+ # SDK fallback
+ try:
+ import anthropic # type: ignore
+ client = anthropic.Anthropic()
+ msg = client.messages.create(
+ model=self.model or "claude-sonnet-4-5",
+ max_tokens=max_tokens,
+ messages=[{"role": "user", "content": prompt}],
+ )
+ text = "".join(getattr(b, "text", "") for b in msg.content)
+ self._tokens += getattr(msg.usage, "input_tokens", 0) + getattr(
+ msg.usage, "output_tokens", 0
+ )
+ return text.strip()
+ except Exception:
+ return ""
+
+ def attempt(self, task: TaskRecord, skill: str, memory: str) -> str:
+ prompt = (
+ "You are completing a recurring task for a user. Apply the skill and "
+ "memory exactly.\n\n"
+ f"# Skill\n{skill or '(none)'}\n\n# Memory\n{memory or '(none)'}\n\n"
+ f"# Task\n{task.intent}\n\n{task.context_excerpt}\n\n"
+ "Return only the final answer."
+ )
+ return self._call(prompt)
+
+ def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]:
+ if task.reference_kind == "exact" and task.reference:
+ hard = exact_score(task.reference, response)
+ return hard, max(hard, keyword_soft_score(task.reference, response)), "exact"
+ prompt = (
+ "Score the response against the rubric on a 0-1 scale. "
+ "Return JSON {\"score\": <0..1>, \"reason\": \"...\"}.\n\n"
+ f"# Rubric\n{task.reference or task.intent}\n\n# Response\n{response}"
+ )
+ raw = self._call(prompt, max_tokens=256)
+ m = re.search(r"\{.*\}", raw, re.DOTALL)
+ if m:
+ try:
+ obj = json.loads(m.group(0))
+ soft = float(obj.get("score", 0.0))
+ return (1.0 if soft >= 0.8 else 0.0), soft, str(obj.get("reason", ""))
+ except Exception:
+ pass
+ return 0.0, 0.0, "judge-parse-failed"
+
+ def reflect(
+ self,
+ failures,
+ successes,
+ skill: str,
+ memory: str,
+ *,
+ edit_budget: int,
+ evolve_skill: bool,
+ evolve_memory: bool,
+ ) -> List[EditRecord]:
+ fail_text = "\n".join(
+ f"- intent: {t.intent[:200]}\n got: {r.response[:200]}\n why: {r.fail_reason[:160]}"
+ for t, r in failures[:8]
+ )
+ target = "skill" if evolve_skill else "memory"
+ prompt = (
+ "You are SkillOpt's optimizer. Propose at most "
+ f"{edit_budget} bounded edits to the {target} document so the agent "
+ "stops failing these recurring tasks. Each edit must be a short, "
+ "general, reusable rule (not task-specific). Return JSON list: "
+ "[{\"op\":\"add|replace|delete\",\"content\":\"...\",\"rationale\":\"...\"}].\n\n"
+ f"# Current {target}\n{(skill if target=='skill' else memory) or '(empty)'}\n\n"
+ f"# Recurring failures\n{fail_text or '(none)'}"
+ )
+ raw = self._call(prompt, max_tokens=1024)
+ m = re.search(r"\[.*\]", raw, re.DOTALL)
+ edits: List[EditRecord] = []
+ if m:
+ try:
+ for e in json.loads(m.group(0))[:edit_budget]:
+ edits.append(
+ EditRecord(
+ target=target,
+ op=str(e.get("op", "add")),
+ content=str(e.get("content", "")).strip(),
+ anchor=str(e.get("anchor", "")),
+ rationale=str(e.get("rationale", "")),
+ )
+ )
+ except Exception:
+ pass
+ return [e for e in edits if e.content]
+
+ def tokens_used(self) -> int:
+ return self._tokens
+
+
+def get_backend(name: str, *, model: str = "", claude_path: str = "claude") -> Backend:
+ if name == "anthropic":
+ return AnthropicBackend(model=model, claude_path=claude_path)
+ return MockBackend()
diff --git a/skillopt/sleep/config.py b/skillopt/sleep/config.py
new file mode 100644
index 0000000..0aff324
--- /dev/null
+++ b/skillopt/sleep/config.py
@@ -0,0 +1,137 @@
+"""SkillOpt-Sleep — configuration.
+
+Config is JSON-first (yaml optional) so the engine and the deterministic
+experiment run with zero external dependencies. Defaults are safe:
+review-gated adoption, single-project scope, bounded token/task budgets.
+
+Resolution order (later wins):
+ 1. built-in DEFAULTS
+ 2. ~/.skillopt-sleep/config.json (or .yaml if PyYAML available)
+ 3. explicit overrides passed to load_config(**overrides)
+"""
+from __future__ import annotations
+
+import json
+import os
+from dataclasses import dataclass, field, asdict
+from typing import Any, Dict, List, Optional
+
+
+HOME_STATE_DIR = os.path.expanduser("~/.skillopt-sleep")
+CLAUDE_HOME = os.path.expanduser("~/.claude")
+
+
+DEFAULTS: Dict[str, Any] = {
+ # ── scope ──────────────────────────────────────────────────────────────
+ "claude_home": CLAUDE_HOME,
+ "projects": "invoked", # "invoked" | "all" | [list of abs paths]
+ "invoked_project": "", # filled at runtime (cwd) when projects == "invoked"
+ "lookback_hours": 72, # harvest window when no prior sleep recorded
+ # ── budgets ────────────────────────────────────────────────────────────
+ "max_tasks_per_night": 40,
+ "max_tokens_per_night": 400_000,
+ "holdout_fraction": 0.34, # fraction of mined tasks reserved for the gate
+ # ── optimizer ──────────────────────────────────────────────────────────
+ "backend": "mock", # "mock" | "anthropic"
+ "model": "", # backend-specific; "" => backend default
+ "edit_budget": 4, # textual learning rate (max edits/night)
+ "gate_metric": "mixed", # hard | soft | mixed (mixed best for tiny holdouts)
+ "gate_mixed_weight": 0.5,
+ "replay_mode": "mock", # "mock" (sandboxed prompt) | "fresh" (worktree)
+ "evolve_memory": True, # consolidate CLAUDE.md
+ "evolve_skill": True, # consolidate the managed SKILL.md
+ # ── adoption / safety ──────────────────────────────────────────────────
+ "auto_adopt": False, # default: stage + require explicit `adopt`
+ "managed_skill_name": "skillopt-sleep-learned",
+ "redact_secrets": True,
+ "seed": 42,
+}
+
+
+@dataclass
+class SleepConfig:
+ data: Dict[str, Any] = field(default_factory=lambda: dict(DEFAULTS))
+
+ # convenient attribute access -------------------------------------------
+ def __getattr__(self, name: str) -> Any:
+ # only called when normal attribute lookup fails
+ data = object.__getattribute__(self, "data")
+ if name in data:
+ return data[name]
+ raise AttributeError(name)
+
+ def get(self, key: str, default: Any = None) -> Any:
+ return self.data.get(key, default)
+
+ def to_dict(self) -> Dict[str, Any]:
+ return dict(self.data)
+
+ # paths ------------------------------------------------------------------
+ @property
+ def state_dir(self) -> str:
+ # Allow full isolation: if the caller overrides state_dir explicitly,
+ # honor it; else derive from claude_home's parent so a single
+ # --claude-home flag isolates transcripts AND state together; else the
+ # default ~/.skillopt-sleep.
+ explicit = self.data.get("state_dir")
+ if explicit:
+ return explicit
+ ch = self.data.get("claude_home", CLAUDE_HOME)
+ if os.path.abspath(ch) != os.path.abspath(CLAUDE_HOME):
+ return os.path.join(os.path.dirname(os.path.abspath(ch)), ".skillopt-sleep")
+ return HOME_STATE_DIR
+
+ @property
+ def state_path(self) -> str:
+ return os.path.join(self.state_dir, "state.json")
+
+ @property
+ def transcripts_dir(self) -> str:
+ return os.path.join(self.data["claude_home"], "projects")
+
+ @property
+ def history_path(self) -> str:
+ return os.path.join(self.data["claude_home"], "history.jsonl")
+
+ @property
+ def skills_dir(self) -> str:
+ return os.path.join(self.data["claude_home"], "skills")
+
+ def managed_skill_path(self) -> str:
+ return os.path.join(
+ self.skills_dir, self.data["managed_skill_name"], "SKILL.md"
+ )
+
+
+def _user_config_path() -> Optional[str]:
+ for name in ("config.json", "config.yaml", "config.yml"):
+ p = os.path.join(HOME_STATE_DIR, name)
+ if os.path.exists(p):
+ return p
+ return None
+
+
+def _load_file(path: str) -> Dict[str, Any]:
+ if path.endswith((".yaml", ".yml")):
+ try:
+ import yaml # optional
+ with open(path) as f:
+ return yaml.safe_load(f) or {}
+ except Exception:
+ return {}
+ with open(path) as f:
+ return json.load(f)
+
+
+def load_config(**overrides: Any) -> SleepConfig:
+ data = dict(DEFAULTS)
+ path = _user_config_path()
+ if path:
+ try:
+ data.update(_load_file(path) or {})
+ except Exception:
+ pass
+ data.update({k: v for k, v in overrides.items() if v is not None})
+ if data.get("projects") == "invoked" and not data.get("invoked_project"):
+ data["invoked_project"] = os.getcwd()
+ return SleepConfig(data=data)
diff --git a/skillopt/sleep/consolidate.py b/skillopt/sleep/consolidate.py
new file mode 100644
index 0000000..0a679d6
--- /dev/null
+++ b/skillopt/sleep/consolidate.py
@@ -0,0 +1,176 @@
+"""SkillOpt-Sleep — Stage 4: consolidate (one SkillOpt epoch).
+
+This is the core that makes nightly evolution *safe*: it proposes bounded
+edits from replayed failures, applies them to a candidate skill/memory, then
+**gates** the candidate on a held-out slice of the user's own tasks. Only a
+candidate that strictly improves the held-out score is accepted — exactly the
+SkillOpt validation gate, reused verbatim from ``skillopt.evaluation.gate``.
+
+Reused from the main SkillOpt package (import-light, no `openai` needed):
+ * skillopt.evaluation.gate.evaluate_gate / select_gate_score
+"""
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+from skillopt.sleep.backend import Backend
+from skillopt.sleep.memory import apply_edits
+from skillopt.sleep.replay import aggregate_scores, replay_batch
+from skillopt.sleep.types import EditRecord, ReplayResult, TaskRecord
+
+
+# Reuse the real SkillOpt gate. This module imports cleanly without `openai`.
+try:
+ from skillopt.evaluation.gate import evaluate_gate, select_gate_score
+ _HAVE_REPO_GATE = True
+except Exception: # pragma: no cover - fallback keeps engine standalone
+ _HAVE_REPO_GATE = False
+
+ def select_gate_score(hard, soft, metric="hard", mixed_weight=0.5): # type: ignore
+ if metric == "hard":
+ return float(hard)
+ if metric == "soft":
+ return float(soft)
+ w = max(0.0, min(1.0, float(mixed_weight)))
+ return (1 - w) * float(hard) + w * float(soft)
+
+
+@dataclass
+class ConsolidationResult:
+ accepted: bool
+ gate_action: str
+ baseline_score: float
+ candidate_score: float
+ new_skill: str
+ new_memory: str
+ applied_edits: List[EditRecord]
+ rejected_edits: List[EditRecord]
+ holdout_baseline: float
+ holdout_candidate: float
+
+
+def _split(tasks: List[TaskRecord]) -> Tuple[List[TaskRecord], List[TaskRecord]]:
+ replay = [t for t in tasks if t.split == "replay"]
+ holdout = [t for t in tasks if t.split == "holdout"]
+ # be robust if a split is empty
+ if not replay:
+ replay = tasks
+ if not holdout:
+ holdout = tasks
+ return replay, holdout
+
+
+def consolidate(
+ backend: Backend,
+ tasks: List[TaskRecord],
+ skill: str,
+ memory: str,
+ *,
+ edit_budget: int = 4,
+ gate_metric: str = "mixed",
+ gate_mixed_weight: float = 0.5,
+ evolve_skill: bool = True,
+ evolve_memory: bool = True,
+ night: int = 1,
+) -> ConsolidationResult:
+ """Run one consolidation epoch: reflect -> bounded edit -> gate.
+
+ Skill and memory are evolved in sequence (skill first if both enabled),
+ each behind the same held-out gate, so each document only changes when it
+ demonstrably helps on the user's held-out tasks.
+ """
+ replay_tasks, holdout_tasks = _split(tasks)
+
+ # ── baseline on held-out slice (the gate reference) ──────────────────
+ base_pairs = replay_batch(backend, holdout_tasks, skill, memory)
+ base_hard, base_soft = aggregate_scores(base_pairs)
+ base_score = select_gate_score(base_hard, base_soft, gate_metric, gate_mixed_weight)
+
+ # ── reflect over replay-split failures/successes ─────────────────────
+ train_pairs = replay_batch(backend, replay_tasks, skill, memory)
+ failures = [(t, r) for (t, r) in train_pairs if r.hard < 1.0]
+ successes = [(t, r) for (t, r) in train_pairs if r.hard >= 1.0]
+
+ cand_skill, cand_memory = skill, memory
+ all_applied: List[EditRecord] = []
+ all_rejected: List[EditRecord] = []
+
+ def _gate_apply(doc: str, edits: List[EditRecord], which: str) -> str:
+ nonlocal cand_skill, cand_memory, base_score, all_applied, all_rejected
+ if not edits:
+ return doc
+ new_doc, applied = apply_edits(doc, edits)
+ if not applied:
+ return doc
+ # evaluate candidate on the held-out slice
+ trial_skill = new_doc if which == "skill" else cand_skill
+ trial_memory = new_doc if which == "memory" else cand_memory
+ pairs = replay_batch(backend, holdout_tasks, trial_skill, trial_memory)
+ h, s = aggregate_scores(pairs)
+ cand_score = select_gate_score(h, s, gate_metric, gate_mixed_weight)
+ if cand_score > base_score:
+ base_score = cand_score
+ all_applied.extend(applied)
+ return new_doc
+ all_rejected.extend(applied)
+ return doc
+
+ if evolve_skill:
+ edits = backend.reflect(
+ failures, successes, cand_skill, cand_memory,
+ edit_budget=edit_budget, evolve_skill=True, evolve_memory=False,
+ )
+ cand_skill = _gate_apply(cand_skill, edits, "skill")
+
+ if evolve_memory:
+ # re-evaluate failures under the (possibly improved) skill
+ train_pairs2 = replay_batch(backend, replay_tasks, cand_skill, cand_memory)
+ failures2 = [(t, r) for (t, r) in train_pairs2 if r.hard < 1.0]
+ successes2 = [(t, r) for (t, r) in train_pairs2 if r.hard >= 1.0]
+ edits_m = backend.reflect(
+ failures2, successes2, cand_skill, cand_memory,
+ edit_budget=edit_budget, evolve_skill=False, evolve_memory=True,
+ )
+ cand_memory = _gate_apply(cand_memory, edits_m, "memory")
+
+ # ── final gate decision (use the repo gate for the canonical action) ──
+ final_pairs = replay_batch(backend, holdout_tasks, cand_skill, cand_memory)
+ final_hard, final_soft = aggregate_scores(final_pairs)
+ final_score = select_gate_score(final_hard, final_soft, gate_metric, gate_mixed_weight)
+
+ if _HAVE_REPO_GATE:
+ gate = evaluate_gate(
+ candidate_skill=cand_skill,
+ cand_hard=final_hard,
+ current_skill=skill,
+ current_score=select_gate_score(base_hard, base_soft, gate_metric, gate_mixed_weight),
+ best_skill=skill,
+ best_score=select_gate_score(base_hard, base_soft, gate_metric, gate_mixed_weight),
+ best_step=night - 1,
+ global_step=night,
+ cand_soft=final_soft,
+ metric=gate_metric,
+ mixed_weight=gate_mixed_weight,
+ )
+ action = gate.action
+ else:
+ action = "accept" if final_score > base_soft else "reject"
+
+ accepted = bool(all_applied) and final_score > select_gate_score(
+ base_hard, base_soft, gate_metric, gate_mixed_weight
+ )
+
+ return ConsolidationResult(
+ accepted=accepted,
+ gate_action=action,
+ baseline_score=select_gate_score(base_hard, base_soft, gate_metric, gate_mixed_weight),
+ candidate_score=final_score,
+ new_skill=cand_skill if accepted else skill,
+ new_memory=cand_memory if accepted else memory,
+ applied_edits=all_applied,
+ rejected_edits=all_rejected,
+ holdout_baseline=base_hard,
+ holdout_candidate=final_hard,
+ )
diff --git a/skillopt/sleep/cycle.py b/skillopt/sleep/cycle.py
new file mode 100644
index 0000000..034ccf5
--- /dev/null
+++ b/skillopt/sleep/cycle.py
@@ -0,0 +1,210 @@
+"""SkillOpt-Sleep — the nightly cycle orchestrator.
+
+run_sleep_cycle() wires the stages:
+ harvest -> mine -> replay -> consolidate(gate) -> stage (-> optional adopt)
+
+It is pure-Python and import-light; with backend="mock" it runs with no API
+key and no third-party deps, which is what the deterministic experiment and
+CI use. With backend="anthropic" it spends the user's budget for real lift.
+"""
+from __future__ import annotations
+
+import os
+import time
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+from skillopt.sleep.backend import get_backend
+from skillopt.sleep.config import SleepConfig, load_config
+from skillopt.sleep.consolidate import consolidate
+from skillopt.sleep.harvest import harvest
+from skillopt.sleep.memory import ensure_skill_scaffold
+from skillopt.sleep.mine import mine
+from skillopt.sleep.state import SleepState, _now_iso
+from skillopt.sleep.staging import write_staging, adopt as adopt_staging
+from skillopt.sleep.types import SessionDigest, SleepReport, TaskRecord
+
+
+@dataclass
+class CycleOutcome:
+ report: SleepReport
+ staging_dir: str
+ adopted: bool
+ adopted_paths: List[str]
+
+
+def _project_paths(cfg: SleepConfig) -> str:
+ """Where live CLAUDE.md lives + which project we are evolving."""
+ if cfg.get("projects") == "invoked" and cfg.get("invoked_project"):
+ return cfg.get("invoked_project")
+ # default: the invoked cwd
+ return cfg.get("invoked_project") or os.getcwd()
+
+
+def _read(path: str) -> str:
+ try:
+ with open(path, encoding="utf-8") as f:
+ return f.read()
+ except Exception:
+ return ""
+
+
+def _render_report_md(report: SleepReport, cfg: SleepConfig) -> str:
+ lines = [
+ f"# SkillOpt-Sleep — night {report.night} report",
+ "",
+ f"- project: `{report.project}`",
+ f"- backend: `{cfg.get('backend')}` replay: `{cfg.get('replay_mode')}`",
+ f"- sessions harvested: {report.n_sessions}",
+ f"- tasks mined: {report.n_tasks} (replayed: {report.n_replayed})",
+ f"- held-out score: {report.baseline_score:.3f} -> {report.candidate_score:.3f}",
+ f"- gate: **{report.gate_action}** (accepted={report.accepted})",
+ f"- tokens used: {report.tokens_used}",
+ "",
+ ]
+ if report.edits:
+ lines.append("## Accepted edits")
+ for e in report.edits:
+ lines.append(f"- [{e.target}/{e.op}] {e.content} \n _why: {e.rationale}_")
+ lines.append("")
+ if report.rejected_edits:
+ lines.append("## Rejected by gate (kept as negative feedback)")
+ for e in report.rejected_edits:
+ lines.append(f"- [{e.target}/{e.op}] {e.content}")
+ lines.append("")
+ if report.notes:
+ lines.append("## Notes")
+ for n in report.notes:
+ lines.append(f"- {n}")
+ lines.append("")
+ lines.append("_Review, then run `/sleep adopt` to apply, or discard this folder._")
+ return "\n".join(lines)
+
+
+def run_sleep_cycle(
+ cfg: Optional[SleepConfig] = None,
+ *,
+ seed_tasks: Optional[List[TaskRecord]] = None,
+ dry_run: bool = False,
+ clock: Optional[float] = None,
+) -> CycleOutcome:
+ """Run one full sleep cycle and return the outcome.
+
+ Parameters
+ ----------
+ cfg : SleepConfig
+ seed_tasks : optional pre-built TaskRecords (used by the experiment to
+ inject a known persona instead of harvesting ~/.claude).
+ dry_run : harvest+mine+replay but DO NOT stage/adopt (report only).
+ clock : fixed epoch seconds for deterministic timestamps in tests.
+ """
+ cfg = cfg or load_config()
+ state = SleepState.load(cfg.state_path)
+ night = state.begin_night(clock)
+ project = _project_paths(cfg)
+ started = _now_iso(clock)
+
+ backend = get_backend(
+ cfg.get("backend", "mock"),
+ model=cfg.get("model", ""),
+ )
+
+ # ── 1+2. harvest + mine (unless seed_tasks injected) ─────────────────
+ digests: List[SessionDigest] = []
+ if seed_tasks is not None:
+ tasks = seed_tasks
+ n_sessions = 0
+ else:
+ since = state.last_harvest_for(project)
+ digests = harvest(
+ cfg.transcripts_dir,
+ scope=cfg.get("projects", "invoked"),
+ invoked_project=cfg.get("invoked_project", ""),
+ since_iso=since,
+ limit=cfg.get("max_tasks_per_night", 40) * 3,
+ )
+ n_sessions = len(digests)
+ tasks = mine(
+ digests,
+ max_tasks=cfg.get("max_tasks_per_night", 40),
+ holdout_fraction=cfg.get("holdout_fraction", 0.34),
+ seed=cfg.get("seed", 42),
+ )
+
+ # ── live skill/memory docs ───────────────────────────────────────────
+ live_memory_path = os.path.join(project, "CLAUDE.md")
+ live_skill_path = cfg.managed_skill_path()
+ skill = _read(live_skill_path)
+ memory = _read(live_memory_path)
+ if not skill:
+ skill = ensure_skill_scaffold(
+ "", name=cfg.get("managed_skill_name", "skillopt-sleep-learned"),
+ description="Preferences and procedures learned from past Claude Code sessions.",
+ )
+
+ report = SleepReport(
+ night=night, project=project, started_at=started,
+ n_sessions=n_sessions, n_tasks=len(tasks),
+ )
+
+ if not tasks:
+ report.ended_at = _now_iso(clock)
+ report.notes.append("no tasks mined — nothing to consolidate")
+ state.set_last_harvest(project, started)
+ state.record_night({"night": night, "accepted": False, "n_tasks": 0})
+ if not dry_run:
+ state.save()
+ staging_dir = ""
+ return CycleOutcome(report, staging_dir, False, [])
+
+ # ── 3+4. replay + consolidate (gate) ─────────────────────────────────
+ result = consolidate(
+ backend, tasks, skill, memory,
+ edit_budget=cfg.get("edit_budget", 4),
+ gate_metric=cfg.get("gate_metric", "mixed"),
+ gate_mixed_weight=cfg.get("gate_mixed_weight", 0.5),
+ evolve_skill=cfg.get("evolve_skill", True),
+ evolve_memory=cfg.get("evolve_memory", True),
+ night=night,
+ )
+
+ report.n_replayed = len(tasks)
+ report.baseline_score = result.baseline_score
+ report.candidate_score = result.candidate_score
+ report.accepted = result.accepted
+ report.gate_action = result.gate_action
+ report.edits = result.applied_edits
+ report.rejected_edits = result.rejected_edits
+ report.tokens_used = backend.tokens_used()
+ report.ended_at = _now_iso(clock)
+
+ # ── 5. stage (unless dry-run) ────────────────────────────────────────
+ staging_dir = ""
+ adopted = False
+ adopted_paths: List[str] = []
+ if not dry_run:
+ report_md = _render_report_md(report, cfg)
+ proposed_skill = result.new_skill if (cfg.get("evolve_skill") and result.accepted) else None
+ proposed_memory = result.new_memory if (cfg.get("evolve_memory") and result.accepted) else None
+ staging_dir = write_staging(
+ project,
+ report=report,
+ proposed_skill=proposed_skill,
+ proposed_memory=proposed_memory,
+ live_skill_path=live_skill_path,
+ live_memory_path=live_memory_path,
+ report_md=report_md,
+ )
+ state.set_last_harvest(project, started)
+ state.record_night({
+ "night": night, "accepted": result.accepted,
+ "baseline": result.baseline_score, "candidate": result.candidate_score,
+ "n_tasks": len(tasks), "staging": staging_dir,
+ })
+ # ── 6. adopt (opt-in) ────────────────────────────────────────────
+ if cfg.get("auto_adopt") and result.accepted:
+ adopted_paths = adopt_staging(staging_dir)
+ adopted = bool(adopted_paths)
+ state.save()
+
+ return CycleOutcome(report, staging_dir, adopted, adopted_paths)
diff --git a/skillopt/sleep/experiments/__init__.py b/skillopt/sleep/experiments/__init__.py
new file mode 100644
index 0000000..fa657fe
--- /dev/null
+++ b/skillopt/sleep/experiments/__init__.py
@@ -0,0 +1 @@
+"""SkillOpt-Sleep experiments."""
diff --git a/skillopt/sleep/experiments/personas.py b/skillopt/sleep/experiments/personas.py
new file mode 100644
index 0000000..0fdb127
--- /dev/null
+++ b/skillopt/sleep/experiments/personas.py
@@ -0,0 +1,86 @@
+"""SkillOpt-Sleep — persona task fixtures for the validation experiment.
+
+Each persona is a list of TaskRecords with EXACT checkable references and a
+`rule:` tag naming the single skill rule that makes the task solvable
+(consumed by MockBackend). This lets the experiment prove — deterministically,
+with no API — that nightly consolidation lifts a held-out score and that the
+gate blocks regressions.
+
+Personas mirror the user's framing: programmer / researcher / analyst.
+"""
+from __future__ import annotations
+
+from typing import List
+
+from skillopt.sleep.types import TaskRecord
+
+
+def _t(i, intent, ref, rule, project="/personas/demo", outcome="fail") -> TaskRecord:
+ return TaskRecord(
+ id=f"persona_{rule}_{i}",
+ project=project,
+ intent=intent,
+ context_excerpt="",
+ attempted_solution="",
+ outcome=outcome,
+ reference_kind="exact",
+ reference=ref,
+ tags=[f"rule:{rule}"],
+ source_sessions=[f"sess_{i}"],
+ )
+
+
+def researcher_persona() -> List[TaskRecord]:
+ """Researcher who always wants arXiv ids wrapped in tags."""
+ items = [
+ ("Give me the arXiv id for the SkillOpt paper", "arXiv:2605.23904"),
+ ("What's the arXiv id of the Attention paper?", "arXiv:1706.03762"),
+ ("arXiv id for the GAN paper?", "arXiv:1406.2661"),
+ ("arXiv id for BERT?", "arXiv:1810.04805"),
+ ("arXiv id for the ResNet paper?", "arXiv:1512.03385"),
+ ("arXiv id for the Adam optimizer paper?", "arXiv:1412.6980"),
+ ("arXiv id for Dropout?", "arXiv:1207.0580"),
+ ("arXiv id for the Transformer-XL paper?", "arXiv:1901.02860"),
+ ("arXiv id for word2vec?", "arXiv:1301.3781"),
+ ("arXiv id for the VAE paper?", "arXiv:1312.6114"),
+ ("arXiv id for batch norm?", "arXiv:1502.03167"),
+ ("arXiv id for GPT-3?", "arXiv:2005.14165"),
+ ]
+ # Both rules required: format the id (arxiv-id) AND wrap in answer tags.
+ out: List[TaskRecord] = []
+ for i, (q, a) in enumerate(items):
+ t = _t(i, q, a, "wrap-answer")
+ t.tags = ["rule:wrap-answer", "rule:arxiv-id"]
+ out.append(t)
+ return out
+
+
+def programmer_persona() -> List[TaskRecord]:
+ """Programmer who wants imperative-mood commit subjects."""
+ items = [
+ ("commit message for adding a login form", "Add login form"),
+ ("commit message for fixing the null pointer bug", "Fix null pointer in parser"),
+ ("commit message for updating the README", "Update README"),
+ ("commit message for removing dead code", "Remove dead code"),
+ ("commit message for bumping the version", "Bump version to 1.2.0"),
+ ("commit message for refactoring the auth module", "Refactor auth module"),
+ ("commit message for adding tests", "Add unit tests for scheduler"),
+ ("commit message for fixing the CI pipeline", "Fix CI pipeline"),
+ ]
+ return [_t(i, q, a, "commit-imperative") for i, (q, a) in enumerate(items)]
+
+
+def harmful_edit_task() -> TaskRecord:
+ """A task whose 'fix' is a known-bad rule; used to prove the gate rejects
+ regressions. The MockBackend proposes the harmful rule on this failure,
+ but applying it does NOT raise the held-out score, so the gate must reject.
+ """
+ t = _t(99, "answer this freely", "THIS_WILL_NOT_MATCH", "__harmful__")
+ t.reference = "an-answer-that-the-harmful-rule-cannot-produce"
+ return t
+
+
+PERSONAS = {
+ "researcher": researcher_persona,
+ "programmer": programmer_persona,
+}
diff --git a/skillopt/sleep/experiments/run_experiment.py b/skillopt/sleep/experiments/run_experiment.py
new file mode 100644
index 0000000..931138b
--- /dev/null
+++ b/skillopt/sleep/experiments/run_experiment.py
@@ -0,0 +1,157 @@
+"""SkillOpt-Sleep — validation experiment.
+
+Answers the question the user posed: *does nightly offline self-evolution
+actually improve the agent?* Runs deterministically with the MockBackend
+(no API key, reproducible) and is the acceptance test for the whole idea.
+
+What it proves:
+ 1. MONOTONIC LIFT — over N sleep nights, the held-out score rises from a
+ baseline (empty skill/memory) toward 1.0 as the gate accepts the
+ general rules the persona's tasks require.
+ 2. GATE SAFETY — an injected harmful edit is REJECTED (held-out score does
+ not improve), so a bad nightly proposal can never be adopted.
+ 3. PLUMBING — harvest->mine->replay->consolidate->stage->adopt all run and
+ the adopted artifact, re-scored, retains the lift.
+
+Run:
+ python -m skillopt.sleep.experiments.run_experiment
+ python -m skillopt.sleep.experiments.run_experiment --persona programmer --nights 3
+ python -m skillopt.sleep.experiments.run_experiment --backend anthropic # real lift
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+import tempfile
+from typing import List
+
+from skillopt.sleep.backend import get_backend
+from skillopt.sleep.consolidate import consolidate
+from skillopt.sleep.experiments.personas import (
+ PERSONAS,
+ harmful_edit_task,
+ researcher_persona,
+)
+from skillopt.sleep.memory import ensure_skill_scaffold
+from skillopt.sleep.replay import aggregate_scores, replay_batch
+from skillopt.sleep.types import TaskRecord
+
+
+def _score_holdout(backend, tasks: List[TaskRecord], skill: str, memory: str,
+ metric: str = "mixed", w: float = 0.5) -> float:
+ from skillopt.sleep.consolidate import select_gate_score
+ holdout = [t for t in tasks if t.split == "holdout"] or tasks
+ pairs = replay_batch(backend, holdout, skill, memory)
+ h, s = aggregate_scores(pairs)
+ return select_gate_score(h, s, metric, w)
+
+
+def run(persona: str = "researcher", nights: int = 4, backend_name: str = "mock",
+ edit_budget: int = 4, seed: int = 42) -> dict:
+ from skillopt.sleep.mine import assign_splits
+
+ make = PERSONAS.get(persona, researcher_persona)
+ tasks = assign_splits(make(), holdout_fraction=0.34, seed=seed)
+ backend = get_backend(backend_name)
+
+ # start from an empty managed skill + empty memory
+ skill = ensure_skill_scaffold("", name="skillopt-sleep-learned",
+ description="Learned preferences.")
+ memory = ""
+
+ baseline = _score_holdout(backend, tasks, skill, memory)
+ trace = [{"night": 0, "holdout_score": round(baseline, 4), "action": "baseline",
+ "n_edits": 0}]
+
+ for night in range(1, nights + 1):
+ res = consolidate(
+ backend, tasks, skill, memory,
+ edit_budget=edit_budget, gate_metric="mixed", gate_mixed_weight=0.5,
+ evolve_skill=True, evolve_memory=True, night=night,
+ )
+ if res.accepted:
+ skill, memory = res.new_skill, res.new_memory
+ trace.append({
+ "night": night,
+ "holdout_score": round(res.candidate_score, 4),
+ "action": res.gate_action,
+ "accepted": res.accepted,
+ "n_edits": len(res.applied_edits),
+ "edits": [e.content for e in res.applied_edits],
+ "n_rejected": len(res.rejected_edits),
+ })
+ # converged: stop early if perfect
+ if res.candidate_score >= 0.999:
+ break
+
+ after = _score_holdout(backend, tasks, skill, memory)
+
+ # ── gate-safety probe: inject a harmful task whose 'fix' is a bad rule ──
+ harmful_tasks = assign_splits([harmful_edit_task()] + make()[:3],
+ holdout_fraction=0.5, seed=seed)
+ h_before = _score_holdout(backend, harmful_tasks, skill, memory)
+ res_h = consolidate(backend, harmful_tasks, skill, memory,
+ edit_budget=edit_budget, gate_metric="mixed",
+ evolve_skill=True, evolve_memory=False, night=nights + 1)
+ harmful_rule_text = get_backend("mock").RULE_TEXT["__harmful__"] # type: ignore[attr-defined]
+ harmful_rejected = (harmful_rule_text not in res_h.new_skill)
+
+ result = {
+ "persona": persona,
+ "backend": backend_name,
+ "nights_run": len(trace) - 1,
+ "baseline_holdout": round(baseline, 4),
+ "after_holdout": round(after, 4),
+ "lift": round(after - baseline, 4),
+ "improved": after > baseline,
+ "gate_blocks_harmful": bool(harmful_rejected),
+ "final_skill_excerpt": skill[-400:],
+ "trace": trace,
+ }
+ return result
+
+
+def _assert(cond: bool, msg: str) -> None:
+ if not cond:
+ print(f"FAIL: {msg}")
+ raise SystemExit(1)
+
+
+def main(argv=None) -> int:
+ ap = argparse.ArgumentParser(description="SkillOpt-Sleep validation experiment")
+ ap.add_argument("--persona", default="researcher", choices=list(PERSONAS.keys()))
+ ap.add_argument("--nights", type=int, default=4)
+ ap.add_argument("--backend", default="mock", choices=["mock", "anthropic"])
+ ap.add_argument("--edit-budget", type=int, default=4)
+ ap.add_argument("--json", action="store_true")
+ ap.add_argument("--assert-improves", action="store_true",
+ help="exit nonzero unless lift>0 and gate blocks harmful edit")
+ args = ap.parse_args(argv)
+
+ res = run(args.persona, nights=args.nights, backend_name=args.backend,
+ edit_budget=args.edit_budget)
+
+ if args.json:
+ print(json.dumps(res, ensure_ascii=False, indent=2))
+ else:
+ print(f"=== SkillOpt-Sleep experiment: persona={res['persona']} backend={res['backend']} ===")
+ print(f"baseline held-out : {res['baseline_holdout']}")
+ print(f"after held-out : {res['after_holdout']} (lift {res['lift']:+.4f})")
+ print(f"gate blocks harmful edit: {res['gate_blocks_harmful']}")
+ print("trace:")
+ for row in res["trace"]:
+ edits = "; ".join(row.get("edits", []))[:80]
+ print(f" night {row['night']}: holdout={row['holdout_score']} "
+ f"{row['action']} (+{row['n_edits']} edits) {edits}")
+
+ if args.assert_improves:
+ _assert(res["improved"], "held-out score did not improve")
+ _assert(res["gate_blocks_harmful"], "gate failed to block harmful edit")
+ print("\nPASS: nightly consolidation improves held-out score AND gate blocks regressions.")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/skillopt/sleep/harvest.py b/skillopt/sleep/harvest.py
new file mode 100644
index 0000000..013483f
--- /dev/null
+++ b/skillopt/sleep/harvest.py
@@ -0,0 +1,236 @@
+"""SkillOpt-Sleep — Stage 1: harvest.
+
+Read the user's local Claude Code records (read-only) and normalize them
+into :class:`SessionDigest` objects.
+
+Sources (verified schema):
+ * ~/.claude/history.jsonl — one JSON/line:
+ {"display": , "pastedContents": {...},
+ "timestamp": , "project": }
+ * ~/.claude/projects//.jsonl — one record/line; the
+ records we care about have type "user"/"assistant" and carry:
+ message{role, content}, cwd, gitBranch, timestamp, sessionId, version
+
+This module performs NO writes and NO network calls.
+"""
+from __future__ import annotations
+
+import json
+import os
+from typing import Any, Dict, Iterable, List, Optional
+
+from skillopt.sleep.types import SessionDigest
+
+
+# Heuristic phrases that signal the user (dis)approving of prior output.
+_NEGATIVE_FEEDBACK = (
+ "still broken", "still not", "still wrong", "doesn't work", "does not work",
+ "not working", "that's wrong", "thats wrong", "incorrect", "wrong",
+ "no,", "nope", "fix it", "didn't", "did not", "broken", "error again",
+ "still failing", "still fails", "not fixed", "revert", "undo",
+ "不对", "还是不对", "还是不行", "不行", "错了", "有问题", "没修好",
+)
+_POSITIVE_FEEDBACK = (
+ "thanks", "thank you", "perfect", "great", "works now", "fixed",
+ "that works", "lgtm", "looks good", "nice", "awesome", "correct",
+ "完美", "可以了", "好的", "搞定", "对了", "正确", "谢谢",
+)
+
+
+def _iter_jsonl(path: str) -> Iterable[Dict[str, Any]]:
+ try:
+ with open(path, encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if not line:
+ continue
+ try:
+ yield json.loads(line)
+ except Exception:
+ continue
+ except (FileNotFoundError, IsADirectoryError, PermissionError):
+ return
+
+
+def _text_from_content(content: Any) -> str:
+ """Flatten a message.content (str or list of blocks) into text."""
+ if isinstance(content, str):
+ return content
+ if isinstance(content, list):
+ parts: List[str] = []
+ for b in content:
+ if isinstance(b, dict):
+ if b.get("type") == "text" and b.get("text"):
+ parts.append(str(b["text"]))
+ return "\n".join(parts)
+ return ""
+
+
+def _tool_names_from_content(content: Any) -> List[str]:
+ names: List[str] = []
+ if isinstance(content, list):
+ for b in content:
+ if isinstance(b, dict) and b.get("type") == "tool_use" and b.get("name"):
+ names.append(str(b["name"]))
+ return names
+
+
+def _detect_feedback(text: str) -> List[str]:
+ low = text.lower()
+ sig: List[str] = []
+ for ph in _NEGATIVE_FEEDBACK:
+ if ph in low:
+ sig.append("neg:" + ph)
+ for ph in _POSITIVE_FEEDBACK:
+ if ph in low:
+ sig.append("pos:" + ph)
+ return sig
+
+
+def _is_meta_prompt(text: str) -> bool:
+ """Skip slash-commands / system noise that aren't real user intents."""
+ t = text.strip()
+ if not t:
+ return True
+ if t.startswith("<") and t.endswith(">"):
+ return True
+ if t.startswith("/") and len(t.split()) <= 3:
+ return True
+ if t.startswith("[Pasted text") or t.startswith("Caveat:"):
+ return True
+ return False
+
+
+def digest_transcript(path: str) -> Optional[SessionDigest]:
+ """Build a SessionDigest from one ``.jsonl`` transcript."""
+ session_id = os.path.splitext(os.path.basename(path))[0]
+ project = ""
+ git_branch = ""
+ started = ""
+ ended = ""
+ user_prompts: List[str] = []
+ assistant_finals: List[str] = []
+ tools: List[str] = []
+ files: List[str] = []
+ feedback: List[str] = []
+ n_user = 0
+ n_asst = 0
+
+ for rec in _iter_jsonl(path):
+ rtype = rec.get("type")
+ ts = rec.get("timestamp")
+ if isinstance(ts, str) and ts:
+ if not started:
+ started = ts
+ ended = ts
+ if rec.get("cwd") and not project:
+ project = str(rec.get("cwd"))
+ if rec.get("gitBranch") and not git_branch:
+ git_branch = str(rec.get("gitBranch"))
+ if rtype == "file-history-snapshot":
+ snap = rec.get("snapshot") or rec.get("files") or {}
+ if isinstance(snap, dict):
+ files.extend([str(k) for k in list(snap.keys())[:20]])
+ msg = rec.get("message")
+ if not isinstance(msg, dict):
+ continue
+ role = msg.get("role")
+ content = msg.get("content")
+ if role == "user":
+ text = _text_from_content(content)
+ if text and not _is_meta_prompt(text):
+ n_user += 1
+ user_prompts.append(text.strip())
+ feedback.extend(_detect_feedback(text))
+ elif role == "assistant":
+ n_asst += 1
+ tools.extend(_tool_names_from_content(content))
+ text = _text_from_content(content)
+ if text.strip():
+ assistant_finals.append(text.strip())
+
+ if n_user == 0 and n_asst == 0:
+ return None
+
+ # de-dup tools/files preserving order
+ def _dedup(xs: List[str]) -> List[str]:
+ seen = set()
+ out = []
+ for x in xs:
+ if x not in seen:
+ seen.add(x)
+ out.append(x)
+ return out
+
+ return SessionDigest(
+ session_id=session_id,
+ project=project,
+ git_branch=git_branch,
+ started_at=started,
+ ended_at=ended,
+ user_prompts=user_prompts,
+ assistant_finals=assistant_finals[-5:], # last few finals are the useful ones
+ tools_used=_dedup(tools),
+ files_touched=_dedup(files),
+ feedback_signals=feedback,
+ n_user_turns=n_user,
+ n_assistant_turns=n_asst,
+ raw_path=path,
+ )
+
+
+def _project_matches(project: str, scope: Any, invoked: str) -> bool:
+ if scope == "all":
+ return True
+ if isinstance(scope, (list, tuple)):
+ return any(os.path.abspath(project) == os.path.abspath(p) for p in scope)
+ # "invoked": match the invoked project (or a subdir of it)
+ if not invoked:
+ return True
+ a = os.path.abspath(project)
+ b = os.path.abspath(invoked)
+ return a == b or a.startswith(b + os.sep) or b.startswith(a + os.sep)
+
+
+def harvest(
+ transcripts_dir: str,
+ *,
+ scope: Any = "all",
+ invoked_project: str = "",
+ since_iso: Optional[str] = None,
+ limit: int = 0,
+) -> List[SessionDigest]:
+ """Walk ~/.claude/projects and return digests matching scope/time.
+
+ Parameters
+ ----------
+ transcripts_dir : str ~/.claude/projects
+ scope : "all" | "invoked" | list[path]
+ invoked_project : str used when scope == "invoked"
+ since_iso : str|None ISO8601; only sessions ending after this are kept
+ limit : int cap number of digests (0 = no cap)
+ """
+ digests: List[SessionDigest] = []
+ if not os.path.isdir(transcripts_dir):
+ return digests
+
+ paths: List[str] = []
+ for root, _dirs, files in os.walk(transcripts_dir):
+ for fn in files:
+ if fn.endswith(".jsonl"):
+ paths.append(os.path.join(root, fn))
+ # newest first by mtime
+ paths.sort(key=lambda p: os.path.getmtime(p), reverse=True)
+
+ for p in paths:
+ d = digest_transcript(p)
+ if d is None:
+ continue
+ if not _project_matches(d.project or "", scope, invoked_project):
+ continue
+ if since_iso and d.ended_at and d.ended_at < since_iso:
+ continue
+ digests.append(d)
+ if limit and len(digests) >= limit:
+ break
+ return digests
diff --git a/skillopt/sleep/memory.py b/skillopt/sleep/memory.py
new file mode 100644
index 0000000..2f7ddbb
--- /dev/null
+++ b/skillopt/sleep/memory.py
@@ -0,0 +1,130 @@
+"""SkillOpt-Sleep — skill/memory document manipulation.
+
+Applies bounded EditRecords to a skill (SKILL.md body) or memory (CLAUDE.md)
+document, and provides Dream-style consolidation helpers (dedup near-identical
+lines, drop contradictions). All edits live inside a protected, clearly-marked
+region so the sleep cycle never clobbers the user's hand-written content.
+"""
+from __future__ import annotations
+
+import re
+from typing import List, Tuple
+
+from skillopt.sleep.types import EditRecord
+
+
+LEARNED_START = ""
+LEARNED_END = ""
+_BANNER = (
+ "_This block is maintained by SkillOpt-Sleep. Edits here are proposed "
+ "offline, validated against your past tasks, and adopted only after you "
+ "approve them. Hand-edits outside this block are never touched._"
+)
+
+
+def extract_learned(doc: str) -> str:
+ s = doc.find(LEARNED_START)
+ e = doc.find(LEARNED_END)
+ if s == -1 or e == -1:
+ return ""
+ return doc[s + len(LEARNED_START):e].strip()
+
+
+def _strip_learned(doc: str) -> str:
+ while True:
+ s = doc.find(LEARNED_START)
+ if s == -1:
+ break
+ e = doc.find(LEARNED_END, s)
+ if e == -1:
+ doc = doc[:s]
+ break
+ doc = doc[:s] + doc[e + len(LEARNED_END):]
+ while "\n\n\n" in doc:
+ doc = doc.replace("\n\n\n", "\n\n")
+ return doc.rstrip()
+
+
+def set_learned(doc: str, learned_lines: List[str]) -> str:
+ """Replace the protected learned region with the given bullet lines."""
+ base = _strip_learned(doc)
+ body = "\n".join(f"- {ln.strip().lstrip('- ').strip()}" for ln in learned_lines if ln.strip())
+ block = (
+ f"\n\n{LEARNED_START}\n"
+ f"## Learned preferences & procedures\n\n{_BANNER}\n\n{body}\n"
+ f"{LEARNED_END}\n"
+ )
+ return (base + block).lstrip("\n")
+
+
+def current_learned_lines(doc: str) -> List[str]:
+ inner = extract_learned(doc)
+ lines: List[str] = []
+ for ln in inner.splitlines():
+ ln = ln.strip()
+ if ln.startswith("- "):
+ lines.append(ln[2:].strip())
+ return lines
+
+
+def _norm(s: str) -> str:
+ return re.sub(r"\s+", " ", (s or "").lower()).strip()
+
+
+def apply_edits(doc: str, edits: List[EditRecord]) -> Tuple[str, List[EditRecord]]:
+ """Apply add/delete/replace edits to the protected learned region.
+
+ Returns (new_doc, applied_edits). Dedups: an `add` whose content already
+ exists (normalized) is skipped. `delete`/`replace` match on normalized
+ anchor substring.
+ """
+ lines = current_learned_lines(doc)
+ norm_set = {_norm(l) for l in lines}
+ applied: List[EditRecord] = []
+
+ for e in edits:
+ op = (e.op or "add").lower()
+ if op == "add":
+ if _norm(e.content) in norm_set or not e.content.strip():
+ continue
+ lines.append(e.content.strip())
+ norm_set.add(_norm(e.content))
+ applied.append(e)
+ elif op == "delete":
+ anchor = _norm(e.anchor or e.content)
+ keep = [l for l in lines if anchor not in _norm(l)]
+ if len(keep) != len(lines):
+ lines = keep
+ norm_set = {_norm(l) for l in lines}
+ applied.append(e)
+ elif op == "replace":
+ anchor = _norm(e.anchor)
+ new_lines = []
+ changed = False
+ for l in lines:
+ if anchor and anchor in _norm(l):
+ new_lines.append(e.content.strip())
+ changed = True
+ else:
+ new_lines.append(l)
+ if changed:
+ lines = new_lines
+ norm_set = {_norm(l) for l in lines}
+ applied.append(e)
+
+ return set_learned(doc, lines), applied
+
+
+def ensure_skill_scaffold(doc: str, *, name: str, description: str) -> str:
+ """Ensure a SKILL.md has YAML frontmatter so Claude Code loads it."""
+ if doc.lstrip().startswith("---"):
+ return doc
+ fm = (
+ "---\n"
+ f"name: {name}\n"
+ f"description: {description}\n"
+ "---\n\n"
+ f"# {name}\n\n"
+ "Preferences and procedures learned from your past Claude Code sessions.\n"
+ )
+ return fm + doc
diff --git a/skillopt/sleep/mine.py b/skillopt/sleep/mine.py
new file mode 100644
index 0000000..fcdfc43
--- /dev/null
+++ b/skillopt/sleep/mine.py
@@ -0,0 +1,168 @@
+"""SkillOpt-Sleep — Stage 2: mine.
+
+Turn :class:`SessionDigest` objects into :class:`TaskRecord` training units.
+
+Two miners:
+ * heuristic_mine — deterministic, no API. Detects retry chains (a prompt
+ re-asked after negative feedback => the early attempt failed), extracts
+ the user's recurring intents, and labels outcomes from feedback signals.
+ * llm_mine — optional; uses an optimizer backend to produce richer
+ TaskRecords with checkable references. Falls back to heuristic on error.
+
+The heuristic miner is what makes the whole cycle runnable offline and is the
+basis of the deterministic experiment.
+"""
+from __future__ import annotations
+
+import hashlib
+import re
+from typing import Any, Callable, List, Optional
+
+from skillopt.sleep.types import SessionDigest, TaskRecord
+
+
+def _tid(project: str, intent: str) -> str:
+ h = hashlib.sha256((project + "::" + intent).encode("utf-8")).hexdigest()[:12]
+ return "task_" + h
+
+
+def _short(text: str, n: int = 600) -> str:
+ text = (text or "").strip()
+ return text if len(text) <= n else text[:n] + " …"
+
+
+def _looks_negative(signals: List[str]) -> bool:
+ return any(s.startswith("neg:") for s in signals)
+
+
+def _looks_positive(signals: List[str]) -> bool:
+ return any(s.startswith("pos:") for s in signals)
+
+
+def heuristic_mine(
+ digests: List[SessionDigest],
+ *,
+ max_tasks: int = 40,
+) -> List[TaskRecord]:
+ """Deterministic miner — no API calls.
+
+ Strategy:
+ * Each session with >=1 real user prompt yields one TaskRecord whose
+ intent is the FIRST substantive prompt (the original ask).
+ * Outcome is inferred:
+ - negative feedback present and no later positive -> "fail"
+ - positive feedback present -> "success"
+ - re-asks (multiple user turns) without resolution -> "mixed"
+ - otherwise -> "unknown"
+ * attempted_solution = the last assistant final (what was produced).
+ * reference_kind defaults to "none"; the consolidation step will use a
+ rubric judge for these. (Exact refs are added by the experiment data
+ or by the LLM miner when it can derive a checkable answer.)
+ """
+ tasks: List[TaskRecord] = []
+ for d in digests:
+ if not d.user_prompts:
+ continue
+ intent = d.user_prompts[0]
+ if len(intent.strip()) < 8:
+ continue
+ if _looks_positive(d.feedback_signals) and not _looks_negative(d.feedback_signals):
+ outcome = "success"
+ elif _looks_negative(d.feedback_signals):
+ outcome = "fail"
+ elif d.n_user_turns >= 3:
+ outcome = "mixed"
+ else:
+ outcome = "unknown"
+
+ attempted = d.assistant_finals[-1] if d.assistant_finals else ""
+ context = ""
+ if len(d.user_prompts) > 1:
+ # later prompts often carry the corrective detail / real constraints
+ context = "Follow-up constraints from the same session:\n- " + "\n- ".join(
+ _short(p, 200) for p in d.user_prompts[1:4]
+ )
+ tags = []
+ if d.tools_used:
+ tags.append("tools:" + "+".join(d.tools_used[:4]))
+ if d.git_branch:
+ tags.append("branch:" + d.git_branch)
+
+ tasks.append(
+ TaskRecord(
+ id=_tid(d.project, intent),
+ project=d.project,
+ intent=_short(intent, 800),
+ context_excerpt=_short(context, 600),
+ attempted_solution=_short(attempted, 600),
+ outcome=outcome,
+ reference_kind="none",
+ reference="",
+ tags=tags,
+ source_sessions=[d.session_id],
+ )
+ )
+ if len(tasks) >= max_tasks:
+ break
+ return tasks
+
+
+def dedup_tasks(tasks: List[TaskRecord]) -> List[TaskRecord]:
+ """Merge tasks sharing an id (same project+intent across sessions)."""
+ by_id: dict = {}
+ for t in tasks:
+ if t.id in by_id:
+ ex = by_id[t.id]
+ ex.source_sessions = list(dict.fromkeys(ex.source_sessions + t.source_sessions))
+ # prefer a resolved outcome if either session resolved it
+ order = {"success": 3, "fail": 2, "mixed": 1, "unknown": 0}
+ if order.get(t.outcome, 0) > order.get(ex.outcome, 0):
+ ex.outcome = t.outcome
+ else:
+ by_id[t.id] = t
+ return list(by_id.values())
+
+
+def assign_splits(
+ tasks: List[TaskRecord],
+ *,
+ holdout_fraction: float = 0.34,
+ seed: int = 42,
+) -> List[TaskRecord]:
+ """Deterministically split tasks into replay (train) / holdout (test).
+
+ Uses a stable hash of the task id so the same task always lands in the
+ same split across nights (a fixed held-out gate, like SkillOpt's D_sel).
+ """
+ for t in tasks:
+ bucket = int(hashlib.sha256((str(seed) + t.id).encode()).hexdigest(), 16) % 100
+ t.split = "holdout" if bucket < int(holdout_fraction * 100) else "replay"
+ # guarantee both splits non-empty when possible
+ splits = {t.split for t in tasks}
+ if len(tasks) >= 2 and "holdout" not in splits:
+ tasks[-1].split = "holdout"
+ if len(tasks) >= 2 and "replay" not in splits:
+ tasks[0].split = "replay"
+ return tasks
+
+
+def mine(
+ digests: List[SessionDigest],
+ *,
+ max_tasks: int = 40,
+ holdout_fraction: float = 0.34,
+ seed: int = 42,
+ llm_miner: Optional[Callable[[List[SessionDigest]], List[TaskRecord]]] = None,
+) -> List[TaskRecord]:
+ """Top-level miner. Uses ``llm_miner`` if provided, else heuristic."""
+ tasks: List[TaskRecord] = []
+ if llm_miner is not None:
+ try:
+ tasks = llm_miner(digests) or []
+ except Exception:
+ tasks = []
+ if not tasks:
+ tasks = heuristic_mine(digests, max_tasks=max_tasks)
+ tasks = dedup_tasks(tasks)
+ tasks = assign_splits(tasks, holdout_fraction=holdout_fraction, seed=seed)
+ return tasks
diff --git a/skillopt/sleep/replay.py b/skillopt/sleep/replay.py
new file mode 100644
index 0000000..de3d598
--- /dev/null
+++ b/skillopt/sleep/replay.py
@@ -0,0 +1,46 @@
+"""SkillOpt-Sleep — Stage 3: replay.
+
+Re-run mined TaskRecords offline under a given (skill, memory) and score
+them, producing the (hard, soft) signal SkillOpt's gate consumes.
+
+For Phase 1 the replay is "mock mode": a sandboxed single-shot attempt via
+the chosen backend (MockBackend = deterministic; AnthropicBackend = real).
+"fresh" worktree replay is Phase 3 and is intentionally not wired here.
+"""
+from __future__ import annotations
+
+from typing import List, Tuple
+
+from skillopt.sleep.backend import Backend
+from skillopt.sleep.types import ReplayResult, TaskRecord
+
+
+def replay_one(backend: Backend, task: TaskRecord, skill: str, memory: str) -> ReplayResult:
+ response = backend.attempt(task, skill, memory)
+ hard, soft, rationale = backend.judge(task, response)
+ return ReplayResult(
+ id=task.id,
+ hard=float(hard),
+ soft=float(soft),
+ response=response,
+ fail_reason="" if hard >= 1.0 else (rationale or "below threshold"),
+ task_type=(task.tags[0] if task.tags else "task"),
+ judge_rationale=rationale,
+ )
+
+
+def replay_batch(
+ backend: Backend,
+ tasks: List[TaskRecord],
+ skill: str,
+ memory: str,
+) -> List[Tuple[TaskRecord, ReplayResult]]:
+ return [(t, replay_one(backend, t, skill, memory)) for t in tasks]
+
+
+def aggregate_scores(pairs: List[Tuple[TaskRecord, ReplayResult]]) -> Tuple[float, float]:
+ if not pairs:
+ return 0.0, 0.0
+ hard = sum(r.hard for _t, r in pairs) / len(pairs)
+ soft = sum(r.soft for _t, r in pairs) / len(pairs)
+ return hard, soft
diff --git a/skillopt/sleep/staging.py b/skillopt/sleep/staging.py
new file mode 100644
index 0000000..b2e0ab2
--- /dev/null
+++ b/skillopt/sleep/staging.py
@@ -0,0 +1,103 @@
+"""SkillOpt-Sleep — Stage 5/6: staging and adoption.
+
+Implements the Dreams safety contract: the cycle never mutates the user's
+live CLAUDE.md / SKILL.md. It writes proposals + a human-readable report into
+a staging directory; a separate, explicit `adopt` step copies them over the
+live files after taking a backup.
+"""
+from __future__ import annotations
+
+import json
+import os
+import shutil
+import time
+from typing import List, Optional
+
+from skillopt.sleep.types import SleepReport
+
+
+def _ts_dir() -> str:
+ return time.strftime("%Y%m%d-%H%M%S", time.localtime())
+
+
+def staging_root(project: str) -> str:
+ return os.path.join(project, ".skillopt-sleep", "staging")
+
+
+def latest_staging(project: str) -> Optional[str]:
+ root = staging_root(project)
+ if not os.path.isdir(root):
+ return None
+ subs = sorted(
+ (os.path.join(root, d) for d in os.listdir(root)),
+ key=lambda p: os.path.getmtime(p),
+ reverse=True,
+ )
+ return subs[0] if subs else None
+
+
+def write_staging(
+ project: str,
+ *,
+ report: SleepReport,
+ proposed_skill: Optional[str],
+ proposed_memory: Optional[str],
+ live_skill_path: str,
+ live_memory_path: str,
+ report_md: str,
+) -> str:
+ """Write proposals + report into staging// and return that path."""
+ out = os.path.join(staging_root(project), _ts_dir())
+ os.makedirs(out, exist_ok=True)
+
+ manifest = {
+ "live_skill_path": live_skill_path,
+ "live_memory_path": live_memory_path,
+ "has_skill": proposed_skill is not None,
+ "has_memory": proposed_memory is not None,
+ "accepted": report.accepted,
+ }
+ if proposed_skill is not None:
+ with open(os.path.join(out, "proposed_SKILL.md"), "w", encoding="utf-8") as f:
+ f.write(proposed_skill)
+ if proposed_memory is not None:
+ with open(os.path.join(out, "proposed_CLAUDE.md"), "w", encoding="utf-8") as f:
+ f.write(proposed_memory)
+ with open(os.path.join(out, "report.json"), "w", encoding="utf-8") as f:
+ json.dump(report.to_dict(), f, ensure_ascii=False, indent=2)
+ with open(os.path.join(out, "report.md"), "w", encoding="utf-8") as f:
+ f.write(report_md)
+ with open(os.path.join(out, "manifest.json"), "w", encoding="utf-8") as f:
+ json.dump(manifest, f, ensure_ascii=False, indent=2)
+ return out
+
+
+def _backup(path: str, backup_dir: str) -> None:
+ if os.path.exists(path):
+ os.makedirs(backup_dir, exist_ok=True)
+ shutil.copy2(path, os.path.join(backup_dir, os.path.basename(path)))
+
+
+def adopt(staging_dir: str) -> List[str]:
+ """Copy staged proposals over the live files, backing up first.
+
+ Returns the list of live paths that were updated.
+ """
+ with open(os.path.join(staging_dir, "manifest.json")) as f:
+ manifest = json.load(f)
+ backup_dir = os.path.join(staging_dir, "backup")
+ updated: List[str] = []
+
+ if manifest.get("has_skill"):
+ live = manifest["live_skill_path"]
+ os.makedirs(os.path.dirname(live), exist_ok=True)
+ _backup(live, backup_dir)
+ shutil.copy2(os.path.join(staging_dir, "proposed_SKILL.md"), live)
+ updated.append(live)
+ if manifest.get("has_memory"):
+ live = manifest["live_memory_path"]
+ os.makedirs(os.path.dirname(live), exist_ok=True)
+ _backup(live, backup_dir)
+ shutil.copy2(os.path.join(staging_dir, "proposed_CLAUDE.md"), live)
+ updated.append(live)
+ return updated
diff --git a/skillopt/sleep/state.py b/skillopt/sleep/state.py
new file mode 100644
index 0000000..1909246
--- /dev/null
+++ b/skillopt/sleep/state.py
@@ -0,0 +1,83 @@
+"""SkillOpt-Sleep — persistent cross-night state.
+
+state.json lives in ~/.skillopt-sleep and is the "long-term" store that
+turns nightly episodes into durable competence (the Agent-Sleep paper's
+short-term -> long-term transfer). It records:
+
+ - night counter
+ - last harvest timestamp per project (so each night only sees new data)
+ - cross-night "slow/meta" memory (lessons that persisted across nights)
+ - per-night history (scores, accept/reject) for trend reporting
+"""
+from __future__ import annotations
+
+import json
+import os
+from typing import Any, Dict, List, Optional
+
+
+def _now_iso(clock: Optional[float] = None) -> str:
+ # caller passes a timestamp; we avoid importing time at module import
+ import time as _t
+ return _t.strftime("%Y-%m-%dT%H:%M:%S", _t.localtime(clock if clock is not None else _t.time()))
+
+
+DEFAULT_STATE: Dict[str, Any] = {
+ "version": 1,
+ "night": 0,
+ "last_harvest": {}, # project -> iso timestamp of last harvested record
+ "slow_memory": "", # cross-night consolidated lessons (meta-skill analogue)
+ "history": [], # list of per-night summaries
+}
+
+
+class SleepState:
+ def __init__(self, path: str, data: Optional[Dict[str, Any]] = None) -> None:
+ self.path = path
+ self.data = data if data is not None else dict(DEFAULT_STATE)
+
+ # io ---------------------------------------------------------------------
+ @classmethod
+ def load(cls, path: str) -> "SleepState":
+ if os.path.exists(path):
+ try:
+ with open(path) as f:
+ data = json.load(f)
+ merged = dict(DEFAULT_STATE)
+ merged.update(data if isinstance(data, dict) else {})
+ return cls(path, merged)
+ except Exception:
+ pass
+ return cls(path, dict(DEFAULT_STATE))
+
+ def save(self) -> None:
+ os.makedirs(os.path.dirname(self.path), exist_ok=True)
+ tmp = self.path + ".tmp"
+ with open(tmp, "w") as f:
+ json.dump(self.data, f, ensure_ascii=False, indent=2)
+ os.replace(tmp, self.path)
+
+ # accessors --------------------------------------------------------------
+ @property
+ def night(self) -> int:
+ return int(self.data.get("night", 0))
+
+ def last_harvest_for(self, project: str) -> Optional[str]:
+ return self.data.get("last_harvest", {}).get(project)
+
+ def set_last_harvest(self, project: str, iso_ts: str) -> None:
+ self.data.setdefault("last_harvest", {})[project] = iso_ts
+
+ @property
+ def slow_memory(self) -> str:
+ return str(self.data.get("slow_memory", ""))
+
+ def set_slow_memory(self, content: str) -> None:
+ self.data["slow_memory"] = content
+
+ def begin_night(self, clock: Optional[float] = None) -> int:
+ self.data["night"] = self.night + 1
+ return self.night
+
+ def record_night(self, summary: Dict[str, Any]) -> None:
+ self.data.setdefault("history", []).append(summary)
diff --git a/skillopt/sleep/types.py b/skillopt/sleep/types.py
new file mode 100644
index 0000000..655541d
--- /dev/null
+++ b/skillopt/sleep/types.py
@@ -0,0 +1,127 @@
+"""SkillOpt-Sleep — core data types.
+
+These dataclasses are the interfaces between the sleep-cycle stages
+(harvest -> mine -> replay -> consolidate -> stage). They are intentionally
+plain (no slots, no heavy deps) so the package imports cleanly on any
+Python 3.8+ interpreter and the deterministic experiment runs with zero
+external dependencies.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field, asdict
+from typing import Any, Dict, List, Optional
+
+
+# ── Stage 1: harvest ──────────────────────────────────────────────────────────
+
+@dataclass
+class SessionDigest:
+ """A normalized summary of one Claude Code session transcript.
+
+ Produced by :mod:`skillopt.sleep.harvest` from a ``.jsonl``
+ transcript plus ``history.jsonl`` entries.
+ """
+
+ session_id: str
+ project: str
+ git_branch: str = ""
+ started_at: str = ""
+ ended_at: str = ""
+ user_prompts: List[str] = field(default_factory=list)
+ assistant_finals: List[str] = field(default_factory=list)
+ tools_used: List[str] = field(default_factory=list)
+ files_touched: List[str] = field(default_factory=list)
+ feedback_signals: List[str] = field(default_factory=list) # "still broken", "perfect", ...
+ n_user_turns: int = 0
+ n_assistant_turns: int = 0
+ raw_path: str = ""
+
+ def to_dict(self) -> Dict[str, Any]:
+ return asdict(self)
+
+
+# ── Stage 2: mine ─────────────────────────────────────────────────────────────
+
+@dataclass
+class TaskRecord:
+ """A self-contained recurring task mined from one or more sessions.
+
+ This is the *training unit* of the sleep cycle — the analogue of a
+ SkillOpt benchmark item.
+ """
+
+ id: str
+ project: str
+ intent: str # what the user wanted (the "question")
+ context_excerpt: str = "" # minimal context needed to attempt it
+ attempted_solution: str = "" # what the agent produced before
+ outcome: str = "unknown" # success | fail | mixed | unknown
+ reference_kind: str = "none" # exact | rubric | none
+ reference: str = "" # exact answer, or rubric text
+ tags: List[str] = field(default_factory=list)
+ source_sessions: List[str] = field(default_factory=list)
+ split: str = "replay" # replay (train) | holdout (test)
+
+ def to_dict(self) -> Dict[str, Any]:
+ return asdict(self)
+
+ @classmethod
+ def from_dict(cls, d: Dict[str, Any]) -> "TaskRecord":
+ known = {f for f in cls.__dataclass_fields__} # type: ignore[attr-defined]
+ return cls(**{k: v for k, v in d.items() if k in known})
+
+
+# ── Stage 3: replay ───────────────────────────────────────────────────────────
+
+@dataclass
+class ReplayResult:
+ """Outcome of re-running one TaskRecord offline under a given skill+memory."""
+
+ id: str
+ hard: float = 0.0 # 0/1 exact, or continuous reward
+ soft: float = 0.0 # partial credit / judge score 0..1
+ response: str = ""
+ fail_reason: str = ""
+ task_type: str = "task"
+ judge_rationale: str = ""
+
+ def to_dict(self) -> Dict[str, Any]:
+ return asdict(self)
+
+
+# ── Stage 4/5: consolidation report ───────────────────────────────────────────
+
+@dataclass
+class EditRecord:
+ """One bounded edit proposed/applied to skill or memory."""
+
+ target: str # "skill" | "memory"
+ op: str # add | delete | replace
+ content: str = ""
+ anchor: str = "" # for replace/delete: text being changed
+ rationale: str = ""
+
+
+@dataclass
+class SleepReport:
+ """Everything one night produced — written to staging for review."""
+
+ night: int
+ project: str
+ started_at: str = ""
+ ended_at: str = ""
+ n_sessions: int = 0
+ n_tasks: int = 0
+ n_replayed: int = 0
+ baseline_score: float = 0.0
+ candidate_score: float = 0.0
+ accepted: bool = False
+ gate_action: str = ""
+ edits: List[EditRecord] = field(default_factory=list)
+ rejected_edits: List[EditRecord] = field(default_factory=list)
+ tokens_used: int = 0
+ notes: List[str] = field(default_factory=list)
+
+ def to_dict(self) -> Dict[str, Any]:
+ d = asdict(self)
+ return d
diff --git a/tests/test_sleep_engine.py b/tests/test_sleep_engine.py
new file mode 100644
index 0000000..751988c
--- /dev/null
+++ b/tests/test_sleep_engine.py
@@ -0,0 +1,166 @@
+"""Tests for the SkillOpt-Sleep engine.
+
+Pure-stdlib (unittest), deterministic, no API key, no third-party deps.
+Run: python3.12 -m pytest tests/test_sleep_engine.py
+ or: python3.12 -m unittest skillopt.sleep ... (see bottom)
+"""
+from __future__ import annotations
+
+import json
+import os
+import tempfile
+import unittest
+
+from skillopt.sleep.backend import MockBackend, exact_score, keyword_soft_score
+from skillopt.sleep.config import load_config
+from skillopt.sleep.consolidate import consolidate
+from skillopt.sleep.cycle import run_sleep_cycle
+from skillopt.sleep.experiments.personas import researcher_persona, programmer_persona
+from skillopt.sleep.harvest import digest_transcript, _detect_feedback, _is_meta_prompt
+from skillopt.sleep.memory import apply_edits, current_learned_lines, extract_learned, set_learned
+from skillopt.sleep.mine import assign_splits, heuristic_mine, dedup_tasks
+from skillopt.sleep.staging import adopt, latest_staging
+from skillopt.sleep.types import EditRecord, SessionDigest, TaskRecord
+
+
+class TestScoring(unittest.TestCase):
+ def test_exact_score(self):
+ self.assertEqual(exact_score("arXiv:1706.03762", "the id is arXiv:1706.03762 ok"), 1.0)
+ self.assertEqual(exact_score("arXiv:1706.03762", "approximately arXiv:1706.037"), 0.0)
+
+ def test_keyword_soft(self):
+ self.assertGreater(keyword_soft_score("add login form", "please add the login form"), 0.5)
+
+
+class TestMemoryEdits(unittest.TestCase):
+ def test_add_and_dedup(self):
+ doc = set_learned("# skill\n", [])
+ doc2, applied = apply_edits(doc, [EditRecord("skill", "add", "Rule A"),
+ EditRecord("skill", "add", "Rule A")])
+ self.assertEqual(len(applied), 1)
+ self.assertIn("Rule A", extract_learned(doc2))
+
+ def test_protected_region_roundtrip(self):
+ base = "# My hand-written skill\nkeep me\n"
+ doc = set_learned(base, ["Rule X"])
+ self.assertIn("keep me", doc)
+ self.assertEqual(current_learned_lines(doc), ["Rule X"])
+ # replacing learned region must preserve hand-written content
+ doc2 = set_learned(doc, ["Rule Y"])
+ self.assertIn("keep me", doc2)
+ self.assertEqual(current_learned_lines(doc2), ["Rule Y"])
+
+ def test_replace_and_delete(self):
+ doc = set_learned("", ["old rule about commits"])
+ doc, _ = apply_edits(doc, [EditRecord("skill", "replace", "new rule", anchor="old rule")])
+ self.assertIn("new rule", extract_learned(doc))
+ doc, _ = apply_edits(doc, [EditRecord("skill", "delete", "", anchor="new rule")])
+ self.assertEqual(current_learned_lines(doc), [])
+
+
+class TestHarvest(unittest.TestCase):
+ def test_feedback_detection(self):
+ self.assertTrue(any(s.startswith("neg:") for s in _detect_feedback("this is still broken")))
+ self.assertTrue(any(s.startswith("pos:") for s in _detect_feedback("perfect, thanks")))
+
+ def test_meta_prompt_filter(self):
+ self.assertTrue(_is_meta_prompt("/clear"))
+ self.assertTrue(_is_meta_prompt("x"))
+ self.assertFalse(_is_meta_prompt("please refactor the auth module"))
+
+ def test_digest_real_transcript_if_present(self):
+ # uses the live machine's transcripts when available; skips otherwise
+ base = os.path.expanduser("~/.claude/projects")
+ if not os.path.isdir(base):
+ self.skipTest("no ~/.claude/projects on this machine")
+ found = None
+ for root, _d, files in os.walk(base):
+ for fn in files:
+ if fn.endswith(".jsonl"):
+ found = os.path.join(root, fn)
+ break
+ if found:
+ break
+ if not found:
+ self.skipTest("no transcripts")
+ d = digest_transcript(found)
+ # may be None for empty transcripts; if not, it must have core fields
+ if d is not None:
+ self.assertIsInstance(d.session_id, str)
+ self.assertGreaterEqual(d.n_user_turns + d.n_assistant_turns, 0)
+
+
+class TestMine(unittest.TestCase):
+ def _digest(self, prompts, feedback):
+ return SessionDigest(
+ session_id="s1", project="/p", user_prompts=prompts,
+ assistant_finals=["did stuff"], feedback_signals=feedback,
+ n_user_turns=len(prompts), n_assistant_turns=1,
+ )
+
+ def test_outcome_inference(self):
+ fail = heuristic_mine([self._digest(["fix the parser bug please"], ["neg:still broken"])])
+ self.assertEqual(fail[0].outcome, "fail")
+ ok = heuristic_mine([self._digest(["format the output"], ["pos:perfect"])])
+ self.assertEqual(ok[0].outcome, "success")
+
+ def test_split_stable_and_nonempty(self):
+ tasks = assign_splits(researcher_persona(), holdout_fraction=0.34, seed=42)
+ splits = {t.split for t in tasks}
+ self.assertIn("replay", splits)
+ self.assertIn("holdout", splits)
+ # stable across calls
+ again = assign_splits(researcher_persona(), holdout_fraction=0.34, seed=42)
+ self.assertEqual([t.split for t in tasks], [t.split for t in again])
+
+
+class TestConsolidateGate(unittest.TestCase):
+ def test_accepts_helpful_rejects_harmful(self):
+ be = MockBackend()
+ tasks = assign_splits(researcher_persona(), holdout_fraction=0.34, seed=42)
+ res = consolidate(be, tasks, set_learned("", []), "", edit_budget=4,
+ gate_metric="mixed", night=1)
+ self.assertTrue(res.accepted)
+ self.assertGreater(res.candidate_score, res.baseline_score)
+
+ def test_no_op_when_already_optimal(self):
+ be = MockBackend()
+ tasks = assign_splits(programmer_persona(), holdout_fraction=0.34, seed=1)
+ # first night learns the rule
+ r1 = consolidate(be, tasks, set_learned("", []), "", edit_budget=4, night=1)
+ # second night on the learned skill should find nothing to add
+ r2 = consolidate(be, tasks, r1.new_skill, r1.new_memory, edit_budget=4, night=2)
+ self.assertEqual(len(r2.applied_edits), 0)
+
+
+class TestFullCycleAndAdopt(unittest.TestCase):
+ def test_cycle_stage_then_adopt_with_backup(self):
+ with tempfile.TemporaryDirectory() as proj, tempfile.TemporaryDirectory() as home:
+ cfg = load_config(
+ invoked_project=proj, projects="invoked", backend="mock",
+ claude_home=os.path.join(home, ".claude"),
+ managed_skill_name="skillopt-sleep-learned",
+ auto_adopt=False,
+ )
+ # seed a known persona so we don't depend on ~/.claude
+ tasks = assign_splits(researcher_persona(), holdout_fraction=0.34, seed=42)
+
+ outcome = run_sleep_cycle(cfg, seed_tasks=tasks)
+ self.assertTrue(outcome.report.accepted)
+ self.assertTrue(os.path.isdir(outcome.staging_dir))
+ self.assertTrue(os.path.exists(os.path.join(outcome.staging_dir, "report.md")))
+
+ # nothing live touched yet
+ live_skill = cfg.managed_skill_path()
+ self.assertFalse(os.path.exists(live_skill))
+
+ # adopt -> live file created, backup dir exists
+ updated = adopt(outcome.staging_dir)
+ self.assertTrue(any("SKILL.md" in p for p in updated))
+ self.assertTrue(os.path.exists(live_skill))
+ with open(live_skill) as f:
+ self.assertIn("answer", f.read().lower())
+
+
+if __name__ == "__main__":
+ unittest.main(verbosity=2)