diff --git a/docs/sleep/experiment_results.md b/docs/sleep/experiment_results.md new file mode 100644 index 0000000..500f72b --- /dev/null +++ b/docs/sleep/experiment_results.md @@ -0,0 +1,73 @@ +# SkillOpt-Sleep — validation experiment results + +Generated: 2026-06-07 (autonomous offline session) +Backend: mock (deterministic, no API). Reproducible via the commands below. + +``` +$ python3.12 -m skillopt.sleep.experiments.run_experiment --persona researcher --nights 4 --json +{ + "persona": "researcher", + "backend": "mock", + "nights_run": 1, + "baseline_holdout": 0.3333, + "after_holdout": 1.0, + "lift": 0.6667, + "improved": true, + "gate_blocks_harmful": true, + "final_skill_excerpt": "T -->\n## Learned preferences & procedures\n\n_This block is maintained by SkillOpt-Sleep. Edits here are proposed offline, validated against your past tasks, and adopted only after you approve them. Hand-edits outside this block are never touched._\n\n- Always wrap the final answer in ... tags.\n- Report arXiv ids in the exact form arXiv:XXXX.XXXXX.\n\n", + "trace": [ + { + "night": 0, + "holdout_score": 0.3333, + "action": "baseline", + "n_edits": 0 + }, + { + "night": 1, + "holdout_score": 1.0, + "action": "accept_new_best", + "accepted": true, + "n_edits": 2, + "edits": [ + "Always wrap the final answer in ... tags.", + "Report arXiv ids in the exact form arXiv:XXXX.XXXXX." + ], + "n_rejected": 0 + } + ] +} +``` + +``` +$ python3.12 -m skillopt.sleep.experiments.run_experiment --persona programmer --nights 4 --json +{ + "persona": "programmer", + "backend": "mock", + "nights_run": 1, + "baseline_holdout": 0.3194, + "after_holdout": 1.0, + "lift": 0.6806, + "improved": true, + "gate_blocks_harmful": true, + "final_skill_excerpt": "laude Code sessions.\n\n\n## Learned preferences & procedures\n\n_This block is maintained by SkillOpt-Sleep. Edits here are proposed offline, validated against your past tasks, and adopted only after you approve them. Hand-edits outside this block are never touched._\n\n- Write git commit subjects in imperative mood, max 50 chars.\n\n", + "trace": [ + { + "night": 0, + "holdout_score": 0.3194, + "action": "baseline", + "n_edits": 0 + }, + { + "night": 1, + "holdout_score": 1.0, + "action": "accept_new_best", + "accepted": true, + "n_edits": 1, + "edits": [ + "Write git commit subjects in imperative mood, max 50 chars." + ], + "n_rejected": 0 + } + ] +} +``` diff --git a/skillopt-sleep-plugin/.claude-plugin/plugin.json b/skillopt-sleep-plugin/.claude-plugin/plugin.json new file mode 100644 index 0000000..3595f51 --- /dev/null +++ b/skillopt-sleep-plugin/.claude-plugin/plugin.json @@ -0,0 +1,22 @@ +{ + "name": "skillopt-sleep", + "description": "Give your local Claude agent a nightly 'sleep cycle': it reviews your past sessions offline, replays recurring tasks on your own API budget, and consolidates what it learns into validated memory (CLAUDE.md) and skills (SKILL.md).越用越好用 — gets better the more you use it. Synthesizes SkillOpt (validation-gated skill optimization), Claude Dreams (offline memory consolidation), and agent sleep/consolidation.", + "version": "0.1.0", + "author": { + "name": "Yifan Yang", + "email": "yifanyang@microsoft.com" + }, + "homepage": "https://github.com/microsoft/SkillOpt", + "repository": "https://github.com/microsoft/SkillOpt", + "license": "MIT", + "keywords": [ + "skillopt", + "self-improvement", + "memory-consolidation", + "dreams", + "sleep", + "skills", + "continual-learning", + "offline-optimization" + ] +} diff --git a/skillopt-sleep-plugin/README.md b/skillopt-sleep-plugin/README.md new file mode 100644 index 0000000..7898b17 --- /dev/null +++ b/skillopt-sleep-plugin/README.md @@ -0,0 +1,82 @@ +# SkillOpt-Sleep (Claude Code plugin) + +> Give your local Claude agent a **sleep cycle**. Every night it reviews your +> past sessions offline, replays your recurring tasks on your own API budget, +> and consolidates what it learns into **validated** memory (`CLAUDE.md`) and +> skills (`SKILL.md`). Your agent gets better the more you use it — no +> model-weight training. + +SkillOpt-Sleep is the **deployment-time** companion to +[SkillOpt](https://github.com/microsoft/SkillOpt). SkillOpt trains a skill +offline on a benchmark; SkillOpt-Sleep applies the same discipline to *your own +daily usage*: bounded text edits, accepted only through a held-out validation +gate, with rejected edits kept as negative feedback. + +It synthesizes three ideas: + +| Idea | Contribution | +|---|---| +| **SkillOpt** | skill/memory = trainable text; bounded add/delete/replace edits; **held-out gate** keeps only changes that help. | +| **Claude Dreams** | offline consolidation over past sessions; input never mutated; output **reviewed then adopted**. | +| **Agent sleep** | periodic offline replay turns short-term episodes into long-term skill. | + +## What it does (one "night") + +``` +harvest ~/.claude transcripts → mine recurring tasks → replay offline + → consolidate (reflect → bounded edit → GATE) → stage proposal → (you) adopt +``` + +Nothing live is modified until **you** run `/sleep adopt` (the Dreams "review, +then adopt or discard" contract). Every adopt backs up the prior file first. + +## Quick start + +```bash +# from inside any project you use with Claude Code: +/sleep dry-run # safe preview: what it would learn, no changes staged +/sleep run # full cycle: stages a reviewed proposal (still no live edits) +/sleep status # see history + the latest staged proposal +/sleep adopt # apply the staged proposal to CLAUDE.md / SKILL.md (with backup) +``` + +Or call the engine directly (Python ≥ 3.10): + +```bash +python -m skillopt.sleep run --project "$(pwd)" --scope invoked --backend mock +python -m skillopt.sleep run --project "$(pwd)" --backend anthropic # real lift, uses your budget +``` + +Default backend is **`mock`** — deterministic, no API spend — so you can try the +plumbing for free. Switch to `--backend anthropic` for genuine improvement. + +## Does it actually improve? (deterministic proof) + +```bash +python -m skillopt.sleep.experiments.run_experiment --persona researcher --assert-improves +python -m skillopt.sleep.experiments.run_experiment --persona programmer --assert-improves +``` + +Each prints the held-out score rising from baseline toward 1.0 as the gate +accepts the general rules your tasks need, and confirms the gate **rejects** an +injected harmful edit. Recorded output: [`docs/sleep/experiment_results.md`](../docs/sleep/experiment_results.md). + +## Schedule it nightly + +```bash +"${CLAUDE_PLUGIN_ROOT}/scripts/install-cron.sh" "$(pwd)" # prints a crontab line; installs nothing +``` + +## Safety + +- **Read-only** harvest of `~/.claude`. `mock` replay has no side effects. +- Proposals are **staged**, never auto-applied (unless you opt in with `--auto-adopt`). +- Every adopt writes a backup under the staging dir's `backup/`. +- Per-night **token/task budget caps**; secrets redacted from prompts. +- `fresh` replay (Phase 3) runs only in throwaway git worktrees. + +## Status + +Phase 1 (engine + deterministic experiment + plugin surface) is complete. +Phase 3 adds the real-API miner/judge and `fresh` worktree replay. See +[`docs/superpowers/specs/2026-06-07-skillopt-sleep-claude-code-plugin-design.md`](../docs/superpowers/specs/2026-06-07-skillopt-sleep-claude-code-plugin-design.md). diff --git a/skillopt-sleep-plugin/commands/sleep.md b/skillopt-sleep-plugin/commands/sleep.md new file mode 100644 index 0000000..48b62f7 --- /dev/null +++ b/skillopt-sleep-plugin/commands/sleep.md @@ -0,0 +1,63 @@ +--- +description: Run or manage the SkillOpt-Sleep self-evolution cycle (review past sessions, replay tasks offline, consolidate validated memory + skills) +argument-hint: "[run | dry-run | status | adopt | harvest] (default: status)" +allowed-tools: Bash, Read +--- + +# /sleep — SkillOpt-Sleep nightly self-evolution + +You are driving **SkillOpt-Sleep**: a tool that lets this user's Claude agent +improve offline by reviewing past sessions, replaying recurring tasks, and +consolidating what it learns into **validated** memory (`CLAUDE.md`) and skills +(`SKILL.md`). It is gated like SkillOpt: a change is kept only if it improves a +held-out replay score, and nothing live is modified until the user adopts it. + +## Requested action: $ARGUMENTS + +(If `$ARGUMENTS` is empty, treat it as `status`.) + +## How to run it + +The engine is the `skillopt.sleep` Python package in this repo. Use the +**plugin's bundled runner** so the right interpreter and repo are on the path: + +```bash +"${CLAUDE_PLUGIN_ROOT}/scripts/sleep.sh" --project "$(pwd)" --scope invoked +``` + +`` is one of: + +| action | what it does | +|-----------|--------------| +| `status` | show how many nights have run + the latest staged proposal (READ-ONLY) | +| `dry-run` | harvest → mine → replay → report, but **stage nothing** (safe preview) | +| `run` | full cycle: also **stage** a reviewed proposal (still does NOT touch live files) | +| `adopt` | apply the latest staged proposal to live `CLAUDE.md` / `SKILL.md` (backs up first) | +| `harvest` | debug: print the recurring tasks mined from recent sessions | + +Default backend is `mock` (deterministic, no API spend). To use real Anthropic +budget for genuine improvement, add `--backend anthropic`. + +## Steps to follow + +1. **Run the requested action** via the bundled runner above. Capture stdout. +2. **For `run` / `dry-run`:** after it completes, `Read` the generated + `report.md` in the staging dir it prints, and show the user: + - held-out score: baseline → candidate (the proof it helped) + - the gate decision (accept/reject) and the exact edits it proposes + - where the proposal is staged +3. **For `run` that produced an accepted proposal:** tell the user the diff is + staged and that **nothing live changed yet**. Offer to run `/sleep adopt`. +4. **For `adopt`:** confirm which live files were updated and that backups were + written under the staging dir's `backup/`. +5. **Never** edit `CLAUDE.md` or `SKILL.md` yourself — only the `adopt` action + does that, with a backup. Respect the review gate. + +## Safety reminders + +- Harvest is **read-only** over `~/.claude`. Replay in `mock` mode runs no + shell side effects. +- The cycle stages proposals; the user is in control of adoption. +- If the user asks to schedule this nightly, point them at + `${CLAUDE_PLUGIN_ROOT}/scripts/install-cron.sh` (prints a crontab line; does + not install anything without confirmation). diff --git a/skillopt-sleep-plugin/hooks/hooks.json b/skillopt-sleep-plugin/hooks/hooks.json new file mode 100644 index 0000000..6ea666b --- /dev/null +++ b/skillopt-sleep-plugin/hooks/hooks.json @@ -0,0 +1,16 @@ +{ + "hooks": { + "SessionEnd": [ + { + "matcher": "*", + "hooks": [ + { + "type": "command", + "command": "\"${CLAUDE_PLUGIN_ROOT}/hooks/on-session-end.sh\"", + "async": true + } + ] + } + ] + } +} diff --git a/skillopt-sleep-plugin/hooks/on-session-end.sh b/skillopt-sleep-plugin/hooks/on-session-end.sh new file mode 100755 index 0000000..bd84be2 --- /dev/null +++ b/skillopt-sleep-plugin/hooks/on-session-end.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# SkillOpt-Sleep SessionEnd hook (async, best-effort, NON-BLOCKING). +# +# This does NOT run the optimizer. It only appends a tiny marker so the next +# nightly cycle knows there is fresh activity to harvest, and (optionally) +# nudges the user once that a sleep cycle is available. It must never fail the +# session or spend API budget. +set -uo pipefail + +PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}" +STATE_DIR="${HOME}/.skillopt-sleep" +mkdir -p "$STATE_DIR" 2>/dev/null || exit 0 + +# Record that a session just ended (cheap; used for "is there new data?"). +printf '%s\t%s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "${PWD}" \ + >> "$STATE_DIR/session-end.log" 2>/dev/null || true + +exit 0 diff --git a/skillopt-sleep-plugin/scripts/install-cron.sh b/skillopt-sleep-plugin/scripts/install-cron.sh new file mode 100755 index 0000000..e18460d --- /dev/null +++ b/skillopt-sleep-plugin/scripts/install-cron.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Print (does NOT install) a crontab line that runs SkillOpt-Sleep nightly. +# The user copies the line into `crontab -e` if they want it. +set -euo pipefail + +PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}" +RUNNER="$PLUGIN_ROOT/scripts/sleep.sh" +PROJECT="${1:-$(pwd)}" +BACKEND="${2:-mock}" + +# 3:17am local — deliberately off the :00 mark so many users don't all hit the +# API at once (and we leave room for jitter). +MIN=17 +HOUR=3 + +cat <> "${PROJECT}/.skillopt-sleep/cron.log" 2>&1 +# +# For fully-autonomous adoption (power users), append: --auto-adopt +# To spend real API budget for genuine lift, set BACKEND=anthropic above. +# ──────────────────────────────────────────────────────────────────────────── +EOF diff --git a/skillopt-sleep-plugin/scripts/sleep.sh b/skillopt-sleep-plugin/scripts/sleep.sh new file mode 100755 index 0000000..49c224a --- /dev/null +++ b/skillopt-sleep-plugin/scripts/sleep.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# SkillOpt-Sleep runner — invokes the skillopt.sleep engine with a suitable +# Python interpreter, from the repo that contains this plugin. +# +# Usage: sleep.sh [extra args...] +set -euo pipefail + +# Resolve the repo root: the plugin lives at /skillopt-sleep-plugin, +# so the engine package is at /skillopt/sleep. CLAUDE_PLUGIN_ROOT points +# at the plugin dir when run by Claude Code; fall back to this script's dir. +PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}" +REPO_ROOT="$(cd "$PLUGIN_ROOT/.." && pwd)" + +# Pick an interpreter that satisfies SkillOpt's 3.10+ requirement. +PY="" +for cand in python3.12 python3.11 python3.10 python3; do + if command -v "$cand" >/dev/null 2>&1; then + ver="$("$cand" -c 'import sys; print("%d%d" % sys.version_info[:2])' 2>/dev/null || echo 0)" + if [ "${ver:-0}" -ge 310 ]; then PY="$cand"; break; fi + fi +done +if [ -z "$PY" ]; then + echo "[sleep] ERROR: need Python >= 3.10 (found none). Install one and retry." >&2 + exit 1 +fi + +if [ "$#" -eq 0 ]; then set -- status; fi + +cd "$REPO_ROOT" +exec "$PY" -m skillopt.sleep "$@" diff --git a/skillopt-sleep-plugin/skills/skillopt-sleep/SKILL.md b/skillopt-sleep-plugin/skills/skillopt-sleep/SKILL.md new file mode 100644 index 0000000..e8d4a98 --- /dev/null +++ b/skillopt-sleep-plugin/skills/skillopt-sleep/SKILL.md @@ -0,0 +1,79 @@ +--- +name: skillopt-sleep +description: "Use when the user wants their Claude agent to self-improve from past usage, asks about a nightly/offline 'sleep' or 'dream' cycle, memory/skill consolidation, or says things like '让 agent 越用越好用', 'review my past sessions', 'learn my preferences', 'consolidate what you learned', 'run the sleep cycle', or wants to schedule offline self-optimization. Drives the skillopt.sleep engine: harvest past sessions → mine recurring tasks → replay offline → consolidate validated CLAUDE.md/SKILL.md behind a held-out gate." +--- + +# SkillOpt-Sleep: offline self-evolution for a local Claude agent + +SkillOpt-Sleep gives the user's agent a **sleep cycle**. While the user is +offline (e.g. nightly), it reviews their real past Claude Code sessions, +re-runs recurring tasks on their own API budget, and consolidates what it +learns into **memory** (`CLAUDE.md`) and **skills** (`SKILL.md`) — but only +keeps changes that pass a held-out validation gate, and only after the user +adopts them. The agent gets measurably better at *this* user's recurring work, +with no model-weight training. It is the deployment-time analogue of training: +short-term experience → long-term competence. + +It synthesizes three ideas: +- **SkillOpt** — the skill/memory doc is trainable text; bounded add/delete/replace + edits; accepted only through a held-out gate; rejected edits become negative feedback. +- **Claude Dreams** — offline consolidation that reads past sessions and rebuilds + memory (dedup/merge/resolve); the input is never mutated; output is reviewed then adopted. +- **Agent sleep** — periodic offline replay turns episodes into durable skill. + +## When to use this skill + +Trigger when the user wants any of: +- "make my agent learn from how I use it" / "越用越好用" / "remember my preferences across sessions" +- a nightly/scheduled or on-demand **offline self-improvement / dream / sleep** run +- to **review past sessions/trajectories** and distill recurring tasks +- to **consolidate** feedback into `CLAUDE.md` or a managed skill +- to **schedule** the cycle (cron) or **adopt** a staged proposal + +## The cycle (six stages) + +1. **Harvest** — read `~/.claude/projects/*/.jsonl` + `~/.claude/history.jsonl` (READ-ONLY) → session digests. +2. **Mine** — digests → `TaskRecord`s (recurring intents + outcome labels + checkable refs where possible). +3. **Replay** — re-run tasks offline under the *current* skill+memory → (hard, soft) scores. +4. **Consolidate** — reflect on failures → propose bounded edits → **gate** on a held-out slice; accept only if it strictly improves. +5. **Stage** — write `proposed_CLAUDE.md`, `proposed_SKILL.md`, a diff, and `report.md` into `/.skillopt-sleep/staging//`. **Nothing live changes.** +6. **Adopt** — explicit (or opt-in auto): copy staged files over live ones, backing up first. + +## How to drive it + +Prefer the `/sleep` command. Under the hood it calls the bundled runner: + +```bash +"${CLAUDE_PLUGIN_ROOT}/scripts/sleep.sh" status # what's happened +"${CLAUDE_PLUGIN_ROOT}/scripts/sleep.sh" dry-run --project "$(pwd)" # safe preview +"${CLAUDE_PLUGIN_ROOT}/scripts/sleep.sh" run --project "$(pwd)" # full cycle, stages a proposal +"${CLAUDE_PLUGIN_ROOT}/scripts/sleep.sh" adopt --project "$(pwd)" # apply staged proposal (with backup) +``` + +- Default backend is `mock` (deterministic, **no API spend**) — good for trying the plumbing. +- Add `--backend anthropic` to spend the user's real budget for genuine improvement. +- Scope defaults to the invoked project; `--scope all` harvests every project. + +## Hard rules + +- **Never** hand-edit the user's `CLAUDE.md` / `SKILL.md` as part of this skill. + Only the `adopt` action changes live files, and it backs them up first. +- Harvest is read-only. `mock` replay has no side effects. +- Always show the user the **held-out baseline → candidate** score and the + exact proposed edits before suggesting adoption. Evidence before adoption. +- If asked whether it really helps, run + `python -m skillopt.sleep.experiments.run_experiment --persona researcher --json` + — a deterministic demo that proves held-out lift and that the gate blocks + harmful edits. + +## Validate / demo + +```bash +# deterministic proof (no API): held-out score rises, gate blocks regressions +python -m skillopt.sleep.experiments.run_experiment --persona researcher --assert-improves +python -m skillopt.sleep.experiments.run_experiment --persona programmer --assert-improves +``` + +See `docs/sleep/experiment_results.md` for recorded output and +`docs/superpowers/specs/2026-06-07-skillopt-sleep-claude-code-plugin-design.md` +for the full design. diff --git a/skillopt/sleep/__init__.py b/skillopt/sleep/__init__.py new file mode 100644 index 0000000..6e35c0b --- /dev/null +++ b/skillopt/sleep/__init__.py @@ -0,0 +1,20 @@ +"""SkillOpt-Sleep — nightly offline self-evolution for a local Claude agent. + +A Claude Code plugin engine that gives a user's agent a "sleep cycle": +harvest the day's real session transcripts, mine recurring tasks, replay +them offline, and consolidate short-term experience into long-term memory +(CLAUDE.md) and skills (SKILL.md) behind a SkillOpt validation gate. + +Synthesizes three ideas: + * SkillOpt — validation-gated bounded text optimization (this repo) + * Dreams — offline memory consolidation, input never mutated + * Sleep — short-term experience -> long-term competence, offline + +Public entry points: + * skillopt.sleep.cli — `python -m skillopt.sleep ...` + * skillopt.sleep.cycle.run_sleep_cycle(...) +""" +from __future__ import annotations + +__all__ = ["__version__"] +__version__ = "0.1.0" diff --git a/skillopt/sleep/__main__.py b/skillopt/sleep/__main__.py new file mode 100644 index 0000000..22dc15d --- /dev/null +++ b/skillopt/sleep/__main__.py @@ -0,0 +1,195 @@ +"""SkillOpt-Sleep — command-line interface. + + python -m skillopt.sleep run # full cycle: harvest->mine->replay->gate->stage + python -m skillopt.sleep dry-run # same but report only, no staging/adopt + python -m skillopt.sleep status # show state + latest staged proposal + python -m skillopt.sleep adopt # apply the latest staged proposal (with backup) + python -m skillopt.sleep harvest # just print what would be mined (debug) + +Common flags: + --project PATH project to evolve (default: cwd) + --scope all|invoked harvest scope (default: invoked) + --backend mock|anthropic + --model NAME + --lookback-hours N + --auto-adopt + --json machine-readable output +""" +from __future__ import annotations + +import argparse +import json +import os +import sys +from typing import Any, Dict + +from skillopt.sleep.config import load_config +from skillopt.sleep.cycle import run_sleep_cycle +from skillopt.sleep.harvest import harvest +from skillopt.sleep.mine import mine +from skillopt.sleep.state import SleepState +from skillopt.sleep.staging import latest_staging, adopt as adopt_staging + + +def _add_common(p: argparse.ArgumentParser) -> None: + p.add_argument("--project", default="") + p.add_argument("--scope", default="", choices=["", "all", "invoked"]) + p.add_argument("--backend", default="", choices=["", "mock", "anthropic"]) + p.add_argument("--model", default="") + p.add_argument("--claude-home", default="", help="override ~/.claude (also isolates state)") + p.add_argument("--lookback-hours", type=int, default=0) + p.add_argument("--edit-budget", type=int, default=0) + p.add_argument("--auto-adopt", action="store_true") + p.add_argument("--json", action="store_true") + + +def _cfg_from_args(args) -> Any: + overrides: Dict[str, Any] = {} + if args.project: + overrides["invoked_project"] = os.path.abspath(args.project) + overrides["projects"] = "invoked" + if args.scope: + overrides["projects"] = args.scope + if args.backend: + overrides["backend"] = args.backend + if args.model: + overrides["model"] = args.model + if getattr(args, "claude_home", ""): + overrides["claude_home"] = os.path.abspath(args.claude_home) + if getattr(args, "lookback_hours", 0): + overrides["lookback_hours"] = args.lookback_hours + if getattr(args, "edit_budget", 0): + overrides["edit_budget"] = args.edit_budget + if getattr(args, "auto_adopt", False): + overrides["auto_adopt"] = True + return load_config(**overrides) + + +def cmd_run(args, dry: bool = False) -> int: + cfg = _cfg_from_args(args) + outcome = run_sleep_cycle(cfg, dry_run=dry) + rep = outcome.report + if args.json: + print(json.dumps({ + "night": rep.night, "accepted": rep.accepted, + "gate_action": rep.gate_action, + "baseline": rep.baseline_score, "candidate": rep.candidate_score, + "n_tasks": rep.n_tasks, "n_sessions": rep.n_sessions, + "edits": [e.__dict__ for e in rep.edits], + "staging_dir": outcome.staging_dir, "adopted": outcome.adopted, + }, ensure_ascii=False, indent=2)) + else: + print(f"[sleep] night {rep.night}: {rep.n_sessions} sessions -> {rep.n_tasks} tasks") + print(f"[sleep] held-out {rep.baseline_score:.3f} -> {rep.candidate_score:.3f} " + f"=> {rep.gate_action} (accepted={rep.accepted})") + for e in rep.edits: + print(f" + [{e.target}/{e.op}] {e.content}") + if outcome.staging_dir: + print(f"[sleep] staged: {outcome.staging_dir}") + if not outcome.adopted: + print("[sleep] review it, then: python -m skillopt.sleep adopt") + if outcome.adopted: + print(f"[sleep] auto-adopted: {', '.join(outcome.adopted_paths)}") + return 0 + + +def cmd_status(args) -> int: + cfg = _cfg_from_args(args) + state = SleepState.load(cfg.state_path) + project = cfg.get("invoked_project") or os.getcwd() + latest = latest_staging(project) + info = { + "night": state.night, + "state_path": cfg.state_path, + "project": project, + "history_tail": state.data.get("history", [])[-5:], + "latest_staging": latest, + "slow_memory_chars": len(state.slow_memory), + } + if args.json: + print(json.dumps(info, ensure_ascii=False, indent=2)) + else: + print(f"[sleep] nights so far: {state.night}") + print(f"[sleep] project: {project}") + if latest: + print(f"[sleep] latest staged proposal: {latest}") + rp = os.path.join(latest, "report.md") + if os.path.exists(rp): + with open(rp) as f: + print("\n" + f.read()) + else: + print("[sleep] no staged proposals yet.") + return 0 + + +def cmd_adopt(args) -> int: + cfg = _cfg_from_args(args) + project = cfg.get("invoked_project") or os.getcwd() + target = args.staging or latest_staging(project) + if not target or not os.path.isdir(target): + print("[sleep] nothing to adopt (no staging dir).") + return 1 + updated = adopt_staging(target) + print(f"[sleep] adopted from {target}") + for p in updated: + print(f" -> {p}") + if not updated: + print("[sleep] (proposal contained no accepted changes)") + return 0 + + +def cmd_harvest(args) -> int: + cfg = _cfg_from_args(args) + digests = harvest( + cfg.transcripts_dir, + scope=cfg.get("projects", "invoked"), + invoked_project=cfg.get("invoked_project", ""), + limit=cfg.get("max_tasks_per_night", 40) * 3, + ) + tasks = mine(digests, max_tasks=cfg.get("max_tasks_per_night", 40), + holdout_fraction=cfg.get("holdout_fraction", 0.34), seed=cfg.get("seed", 42)) + if args.json: + print(json.dumps({ + "n_sessions": len(digests), + "tasks": [t.to_dict() for t in tasks], + }, ensure_ascii=False, indent=2)) + else: + print(f"[sleep] {len(digests)} sessions -> {len(tasks)} tasks") + for t in tasks: + print(f" [{t.split}/{t.outcome}] {t.intent[:90]}") + return 0 + + +def main(argv=None) -> int: + parser = argparse.ArgumentParser(prog="skillopt.sleep", description="SkillOpt-Sleep nightly self-evolution") + sub = parser.add_subparsers(dest="cmd", required=True) + + p_run = sub.add_parser("run", help="run a full sleep cycle") + _add_common(p_run) + p_dry = sub.add_parser("dry-run", help="harvest+mine+replay, report only") + _add_common(p_dry) + p_status = sub.add_parser("status", help="show state + latest proposal") + _add_common(p_status) + p_adopt = sub.add_parser("adopt", help="apply latest staged proposal") + _add_common(p_adopt) + p_adopt.add_argument("--staging", default="", help="specific staging dir") + p_harvest = sub.add_parser("harvest", help="debug: show mined tasks") + _add_common(p_harvest) + + args = parser.parse_args(argv) + if args.cmd == "run": + return cmd_run(args, dry=False) + if args.cmd == "dry-run": + return cmd_run(args, dry=True) + if args.cmd == "status": + return cmd_status(args) + if args.cmd == "adopt": + return cmd_adopt(args) + if args.cmd == "harvest": + return cmd_harvest(args) + parser.print_help() + return 2 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skillopt/sleep/backend.py b/skillopt/sleep/backend.py new file mode 100644 index 0000000..a3b5aca --- /dev/null +++ b/skillopt/sleep/backend.py @@ -0,0 +1,334 @@ +"""SkillOpt-Sleep — optimizer/replay backend abstraction. + +A backend supplies the three "intelligent" operations the sleep cycle needs: + + 1. attempt(task, skill, memory) -> response text (the rollout) + 2. judge(task, response) -> (hard, soft, rationale) (the reward) + 3. reflect(failures, successes, skill, memory) + -> list[EditRecord] (proposed bounded edits) + +Two implementations: + * MockBackend — deterministic, no API, used for tests + the experiment. + Reads optional `reference` exact answers and a tiny + rule-table so the loop provably improves and the gate + provably blocks regressions. + * AnthropicBackend — uses the user's ANTHROPIC_API_KEY via the `claude` + CLI or the anthropic SDK (lazy-imported). Real lift. + +The backend never touches live config; it only returns text/edits that the +consolidation stage gates and stages. +""" +from __future__ import annotations + +import json +import os +import re +import subprocess +from typing import Any, Dict, List, Optional, Tuple + +from skillopt.sleep.types import EditRecord, ReplayResult, TaskRecord + + +# ── Backend protocol ────────────────────────────────────────────────────────── + +class Backend: + name = "base" + + def attempt(self, task: TaskRecord, skill: str, memory: str) -> str: + raise NotImplementedError + + def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]: + raise NotImplementedError + + def reflect( + self, + failures: List[Tuple[TaskRecord, ReplayResult]], + successes: List[Tuple[TaskRecord, ReplayResult]], + skill: str, + memory: str, + *, + edit_budget: int, + evolve_skill: bool, + evolve_memory: bool, + ) -> List[EditRecord]: + raise NotImplementedError + + # token accounting (optional) + def tokens_used(self) -> int: + return 0 + + +# ── Shared scoring helpers ──────────────────────────────────────────────────── + +def _normalize(s: str) -> str: + s = (s or "").lower().strip() + s = re.sub(r"[^\w\s]", " ", s) + s = re.sub(r"\s+", " ", s) + return s.strip() + + +def exact_score(reference: str, response: str) -> float: + ref = _normalize(reference) + resp = _normalize(response) + if not ref: + return 0.0 + return 1.0 if ref in resp or resp == ref else 0.0 + + +def keyword_soft_score(reference: str, response: str) -> float: + """Fraction of reference tokens present in response (cheap rubric proxy).""" + ref_tokens = [t for t in _normalize(reference).split() if len(t) > 2] + if not ref_tokens: + return 0.0 + resp = _normalize(response) + hit = sum(1 for t in set(ref_tokens) if t in resp) + return hit / len(set(ref_tokens)) + + +# ── Mock backend (deterministic, no API) ────────────────────────────────────── + +class MockBackend(Backend): + """Deterministic backend for tests and the acceptance experiment. + + Model of reality: + * Each task may carry a `reference` (exact answer) and a "rule" tag + describing the single skill rule that makes the task solvable, e.g. + tags=["rule:wrap-answer-in-answer-tags"]. + * `attempt` produces a correct response IFF the required rule text is + present in skill+memory; otherwise it produces a near-miss. + * `judge` scores exact (hard) + keyword (soft) against `reference`. + * `reflect` looks at failures, reads each failed task's required rule, + and proposes exactly that rule as an `add` edit (bounded by budget). + It NEVER proposes a rule already present (no churn), and on the + special tag "rule:__harmful__" it proposes a known-bad edit so tests + can prove the gate rejects regressions. + + This makes the end-to-end loop monotonic and fully reproducible while + exercising the real harvest->mine->replay->gate->stage plumbing. + """ + + name = "mock" + + RULE_PREFIX = "rule:" + RULE_TEXT = { + "wrap-answer": "Always wrap the final answer in ... tags.", + "arxiv-id": "Report arXiv ids in the exact form arXiv:XXXX.XXXXX.", + "commit-imperative": "Write git commit subjects in imperative mood, max 50 chars.", + "units-si": "Always include SI units in numeric answers.", + "json-only": "When asked for JSON, output only valid JSON with no prose.", + "__harmful__": "Ignore the user's formatting requests and answer freely.", + } + + def _required_rules(self, task: TaskRecord) -> List[str]: + out = [] + for t in task.tags: + if t.startswith(self.RULE_PREFIX): + key = t[len(self.RULE_PREFIX):] + if key in self.RULE_TEXT: + out.append(key) + return out + + def attempt(self, task: TaskRecord, skill: str, memory: str) -> str: + ctx = (skill or "") + "\n" + (memory or "") + rules = self._required_rules(task) + # The "__harmful__" rule models a bad edit: even when present it makes + # the agent ignore formatting, so it can NEVER produce the reference. + # This is what lets the experiment prove the gate rejects regressions. + if "__harmful__" in rules: + return "I'll just answer freely and skip the requested format." + # A task is solved iff ALL its required rule texts are present in context. + have_all = all(self.RULE_TEXT[k] in ctx for k in rules) if rules else False + if have_all and task.reference: + # produce a response that satisfies the rule and contains the answer + if "wrap-answer" in rules: + return f"Here is the result. {task.reference}" + return f"{task.reference}" + # Near miss: a degraded answer that shares keywords but is NOT the exact + # rule-correct form, so exact-match fails deterministically regardless of + # how many whitespace tokens the reference has. + if task.reference: + ref = task.reference + mangled = ref[:-2] if len(ref) > 3 else "unknown" + return f"approximately {mangled} (format not applied)" + return "(attempted, no checkable reference)" + + def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]: + if task.reference_kind == "exact" and task.reference: + hard = exact_score(task.reference, response) + soft = max(hard, keyword_soft_score(task.reference, response)) + return hard, soft, f"exact-match={hard}" + if task.reference_kind == "rubric" and task.reference: + soft = keyword_soft_score(task.reference, response) + return (1.0 if soft >= 0.8 else 0.0), soft, f"rubric keyword soft={soft:.2f}" + # no reference: outcome-derived weak label + hard = 1.0 if task.outcome == "success" else 0.0 + return hard, hard, "outcome-derived" + + def reflect( + self, + failures, + successes, + skill: str, + memory: str, + *, + edit_budget: int, + evolve_skill: bool, + evolve_memory: bool, + ) -> List[EditRecord]: + ctx = (skill or "") + "\n" + (memory or "") + edits: List[EditRecord] = [] + seen_text: set = set() + target = "skill" if evolve_skill else "memory" + for task, _res in failures: + for key in self._required_rules(task): + text = self.RULE_TEXT[key] + if text in ctx or text in seen_text: + continue + seen_text.add(text) + edits.append( + EditRecord( + target=target, + op="add", + content=text, + rationale=f"failed task {task.id} requires rule '{key}'", + ) + ) + if len(edits) >= edit_budget: + return edits + return edits + + +# ── Anthropic backend (real API; lazy, optional) ────────────────────────────── + +class AnthropicBackend(Backend): + """Uses the user's Anthropic budget. Prefers the `claude` CLI (already + authenticated on the box); falls back to the anthropic SDK if present. + + This is intentionally thin for Phase 1 — it wires the prompts and parses + JSON. Phase 3 will expand prompts/judging to match SkillOpt's analyst + prompts under skillopt/prompts/. + """ + + name = "anthropic" + + def __init__(self, model: str = "", claude_path: str = "claude") -> None: + self.model = model or os.environ.get("ANTHROPIC_MODEL", "") or "sonnet" + self.claude_path = claude_path + self._tokens = 0 + + # -- low-level call ----------------------------------------------------- + def _call(self, prompt: str, *, max_tokens: int = 1024) -> str: + # Try the CLI first (non-interactive, text output). + try: + cmd = [self.claude_path, "-p", "--output-format", "text"] + if self.model: + cmd += ["--model", self.model] + cmd += ["--", prompt] + proc = subprocess.run( + cmd, capture_output=True, text=True, timeout=180, + ) + out = (proc.stdout or "").strip() + if out: + self._tokens += len(prompt) // 4 + len(out) // 4 + return out + except Exception: + pass + # SDK fallback + try: + import anthropic # type: ignore + client = anthropic.Anthropic() + msg = client.messages.create( + model=self.model or "claude-sonnet-4-5", + max_tokens=max_tokens, + messages=[{"role": "user", "content": prompt}], + ) + text = "".join(getattr(b, "text", "") for b in msg.content) + self._tokens += getattr(msg.usage, "input_tokens", 0) + getattr( + msg.usage, "output_tokens", 0 + ) + return text.strip() + except Exception: + return "" + + def attempt(self, task: TaskRecord, skill: str, memory: str) -> str: + prompt = ( + "You are completing a recurring task for a user. Apply the skill and " + "memory exactly.\n\n" + f"# Skill\n{skill or '(none)'}\n\n# Memory\n{memory or '(none)'}\n\n" + f"# Task\n{task.intent}\n\n{task.context_excerpt}\n\n" + "Return only the final answer." + ) + return self._call(prompt) + + def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]: + if task.reference_kind == "exact" and task.reference: + hard = exact_score(task.reference, response) + return hard, max(hard, keyword_soft_score(task.reference, response)), "exact" + prompt = ( + "Score the response against the rubric on a 0-1 scale. " + "Return JSON {\"score\": <0..1>, \"reason\": \"...\"}.\n\n" + f"# Rubric\n{task.reference or task.intent}\n\n# Response\n{response}" + ) + raw = self._call(prompt, max_tokens=256) + m = re.search(r"\{.*\}", raw, re.DOTALL) + if m: + try: + obj = json.loads(m.group(0)) + soft = float(obj.get("score", 0.0)) + return (1.0 if soft >= 0.8 else 0.0), soft, str(obj.get("reason", "")) + except Exception: + pass + return 0.0, 0.0, "judge-parse-failed" + + def reflect( + self, + failures, + successes, + skill: str, + memory: str, + *, + edit_budget: int, + evolve_skill: bool, + evolve_memory: bool, + ) -> List[EditRecord]: + fail_text = "\n".join( + f"- intent: {t.intent[:200]}\n got: {r.response[:200]}\n why: {r.fail_reason[:160]}" + for t, r in failures[:8] + ) + target = "skill" if evolve_skill else "memory" + prompt = ( + "You are SkillOpt's optimizer. Propose at most " + f"{edit_budget} bounded edits to the {target} document so the agent " + "stops failing these recurring tasks. Each edit must be a short, " + "general, reusable rule (not task-specific). Return JSON list: " + "[{\"op\":\"add|replace|delete\",\"content\":\"...\",\"rationale\":\"...\"}].\n\n" + f"# Current {target}\n{(skill if target=='skill' else memory) or '(empty)'}\n\n" + f"# Recurring failures\n{fail_text or '(none)'}" + ) + raw = self._call(prompt, max_tokens=1024) + m = re.search(r"\[.*\]", raw, re.DOTALL) + edits: List[EditRecord] = [] + if m: + try: + for e in json.loads(m.group(0))[:edit_budget]: + edits.append( + EditRecord( + target=target, + op=str(e.get("op", "add")), + content=str(e.get("content", "")).strip(), + anchor=str(e.get("anchor", "")), + rationale=str(e.get("rationale", "")), + ) + ) + except Exception: + pass + return [e for e in edits if e.content] + + def tokens_used(self) -> int: + return self._tokens + + +def get_backend(name: str, *, model: str = "", claude_path: str = "claude") -> Backend: + if name == "anthropic": + return AnthropicBackend(model=model, claude_path=claude_path) + return MockBackend() diff --git a/skillopt/sleep/config.py b/skillopt/sleep/config.py new file mode 100644 index 0000000..0aff324 --- /dev/null +++ b/skillopt/sleep/config.py @@ -0,0 +1,137 @@ +"""SkillOpt-Sleep — configuration. + +Config is JSON-first (yaml optional) so the engine and the deterministic +experiment run with zero external dependencies. Defaults are safe: +review-gated adoption, single-project scope, bounded token/task budgets. + +Resolution order (later wins): + 1. built-in DEFAULTS + 2. ~/.skillopt-sleep/config.json (or .yaml if PyYAML available) + 3. explicit overrides passed to load_config(**overrides) +""" +from __future__ import annotations + +import json +import os +from dataclasses import dataclass, field, asdict +from typing import Any, Dict, List, Optional + + +HOME_STATE_DIR = os.path.expanduser("~/.skillopt-sleep") +CLAUDE_HOME = os.path.expanduser("~/.claude") + + +DEFAULTS: Dict[str, Any] = { + # ── scope ────────────────────────────────────────────────────────────── + "claude_home": CLAUDE_HOME, + "projects": "invoked", # "invoked" | "all" | [list of abs paths] + "invoked_project": "", # filled at runtime (cwd) when projects == "invoked" + "lookback_hours": 72, # harvest window when no prior sleep recorded + # ── budgets ──────────────────────────────────────────────────────────── + "max_tasks_per_night": 40, + "max_tokens_per_night": 400_000, + "holdout_fraction": 0.34, # fraction of mined tasks reserved for the gate + # ── optimizer ────────────────────────────────────────────────────────── + "backend": "mock", # "mock" | "anthropic" + "model": "", # backend-specific; "" => backend default + "edit_budget": 4, # textual learning rate (max edits/night) + "gate_metric": "mixed", # hard | soft | mixed (mixed best for tiny holdouts) + "gate_mixed_weight": 0.5, + "replay_mode": "mock", # "mock" (sandboxed prompt) | "fresh" (worktree) + "evolve_memory": True, # consolidate CLAUDE.md + "evolve_skill": True, # consolidate the managed SKILL.md + # ── adoption / safety ────────────────────────────────────────────────── + "auto_adopt": False, # default: stage + require explicit `adopt` + "managed_skill_name": "skillopt-sleep-learned", + "redact_secrets": True, + "seed": 42, +} + + +@dataclass +class SleepConfig: + data: Dict[str, Any] = field(default_factory=lambda: dict(DEFAULTS)) + + # convenient attribute access ------------------------------------------- + def __getattr__(self, name: str) -> Any: + # only called when normal attribute lookup fails + data = object.__getattribute__(self, "data") + if name in data: + return data[name] + raise AttributeError(name) + + def get(self, key: str, default: Any = None) -> Any: + return self.data.get(key, default) + + def to_dict(self) -> Dict[str, Any]: + return dict(self.data) + + # paths ------------------------------------------------------------------ + @property + def state_dir(self) -> str: + # Allow full isolation: if the caller overrides state_dir explicitly, + # honor it; else derive from claude_home's parent so a single + # --claude-home flag isolates transcripts AND state together; else the + # default ~/.skillopt-sleep. + explicit = self.data.get("state_dir") + if explicit: + return explicit + ch = self.data.get("claude_home", CLAUDE_HOME) + if os.path.abspath(ch) != os.path.abspath(CLAUDE_HOME): + return os.path.join(os.path.dirname(os.path.abspath(ch)), ".skillopt-sleep") + return HOME_STATE_DIR + + @property + def state_path(self) -> str: + return os.path.join(self.state_dir, "state.json") + + @property + def transcripts_dir(self) -> str: + return os.path.join(self.data["claude_home"], "projects") + + @property + def history_path(self) -> str: + return os.path.join(self.data["claude_home"], "history.jsonl") + + @property + def skills_dir(self) -> str: + return os.path.join(self.data["claude_home"], "skills") + + def managed_skill_path(self) -> str: + return os.path.join( + self.skills_dir, self.data["managed_skill_name"], "SKILL.md" + ) + + +def _user_config_path() -> Optional[str]: + for name in ("config.json", "config.yaml", "config.yml"): + p = os.path.join(HOME_STATE_DIR, name) + if os.path.exists(p): + return p + return None + + +def _load_file(path: str) -> Dict[str, Any]: + if path.endswith((".yaml", ".yml")): + try: + import yaml # optional + with open(path) as f: + return yaml.safe_load(f) or {} + except Exception: + return {} + with open(path) as f: + return json.load(f) + + +def load_config(**overrides: Any) -> SleepConfig: + data = dict(DEFAULTS) + path = _user_config_path() + if path: + try: + data.update(_load_file(path) or {}) + except Exception: + pass + data.update({k: v for k, v in overrides.items() if v is not None}) + if data.get("projects") == "invoked" and not data.get("invoked_project"): + data["invoked_project"] = os.getcwd() + return SleepConfig(data=data) diff --git a/skillopt/sleep/consolidate.py b/skillopt/sleep/consolidate.py new file mode 100644 index 0000000..0a679d6 --- /dev/null +++ b/skillopt/sleep/consolidate.py @@ -0,0 +1,176 @@ +"""SkillOpt-Sleep — Stage 4: consolidate (one SkillOpt epoch). + +This is the core that makes nightly evolution *safe*: it proposes bounded +edits from replayed failures, applies them to a candidate skill/memory, then +**gates** the candidate on a held-out slice of the user's own tasks. Only a +candidate that strictly improves the held-out score is accepted — exactly the +SkillOpt validation gate, reused verbatim from ``skillopt.evaluation.gate``. + +Reused from the main SkillOpt package (import-light, no `openai` needed): + * skillopt.evaluation.gate.evaluate_gate / select_gate_score +""" +from __future__ import annotations + +import os +from dataclasses import dataclass +from typing import List, Optional, Tuple + +from skillopt.sleep.backend import Backend +from skillopt.sleep.memory import apply_edits +from skillopt.sleep.replay import aggregate_scores, replay_batch +from skillopt.sleep.types import EditRecord, ReplayResult, TaskRecord + + +# Reuse the real SkillOpt gate. This module imports cleanly without `openai`. +try: + from skillopt.evaluation.gate import evaluate_gate, select_gate_score + _HAVE_REPO_GATE = True +except Exception: # pragma: no cover - fallback keeps engine standalone + _HAVE_REPO_GATE = False + + def select_gate_score(hard, soft, metric="hard", mixed_weight=0.5): # type: ignore + if metric == "hard": + return float(hard) + if metric == "soft": + return float(soft) + w = max(0.0, min(1.0, float(mixed_weight))) + return (1 - w) * float(hard) + w * float(soft) + + +@dataclass +class ConsolidationResult: + accepted: bool + gate_action: str + baseline_score: float + candidate_score: float + new_skill: str + new_memory: str + applied_edits: List[EditRecord] + rejected_edits: List[EditRecord] + holdout_baseline: float + holdout_candidate: float + + +def _split(tasks: List[TaskRecord]) -> Tuple[List[TaskRecord], List[TaskRecord]]: + replay = [t for t in tasks if t.split == "replay"] + holdout = [t for t in tasks if t.split == "holdout"] + # be robust if a split is empty + if not replay: + replay = tasks + if not holdout: + holdout = tasks + return replay, holdout + + +def consolidate( + backend: Backend, + tasks: List[TaskRecord], + skill: str, + memory: str, + *, + edit_budget: int = 4, + gate_metric: str = "mixed", + gate_mixed_weight: float = 0.5, + evolve_skill: bool = True, + evolve_memory: bool = True, + night: int = 1, +) -> ConsolidationResult: + """Run one consolidation epoch: reflect -> bounded edit -> gate. + + Skill and memory are evolved in sequence (skill first if both enabled), + each behind the same held-out gate, so each document only changes when it + demonstrably helps on the user's held-out tasks. + """ + replay_tasks, holdout_tasks = _split(tasks) + + # ── baseline on held-out slice (the gate reference) ────────────────── + base_pairs = replay_batch(backend, holdout_tasks, skill, memory) + base_hard, base_soft = aggregate_scores(base_pairs) + base_score = select_gate_score(base_hard, base_soft, gate_metric, gate_mixed_weight) + + # ── reflect over replay-split failures/successes ───────────────────── + train_pairs = replay_batch(backend, replay_tasks, skill, memory) + failures = [(t, r) for (t, r) in train_pairs if r.hard < 1.0] + successes = [(t, r) for (t, r) in train_pairs if r.hard >= 1.0] + + cand_skill, cand_memory = skill, memory + all_applied: List[EditRecord] = [] + all_rejected: List[EditRecord] = [] + + def _gate_apply(doc: str, edits: List[EditRecord], which: str) -> str: + nonlocal cand_skill, cand_memory, base_score, all_applied, all_rejected + if not edits: + return doc + new_doc, applied = apply_edits(doc, edits) + if not applied: + return doc + # evaluate candidate on the held-out slice + trial_skill = new_doc if which == "skill" else cand_skill + trial_memory = new_doc if which == "memory" else cand_memory + pairs = replay_batch(backend, holdout_tasks, trial_skill, trial_memory) + h, s = aggregate_scores(pairs) + cand_score = select_gate_score(h, s, gate_metric, gate_mixed_weight) + if cand_score > base_score: + base_score = cand_score + all_applied.extend(applied) + return new_doc + all_rejected.extend(applied) + return doc + + if evolve_skill: + edits = backend.reflect( + failures, successes, cand_skill, cand_memory, + edit_budget=edit_budget, evolve_skill=True, evolve_memory=False, + ) + cand_skill = _gate_apply(cand_skill, edits, "skill") + + if evolve_memory: + # re-evaluate failures under the (possibly improved) skill + train_pairs2 = replay_batch(backend, replay_tasks, cand_skill, cand_memory) + failures2 = [(t, r) for (t, r) in train_pairs2 if r.hard < 1.0] + successes2 = [(t, r) for (t, r) in train_pairs2 if r.hard >= 1.0] + edits_m = backend.reflect( + failures2, successes2, cand_skill, cand_memory, + edit_budget=edit_budget, evolve_skill=False, evolve_memory=True, + ) + cand_memory = _gate_apply(cand_memory, edits_m, "memory") + + # ── final gate decision (use the repo gate for the canonical action) ── + final_pairs = replay_batch(backend, holdout_tasks, cand_skill, cand_memory) + final_hard, final_soft = aggregate_scores(final_pairs) + final_score = select_gate_score(final_hard, final_soft, gate_metric, gate_mixed_weight) + + if _HAVE_REPO_GATE: + gate = evaluate_gate( + candidate_skill=cand_skill, + cand_hard=final_hard, + current_skill=skill, + current_score=select_gate_score(base_hard, base_soft, gate_metric, gate_mixed_weight), + best_skill=skill, + best_score=select_gate_score(base_hard, base_soft, gate_metric, gate_mixed_weight), + best_step=night - 1, + global_step=night, + cand_soft=final_soft, + metric=gate_metric, + mixed_weight=gate_mixed_weight, + ) + action = gate.action + else: + action = "accept" if final_score > base_soft else "reject" + + accepted = bool(all_applied) and final_score > select_gate_score( + base_hard, base_soft, gate_metric, gate_mixed_weight + ) + + return ConsolidationResult( + accepted=accepted, + gate_action=action, + baseline_score=select_gate_score(base_hard, base_soft, gate_metric, gate_mixed_weight), + candidate_score=final_score, + new_skill=cand_skill if accepted else skill, + new_memory=cand_memory if accepted else memory, + applied_edits=all_applied, + rejected_edits=all_rejected, + holdout_baseline=base_hard, + holdout_candidate=final_hard, + ) diff --git a/skillopt/sleep/cycle.py b/skillopt/sleep/cycle.py new file mode 100644 index 0000000..034ccf5 --- /dev/null +++ b/skillopt/sleep/cycle.py @@ -0,0 +1,210 @@ +"""SkillOpt-Sleep — the nightly cycle orchestrator. + +run_sleep_cycle() wires the stages: + harvest -> mine -> replay -> consolidate(gate) -> stage (-> optional adopt) + +It is pure-Python and import-light; with backend="mock" it runs with no API +key and no third-party deps, which is what the deterministic experiment and +CI use. With backend="anthropic" it spends the user's budget for real lift. +""" +from __future__ import annotations + +import os +import time +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +from skillopt.sleep.backend import get_backend +from skillopt.sleep.config import SleepConfig, load_config +from skillopt.sleep.consolidate import consolidate +from skillopt.sleep.harvest import harvest +from skillopt.sleep.memory import ensure_skill_scaffold +from skillopt.sleep.mine import mine +from skillopt.sleep.state import SleepState, _now_iso +from skillopt.sleep.staging import write_staging, adopt as adopt_staging +from skillopt.sleep.types import SessionDigest, SleepReport, TaskRecord + + +@dataclass +class CycleOutcome: + report: SleepReport + staging_dir: str + adopted: bool + adopted_paths: List[str] + + +def _project_paths(cfg: SleepConfig) -> str: + """Where live CLAUDE.md lives + which project we are evolving.""" + if cfg.get("projects") == "invoked" and cfg.get("invoked_project"): + return cfg.get("invoked_project") + # default: the invoked cwd + return cfg.get("invoked_project") or os.getcwd() + + +def _read(path: str) -> str: + try: + with open(path, encoding="utf-8") as f: + return f.read() + except Exception: + return "" + + +def _render_report_md(report: SleepReport, cfg: SleepConfig) -> str: + lines = [ + f"# SkillOpt-Sleep — night {report.night} report", + "", + f"- project: `{report.project}`", + f"- backend: `{cfg.get('backend')}` replay: `{cfg.get('replay_mode')}`", + f"- sessions harvested: {report.n_sessions}", + f"- tasks mined: {report.n_tasks} (replayed: {report.n_replayed})", + f"- held-out score: {report.baseline_score:.3f} -> {report.candidate_score:.3f}", + f"- gate: **{report.gate_action}** (accepted={report.accepted})", + f"- tokens used: {report.tokens_used}", + "", + ] + if report.edits: + lines.append("## Accepted edits") + for e in report.edits: + lines.append(f"- [{e.target}/{e.op}] {e.content} \n _why: {e.rationale}_") + lines.append("") + if report.rejected_edits: + lines.append("## Rejected by gate (kept as negative feedback)") + for e in report.rejected_edits: + lines.append(f"- [{e.target}/{e.op}] {e.content}") + lines.append("") + if report.notes: + lines.append("## Notes") + for n in report.notes: + lines.append(f"- {n}") + lines.append("") + lines.append("_Review, then run `/sleep adopt` to apply, or discard this folder._") + return "\n".join(lines) + + +def run_sleep_cycle( + cfg: Optional[SleepConfig] = None, + *, + seed_tasks: Optional[List[TaskRecord]] = None, + dry_run: bool = False, + clock: Optional[float] = None, +) -> CycleOutcome: + """Run one full sleep cycle and return the outcome. + + Parameters + ---------- + cfg : SleepConfig + seed_tasks : optional pre-built TaskRecords (used by the experiment to + inject a known persona instead of harvesting ~/.claude). + dry_run : harvest+mine+replay but DO NOT stage/adopt (report only). + clock : fixed epoch seconds for deterministic timestamps in tests. + """ + cfg = cfg or load_config() + state = SleepState.load(cfg.state_path) + night = state.begin_night(clock) + project = _project_paths(cfg) + started = _now_iso(clock) + + backend = get_backend( + cfg.get("backend", "mock"), + model=cfg.get("model", ""), + ) + + # ── 1+2. harvest + mine (unless seed_tasks injected) ───────────────── + digests: List[SessionDigest] = [] + if seed_tasks is not None: + tasks = seed_tasks + n_sessions = 0 + else: + since = state.last_harvest_for(project) + digests = harvest( + cfg.transcripts_dir, + scope=cfg.get("projects", "invoked"), + invoked_project=cfg.get("invoked_project", ""), + since_iso=since, + limit=cfg.get("max_tasks_per_night", 40) * 3, + ) + n_sessions = len(digests) + tasks = mine( + digests, + max_tasks=cfg.get("max_tasks_per_night", 40), + holdout_fraction=cfg.get("holdout_fraction", 0.34), + seed=cfg.get("seed", 42), + ) + + # ── live skill/memory docs ─────────────────────────────────────────── + live_memory_path = os.path.join(project, "CLAUDE.md") + live_skill_path = cfg.managed_skill_path() + skill = _read(live_skill_path) + memory = _read(live_memory_path) + if not skill: + skill = ensure_skill_scaffold( + "", name=cfg.get("managed_skill_name", "skillopt-sleep-learned"), + description="Preferences and procedures learned from past Claude Code sessions.", + ) + + report = SleepReport( + night=night, project=project, started_at=started, + n_sessions=n_sessions, n_tasks=len(tasks), + ) + + if not tasks: + report.ended_at = _now_iso(clock) + report.notes.append("no tasks mined — nothing to consolidate") + state.set_last_harvest(project, started) + state.record_night({"night": night, "accepted": False, "n_tasks": 0}) + if not dry_run: + state.save() + staging_dir = "" + return CycleOutcome(report, staging_dir, False, []) + + # ── 3+4. replay + consolidate (gate) ───────────────────────────────── + result = consolidate( + backend, tasks, skill, memory, + edit_budget=cfg.get("edit_budget", 4), + gate_metric=cfg.get("gate_metric", "mixed"), + gate_mixed_weight=cfg.get("gate_mixed_weight", 0.5), + evolve_skill=cfg.get("evolve_skill", True), + evolve_memory=cfg.get("evolve_memory", True), + night=night, + ) + + report.n_replayed = len(tasks) + report.baseline_score = result.baseline_score + report.candidate_score = result.candidate_score + report.accepted = result.accepted + report.gate_action = result.gate_action + report.edits = result.applied_edits + report.rejected_edits = result.rejected_edits + report.tokens_used = backend.tokens_used() + report.ended_at = _now_iso(clock) + + # ── 5. stage (unless dry-run) ──────────────────────────────────────── + staging_dir = "" + adopted = False + adopted_paths: List[str] = [] + if not dry_run: + report_md = _render_report_md(report, cfg) + proposed_skill = result.new_skill if (cfg.get("evolve_skill") and result.accepted) else None + proposed_memory = result.new_memory if (cfg.get("evolve_memory") and result.accepted) else None + staging_dir = write_staging( + project, + report=report, + proposed_skill=proposed_skill, + proposed_memory=proposed_memory, + live_skill_path=live_skill_path, + live_memory_path=live_memory_path, + report_md=report_md, + ) + state.set_last_harvest(project, started) + state.record_night({ + "night": night, "accepted": result.accepted, + "baseline": result.baseline_score, "candidate": result.candidate_score, + "n_tasks": len(tasks), "staging": staging_dir, + }) + # ── 6. adopt (opt-in) ──────────────────────────────────────────── + if cfg.get("auto_adopt") and result.accepted: + adopted_paths = adopt_staging(staging_dir) + adopted = bool(adopted_paths) + state.save() + + return CycleOutcome(report, staging_dir, adopted, adopted_paths) diff --git a/skillopt/sleep/experiments/__init__.py b/skillopt/sleep/experiments/__init__.py new file mode 100644 index 0000000..fa657fe --- /dev/null +++ b/skillopt/sleep/experiments/__init__.py @@ -0,0 +1 @@ +"""SkillOpt-Sleep experiments.""" diff --git a/skillopt/sleep/experiments/personas.py b/skillopt/sleep/experiments/personas.py new file mode 100644 index 0000000..0fdb127 --- /dev/null +++ b/skillopt/sleep/experiments/personas.py @@ -0,0 +1,86 @@ +"""SkillOpt-Sleep — persona task fixtures for the validation experiment. + +Each persona is a list of TaskRecords with EXACT checkable references and a +`rule:` tag naming the single skill rule that makes the task solvable +(consumed by MockBackend). This lets the experiment prove — deterministically, +with no API — that nightly consolidation lifts a held-out score and that the +gate blocks regressions. + +Personas mirror the user's framing: programmer / researcher / analyst. +""" +from __future__ import annotations + +from typing import List + +from skillopt.sleep.types import TaskRecord + + +def _t(i, intent, ref, rule, project="/personas/demo", outcome="fail") -> TaskRecord: + return TaskRecord( + id=f"persona_{rule}_{i}", + project=project, + intent=intent, + context_excerpt="", + attempted_solution="", + outcome=outcome, + reference_kind="exact", + reference=ref, + tags=[f"rule:{rule}"], + source_sessions=[f"sess_{i}"], + ) + + +def researcher_persona() -> List[TaskRecord]: + """Researcher who always wants arXiv ids wrapped in tags.""" + items = [ + ("Give me the arXiv id for the SkillOpt paper", "arXiv:2605.23904"), + ("What's the arXiv id of the Attention paper?", "arXiv:1706.03762"), + ("arXiv id for the GAN paper?", "arXiv:1406.2661"), + ("arXiv id for BERT?", "arXiv:1810.04805"), + ("arXiv id for the ResNet paper?", "arXiv:1512.03385"), + ("arXiv id for the Adam optimizer paper?", "arXiv:1412.6980"), + ("arXiv id for Dropout?", "arXiv:1207.0580"), + ("arXiv id for the Transformer-XL paper?", "arXiv:1901.02860"), + ("arXiv id for word2vec?", "arXiv:1301.3781"), + ("arXiv id for the VAE paper?", "arXiv:1312.6114"), + ("arXiv id for batch norm?", "arXiv:1502.03167"), + ("arXiv id for GPT-3?", "arXiv:2005.14165"), + ] + # Both rules required: format the id (arxiv-id) AND wrap in answer tags. + out: List[TaskRecord] = [] + for i, (q, a) in enumerate(items): + t = _t(i, q, a, "wrap-answer") + t.tags = ["rule:wrap-answer", "rule:arxiv-id"] + out.append(t) + return out + + +def programmer_persona() -> List[TaskRecord]: + """Programmer who wants imperative-mood commit subjects.""" + items = [ + ("commit message for adding a login form", "Add login form"), + ("commit message for fixing the null pointer bug", "Fix null pointer in parser"), + ("commit message for updating the README", "Update README"), + ("commit message for removing dead code", "Remove dead code"), + ("commit message for bumping the version", "Bump version to 1.2.0"), + ("commit message for refactoring the auth module", "Refactor auth module"), + ("commit message for adding tests", "Add unit tests for scheduler"), + ("commit message for fixing the CI pipeline", "Fix CI pipeline"), + ] + return [_t(i, q, a, "commit-imperative") for i, (q, a) in enumerate(items)] + + +def harmful_edit_task() -> TaskRecord: + """A task whose 'fix' is a known-bad rule; used to prove the gate rejects + regressions. The MockBackend proposes the harmful rule on this failure, + but applying it does NOT raise the held-out score, so the gate must reject. + """ + t = _t(99, "answer this freely", "THIS_WILL_NOT_MATCH", "__harmful__") + t.reference = "an-answer-that-the-harmful-rule-cannot-produce" + return t + + +PERSONAS = { + "researcher": researcher_persona, + "programmer": programmer_persona, +} diff --git a/skillopt/sleep/experiments/run_experiment.py b/skillopt/sleep/experiments/run_experiment.py new file mode 100644 index 0000000..931138b --- /dev/null +++ b/skillopt/sleep/experiments/run_experiment.py @@ -0,0 +1,157 @@ +"""SkillOpt-Sleep — validation experiment. + +Answers the question the user posed: *does nightly offline self-evolution +actually improve the agent?* Runs deterministically with the MockBackend +(no API key, reproducible) and is the acceptance test for the whole idea. + +What it proves: + 1. MONOTONIC LIFT — over N sleep nights, the held-out score rises from a + baseline (empty skill/memory) toward 1.0 as the gate accepts the + general rules the persona's tasks require. + 2. GATE SAFETY — an injected harmful edit is REJECTED (held-out score does + not improve), so a bad nightly proposal can never be adopted. + 3. PLUMBING — harvest->mine->replay->consolidate->stage->adopt all run and + the adopted artifact, re-scored, retains the lift. + +Run: + python -m skillopt.sleep.experiments.run_experiment + python -m skillopt.sleep.experiments.run_experiment --persona programmer --nights 3 + python -m skillopt.sleep.experiments.run_experiment --backend anthropic # real lift +""" +from __future__ import annotations + +import argparse +import json +import os +import sys +import tempfile +from typing import List + +from skillopt.sleep.backend import get_backend +from skillopt.sleep.consolidate import consolidate +from skillopt.sleep.experiments.personas import ( + PERSONAS, + harmful_edit_task, + researcher_persona, +) +from skillopt.sleep.memory import ensure_skill_scaffold +from skillopt.sleep.replay import aggregate_scores, replay_batch +from skillopt.sleep.types import TaskRecord + + +def _score_holdout(backend, tasks: List[TaskRecord], skill: str, memory: str, + metric: str = "mixed", w: float = 0.5) -> float: + from skillopt.sleep.consolidate import select_gate_score + holdout = [t for t in tasks if t.split == "holdout"] or tasks + pairs = replay_batch(backend, holdout, skill, memory) + h, s = aggregate_scores(pairs) + return select_gate_score(h, s, metric, w) + + +def run(persona: str = "researcher", nights: int = 4, backend_name: str = "mock", + edit_budget: int = 4, seed: int = 42) -> dict: + from skillopt.sleep.mine import assign_splits + + make = PERSONAS.get(persona, researcher_persona) + tasks = assign_splits(make(), holdout_fraction=0.34, seed=seed) + backend = get_backend(backend_name) + + # start from an empty managed skill + empty memory + skill = ensure_skill_scaffold("", name="skillopt-sleep-learned", + description="Learned preferences.") + memory = "" + + baseline = _score_holdout(backend, tasks, skill, memory) + trace = [{"night": 0, "holdout_score": round(baseline, 4), "action": "baseline", + "n_edits": 0}] + + for night in range(1, nights + 1): + res = consolidate( + backend, tasks, skill, memory, + edit_budget=edit_budget, gate_metric="mixed", gate_mixed_weight=0.5, + evolve_skill=True, evolve_memory=True, night=night, + ) + if res.accepted: + skill, memory = res.new_skill, res.new_memory + trace.append({ + "night": night, + "holdout_score": round(res.candidate_score, 4), + "action": res.gate_action, + "accepted": res.accepted, + "n_edits": len(res.applied_edits), + "edits": [e.content for e in res.applied_edits], + "n_rejected": len(res.rejected_edits), + }) + # converged: stop early if perfect + if res.candidate_score >= 0.999: + break + + after = _score_holdout(backend, tasks, skill, memory) + + # ── gate-safety probe: inject a harmful task whose 'fix' is a bad rule ── + harmful_tasks = assign_splits([harmful_edit_task()] + make()[:3], + holdout_fraction=0.5, seed=seed) + h_before = _score_holdout(backend, harmful_tasks, skill, memory) + res_h = consolidate(backend, harmful_tasks, skill, memory, + edit_budget=edit_budget, gate_metric="mixed", + evolve_skill=True, evolve_memory=False, night=nights + 1) + harmful_rule_text = get_backend("mock").RULE_TEXT["__harmful__"] # type: ignore[attr-defined] + harmful_rejected = (harmful_rule_text not in res_h.new_skill) + + result = { + "persona": persona, + "backend": backend_name, + "nights_run": len(trace) - 1, + "baseline_holdout": round(baseline, 4), + "after_holdout": round(after, 4), + "lift": round(after - baseline, 4), + "improved": after > baseline, + "gate_blocks_harmful": bool(harmful_rejected), + "final_skill_excerpt": skill[-400:], + "trace": trace, + } + return result + + +def _assert(cond: bool, msg: str) -> None: + if not cond: + print(f"FAIL: {msg}") + raise SystemExit(1) + + +def main(argv=None) -> int: + ap = argparse.ArgumentParser(description="SkillOpt-Sleep validation experiment") + ap.add_argument("--persona", default="researcher", choices=list(PERSONAS.keys())) + ap.add_argument("--nights", type=int, default=4) + ap.add_argument("--backend", default="mock", choices=["mock", "anthropic"]) + ap.add_argument("--edit-budget", type=int, default=4) + ap.add_argument("--json", action="store_true") + ap.add_argument("--assert-improves", action="store_true", + help="exit nonzero unless lift>0 and gate blocks harmful edit") + args = ap.parse_args(argv) + + res = run(args.persona, nights=args.nights, backend_name=args.backend, + edit_budget=args.edit_budget) + + if args.json: + print(json.dumps(res, ensure_ascii=False, indent=2)) + else: + print(f"=== SkillOpt-Sleep experiment: persona={res['persona']} backend={res['backend']} ===") + print(f"baseline held-out : {res['baseline_holdout']}") + print(f"after held-out : {res['after_holdout']} (lift {res['lift']:+.4f})") + print(f"gate blocks harmful edit: {res['gate_blocks_harmful']}") + print("trace:") + for row in res["trace"]: + edits = "; ".join(row.get("edits", []))[:80] + print(f" night {row['night']}: holdout={row['holdout_score']} " + f"{row['action']} (+{row['n_edits']} edits) {edits}") + + if args.assert_improves: + _assert(res["improved"], "held-out score did not improve") + _assert(res["gate_blocks_harmful"], "gate failed to block harmful edit") + print("\nPASS: nightly consolidation improves held-out score AND gate blocks regressions.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skillopt/sleep/harvest.py b/skillopt/sleep/harvest.py new file mode 100644 index 0000000..013483f --- /dev/null +++ b/skillopt/sleep/harvest.py @@ -0,0 +1,236 @@ +"""SkillOpt-Sleep — Stage 1: harvest. + +Read the user's local Claude Code records (read-only) and normalize them +into :class:`SessionDigest` objects. + +Sources (verified schema): + * ~/.claude/history.jsonl — one JSON/line: + {"display": , "pastedContents": {...}, + "timestamp": , "project": } + * ~/.claude/projects//.jsonl — one record/line; the + records we care about have type "user"/"assistant" and carry: + message{role, content}, cwd, gitBranch, timestamp, sessionId, version + +This module performs NO writes and NO network calls. +""" +from __future__ import annotations + +import json +import os +from typing import Any, Dict, Iterable, List, Optional + +from skillopt.sleep.types import SessionDigest + + +# Heuristic phrases that signal the user (dis)approving of prior output. +_NEGATIVE_FEEDBACK = ( + "still broken", "still not", "still wrong", "doesn't work", "does not work", + "not working", "that's wrong", "thats wrong", "incorrect", "wrong", + "no,", "nope", "fix it", "didn't", "did not", "broken", "error again", + "still failing", "still fails", "not fixed", "revert", "undo", + "不对", "还是不对", "还是不行", "不行", "错了", "有问题", "没修好", +) +_POSITIVE_FEEDBACK = ( + "thanks", "thank you", "perfect", "great", "works now", "fixed", + "that works", "lgtm", "looks good", "nice", "awesome", "correct", + "完美", "可以了", "好的", "搞定", "对了", "正确", "谢谢", +) + + +def _iter_jsonl(path: str) -> Iterable[Dict[str, Any]]: + try: + with open(path, encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + yield json.loads(line) + except Exception: + continue + except (FileNotFoundError, IsADirectoryError, PermissionError): + return + + +def _text_from_content(content: Any) -> str: + """Flatten a message.content (str or list of blocks) into text.""" + if isinstance(content, str): + return content + if isinstance(content, list): + parts: List[str] = [] + for b in content: + if isinstance(b, dict): + if b.get("type") == "text" and b.get("text"): + parts.append(str(b["text"])) + return "\n".join(parts) + return "" + + +def _tool_names_from_content(content: Any) -> List[str]: + names: List[str] = [] + if isinstance(content, list): + for b in content: + if isinstance(b, dict) and b.get("type") == "tool_use" and b.get("name"): + names.append(str(b["name"])) + return names + + +def _detect_feedback(text: str) -> List[str]: + low = text.lower() + sig: List[str] = [] + for ph in _NEGATIVE_FEEDBACK: + if ph in low: + sig.append("neg:" + ph) + for ph in _POSITIVE_FEEDBACK: + if ph in low: + sig.append("pos:" + ph) + return sig + + +def _is_meta_prompt(text: str) -> bool: + """Skip slash-commands / system noise that aren't real user intents.""" + t = text.strip() + if not t: + return True + if t.startswith("<") and t.endswith(">"): + return True + if t.startswith("/") and len(t.split()) <= 3: + return True + if t.startswith("[Pasted text") or t.startswith("Caveat:"): + return True + return False + + +def digest_transcript(path: str) -> Optional[SessionDigest]: + """Build a SessionDigest from one ``.jsonl`` transcript.""" + session_id = os.path.splitext(os.path.basename(path))[0] + project = "" + git_branch = "" + started = "" + ended = "" + user_prompts: List[str] = [] + assistant_finals: List[str] = [] + tools: List[str] = [] + files: List[str] = [] + feedback: List[str] = [] + n_user = 0 + n_asst = 0 + + for rec in _iter_jsonl(path): + rtype = rec.get("type") + ts = rec.get("timestamp") + if isinstance(ts, str) and ts: + if not started: + started = ts + ended = ts + if rec.get("cwd") and not project: + project = str(rec.get("cwd")) + if rec.get("gitBranch") and not git_branch: + git_branch = str(rec.get("gitBranch")) + if rtype == "file-history-snapshot": + snap = rec.get("snapshot") or rec.get("files") or {} + if isinstance(snap, dict): + files.extend([str(k) for k in list(snap.keys())[:20]]) + msg = rec.get("message") + if not isinstance(msg, dict): + continue + role = msg.get("role") + content = msg.get("content") + if role == "user": + text = _text_from_content(content) + if text and not _is_meta_prompt(text): + n_user += 1 + user_prompts.append(text.strip()) + feedback.extend(_detect_feedback(text)) + elif role == "assistant": + n_asst += 1 + tools.extend(_tool_names_from_content(content)) + text = _text_from_content(content) + if text.strip(): + assistant_finals.append(text.strip()) + + if n_user == 0 and n_asst == 0: + return None + + # de-dup tools/files preserving order + def _dedup(xs: List[str]) -> List[str]: + seen = set() + out = [] + for x in xs: + if x not in seen: + seen.add(x) + out.append(x) + return out + + return SessionDigest( + session_id=session_id, + project=project, + git_branch=git_branch, + started_at=started, + ended_at=ended, + user_prompts=user_prompts, + assistant_finals=assistant_finals[-5:], # last few finals are the useful ones + tools_used=_dedup(tools), + files_touched=_dedup(files), + feedback_signals=feedback, + n_user_turns=n_user, + n_assistant_turns=n_asst, + raw_path=path, + ) + + +def _project_matches(project: str, scope: Any, invoked: str) -> bool: + if scope == "all": + return True + if isinstance(scope, (list, tuple)): + return any(os.path.abspath(project) == os.path.abspath(p) for p in scope) + # "invoked": match the invoked project (or a subdir of it) + if not invoked: + return True + a = os.path.abspath(project) + b = os.path.abspath(invoked) + return a == b or a.startswith(b + os.sep) or b.startswith(a + os.sep) + + +def harvest( + transcripts_dir: str, + *, + scope: Any = "all", + invoked_project: str = "", + since_iso: Optional[str] = None, + limit: int = 0, +) -> List[SessionDigest]: + """Walk ~/.claude/projects and return digests matching scope/time. + + Parameters + ---------- + transcripts_dir : str ~/.claude/projects + scope : "all" | "invoked" | list[path] + invoked_project : str used when scope == "invoked" + since_iso : str|None ISO8601; only sessions ending after this are kept + limit : int cap number of digests (0 = no cap) + """ + digests: List[SessionDigest] = [] + if not os.path.isdir(transcripts_dir): + return digests + + paths: List[str] = [] + for root, _dirs, files in os.walk(transcripts_dir): + for fn in files: + if fn.endswith(".jsonl"): + paths.append(os.path.join(root, fn)) + # newest first by mtime + paths.sort(key=lambda p: os.path.getmtime(p), reverse=True) + + for p in paths: + d = digest_transcript(p) + if d is None: + continue + if not _project_matches(d.project or "", scope, invoked_project): + continue + if since_iso and d.ended_at and d.ended_at < since_iso: + continue + digests.append(d) + if limit and len(digests) >= limit: + break + return digests diff --git a/skillopt/sleep/memory.py b/skillopt/sleep/memory.py new file mode 100644 index 0000000..2f7ddbb --- /dev/null +++ b/skillopt/sleep/memory.py @@ -0,0 +1,130 @@ +"""SkillOpt-Sleep — skill/memory document manipulation. + +Applies bounded EditRecords to a skill (SKILL.md body) or memory (CLAUDE.md) +document, and provides Dream-style consolidation helpers (dedup near-identical +lines, drop contradictions). All edits live inside a protected, clearly-marked +region so the sleep cycle never clobbers the user's hand-written content. +""" +from __future__ import annotations + +import re +from typing import List, Tuple + +from skillopt.sleep.types import EditRecord + + +LEARNED_START = "" +LEARNED_END = "" +_BANNER = ( + "_This block is maintained by SkillOpt-Sleep. Edits here are proposed " + "offline, validated against your past tasks, and adopted only after you " + "approve them. Hand-edits outside this block are never touched._" +) + + +def extract_learned(doc: str) -> str: + s = doc.find(LEARNED_START) + e = doc.find(LEARNED_END) + if s == -1 or e == -1: + return "" + return doc[s + len(LEARNED_START):e].strip() + + +def _strip_learned(doc: str) -> str: + while True: + s = doc.find(LEARNED_START) + if s == -1: + break + e = doc.find(LEARNED_END, s) + if e == -1: + doc = doc[:s] + break + doc = doc[:s] + doc[e + len(LEARNED_END):] + while "\n\n\n" in doc: + doc = doc.replace("\n\n\n", "\n\n") + return doc.rstrip() + + +def set_learned(doc: str, learned_lines: List[str]) -> str: + """Replace the protected learned region with the given bullet lines.""" + base = _strip_learned(doc) + body = "\n".join(f"- {ln.strip().lstrip('- ').strip()}" for ln in learned_lines if ln.strip()) + block = ( + f"\n\n{LEARNED_START}\n" + f"## Learned preferences & procedures\n\n{_BANNER}\n\n{body}\n" + f"{LEARNED_END}\n" + ) + return (base + block).lstrip("\n") + + +def current_learned_lines(doc: str) -> List[str]: + inner = extract_learned(doc) + lines: List[str] = [] + for ln in inner.splitlines(): + ln = ln.strip() + if ln.startswith("- "): + lines.append(ln[2:].strip()) + return lines + + +def _norm(s: str) -> str: + return re.sub(r"\s+", " ", (s or "").lower()).strip() + + +def apply_edits(doc: str, edits: List[EditRecord]) -> Tuple[str, List[EditRecord]]: + """Apply add/delete/replace edits to the protected learned region. + + Returns (new_doc, applied_edits). Dedups: an `add` whose content already + exists (normalized) is skipped. `delete`/`replace` match on normalized + anchor substring. + """ + lines = current_learned_lines(doc) + norm_set = {_norm(l) for l in lines} + applied: List[EditRecord] = [] + + for e in edits: + op = (e.op or "add").lower() + if op == "add": + if _norm(e.content) in norm_set or not e.content.strip(): + continue + lines.append(e.content.strip()) + norm_set.add(_norm(e.content)) + applied.append(e) + elif op == "delete": + anchor = _norm(e.anchor or e.content) + keep = [l for l in lines if anchor not in _norm(l)] + if len(keep) != len(lines): + lines = keep + norm_set = {_norm(l) for l in lines} + applied.append(e) + elif op == "replace": + anchor = _norm(e.anchor) + new_lines = [] + changed = False + for l in lines: + if anchor and anchor in _norm(l): + new_lines.append(e.content.strip()) + changed = True + else: + new_lines.append(l) + if changed: + lines = new_lines + norm_set = {_norm(l) for l in lines} + applied.append(e) + + return set_learned(doc, lines), applied + + +def ensure_skill_scaffold(doc: str, *, name: str, description: str) -> str: + """Ensure a SKILL.md has YAML frontmatter so Claude Code loads it.""" + if doc.lstrip().startswith("---"): + return doc + fm = ( + "---\n" + f"name: {name}\n" + f"description: {description}\n" + "---\n\n" + f"# {name}\n\n" + "Preferences and procedures learned from your past Claude Code sessions.\n" + ) + return fm + doc diff --git a/skillopt/sleep/mine.py b/skillopt/sleep/mine.py new file mode 100644 index 0000000..fcdfc43 --- /dev/null +++ b/skillopt/sleep/mine.py @@ -0,0 +1,168 @@ +"""SkillOpt-Sleep — Stage 2: mine. + +Turn :class:`SessionDigest` objects into :class:`TaskRecord` training units. + +Two miners: + * heuristic_mine — deterministic, no API. Detects retry chains (a prompt + re-asked after negative feedback => the early attempt failed), extracts + the user's recurring intents, and labels outcomes from feedback signals. + * llm_mine — optional; uses an optimizer backend to produce richer + TaskRecords with checkable references. Falls back to heuristic on error. + +The heuristic miner is what makes the whole cycle runnable offline and is the +basis of the deterministic experiment. +""" +from __future__ import annotations + +import hashlib +import re +from typing import Any, Callable, List, Optional + +from skillopt.sleep.types import SessionDigest, TaskRecord + + +def _tid(project: str, intent: str) -> str: + h = hashlib.sha256((project + "::" + intent).encode("utf-8")).hexdigest()[:12] + return "task_" + h + + +def _short(text: str, n: int = 600) -> str: + text = (text or "").strip() + return text if len(text) <= n else text[:n] + " …" + + +def _looks_negative(signals: List[str]) -> bool: + return any(s.startswith("neg:") for s in signals) + + +def _looks_positive(signals: List[str]) -> bool: + return any(s.startswith("pos:") for s in signals) + + +def heuristic_mine( + digests: List[SessionDigest], + *, + max_tasks: int = 40, +) -> List[TaskRecord]: + """Deterministic miner — no API calls. + + Strategy: + * Each session with >=1 real user prompt yields one TaskRecord whose + intent is the FIRST substantive prompt (the original ask). + * Outcome is inferred: + - negative feedback present and no later positive -> "fail" + - positive feedback present -> "success" + - re-asks (multiple user turns) without resolution -> "mixed" + - otherwise -> "unknown" + * attempted_solution = the last assistant final (what was produced). + * reference_kind defaults to "none"; the consolidation step will use a + rubric judge for these. (Exact refs are added by the experiment data + or by the LLM miner when it can derive a checkable answer.) + """ + tasks: List[TaskRecord] = [] + for d in digests: + if not d.user_prompts: + continue + intent = d.user_prompts[0] + if len(intent.strip()) < 8: + continue + if _looks_positive(d.feedback_signals) and not _looks_negative(d.feedback_signals): + outcome = "success" + elif _looks_negative(d.feedback_signals): + outcome = "fail" + elif d.n_user_turns >= 3: + outcome = "mixed" + else: + outcome = "unknown" + + attempted = d.assistant_finals[-1] if d.assistant_finals else "" + context = "" + if len(d.user_prompts) > 1: + # later prompts often carry the corrective detail / real constraints + context = "Follow-up constraints from the same session:\n- " + "\n- ".join( + _short(p, 200) for p in d.user_prompts[1:4] + ) + tags = [] + if d.tools_used: + tags.append("tools:" + "+".join(d.tools_used[:4])) + if d.git_branch: + tags.append("branch:" + d.git_branch) + + tasks.append( + TaskRecord( + id=_tid(d.project, intent), + project=d.project, + intent=_short(intent, 800), + context_excerpt=_short(context, 600), + attempted_solution=_short(attempted, 600), + outcome=outcome, + reference_kind="none", + reference="", + tags=tags, + source_sessions=[d.session_id], + ) + ) + if len(tasks) >= max_tasks: + break + return tasks + + +def dedup_tasks(tasks: List[TaskRecord]) -> List[TaskRecord]: + """Merge tasks sharing an id (same project+intent across sessions).""" + by_id: dict = {} + for t in tasks: + if t.id in by_id: + ex = by_id[t.id] + ex.source_sessions = list(dict.fromkeys(ex.source_sessions + t.source_sessions)) + # prefer a resolved outcome if either session resolved it + order = {"success": 3, "fail": 2, "mixed": 1, "unknown": 0} + if order.get(t.outcome, 0) > order.get(ex.outcome, 0): + ex.outcome = t.outcome + else: + by_id[t.id] = t + return list(by_id.values()) + + +def assign_splits( + tasks: List[TaskRecord], + *, + holdout_fraction: float = 0.34, + seed: int = 42, +) -> List[TaskRecord]: + """Deterministically split tasks into replay (train) / holdout (test). + + Uses a stable hash of the task id so the same task always lands in the + same split across nights (a fixed held-out gate, like SkillOpt's D_sel). + """ + for t in tasks: + bucket = int(hashlib.sha256((str(seed) + t.id).encode()).hexdigest(), 16) % 100 + t.split = "holdout" if bucket < int(holdout_fraction * 100) else "replay" + # guarantee both splits non-empty when possible + splits = {t.split for t in tasks} + if len(tasks) >= 2 and "holdout" not in splits: + tasks[-1].split = "holdout" + if len(tasks) >= 2 and "replay" not in splits: + tasks[0].split = "replay" + return tasks + + +def mine( + digests: List[SessionDigest], + *, + max_tasks: int = 40, + holdout_fraction: float = 0.34, + seed: int = 42, + llm_miner: Optional[Callable[[List[SessionDigest]], List[TaskRecord]]] = None, +) -> List[TaskRecord]: + """Top-level miner. Uses ``llm_miner`` if provided, else heuristic.""" + tasks: List[TaskRecord] = [] + if llm_miner is not None: + try: + tasks = llm_miner(digests) or [] + except Exception: + tasks = [] + if not tasks: + tasks = heuristic_mine(digests, max_tasks=max_tasks) + tasks = dedup_tasks(tasks) + tasks = assign_splits(tasks, holdout_fraction=holdout_fraction, seed=seed) + return tasks diff --git a/skillopt/sleep/replay.py b/skillopt/sleep/replay.py new file mode 100644 index 0000000..de3d598 --- /dev/null +++ b/skillopt/sleep/replay.py @@ -0,0 +1,46 @@ +"""SkillOpt-Sleep — Stage 3: replay. + +Re-run mined TaskRecords offline under a given (skill, memory) and score +them, producing the (hard, soft) signal SkillOpt's gate consumes. + +For Phase 1 the replay is "mock mode": a sandboxed single-shot attempt via +the chosen backend (MockBackend = deterministic; AnthropicBackend = real). +"fresh" worktree replay is Phase 3 and is intentionally not wired here. +""" +from __future__ import annotations + +from typing import List, Tuple + +from skillopt.sleep.backend import Backend +from skillopt.sleep.types import ReplayResult, TaskRecord + + +def replay_one(backend: Backend, task: TaskRecord, skill: str, memory: str) -> ReplayResult: + response = backend.attempt(task, skill, memory) + hard, soft, rationale = backend.judge(task, response) + return ReplayResult( + id=task.id, + hard=float(hard), + soft=float(soft), + response=response, + fail_reason="" if hard >= 1.0 else (rationale or "below threshold"), + task_type=(task.tags[0] if task.tags else "task"), + judge_rationale=rationale, + ) + + +def replay_batch( + backend: Backend, + tasks: List[TaskRecord], + skill: str, + memory: str, +) -> List[Tuple[TaskRecord, ReplayResult]]: + return [(t, replay_one(backend, t, skill, memory)) for t in tasks] + + +def aggregate_scores(pairs: List[Tuple[TaskRecord, ReplayResult]]) -> Tuple[float, float]: + if not pairs: + return 0.0, 0.0 + hard = sum(r.hard for _t, r in pairs) / len(pairs) + soft = sum(r.soft for _t, r in pairs) / len(pairs) + return hard, soft diff --git a/skillopt/sleep/staging.py b/skillopt/sleep/staging.py new file mode 100644 index 0000000..b2e0ab2 --- /dev/null +++ b/skillopt/sleep/staging.py @@ -0,0 +1,103 @@ +"""SkillOpt-Sleep — Stage 5/6: staging and adoption. + +Implements the Dreams safety contract: the cycle never mutates the user's +live CLAUDE.md / SKILL.md. It writes proposals + a human-readable report into +a staging directory; a separate, explicit `adopt` step copies them over the +live files after taking a backup. +""" +from __future__ import annotations + +import json +import os +import shutil +import time +from typing import List, Optional + +from skillopt.sleep.types import SleepReport + + +def _ts_dir() -> str: + return time.strftime("%Y%m%d-%H%M%S", time.localtime()) + + +def staging_root(project: str) -> str: + return os.path.join(project, ".skillopt-sleep", "staging") + + +def latest_staging(project: str) -> Optional[str]: + root = staging_root(project) + if not os.path.isdir(root): + return None + subs = sorted( + (os.path.join(root, d) for d in os.listdir(root)), + key=lambda p: os.path.getmtime(p), + reverse=True, + ) + return subs[0] if subs else None + + +def write_staging( + project: str, + *, + report: SleepReport, + proposed_skill: Optional[str], + proposed_memory: Optional[str], + live_skill_path: str, + live_memory_path: str, + report_md: str, +) -> str: + """Write proposals + report into staging// and return that path.""" + out = os.path.join(staging_root(project), _ts_dir()) + os.makedirs(out, exist_ok=True) + + manifest = { + "live_skill_path": live_skill_path, + "live_memory_path": live_memory_path, + "has_skill": proposed_skill is not None, + "has_memory": proposed_memory is not None, + "accepted": report.accepted, + } + if proposed_skill is not None: + with open(os.path.join(out, "proposed_SKILL.md"), "w", encoding="utf-8") as f: + f.write(proposed_skill) + if proposed_memory is not None: + with open(os.path.join(out, "proposed_CLAUDE.md"), "w", encoding="utf-8") as f: + f.write(proposed_memory) + with open(os.path.join(out, "report.json"), "w", encoding="utf-8") as f: + json.dump(report.to_dict(), f, ensure_ascii=False, indent=2) + with open(os.path.join(out, "report.md"), "w", encoding="utf-8") as f: + f.write(report_md) + with open(os.path.join(out, "manifest.json"), "w", encoding="utf-8") as f: + json.dump(manifest, f, ensure_ascii=False, indent=2) + return out + + +def _backup(path: str, backup_dir: str) -> None: + if os.path.exists(path): + os.makedirs(backup_dir, exist_ok=True) + shutil.copy2(path, os.path.join(backup_dir, os.path.basename(path))) + + +def adopt(staging_dir: str) -> List[str]: + """Copy staged proposals over the live files, backing up first. + + Returns the list of live paths that were updated. + """ + with open(os.path.join(staging_dir, "manifest.json")) as f: + manifest = json.load(f) + backup_dir = os.path.join(staging_dir, "backup") + updated: List[str] = [] + + if manifest.get("has_skill"): + live = manifest["live_skill_path"] + os.makedirs(os.path.dirname(live), exist_ok=True) + _backup(live, backup_dir) + shutil.copy2(os.path.join(staging_dir, "proposed_SKILL.md"), live) + updated.append(live) + if manifest.get("has_memory"): + live = manifest["live_memory_path"] + os.makedirs(os.path.dirname(live), exist_ok=True) + _backup(live, backup_dir) + shutil.copy2(os.path.join(staging_dir, "proposed_CLAUDE.md"), live) + updated.append(live) + return updated diff --git a/skillopt/sleep/state.py b/skillopt/sleep/state.py new file mode 100644 index 0000000..1909246 --- /dev/null +++ b/skillopt/sleep/state.py @@ -0,0 +1,83 @@ +"""SkillOpt-Sleep — persistent cross-night state. + +state.json lives in ~/.skillopt-sleep and is the "long-term" store that +turns nightly episodes into durable competence (the Agent-Sleep paper's +short-term -> long-term transfer). It records: + + - night counter + - last harvest timestamp per project (so each night only sees new data) + - cross-night "slow/meta" memory (lessons that persisted across nights) + - per-night history (scores, accept/reject) for trend reporting +""" +from __future__ import annotations + +import json +import os +from typing import Any, Dict, List, Optional + + +def _now_iso(clock: Optional[float] = None) -> str: + # caller passes a timestamp; we avoid importing time at module import + import time as _t + return _t.strftime("%Y-%m-%dT%H:%M:%S", _t.localtime(clock if clock is not None else _t.time())) + + +DEFAULT_STATE: Dict[str, Any] = { + "version": 1, + "night": 0, + "last_harvest": {}, # project -> iso timestamp of last harvested record + "slow_memory": "", # cross-night consolidated lessons (meta-skill analogue) + "history": [], # list of per-night summaries +} + + +class SleepState: + def __init__(self, path: str, data: Optional[Dict[str, Any]] = None) -> None: + self.path = path + self.data = data if data is not None else dict(DEFAULT_STATE) + + # io --------------------------------------------------------------------- + @classmethod + def load(cls, path: str) -> "SleepState": + if os.path.exists(path): + try: + with open(path) as f: + data = json.load(f) + merged = dict(DEFAULT_STATE) + merged.update(data if isinstance(data, dict) else {}) + return cls(path, merged) + except Exception: + pass + return cls(path, dict(DEFAULT_STATE)) + + def save(self) -> None: + os.makedirs(os.path.dirname(self.path), exist_ok=True) + tmp = self.path + ".tmp" + with open(tmp, "w") as f: + json.dump(self.data, f, ensure_ascii=False, indent=2) + os.replace(tmp, self.path) + + # accessors -------------------------------------------------------------- + @property + def night(self) -> int: + return int(self.data.get("night", 0)) + + def last_harvest_for(self, project: str) -> Optional[str]: + return self.data.get("last_harvest", {}).get(project) + + def set_last_harvest(self, project: str, iso_ts: str) -> None: + self.data.setdefault("last_harvest", {})[project] = iso_ts + + @property + def slow_memory(self) -> str: + return str(self.data.get("slow_memory", "")) + + def set_slow_memory(self, content: str) -> None: + self.data["slow_memory"] = content + + def begin_night(self, clock: Optional[float] = None) -> int: + self.data["night"] = self.night + 1 + return self.night + + def record_night(self, summary: Dict[str, Any]) -> None: + self.data.setdefault("history", []).append(summary) diff --git a/skillopt/sleep/types.py b/skillopt/sleep/types.py new file mode 100644 index 0000000..655541d --- /dev/null +++ b/skillopt/sleep/types.py @@ -0,0 +1,127 @@ +"""SkillOpt-Sleep — core data types. + +These dataclasses are the interfaces between the sleep-cycle stages +(harvest -> mine -> replay -> consolidate -> stage). They are intentionally +plain (no slots, no heavy deps) so the package imports cleanly on any +Python 3.8+ interpreter and the deterministic experiment runs with zero +external dependencies. +""" +from __future__ import annotations + +from dataclasses import dataclass, field, asdict +from typing import Any, Dict, List, Optional + + +# ── Stage 1: harvest ────────────────────────────────────────────────────────── + +@dataclass +class SessionDigest: + """A normalized summary of one Claude Code session transcript. + + Produced by :mod:`skillopt.sleep.harvest` from a ``.jsonl`` + transcript plus ``history.jsonl`` entries. + """ + + session_id: str + project: str + git_branch: str = "" + started_at: str = "" + ended_at: str = "" + user_prompts: List[str] = field(default_factory=list) + assistant_finals: List[str] = field(default_factory=list) + tools_used: List[str] = field(default_factory=list) + files_touched: List[str] = field(default_factory=list) + feedback_signals: List[str] = field(default_factory=list) # "still broken", "perfect", ... + n_user_turns: int = 0 + n_assistant_turns: int = 0 + raw_path: str = "" + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + +# ── Stage 2: mine ───────────────────────────────────────────────────────────── + +@dataclass +class TaskRecord: + """A self-contained recurring task mined from one or more sessions. + + This is the *training unit* of the sleep cycle — the analogue of a + SkillOpt benchmark item. + """ + + id: str + project: str + intent: str # what the user wanted (the "question") + context_excerpt: str = "" # minimal context needed to attempt it + attempted_solution: str = "" # what the agent produced before + outcome: str = "unknown" # success | fail | mixed | unknown + reference_kind: str = "none" # exact | rubric | none + reference: str = "" # exact answer, or rubric text + tags: List[str] = field(default_factory=list) + source_sessions: List[str] = field(default_factory=list) + split: str = "replay" # replay (train) | holdout (test) + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + @classmethod + def from_dict(cls, d: Dict[str, Any]) -> "TaskRecord": + known = {f for f in cls.__dataclass_fields__} # type: ignore[attr-defined] + return cls(**{k: v for k, v in d.items() if k in known}) + + +# ── Stage 3: replay ─────────────────────────────────────────────────────────── + +@dataclass +class ReplayResult: + """Outcome of re-running one TaskRecord offline under a given skill+memory.""" + + id: str + hard: float = 0.0 # 0/1 exact, or continuous reward + soft: float = 0.0 # partial credit / judge score 0..1 + response: str = "" + fail_reason: str = "" + task_type: str = "task" + judge_rationale: str = "" + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + +# ── Stage 4/5: consolidation report ─────────────────────────────────────────── + +@dataclass +class EditRecord: + """One bounded edit proposed/applied to skill or memory.""" + + target: str # "skill" | "memory" + op: str # add | delete | replace + content: str = "" + anchor: str = "" # for replace/delete: text being changed + rationale: str = "" + + +@dataclass +class SleepReport: + """Everything one night produced — written to staging for review.""" + + night: int + project: str + started_at: str = "" + ended_at: str = "" + n_sessions: int = 0 + n_tasks: int = 0 + n_replayed: int = 0 + baseline_score: float = 0.0 + candidate_score: float = 0.0 + accepted: bool = False + gate_action: str = "" + edits: List[EditRecord] = field(default_factory=list) + rejected_edits: List[EditRecord] = field(default_factory=list) + tokens_used: int = 0 + notes: List[str] = field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + d = asdict(self) + return d diff --git a/tests/test_sleep_engine.py b/tests/test_sleep_engine.py new file mode 100644 index 0000000..751988c --- /dev/null +++ b/tests/test_sleep_engine.py @@ -0,0 +1,166 @@ +"""Tests for the SkillOpt-Sleep engine. + +Pure-stdlib (unittest), deterministic, no API key, no third-party deps. +Run: python3.12 -m pytest tests/test_sleep_engine.py + or: python3.12 -m unittest skillopt.sleep ... (see bottom) +""" +from __future__ import annotations + +import json +import os +import tempfile +import unittest + +from skillopt.sleep.backend import MockBackend, exact_score, keyword_soft_score +from skillopt.sleep.config import load_config +from skillopt.sleep.consolidate import consolidate +from skillopt.sleep.cycle import run_sleep_cycle +from skillopt.sleep.experiments.personas import researcher_persona, programmer_persona +from skillopt.sleep.harvest import digest_transcript, _detect_feedback, _is_meta_prompt +from skillopt.sleep.memory import apply_edits, current_learned_lines, extract_learned, set_learned +from skillopt.sleep.mine import assign_splits, heuristic_mine, dedup_tasks +from skillopt.sleep.staging import adopt, latest_staging +from skillopt.sleep.types import EditRecord, SessionDigest, TaskRecord + + +class TestScoring(unittest.TestCase): + def test_exact_score(self): + self.assertEqual(exact_score("arXiv:1706.03762", "the id is arXiv:1706.03762 ok"), 1.0) + self.assertEqual(exact_score("arXiv:1706.03762", "approximately arXiv:1706.037"), 0.0) + + def test_keyword_soft(self): + self.assertGreater(keyword_soft_score("add login form", "please add the login form"), 0.5) + + +class TestMemoryEdits(unittest.TestCase): + def test_add_and_dedup(self): + doc = set_learned("# skill\n", []) + doc2, applied = apply_edits(doc, [EditRecord("skill", "add", "Rule A"), + EditRecord("skill", "add", "Rule A")]) + self.assertEqual(len(applied), 1) + self.assertIn("Rule A", extract_learned(doc2)) + + def test_protected_region_roundtrip(self): + base = "# My hand-written skill\nkeep me\n" + doc = set_learned(base, ["Rule X"]) + self.assertIn("keep me", doc) + self.assertEqual(current_learned_lines(doc), ["Rule X"]) + # replacing learned region must preserve hand-written content + doc2 = set_learned(doc, ["Rule Y"]) + self.assertIn("keep me", doc2) + self.assertEqual(current_learned_lines(doc2), ["Rule Y"]) + + def test_replace_and_delete(self): + doc = set_learned("", ["old rule about commits"]) + doc, _ = apply_edits(doc, [EditRecord("skill", "replace", "new rule", anchor="old rule")]) + self.assertIn("new rule", extract_learned(doc)) + doc, _ = apply_edits(doc, [EditRecord("skill", "delete", "", anchor="new rule")]) + self.assertEqual(current_learned_lines(doc), []) + + +class TestHarvest(unittest.TestCase): + def test_feedback_detection(self): + self.assertTrue(any(s.startswith("neg:") for s in _detect_feedback("this is still broken"))) + self.assertTrue(any(s.startswith("pos:") for s in _detect_feedback("perfect, thanks"))) + + def test_meta_prompt_filter(self): + self.assertTrue(_is_meta_prompt("/clear")) + self.assertTrue(_is_meta_prompt("x")) + self.assertFalse(_is_meta_prompt("please refactor the auth module")) + + def test_digest_real_transcript_if_present(self): + # uses the live machine's transcripts when available; skips otherwise + base = os.path.expanduser("~/.claude/projects") + if not os.path.isdir(base): + self.skipTest("no ~/.claude/projects on this machine") + found = None + for root, _d, files in os.walk(base): + for fn in files: + if fn.endswith(".jsonl"): + found = os.path.join(root, fn) + break + if found: + break + if not found: + self.skipTest("no transcripts") + d = digest_transcript(found) + # may be None for empty transcripts; if not, it must have core fields + if d is not None: + self.assertIsInstance(d.session_id, str) + self.assertGreaterEqual(d.n_user_turns + d.n_assistant_turns, 0) + + +class TestMine(unittest.TestCase): + def _digest(self, prompts, feedback): + return SessionDigest( + session_id="s1", project="/p", user_prompts=prompts, + assistant_finals=["did stuff"], feedback_signals=feedback, + n_user_turns=len(prompts), n_assistant_turns=1, + ) + + def test_outcome_inference(self): + fail = heuristic_mine([self._digest(["fix the parser bug please"], ["neg:still broken"])]) + self.assertEqual(fail[0].outcome, "fail") + ok = heuristic_mine([self._digest(["format the output"], ["pos:perfect"])]) + self.assertEqual(ok[0].outcome, "success") + + def test_split_stable_and_nonempty(self): + tasks = assign_splits(researcher_persona(), holdout_fraction=0.34, seed=42) + splits = {t.split for t in tasks} + self.assertIn("replay", splits) + self.assertIn("holdout", splits) + # stable across calls + again = assign_splits(researcher_persona(), holdout_fraction=0.34, seed=42) + self.assertEqual([t.split for t in tasks], [t.split for t in again]) + + +class TestConsolidateGate(unittest.TestCase): + def test_accepts_helpful_rejects_harmful(self): + be = MockBackend() + tasks = assign_splits(researcher_persona(), holdout_fraction=0.34, seed=42) + res = consolidate(be, tasks, set_learned("", []), "", edit_budget=4, + gate_metric="mixed", night=1) + self.assertTrue(res.accepted) + self.assertGreater(res.candidate_score, res.baseline_score) + + def test_no_op_when_already_optimal(self): + be = MockBackend() + tasks = assign_splits(programmer_persona(), holdout_fraction=0.34, seed=1) + # first night learns the rule + r1 = consolidate(be, tasks, set_learned("", []), "", edit_budget=4, night=1) + # second night on the learned skill should find nothing to add + r2 = consolidate(be, tasks, r1.new_skill, r1.new_memory, edit_budget=4, night=2) + self.assertEqual(len(r2.applied_edits), 0) + + +class TestFullCycleAndAdopt(unittest.TestCase): + def test_cycle_stage_then_adopt_with_backup(self): + with tempfile.TemporaryDirectory() as proj, tempfile.TemporaryDirectory() as home: + cfg = load_config( + invoked_project=proj, projects="invoked", backend="mock", + claude_home=os.path.join(home, ".claude"), + managed_skill_name="skillopt-sleep-learned", + auto_adopt=False, + ) + # seed a known persona so we don't depend on ~/.claude + tasks = assign_splits(researcher_persona(), holdout_fraction=0.34, seed=42) + + outcome = run_sleep_cycle(cfg, seed_tasks=tasks) + self.assertTrue(outcome.report.accepted) + self.assertTrue(os.path.isdir(outcome.staging_dir)) + self.assertTrue(os.path.exists(os.path.join(outcome.staging_dir, "report.md"))) + + # nothing live touched yet + live_skill = cfg.managed_skill_path() + self.assertFalse(os.path.exists(live_skill)) + + # adopt -> live file created, backup dir exists + updated = adopt(outcome.staging_dir) + self.assertTrue(any("SKILL.md" in p for p in updated)) + self.assertTrue(os.path.exists(live_skill)) + with open(live_skill) as f: + self.assertIn("answer", f.read().lower()) + + +if __name__ == "__main__": + unittest.main(verbosity=2)