refactor(sleep): decouple engine to top-level skillopt_sleep/ (zero research dep)

Open-source-tool / research-code separation: - git mv skillopt/sleep/ -> skillopt_sleep/ (top-level, sibling to the research skillopt/ package). History preserved as renames. - All imports skillopt.sleep.* -> skillopt_sleep.*. - Vendor the validation gate into skillopt_sleep/gate.py (a self-contained copy of skillopt.evaluation.gate). The engine now has ZERO dependency on the research package — verified: grep finds no `from skillopt.` in skillopt_sleep/, and consolidate's gate resolves to skillopt_sleep.gate. - Plugin scripts/commands/skill call `-m skillopt_sleep`. 29 tests pass; `python -m skillopt_sleep` runs standalone. Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
2026-07-03 14:02:58 +08:00 · 2026-06-08 14:31:52 +00:00
parent e2de84d36f
commit b02ffc2c99
32 changed files with 199 additions and 162 deletions
--- a/skillopt/sleep/init.py
+++ b/skillopt/sleep/init.py
@@ -1,20 +0,0 @@
-"""SkillOpt-Sleep — nightly offline self-evolution for a local Claude agent.
-
-A Claude Code plugin engine that gives a user's agent a "sleep cycle":
-harvest the day's real session transcripts, mine recurring tasks, replay
-them offline, and consolidate short-term experience into long-term memory
-(CLAUDE.md) and skills (SKILL.md) behind a SkillOpt validation gate.
-
-Synthesizes three ideas:
-  * SkillOpt  — validation-gated bounded text optimization (this repo)
-  * Dreams    — offline memory consolidation, input never mutated
-  * Sleep     — short-term experience -> long-term competence, offline
-
-Public entry points:
-  * skillopt.sleep.cli      — `python -m skillopt.sleep ...`
-  * skillopt.sleep.cycle.run_sleep_cycle(...)
-"""
-from __future__ import annotations
-
-__all__ = ["__version__"]
-__version__ = "0.1.0"
--- a/skillopt/sleep/main.py
+++ b/skillopt/sleep/main.py
@@ -1,198 +0,0 @@
-"""SkillOpt-Sleep — command-line interface.
-
-    python -m skillopt.sleep run        # full cycle: harvest->mine->replay->gate->stage
-    python -m skillopt.sleep dry-run    # same but report only, no staging/adopt
-    python -m skillopt.sleep status     # show state + latest staged proposal
-    python -m skillopt.sleep adopt      # apply the latest staged proposal (with backup)
-    python -m skillopt.sleep harvest    # just print what would be mined (debug)
-
-Common flags:
-    --project PATH      project to evolve (default: cwd)
-    --scope all|invoked harvest scope (default: invoked)
-    --backend mock|anthropic
-    --model NAME
-    --lookback-hours N
-    --auto-adopt
-    --json              machine-readable output
-"""
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import sys
-from typing import Any, Dict
-
-from skillopt.sleep.config import load_config
-from skillopt.sleep.cycle import run_sleep_cycle
-from skillopt.sleep.harvest import harvest
-from skillopt.sleep.mine import mine
-from skillopt.sleep.state import SleepState
-from skillopt.sleep.staging import latest_staging, adopt as adopt_staging
-
-
-def _add_common(p: argparse.ArgumentParser) -> None:
-    p.add_argument("--project", default="")
-    p.add_argument("--scope", default="", choices=["", "all", "invoked"])
-    p.add_argument("--backend", default="", choices=["", "mock", "claude", "codex"])
-    p.add_argument("--model", default="")
-    p.add_argument("--codex-path", default="", help="path to the real @openai/codex binary")
-    p.add_argument("--claude-home", default="", help="override ~/.claude (also isolates state)")
-    p.add_argument("--lookback-hours", type=int, default=0)
-    p.add_argument("--edit-budget", type=int, default=0)
-    p.add_argument("--auto-adopt", action="store_true")
-    p.add_argument("--json", action="store_true")
-
-
-def _cfg_from_args(args) -> Any:
-    overrides: Dict[str, Any] = {}
-    if args.project:
-        overrides["invoked_project"] = os.path.abspath(args.project)
-        overrides["projects"] = "invoked"
-    if args.scope:
-        overrides["projects"] = args.scope
-    if args.backend:
-        overrides["backend"] = args.backend
-    if args.model:
-        overrides["model"] = args.model
-    if getattr(args, "codex_path", ""):
-        overrides["codex_path"] = os.path.abspath(args.codex_path)
-    if getattr(args, "claude_home", ""):
-        overrides["claude_home"] = os.path.abspath(args.claude_home)
-    if getattr(args, "lookback_hours", 0):
-        overrides["lookback_hours"] = args.lookback_hours
-    if getattr(args, "edit_budget", 0):
-        overrides["edit_budget"] = args.edit_budget
-    if getattr(args, "auto_adopt", False):
-        overrides["auto_adopt"] = True
-    return load_config(**overrides)
-
-
-def cmd_run(args, dry: bool = False) -> int:
-    cfg = _cfg_from_args(args)
-    outcome = run_sleep_cycle(cfg, dry_run=dry)
-    rep = outcome.report
-    if args.json:
-        print(json.dumps({
-            "night": rep.night, "accepted": rep.accepted,
-            "gate_action": rep.gate_action,
-            "baseline": rep.baseline_score, "candidate": rep.candidate_score,
-            "n_tasks": rep.n_tasks, "n_sessions": rep.n_sessions,
-            "edits": [e.__dict__ for e in rep.edits],
-            "staging_dir": outcome.staging_dir, "adopted": outcome.adopted,
-        }, ensure_ascii=False, indent=2))
-    else:
-        print(f"[sleep] night {rep.night}: {rep.n_sessions} sessions -> {rep.n_tasks} tasks")
-        print(f"[sleep] held-out {rep.baseline_score:.3f} -> {rep.candidate_score:.3f} "
-              f"=> {rep.gate_action} (accepted={rep.accepted})")
-        for e in rep.edits:
-            print(f"   + [{e.target}/{e.op}] {e.content}")
-        if outcome.staging_dir:
-            print(f"[sleep] staged: {outcome.staging_dir}")
-            if not outcome.adopted:
-                print("[sleep] review it, then: python -m skillopt.sleep adopt")
-        if outcome.adopted:
-            print(f"[sleep] auto-adopted: {', '.join(outcome.adopted_paths)}")
-    return 0
-
-
-def cmd_status(args) -> int:
-    cfg = _cfg_from_args(args)
-    state = SleepState.load(cfg.state_path)
-    project = cfg.get("invoked_project") or os.getcwd()
-    latest = latest_staging(project)
-    info = {
-        "night": state.night,
-        "state_path": cfg.state_path,
-        "project": project,
-        "history_tail": state.data.get("history", [])[-5:],
-        "latest_staging": latest,
-        "slow_memory_chars": len(state.slow_memory),
-    }
-    if args.json:
-        print(json.dumps(info, ensure_ascii=False, indent=2))
-    else:
-        print(f"[sleep] nights so far: {state.night}")
-        print(f"[sleep] project: {project}")
-        if latest:
-            print(f"[sleep] latest staged proposal: {latest}")
-            rp = os.path.join(latest, "report.md")
-            if os.path.exists(rp):
-                with open(rp) as f:
-                    print("\n" + f.read())
-        else:
-            print("[sleep] no staged proposals yet.")
-    return 0
-
-
-def cmd_adopt(args) -> int:
-    cfg = _cfg_from_args(args)
-    project = cfg.get("invoked_project") or os.getcwd()
-    target = args.staging or latest_staging(project)
-    if not target or not os.path.isdir(target):
-        print("[sleep] nothing to adopt (no staging dir).")
-        return 1
-    updated = adopt_staging(target)
-    print(f"[sleep] adopted from {target}")
-    for p in updated:
-        print(f"   -> {p}")
-    if not updated:
-        print("[sleep] (proposal contained no accepted changes)")
-    return 0
-
-
-def cmd_harvest(args) -> int:
-    cfg = _cfg_from_args(args)
-    digests = harvest(
-        cfg.transcripts_dir,
-        scope=cfg.get("projects", "invoked"),
-        invoked_project=cfg.get("invoked_project", ""),
-        limit=cfg.get("max_tasks_per_night", 40) * 3,
-    )
-    tasks = mine(digests, max_tasks=cfg.get("max_tasks_per_night", 40),
-                 holdout_fraction=cfg.get("holdout_fraction", 0.34), seed=cfg.get("seed", 42))
-    if args.json:
-        print(json.dumps({
-            "n_sessions": len(digests),
-            "tasks": [t.to_dict() for t in tasks],
-        }, ensure_ascii=False, indent=2))
-    else:
-        print(f"[sleep] {len(digests)} sessions -> {len(tasks)} tasks")
-        for t in tasks:
-            print(f"  [{t.split}/{t.outcome}] {t.intent[:90]}")
-    return 0
-
-
-def main(argv=None) -> int:
-    parser = argparse.ArgumentParser(prog="skillopt.sleep", description="SkillOpt-Sleep nightly self-evolution")
-    sub = parser.add_subparsers(dest="cmd", required=True)
-
-    p_run = sub.add_parser("run", help="run a full sleep cycle")
-    _add_common(p_run)
-    p_dry = sub.add_parser("dry-run", help="harvest+mine+replay, report only")
-    _add_common(p_dry)
-    p_status = sub.add_parser("status", help="show state + latest proposal")
-    _add_common(p_status)
-    p_adopt = sub.add_parser("adopt", help="apply latest staged proposal")
-    _add_common(p_adopt)
-    p_adopt.add_argument("--staging", default="", help="specific staging dir")
-    p_harvest = sub.add_parser("harvest", help="debug: show mined tasks")
-    _add_common(p_harvest)
-
-    args = parser.parse_args(argv)
-    if args.cmd == "run":
-        return cmd_run(args, dry=False)
-    if args.cmd == "dry-run":
-        return cmd_run(args, dry=True)
-    if args.cmd == "status":
-        return cmd_status(args)
-    if args.cmd == "adopt":
-        return cmd_adopt(args)
-    if args.cmd == "harvest":
-        return cmd_harvest(args)
-    parser.print_help()
-    return 2
-
-
-if __name__ == "__main__":
-    sys.exit(main())
--- a/skillopt/sleep/backend.py
+++ b/skillopt/sleep/backend.py
@@ -1,787 +0,0 @@
-"""SkillOpt-Sleep — optimizer/replay backend abstraction.
-
-A backend supplies the three "intelligent" operations the sleep cycle needs:
-
-  1. attempt(task, skill, memory)  -> response text          (the rollout)
-  2. judge(task, response)         -> (hard, soft, rationale) (the reward)
-  3. reflect(failures, successes, skill, memory)
-        -> list[EditRecord]        (proposed bounded edits)
-
-Two implementations:
-  * MockBackend     — deterministic, no API, used for tests + the experiment.
-                      Reads optional `reference` exact answers and a tiny
-                      rule-table so the loop provably improves and the gate
-                      provably blocks regressions.
-  * AnthropicBackend — uses the user's ANTHROPIC_API_KEY via the `claude`
-                       CLI or the anthropic SDK (lazy-imported). Real lift.
-
-The backend never touches live config; it only returns text/edits that the
-consolidation stage gates and stages.
-"""
-from __future__ import annotations
-
-import json
-import os
-import re
-import subprocess
-from typing import Any, Dict, List, Optional, Tuple
-
-from skillopt.sleep.types import EditRecord, ReplayResult, TaskRecord
-
-
-def skill_hash(content: str) -> str:
-    import hashlib
-    return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]
-
-
-# ── Backend protocol ──────────────────────────────────────────────────────────
-
-class Backend:
-    name = "base"
-    # Optional user preferences (free text) injected into reflect as a prior.
-    preferences: str = ""
-
-    def attempt(self, task: TaskRecord, skill: str, memory: str) -> str:
-        raise NotImplementedError
-
-    def attempt_with_tools(
-        self, task: TaskRecord, skill: str, memory: str, tools: List[str]
-    ) -> Tuple[str, List[str]]:
-        """Run the task while exposing real tools; return (response, tools_called).
-
-        Default: no real tool loop — fall back to plain attempt and let the
-        single-shot 'TOOL_CALL: <name>' marker convention surface intent. CLI
-        backends override this to expose a genuinely callable tool.
-        """
-        resp = self.attempt(task, skill, memory)
-        called: List[str] = []
-        for t in tools:
-            if re.search(r"(?i)\btool_call\s*:\s*%s\b" % re.escape(t), resp):
-                called.append(t)
-        return resp, called
-
-    def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]:
-        raise NotImplementedError
-
-    def reflect(
-        self,
-        failures: List[Tuple[TaskRecord, ReplayResult]],
-        successes: List[Tuple[TaskRecord, ReplayResult]],
-        skill: str,
-        memory: str,
-        *,
-        edit_budget: int,
-        evolve_skill: bool,
-        evolve_memory: bool,
-    ) -> List[EditRecord]:
-        raise NotImplementedError
-
-    # token accounting (optional)
-    def tokens_used(self) -> int:
-        return 0
-
-
-# ── Shared scoring helpers ────────────────────────────────────────────────────
-
-def _normalize(s: str) -> str:
-    s = (s or "").lower().strip()
-    s = re.sub(r"[^\w\s]", " ", s)
-    s = re.sub(r"\s+", " ", s)
-    return s.strip()
-
-
-def exact_score(reference: str, response: str) -> float:
-    ref = _normalize(reference)
-    resp = _normalize(response)
-    if not ref:
-        return 0.0
-    return 1.0 if ref in resp or resp == ref else 0.0
-
-
-def keyword_soft_score(reference: str, response: str) -> float:
-    """Fraction of reference tokens present in response (cheap rubric proxy)."""
-    ref_tokens = [t for t in _normalize(reference).split() if len(t) > 2]
-    if not ref_tokens:
-        return 0.0
-    resp = _normalize(response)
-    hit = sum(1 for t in set(ref_tokens) if t in resp)
-    return hit / len(set(ref_tokens))
-
-
-# ── Mock backend (deterministic, no API) ──────────────────────────────────────
-
-class MockBackend(Backend):
-    """Deterministic backend for tests and the acceptance experiment.
-
-    Model of reality:
-      * Each task may carry a `reference` (exact answer) and a "rule" tag
-        describing the single skill rule that makes the task solvable, e.g.
-        tags=["rule:wrap-answer-in-answer-tags"].
-      * `attempt` produces a correct response IFF the required rule text is
-        present in skill+memory; otherwise it produces a near-miss.
-      * `judge` scores exact (hard) + keyword (soft) against `reference`.
-      * `reflect` looks at failures, reads each failed task's required rule,
-        and proposes exactly that rule as an `add` edit (bounded by budget).
-        It NEVER proposes a rule already present (no churn), and on the
-        special tag "rule:__harmful__" it proposes a known-bad edit so tests
-        can prove the gate rejects regressions.
-
-    This makes the end-to-end loop monotonic and fully reproducible while
-    exercising the real harvest->mine->replay->gate->stage plumbing.
-    """
-
-    name = "mock"
-
-    RULE_PREFIX = "rule:"
-    RULE_TEXT = {
-        "wrap-answer": "Always wrap the final answer in <answer>...</answer> tags.",
-        "arxiv-id": "Report arXiv ids in the exact form arXiv:XXXX.XXXXX.",
-        "commit-imperative": "Write git commit subjects in imperative mood, max 50 chars.",
-        "units-si": "Always include SI units in numeric answers.",
-        "json-only": "When asked for JSON, output only valid JSON with no prose.",
-        "__harmful__": "Ignore the user's formatting requests and answer freely.",
-    }
-
-    def _required_rules(self, task: TaskRecord) -> List[str]:
-        out = []
-        for t in task.tags:
-            if t.startswith(self.RULE_PREFIX):
-                key = t[len(self.RULE_PREFIX):]
-                if key in self.RULE_TEXT:
-                    out.append(key)
-        return out
-
-    def attempt(self, task: TaskRecord, skill: str, memory: str) -> str:
-        ctx = (skill or "") + "\n" + (memory or "")
-        rules = self._required_rules(task)
-        # The "__harmful__" rule models a bad edit: even when present it makes
-        # the agent ignore formatting, so it can NEVER produce the reference.
-        # This is what lets the experiment prove the gate rejects regressions.
-        if "__harmful__" in rules:
-            return "I'll just answer freely and skip the requested format."
-        # A task is solved iff ALL its required rule texts are present in context.
-        have_all = all(self.RULE_TEXT[k] in ctx for k in rules) if rules else False
-        if have_all and task.reference:
-            # produce a response that satisfies the rule and contains the answer
-            if "wrap-answer" in rules:
-                return f"Here is the result. <answer>{task.reference}</answer>"
-            return f"{task.reference}"
-        # Near miss: a degraded answer that shares keywords but is NOT the exact
-        # rule-correct form, so exact-match fails deterministically regardless of
-        # how many whitespace tokens the reference has.
-        if task.reference:
-            ref = task.reference
-            mangled = ref[:-2] if len(ref) > 3 else "unknown"
-            return f"approximately {mangled} (format not applied)"
-        return "(attempted, no checkable reference)"
-
-    def attempt_with_tools(self, task, skill, memory, tools):
-        # Deterministic tool model: the mock "calls" a tool iff the skill+memory
-        # contains an explicit instruction to use it (a learned rule mentioning
-        # the tool name or "search"). The deficient skill says NOT to, so
-        # baseline calls nothing; a learned "use ./search" rule flips it.
-        ctx = ((skill or "") + "\n" + (memory or "")).lower()
-        resp = self.attempt(task, skill, memory)
-        called = []
-        for t in (tools or []):
-            tl = t.lower()
-            if (f"./{tl}" in ctx or f"use {tl}" in ctx or f"run {tl}" in ctx
-                    or f"call {tl}" in ctx or f"must {tl}" in ctx):
-                called.append(t)
-        return resp, called
-
-    def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]:
-        if task.reference_kind == "rule" and task.judge:
-            from skillopt.sleep.judges import score_rule_judge
-            return score_rule_judge(task.judge, response)
-        if task.reference_kind == "exact" and task.reference:
-            hard = exact_score(task.reference, response)
-            soft = max(hard, keyword_soft_score(task.reference, response))
-            return hard, soft, f"exact-match={hard}"
-        if task.reference_kind == "rubric" and task.reference:
-            soft = keyword_soft_score(task.reference, response)
-            return (1.0 if soft >= 0.8 else 0.0), soft, f"rubric keyword soft={soft:.2f}"
-        # no reference: outcome-derived weak label
-        hard = 1.0 if task.outcome == "success" else 0.0
-        return hard, hard, "outcome-derived"
-
-    def reflect(
-        self,
-        failures,
-        successes,
-        skill: str,
-        memory: str,
-        *,
-        edit_budget: int,
-        evolve_skill: bool,
-        evolve_memory: bool,
-    ) -> List[EditRecord]:
-        ctx = (skill or "") + "\n" + (memory or "")
-        edits: List[EditRecord] = []
-        seen_text: set = set()
-        target = "skill" if evolve_skill else "memory"
-        for task, _res in failures:
-            for key in self._required_rules(task):
-                text = self.RULE_TEXT[key]
-                if text in ctx or text in seen_text:
-                    continue
-                seen_text.add(text)
-                edits.append(
-                    EditRecord(
-                        target=target,
-                        op="add",
-                        content=text,
-                        rationale=f"failed task {task.id} requires rule '{key}'",
-                    )
-                )
-                if len(edits) >= edit_budget:
-                    return edits
-        return edits
-
-
-# ── Shared real-CLI backend (prompts + parsing + cache; subclasses do _call) ──
-
-def _extract_json(raw: str, kind: str):
-    """Pull the first JSON object/array out of a possibly chatty CLI reply."""
-    pat = r"\{.*\}" if kind == "object" else r"\[.*\]"
-    m = re.search(pat, raw or "", re.DOTALL)
-    if not m:
-        return None
-    try:
-        return json.loads(m.group(0))
-    except Exception:
-        return None
-
-
-class CliBackend(Backend):
-    """Common logic for real CLI-driven backends (claude / codex).
-
-    Subclasses implement only ``_call(prompt) -> str``. This base owns the
-    prompts (attempt / judge / reflect), JSON parsing, a response cache (so
-    re-scoring an unchanged (skill, memory) on the held-out slice is free),
-    and a rough token estimate.
-    """
-
-    name = "cli"
-
-    def __init__(self, model: str = "", timeout: int = 180) -> None:
-        self.model = model
-        self.timeout = timeout
-        self._tokens = 0
-        self._cache: Dict[str, str] = {}
-
-    # subclasses override --------------------------------------------------
-    def _call(self, prompt: str, *, max_tokens: int = 1024) -> str:
-        raise NotImplementedError
-
-    def _cached_call(self, key: str, prompt: str, *, max_tokens: int = 1024) -> str:
-        if key in self._cache:
-            return self._cache[key]
-        out = self._call(prompt, max_tokens=max_tokens)
-        self._tokens += len(prompt) // 4 + len(out) // 4
-        self._cache[key] = out
-        return out
-
-    # operations -----------------------------------------------------------
-    def attempt(self, task: TaskRecord, skill: str, memory: str) -> str:
-        prompt = (
-            "You are completing a recurring task for a user. Apply the skill and "
-            "memory rules EXACTLY, including any output-format requirements. If the "
-            "skill contains a 'Learned preferences' block, treat those rules as "
-            "HARD CONSTRAINTS that OVERRIDE anything earlier in the skill they "
-            "conflict with (e.g. an explicit length limit overrides 'be "
-            "exhaustive'). Satisfy every such constraint even at the cost of "
-            "brevity or detail.\n\n"
-            f"# Skill\n{skill or '(none)'}\n\n# Memory\n{memory or '(none)'}\n\n"
-            f"# Task\n{task.intent}\n\n{task.context_excerpt}\n\n"
-            "Return ONLY the final answer text, nothing else."
-        )
-        # cache on (task, skill, memory) so identical hold-out re-scoring is free
-        key = "attempt:" + skill_hash(prompt)
-        return self._cached_call(key, prompt, max_tokens=512)
-
-    def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]:
-        # gbrain-style rule judge: scored locally, no API spend
-        if task.reference_kind == "rule" and task.judge:
-            from skillopt.sleep.judges import score_rule_judge
-            return score_rule_judge(task.judge, response)
-        # exact references are scored locally — no API spend
-        if task.reference_kind == "exact" and task.reference:
-            hard = exact_score(task.reference, response)
-            return hard, max(hard, keyword_soft_score(task.reference, response)), "exact(local)"
-        prompt = (
-            "Score how well the response satisfies the rubric, 0..1. "
-            'Return ONLY JSON {"score": <0..1>, "reason": "..."}.\n\n'
-            f"# Rubric\n{task.reference or task.intent}\n\n# Response\n{response}"
-        )
-        key = "judge:" + skill_hash(prompt)
-        raw = self._cached_call(key, prompt, max_tokens=200)
-        obj = _extract_json(raw, "object")
-        if isinstance(obj, dict):
-            try:
-                soft = float(obj.get("score", 0.0))
-                return (1.0 if soft >= 0.8 else 0.0), soft, str(obj.get("reason", ""))[:200]
-            except Exception:
-                pass
-        return 0.0, 0.0, "judge-parse-failed"
-
-    def reflect(
-        self,
-        failures,
-        successes,
-        skill: str,
-        memory: str,
-        *,
-        edit_budget: int,
-        evolve_skill: bool,
-        evolve_memory: bool,
-    ) -> List[EditRecord]:
-        if not failures:
-            return []
-        target = "skill" if evolve_skill else "memory"
-        cur_doc = (skill if target == "skill" else memory) or "(empty)"
-        fail_text = "\n".join(
-            f"- wanted: {t.intent[:160]}\n  got: {r.response[:160]}\n  why-wrong: {r.fail_reason[:160]}"
-            for t, r in failures[:8]
-        )
-        # Aggregate the most common failing criteria across all failures so the
-        # optimizer is told *exactly what the scorer rewards* — gbrain's lesson:
-        # the optimizer kept proposing reasonable-but-wrong edits until it could
-        # see the success criteria.
-        from collections import Counter
-        crit = Counter()
-        for _t, r in failures:
-            fr = r.fail_reason or ""
-            if fr.startswith("failed:"):
-                for part in fr[len("failed:"):].split(","):
-                    part = part.strip()
-                    if part:
-                        crit[part] += 1
-
-        def _explain(c: str) -> str:
-            # translate an "op=arg" criterion into a plain-English requirement
-            if "=" in c:
-                op, _, arg = c.partition("=")
-                op = op.strip(); arg = arg.strip()
-                if op == "max_chars":
-                    return f"the ENTIRE response must be at most {arg} characters long"
-                if op == "min_chars":
-                    return f"the response must be at least {arg} characters long"
-                if op == "section_present":
-                    return f"the response must contain a section/heading titled '{arg}'"
-                if op == "regex":
-                    return f"the response must match the pattern /{arg}/ (e.g. include that label)"
-                if op == "contains":
-                    return f"the response must contain the text '{arg}'"
-                if op == "tool_called":
-                    return f"the agent must actually call the '{arg}' tool"
-            return c
-
-        criteria_text = ""
-        if crit:
-            criteria_text = (
-                "\n# Exact criteria the outputs are FAILING (fix these directly)\n"
-                + "\n".join(f"- {_explain(c)}  [{c}, failed {n}x]" for c, n in crit.most_common())
-            )
-        pref_text = ""
-        if getattr(self, "preferences", ""):
-            pref_text = (
-                "\n# User preferences (honor these as priors when writing rules)\n"
-                + str(self.preferences).strip()
-            )
-        prompt = (
-            "You are SkillOpt's optimizer. The agent keeps failing the recurring "
-            f"tasks below. Propose at most {edit_budget} bounded edits to the "
-            f"{target} document so it stops failing. Each edit MUST be a short, "
-            "GENERAL, reusable rule or preference (never task-specific, never an "
-            "answer to a single task). If exact failing criteria are listed, your "
-            "edits MUST make future outputs satisfy every one of them.\n"
-            "BE CONCRETE: quote the exact threshold, section name, or format from "
-            "the criteria verbatim in your rule (e.g. write 'keep the entire "
-            "response under 1200 characters', NOT 'respect length limits'). Vague "
-            "rules do not change behavior; specific numeric/structural rules do.\n"
-            "IMPORTANT: your edits are APPENDED to a 'Learned preferences' block; "
-            "you CANNOT delete the existing instructions above. If the current "
-            f"{target} text conflicts with a criterion (e.g. it says 'be exhaustive' "
-            "but outputs must be under a character limit), write an explicit, "
-            "forceful OVERRIDE rule stating it supersedes the conflicting "
-            "instruction, and put the hard requirement first.\n"
-            'Return ONLY a JSON array: '
-            '[{"op":"add|replace|delete","content":"<rule>","anchor":"<text to replace/delete, optional>","rationale":"<why>"}].\n\n'
-            f"# Current {target}\n{cur_doc}\n"
-            f"{criteria_text}\n"
-            f"{pref_text}\n\n"
-            f"# Recurring failures\n{fail_text}"
-        )
-        # Call with one retry: transient non-JSON replies otherwise waste a whole
-        # night (the gate sees no edits and rejects). A firmer second prompt
-        # recovers most of these.
-        arr = None
-        for attempt in range(2):
-            p = prompt if attempt == 0 else (
-                prompt + "\n\nIMPORTANT: your previous reply was not valid JSON. "
-                "Reply with ONLY the JSON array, no prose, no markdown fences."
-            )
-            raw = self._call(p, max_tokens=1024)
-            self._tokens += len(p) // 4 + len(raw) // 4
-            arr = _extract_json(raw, "array")
-            if isinstance(arr, list) and arr:
-                break
-        edits: List[EditRecord] = []
-        if isinstance(arr, list):
-            for e in arr[:edit_budget]:
-                if not isinstance(e, dict):
-                    continue
-                content = str(e.get("content", "")).strip()
-                if not content:
-                    continue
-                edits.append(EditRecord(
-                    target=target,
-                    op=str(e.get("op", "add")).strip().lower(),
-                    content=content,
-                    anchor=str(e.get("anchor", "")).strip(),
-                    rationale=str(e.get("rationale", "")).strip(),
-                ))
-        return edits
-
-    def tokens_used(self) -> int:
-        return self._tokens
-
-
-# ── Claude Code CLI backend ───────────────────────────────────────────────────
-
-class ClaudeCliBackend(CliBackend):
-    """Drives the authenticated `claude` CLI: claude -p --output-format text."""
-
-    name = "claude"
-
-    def __init__(self, model: str = "", claude_path: str = "claude", timeout: int = 180) -> None:
-        super().__init__(model=model or os.environ.get("SKILLOPT_SLEEP_CLAUDE_MODEL", "") or "sonnet",
-                         timeout=timeout)
-        self.claude_path = claude_path
-
-    def _call(self, prompt: str, *, max_tokens: int = 1024) -> str:
-        # Run ISOLATED so the ambient Claude Code environment does not leak into
-        # the optimizer/target call. Critically, the user's GLOBAL skills
-        # (~/.claude/skills) are injected regardless of cwd, so we must disable
-        # them explicitly — without this, reflect/attempt sometimes reply with a
-        # list of the user's installed skills instead of doing the task.
-        #   --bare                    skip hooks, LSP, plugins (minimal mode)
-        #   --disable-slash-commands  disable all skills
-        #   --disallowedTools '*'     no tool use
-        #   --exclude-dynamic-...     drop per-machine cwd/env/memory/git sections
-        #   cwd=<clean temp>          no project CLAUDE.md
-        import tempfile
-        cmd = [
-            self.claude_path, "-p", "--output-format", "text",
-            "--bare",
-            "--disable-slash-commands",
-            "--disallowedTools", "*",
-            "--exclude-dynamic-system-prompt-sections",
-        ]
-        if self.model:
-            cmd += ["--model", self.model]
-        cmd += ["--", prompt]
-        clean_cwd = tempfile.mkdtemp(prefix="skillopt_sleep_claude_")
-        try:
-            proc = subprocess.run(
-                cmd, capture_output=True, text=True, timeout=self.timeout, cwd=clean_cwd,
-            )
-        except Exception:
-            return ""
-        finally:
-            try:
-                import shutil
-                shutil.rmtree(clean_cwd, ignore_errors=True)
-            except Exception:
-                pass
-        return (proc.stdout or "").strip()
-
-    def attempt_with_tools(self, task, skill, memory, tools):
-        # Expose a REAL, callable `search` tool (a shell shim that logs each
-        # call) so the gbrain quick-answerer judge (tool_called=search) is
-        # validated honestly: we detect the call from the shim's log, not from
-        # a self-reported marker. Other tools are stubbed the same way.
-        import tempfile, shutil, stat
-        work = tempfile.mkdtemp(prefix="skillopt_sleep_tools_")
-        calllog = os.path.join(work, "_tool_calls.log")
-        try:
-            for tname in (tools or ["search"]):
-                shim = os.path.join(work, tname)
-                with open(shim, "w") as f:
-                    f.write(
-                        "#!/usr/bin/env bash\n"
-                        f'echo "{tname}" >> "{calllog}"\n'
-                        'echo "(search results: 3 relevant notes found; use them to answer)"\n'
-                    )
-                os.chmod(shim, os.stat(shim).st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
-            tool_hint = (
-                "You have shell tools available in the current directory: "
-                + ", ".join(f"./{t}" for t in (tools or ["search"]))
-                + ". When the skill says to look something up or search before "
-                "answering, you MUST actually run the tool (e.g. `./search \"query\"`) "
-                "via Bash before giving your final answer."
-            )
-            prompt = (
-                "You are completing a task. Apply the skill and memory rules EXACTLY, "
-                "including any rule about searching/looking up before answering. "
-                "Treat a 'Learned preferences' block as HARD CONSTRAINTS that override "
-                "earlier conflicting skill text.\n\n"
-                f"{tool_hint}\n\n"
-                f"# Skill\n{skill or '(none)'}\n\n# Memory\n{memory or '(none)'}\n\n"
-                f"# Task\n{task.intent}\n\n{task.context_excerpt}\n\n"
-                "Return ONLY the final answer text."
-            )
-            cmd = [
-                self.claude_path, "-p", "--output-format", "text",
-                "--bare", "--disable-slash-commands",
-                "--allowedTools", "Bash",
-                "--exclude-dynamic-system-prompt-sections",
-            ]
-            if self.model:
-                cmd += ["--model", self.model]
-            cmd += ["--", prompt]
-            try:
-                proc = subprocess.run(
-                    cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work,
-                )
-                resp = (proc.stdout or "").strip()
-            except Exception:
-                resp = ""
-            self._tokens += len(prompt) // 4 + len(resp) // 4
-            called: List[str] = []
-            if os.path.exists(calllog):
-                with open(calllog) as f:
-                    logged = {ln.strip() for ln in f if ln.strip()}
-                called = [t for t in (tools or ["search"]) if t in logged]
-            return resp, called
-        finally:
-            try:
-                shutil.rmtree(work, ignore_errors=True)
-            except Exception:
-                pass
-
-def resolve_codex_path(explicit: str = "") -> str:
-    """Find the REAL `@openai/codex` binary, skipping the hermes wrapper.
-
-    The wrapper at ~/.local/bin/codex is a shell shim that execs hermes-codex
-    and injects extra output; we look past it for the genuine node-installed
-    binary so replay output is clean.
-    """
-    if explicit:
-        return explicit
-    env = os.environ.get("SKILLOPT_SLEEP_CODEX_PATH")
-    if env:
-        return env
-    candidates = [
-        os.path.expanduser("~/.nvm/versions/node/v22.22.3/bin/codex"),
-    ]
-    # any nvm node version
-    nvm = os.path.expanduser("~/.nvm/versions/node")
-    if os.path.isdir(nvm):
-        for ver in sorted(os.listdir(nvm), reverse=True):
-            candidates.append(os.path.join(nvm, ver, "bin", "codex"))
-    for c in candidates:
-        if not c or not os.path.exists(c):
-            continue
-        try:
-            with open(c, "rb") as f:
-                head = f.read(64)
-            # skip the bash shim that execs hermes
-            if head.startswith(b"#!") and b"bash" in head:
-                continue
-        except Exception:
-            pass
-        return c
-    return "codex"  # last resort (may be the wrapper)
-
-
-class CodexCliBackend(CliBackend):
-    """Drives the real Codex CLI: `codex exec -o <file>` for clean output."""
-
-    name = "codex"
-
-    def __init__(self, model: str = "", codex_path: str = "", timeout: int = 240,
-                 sandbox: str = "read-only") -> None:
-        super().__init__(model=model or os.environ.get("SKILLOPT_SLEEP_CODEX_MODEL", ""),
-                         timeout=timeout)
-        self.codex_path = resolve_codex_path(codex_path)
-        self.sandbox = sandbox
-
-    def _call(self, prompt: str, *, max_tokens: int = 1024) -> str:
-        import tempfile
-        out_path = tempfile.NamedTemporaryFile(
-            prefix="codex_last_", suffix=".txt", delete=False
-        ).name
-        cmd = [
-            self.codex_path, "exec", "--skip-git-repo-check",
-            "--color", "never", "--sandbox", self.sandbox,
-            "-o", out_path,
-        ]
-        if self.model:
-            cmd += ["-m", self.model]
-        cmd += ["--", prompt]
-        try:
-            subprocess.run(cmd, capture_output=True, text=True, timeout=self.timeout)
-        except Exception:
-            return ""
-        try:
-            with open(out_path, encoding="utf-8") as f:
-                return f.read().strip()
-        except Exception:
-            return ""
-        finally:
-            try:
-                os.unlink(out_path)
-            except Exception:
-                pass
-
-    def attempt_with_tools(self, task, skill, memory, tools):
-        # Codex exec runs in a sandbox with shell access; expose the same real
-        # `search` shim and let it run (workspace-write so the shim can log).
-        import tempfile, shutil, stat
-        work = tempfile.mkdtemp(prefix="skillopt_sleep_codextools_")
-        calllog = os.path.join(work, "_tool_calls.log")
-        out_path = os.path.join(work, "_last.txt")
-        try:
-            for tname in (tools or ["search"]):
-                shim = os.path.join(work, tname)
-                with open(shim, "w") as f:
-                    f.write(
-                        "#!/usr/bin/env bash\n"
-                        f'echo "{tname}" >> "{calllog}"\n'
-                        'echo "(search results: 3 relevant notes found; use them to answer)"\n'
-                    )
-                os.chmod(shim, os.stat(shim).st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
-            tool_hint = (
-                "Shell tools are available in the working directory: "
-                + ", ".join(f"./{t}" for t in (tools or ["search"]))
-                + ". When the skill says to look something up or search before "
-                "answering, you MUST actually run the tool (e.g. `./search \"query\"`) "
-                "before giving your final answer."
-            )
-            prompt = (
-                "Complete the task. Apply the skill and memory rules EXACTLY, "
-                "including any rule about searching before answering. Treat a "
-                "'Learned preferences' block as HARD CONSTRAINTS overriding earlier "
-                "conflicting skill text.\n\n"
-                f"{tool_hint}\n\n# Skill\n{skill or '(none)'}\n\n# Memory\n{memory or '(none)'}\n\n"
-                f"# Task\n{task.intent}\n\n{task.context_excerpt}\n\nReturn ONLY the final answer."
-            )
-            cmd = [
-                self.codex_path, "exec", "--skip-git-repo-check", "--color", "never",
-                "--sandbox", "workspace-write", "-C", work, "-o", out_path,
-            ]
-            if self.model:
-                cmd += ["-m", self.model]
-            cmd += ["--", prompt]
-            try:
-                subprocess.run(cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work)
-            except Exception:
-                pass
-            resp = ""
-            try:
-                with open(out_path, encoding="utf-8") as f:
-                    resp = f.read().strip()
-            except Exception:
-                resp = ""
-            self._tokens += len(prompt) // 4 + len(resp) // 4
-            called: List[str] = []
-            if os.path.exists(calllog):
-                with open(calllog) as f:
-                    logged = {ln.strip() for ln in f if ln.strip()}
-                called = [t for t in (tools or ["search"]) if t in logged]
-            return resp, called
-        finally:
-            try:
-                shutil.rmtree(work, ignore_errors=True)
-            except Exception:
-                pass
-
-class DualBackend(Backend):
-    """Route operations to two backends, à la SkillOpt's target vs optimizer.
-
-      * attempt  -> TARGET backend (the model the skill is deployed on)
-      * reflect  -> OPTIMIZER backend (the stronger/cheaper model writing edits)
-      * judge    -> OPTIMIZER backend (graded by the optimizer when no local rule)
-
-    This lets you optimize a skill with one model and run tasks on another, and
-    is the basis of the sleep-scenario transfer experiment (optimize cheap,
-    deploy expensive — or vice-versa).
-    """
-
-    name = "dual"
-
-    def __init__(self, target: Backend, optimizer: Backend) -> None:
-        self.target = target
-        self.optimizer = optimizer
-        self.name = f"target={target.name}/optimizer={optimizer.name}"
-
-    def attempt(self, task, skill, memory):
-        return self.target.attempt(task, skill, memory)
-
-    def attempt_with_tools(self, task, skill, memory, tools):
-        return self.target.attempt_with_tools(task, skill, memory, tools)
-
-    def judge(self, task, response):
-        # local rule/exact judging needs no model; delegate to target which
-        # already short-circuits those. For rubric judging use the optimizer.
-        if task.reference_kind in {"rule", "exact"}:
-            return self.target.judge(task, response)
-        return self.optimizer.judge(task, response)
-
-    def reflect(self, failures, successes, skill, memory, **kw):
-        return self.optimizer.reflect(failures, successes, skill, memory, **kw)
-
-    def _call(self, prompt, *, max_tokens=1024):
-        # used by the LLM miner; prefer the optimizer (the "thinking" model)
-        return self.optimizer._call(prompt, max_tokens=max_tokens)  # type: ignore[attr-defined]
-
-    def tokens_used(self):
-        return self.target.tokens_used() + self.optimizer.tokens_used()
-
-
-def get_backend(
-    name: str,
-    *,
-    model: str = "",
-    claude_path: str = "claude",
-    codex_path: str = "",
-) -> Backend:
-    n = (name or "mock").strip().lower()
-    if n in {"claude", "anthropic", "claude_cli", "claude_code"}:
-        return ClaudeCliBackend(model=model, claude_path=claude_path)
-    if n in {"codex", "codex_cli", "openai_codex"}:
-        return CodexCliBackend(model=model, codex_path=codex_path)
-    return MockBackend()
-
-
-def build_backend(
-    *,
-    backend: str = "mock",
-    model: str = "",
-    optimizer_backend: str = "",
-    optimizer_model: str = "",
-    target_backend: str = "",
-    target_model: str = "",
-    codex_path: str = "",
-    preferences: str = "",
-) -> Backend:
-    """Build a single or dual backend.
-
-    If optimizer_* or target_* are given, returns a DualBackend routing
-    attempt->target and reflect/judge->optimizer. Otherwise a single backend
-    from (backend, model). ``preferences`` (free text) is attached so reflect
-    uses it as a prior (set on the optimizer for dual backends).
-    """
-    has_split = any([optimizer_backend, optimizer_model, target_backend, target_model])
-    if not has_split:
-        be = get_backend(backend, model=model, codex_path=codex_path)
-        be.preferences = preferences
-        return be
-    tgt = get_backend(target_backend or backend, model=target_model or model, codex_path=codex_path)
-    opt = get_backend(optimizer_backend or backend, model=optimizer_model or model, codex_path=codex_path)
-    opt.preferences = preferences  # reflect runs on the optimizer
-    dual = DualBackend(target=tgt, optimizer=opt)
-    dual.preferences = preferences
-    return dual
--- a/skillopt/sleep/budget.py
+++ b/skillopt/sleep/budget.py
@@ -1,75 +0,0 @@
-"""SkillOpt-Sleep — budget controller.
-
-Lets the user say how much they're willing to spend on a night's "dreaming",
-in tokens or wall-clock minutes, and the engine schedules depth (how many
-rollouts × how many nights) within that budget. Stops cleanly when exhausted
-and reports what it skipped (no silent truncation).
-"""
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import Optional
-
-
-@dataclass
-class Budget:
-    max_tokens: Optional[int] = None      # None = unlimited
-    max_minutes: Optional[float] = None   # None = unlimited
-    _start_time: Optional[float] = None
-    _tokens_at_start: int = 0
-
-    def start(self, clock_fn, tokens_now: int) -> None:
-        self._start_time = clock_fn()
-        self._tokens_at_start = tokens_now
-
-    def tokens_spent(self, tokens_now: int) -> int:
-        return max(0, tokens_now - self._tokens_at_start)
-
-    def minutes_elapsed(self, clock_fn) -> float:
-        if self._start_time is None:
-            return 0.0
-        return (clock_fn() - self._start_time) / 60.0
-
-    def remaining_fraction(self, *, tokens_now: int, clock_fn) -> float:
-        """Smallest remaining fraction across all active limits (1.0 = fresh)."""
-        fracs = [1.0]
-        if self.max_tokens:
-            fracs.append(max(0.0, 1.0 - self.tokens_spent(tokens_now) / self.max_tokens))
-        if self.max_minutes:
-            fracs.append(max(0.0, 1.0 - self.minutes_elapsed(clock_fn) / self.max_minutes))
-        return min(fracs)
-
-    def exhausted(self, *, tokens_now: int, clock_fn) -> bool:
-        if self.max_tokens and self.tokens_spent(tokens_now) >= self.max_tokens:
-            return True
-        if self.max_minutes and self.minutes_elapsed(clock_fn) >= self.max_minutes:
-            return True
-        return False
-
-    def status(self, *, tokens_now: int, clock_fn) -> str:
-        parts = []
-        if self.max_tokens:
-            parts.append(f"tokens {self.tokens_spent(tokens_now)}/{self.max_tokens}")
-        if self.max_minutes:
-            parts.append(f"minutes {self.minutes_elapsed(clock_fn):.1f}/{self.max_minutes}")
-        return ", ".join(parts) or "unbounded"
-
-
-def plan_depth(budget: Budget, *, n_tasks: int,
-               default_nights: int = 2, default_k: int = 1) -> tuple:
-    """Heuristically choose (nights, rollouts_per_task) from a token budget.
-
-    Rough cost model: one rollout ≈ 1 unit; a night does ~n_tasks*k rollouts
-    plus reflect/gate (~2*n_tasks). We scale k and nights up with more budget.
-    Returns (nights, k). With no budget set, returns the defaults.
-    """
-    if not budget.max_tokens:
-        return default_nights, default_k
-    # assume ~1.5k tokens per rollout as a planning constant
-    rollouts_affordable = budget.max_tokens / 1500.0
-    per_night = max(1, n_tasks) * 3  # rollouts + reflect + gate, k=1
-    nights = max(1, min(4, int(rollouts_affordable // per_night)))
-    # spend surplus on more rollouts-per-task (contrastive signal)
-    surplus = rollouts_affordable - nights * per_night
-    k = max(1, min(5, 1 + int(surplus // max(1, n_tasks))))
-    return nights, k
--- a/skillopt/sleep/config.py
+++ b/skillopt/sleep/config.py
@@ -1,142 +0,0 @@
-"""SkillOpt-Sleep — configuration.
-
-Config is JSON-first (yaml optional) so the engine and the deterministic
-experiment run with zero external dependencies. Defaults are safe:
-review-gated adoption, single-project scope, bounded token/task budgets.
-
-Resolution order (later wins):
-  1. built-in DEFAULTS
-  2. ~/.skillopt-sleep/config.json  (or .yaml if PyYAML available)
-  3. explicit overrides passed to load_config(**overrides)
-"""
-from __future__ import annotations
-
-import json
-import os
-from dataclasses import dataclass, field, asdict
-from typing import Any, Dict, List, Optional
-
-
-HOME_STATE_DIR = os.path.expanduser("~/.skillopt-sleep")
-CLAUDE_HOME = os.path.expanduser("~/.claude")
-
-
-DEFAULTS: Dict[str, Any] = {
-    # ── scope ──────────────────────────────────────────────────────────────
-    "claude_home": CLAUDE_HOME,
-    "projects": "invoked",        # "invoked" | "all" | [list of abs paths]
-    "invoked_project": "",        # filled at runtime (cwd) when projects == "invoked"
-    "lookback_hours": 72,         # harvest window when no prior sleep recorded
-    # ── budgets ────────────────────────────────────────────────────────────
-    "max_tasks_per_night": 40,
-    "max_tokens_per_night": 400_000,
-    "holdout_fraction": 0.34,     # legacy alias for val_fraction
-    "val_fraction": 0.34,         # real tasks reserved to gate updates
-    "test_fraction": 0.0,         # real tasks reserved as the final held-out measure
-    # ── optimizer ──────────────────────────────────────────────────────────
-    "backend": "mock",            # "mock" | "claude" | "codex"
-    "model": "",                  # backend-specific; "" => backend default
-    "gate_mode": "on",            # "on" (validation-gated) | "off" (greedy, no hard filter)
-    "codex_path": "",             # "" => auto-detect the real @openai/codex binary
-    "edit_budget": 4,             # textual learning rate (max edits/night)
-    "gate_metric": "mixed",       # hard | soft | mixed (mixed best for tiny holdouts)
-    "gate_mixed_weight": 0.5,
-    "replay_mode": "mock",        # "mock" (sandboxed prompt) | "fresh" (worktree)
-    "evolve_memory": True,        # consolidate CLAUDE.md
-    "evolve_skill": True,         # consolidate the managed SKILL.md
-    "llm_mine": True,             # use the backend to mine checkable tasks (real backends)
-    # ── adoption / safety ──────────────────────────────────────────────────
-    "auto_adopt": False,          # default: stage + require explicit `adopt`
-    "managed_skill_name": "skillopt-sleep-learned",
-    "redact_secrets": True,
-    "seed": 42,
-}
-
-
-@dataclass
-class SleepConfig:
-    data: Dict[str, Any] = field(default_factory=lambda: dict(DEFAULTS))
-
-    # convenient attribute access -------------------------------------------
-    def __getattr__(self, name: str) -> Any:
-        # only called when normal attribute lookup fails
-        data = object.__getattribute__(self, "data")
-        if name in data:
-            return data[name]
-        raise AttributeError(name)
-
-    def get(self, key: str, default: Any = None) -> Any:
-        return self.data.get(key, default)
-
-    def to_dict(self) -> Dict[str, Any]:
-        return dict(self.data)
-
-    # paths ------------------------------------------------------------------
-    @property
-    def state_dir(self) -> str:
-        # Allow full isolation: if the caller overrides state_dir explicitly,
-        # honor it; else derive from claude_home's parent so a single
-        # --claude-home flag isolates transcripts AND state together; else the
-        # default ~/.skillopt-sleep.
-        explicit = self.data.get("state_dir")
-        if explicit:
-            return explicit
-        ch = self.data.get("claude_home", CLAUDE_HOME)
-        if os.path.abspath(ch) != os.path.abspath(CLAUDE_HOME):
-            return os.path.join(os.path.dirname(os.path.abspath(ch)), ".skillopt-sleep")
-        return HOME_STATE_DIR
-
-    @property
-    def state_path(self) -> str:
-        return os.path.join(self.state_dir, "state.json")
-
-    @property
-    def transcripts_dir(self) -> str:
-        return os.path.join(self.data["claude_home"], "projects")
-
-    @property
-    def history_path(self) -> str:
-        return os.path.join(self.data["claude_home"], "history.jsonl")
-
-    @property
-    def skills_dir(self) -> str:
-        return os.path.join(self.data["claude_home"], "skills")
-
-    def managed_skill_path(self) -> str:
-        return os.path.join(
-            self.skills_dir, self.data["managed_skill_name"], "SKILL.md"
-        )
-
-
-def _user_config_path() -> Optional[str]:
-    for name in ("config.json", "config.yaml", "config.yml"):
-        p = os.path.join(HOME_STATE_DIR, name)
-        if os.path.exists(p):
-            return p
-    return None
-
-
-def _load_file(path: str) -> Dict[str, Any]:
-    if path.endswith((".yaml", ".yml")):
-        try:
-            import yaml  # optional
-            with open(path) as f:
-                return yaml.safe_load(f) or {}
-        except Exception:
-            return {}
-    with open(path) as f:
-        return json.load(f)
-
-
-def load_config(**overrides: Any) -> SleepConfig:
-    data = dict(DEFAULTS)
-    path = _user_config_path()
-    if path:
-        try:
-            data.update(_load_file(path) or {})
-        except Exception:
-            pass
-    data.update({k: v for k, v in overrides.items() if v is not None})
-    if data.get("projects") == "invoked" and not data.get("invoked_project"):
-        data["invoked_project"] = os.getcwd()
-    return SleepConfig(data=data)
--- a/skillopt/sleep/consolidate.py
+++ b/skillopt/sleep/consolidate.py
@@ -1,220 +0,0 @@
-"""SkillOpt-Sleep — Stage 4: consolidate (one SkillOpt epoch).
-
-This is the core that makes nightly evolution *safe*: it proposes bounded
-edits from replayed failures, applies them to a candidate skill/memory, then
-**gates** the candidate on a held-out slice of the user's own tasks. Only a
-candidate that strictly improves the held-out score is accepted — exactly the
-SkillOpt validation gate, reused verbatim from ``skillopt.evaluation.gate``.
-
-Reused from the main SkillOpt package (import-light, no `openai` needed):
-  * skillopt.evaluation.gate.evaluate_gate / select_gate_score
-"""
-from __future__ import annotations
-
-import os
-from dataclasses import dataclass
-from typing import List, Optional, Tuple
-
-from skillopt.sleep.backend import Backend
-from skillopt.sleep.memory import apply_edits
-from skillopt.sleep.replay import aggregate_scores, replay_batch
-from skillopt.sleep.types import EditRecord, ReplayResult, TaskRecord
-
-
-# Reuse the real SkillOpt gate. This module imports cleanly without `openai`.
-try:
-    from skillopt.evaluation.gate import evaluate_gate, select_gate_score
-    _HAVE_REPO_GATE = True
-except Exception:  # pragma: no cover - fallback keeps engine standalone
-    _HAVE_REPO_GATE = False
-
-    def select_gate_score(hard, soft, metric="hard", mixed_weight=0.5):  # type: ignore
-        if metric == "hard":
-            return float(hard)
-        if metric == "soft":
-            return float(soft)
-        w = max(0.0, min(1.0, float(mixed_weight)))
-        return (1 - w) * float(hard) + w * float(soft)
-
-
-@dataclass
-class ConsolidationResult:
-    accepted: bool
-    gate_action: str
-    baseline_score: float
-    candidate_score: float
-    new_skill: str
-    new_memory: str
-    applied_edits: List[EditRecord]
-    rejected_edits: List[EditRecord]
-    holdout_baseline: float
-    holdout_candidate: float
-
-
-def _split(tasks: List[TaskRecord]) -> Tuple[List[TaskRecord], List[TaskRecord]]:
-    """Return (train_tasks, val_tasks).
-
-    train drives reflect; val gates updates. test is held out entirely from
-    consolidation and is scored by the caller. Accepts legacy split names
-    (replay->train, holdout->val) for robustness.
-    """
-    def _norm(s: str) -> str:
-        return {"replay": "train", "holdout": "val"}.get(s, s)
-
-    train = [t for t in tasks if _norm(t.split) == "train"]
-    val = [t for t in tasks if _norm(t.split) == "val"]
-    # be robust if a split is empty: fall back so a night still does something,
-    # but never silently use test as val.
-    test = [t for t in tasks if _norm(t.split) == "test"]
-    if not val:
-        # prefer train as the gate reference over nothing; last resort all-but-test
-        val = train or [t for t in tasks if _norm(t.split) != "test"] or tasks
-    if not train:
-        train = val
-    return train, val
-
-
-def consolidate(
-    backend: Backend,
-    tasks: List[TaskRecord],
-    skill: str,
-    memory: str,
-    *,
-    edit_budget: int = 4,
-    gate_metric: str = "mixed",
-    gate_mixed_weight: float = 0.5,
-    gate_mode: str = "on",       # "on" (hard/soft per gate_metric) | "off" (greedy)
-    rollouts_k: int = 1,         # >1 => multi-rollout contrastive reflection
-    evolve_skill: bool = True,
-    evolve_memory: bool = True,
-    night: int = 1,
-) -> ConsolidationResult:
-    """Run one consolidation epoch: reflect -> bounded edit -> gate.
-
-    train tasks drive reflect; val tasks gate the update (test is held out by the
-    caller). With ``gate_mode='off'`` edits are accepted greedily (no val-improve
-    requirement) — the user opts out of hard filtering — but val scores are still
-    recorded so the report shows whether quality moved.
-
-    Skill and memory are evolved in sequence (skill first if both enabled).
-    """
-    train_tasks, val_tasks = _split(tasks)
-    gate_off = str(gate_mode).strip().lower() in {"off", "none", "false", "greedy"}
-
-    # ── baseline on the VAL slice (the gate reference) ────────────────────
-    base_pairs = replay_batch(backend, val_tasks, skill, memory)
-    base_hard, base_soft = aggregate_scores(base_pairs)
-    base_score = select_gate_score(base_hard, base_soft, gate_metric, gate_mixed_weight)
-
-    # ── reflect over TRAIN-split failures/successes ───────────────────────
-    train_pairs = replay_batch(backend, train_tasks, skill, memory)
-    failures = [(t, r) for (t, r) in train_pairs if r.hard < 1.0]
-    successes = [(t, r) for (t, r) in train_pairs if r.hard >= 1.0]
-
-    cand_skill, cand_memory = skill, memory
-    all_applied: List[EditRecord] = []
-    all_rejected: List[EditRecord] = []
-
-    def _gate_apply(doc: str, edits: List[EditRecord], which: str) -> str:
-        nonlocal cand_skill, cand_memory, base_score, all_applied, all_rejected
-        if not edits:
-            return doc
-        new_doc, applied = apply_edits(doc, edits)
-        if not applied:
-            return doc
-        # score the candidate on the VAL slice
-        trial_skill = new_doc if which == "skill" else cand_skill
-        trial_memory = new_doc if which == "memory" else cand_memory
-        pairs = replay_batch(backend, val_tasks, trial_skill, trial_memory)
-        h, s = aggregate_scores(pairs)
-        cand_score = select_gate_score(h, s, gate_metric, gate_mixed_weight)
-        # gate OFF: accept greedily (no regression check); gate ON: strict improve
-        if gate_off or cand_score > base_score:
-            base_score = max(base_score, cand_score)
-            all_applied.extend(applied)
-            return new_doc
-        all_rejected.extend(applied)
-        return doc
-
-    if evolve_skill:
-        if rollouts_k > 1:
-            # multi-rollout contrastive reflection: run each train task K times
-            # and distill a rule from the good-vs-bad contrast (the "脑补" signal).
-            from skillopt.sleep.rollout import multi_rollout, contrastive_reflect
-            sets = [multi_rollout(backend, t, cand_skill, cand_memory, k=rollouts_k)
-                    for t in train_tasks]
-            edits = contrastive_reflect(
-                backend, sets, cand_skill, cand_memory,
-                edit_budget=edit_budget, target="skill",
-            )
-            # fall back to single-shot reflect if contrast yielded nothing
-            if not edits:
-                edits = backend.reflect(
-                    failures, successes, cand_skill, cand_memory,
-                    edit_budget=edit_budget, evolve_skill=True, evolve_memory=False,
-                )
-        else:
-            edits = backend.reflect(
-                failures, successes, cand_skill, cand_memory,
-                edit_budget=edit_budget, evolve_skill=True, evolve_memory=False,
-            )
-        cand_skill = _gate_apply(cand_skill, edits, "skill")
-
-    if evolve_memory:
-        # re-evaluate failures under the (possibly improved) skill
-        train_pairs2 = replay_batch(backend, train_tasks, cand_skill, cand_memory)
-        failures2 = [(t, r) for (t, r) in train_pairs2 if r.hard < 1.0]
-        successes2 = [(t, r) for (t, r) in train_pairs2 if r.hard >= 1.0]
-        edits_m = backend.reflect(
-            failures2, successes2, cand_skill, cand_memory,
-            edit_budget=edit_budget, evolve_skill=False, evolve_memory=True,
-        )
-        cand_memory = _gate_apply(cand_memory, edits_m, "memory")
-
-    # ── final decision, scored on the VAL slice ───────────────────────────
-    final_pairs = replay_batch(backend, val_tasks, cand_skill, cand_memory)
-    final_hard, final_soft = aggregate_scores(final_pairs)
-    final_score = select_gate_score(final_hard, final_soft, gate_metric, gate_mixed_weight)
-    base_gate_score = select_gate_score(base_hard, base_soft, gate_metric, gate_mixed_weight)
-
-    if gate_off:
-        # greedy mode: keep whatever edits we applied; report quality movement
-        accepted = bool(all_applied)
-        if final_score > base_gate_score:
-            action = "greedy_improved"
-        elif final_score < base_gate_score:
-            action = "greedy_regressed"
-        else:
-            action = "greedy_flat" if all_applied else "greedy_noop"
-    elif _HAVE_REPO_GATE:
-        gate = evaluate_gate(
-            candidate_skill=cand_skill,
-            cand_hard=final_hard,
-            current_skill=skill,
-            current_score=base_gate_score,
-            best_skill=skill,
-            best_score=base_gate_score,
-            best_step=night - 1,
-            global_step=night,
-            cand_soft=final_soft,
-            metric=gate_metric,
-            mixed_weight=gate_mixed_weight,
-        )
-        action = gate.action
-        accepted = bool(all_applied) and final_score > base_gate_score
-    else:
-        action = "accept" if final_score > base_gate_score else "reject"
-        accepted = bool(all_applied) and final_score > base_gate_score
-
-    return ConsolidationResult(
-        accepted=accepted,
-        gate_action=action,
-        baseline_score=base_gate_score,
-        candidate_score=final_score,
-        new_skill=cand_skill if accepted else skill,
-        new_memory=cand_memory if accepted else memory,
-        applied_edits=all_applied,
-        rejected_edits=all_rejected,
-        holdout_baseline=base_hard,
-        holdout_candidate=final_hard,
-    )
--- a/skillopt/sleep/cycle.py
+++ b/skillopt/sleep/cycle.py
@@ -1,223 +0,0 @@
-"""SkillOpt-Sleep — the nightly cycle orchestrator.
-
-run_sleep_cycle() wires the stages:
-    harvest -> mine -> replay -> consolidate(gate) -> stage  (-> optional adopt)
-
-It is pure-Python and import-light; with backend="mock" it runs with no API
-key and no third-party deps, which is what the deterministic experiment and
-CI use. With backend="anthropic" it spends the user's budget for real lift.
-"""
-from __future__ import annotations
-
-import os
-import time
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
-
-from skillopt.sleep.backend import get_backend
-from skillopt.sleep.config import SleepConfig, load_config
-from skillopt.sleep.consolidate import consolidate
-from skillopt.sleep.harvest import harvest
-from skillopt.sleep.memory import ensure_skill_scaffold
-from skillopt.sleep.mine import mine
-from skillopt.sleep.state import SleepState, _now_iso
-from skillopt.sleep.staging import write_staging, adopt as adopt_staging
-from skillopt.sleep.types import SessionDigest, SleepReport, TaskRecord
-
-
-@dataclass
-class CycleOutcome:
-    report: SleepReport
-    staging_dir: str
-    adopted: bool
-    adopted_paths: List[str]
-
-
-def _project_paths(cfg: SleepConfig) -> str:
-    """Where live CLAUDE.md lives + which project we are evolving."""
-    if cfg.get("projects") == "invoked" and cfg.get("invoked_project"):
-        return cfg.get("invoked_project")
-    # default: the invoked cwd
-    return cfg.get("invoked_project") or os.getcwd()
-
-
-def _read(path: str) -> str:
-    try:
-        with open(path, encoding="utf-8") as f:
-            return f.read()
-    except Exception:
-        return ""
-
-
-def _render_report_md(report: SleepReport, cfg: SleepConfig) -> str:
-    lines = [
-        f"# SkillOpt-Sleep — night {report.night} report",
-        "",
-        f"- project: `{report.project}`",
-        f"- backend: `{cfg.get('backend')}`  replay: `{cfg.get('replay_mode')}`",
-        f"- sessions harvested: {report.n_sessions}",
-        f"- tasks mined: {report.n_tasks}  (replayed: {report.n_replayed})",
-        f"- held-out score: {report.baseline_score:.3f} -> {report.candidate_score:.3f}",
-        f"- gate: **{report.gate_action}** (accepted={report.accepted})",
-        f"- tokens used: {report.tokens_used}",
-        "",
-    ]
-    if report.edits:
-        lines.append("## Accepted edits")
-        for e in report.edits:
-            lines.append(f"- [{e.target}/{e.op}] {e.content}  \n  _why: {e.rationale}_")
-        lines.append("")
-    if report.rejected_edits:
-        lines.append("## Rejected by gate (kept as negative feedback)")
-        for e in report.rejected_edits:
-            lines.append(f"- [{e.target}/{e.op}] {e.content}")
-        lines.append("")
-    if report.notes:
-        lines.append("## Notes")
-        for n in report.notes:
-            lines.append(f"- {n}")
-        lines.append("")
-    lines.append("_Review, then run `/sleep adopt` to apply, or discard this folder._")
-    return "\n".join(lines)
-
-
-def run_sleep_cycle(
-    cfg: Optional[SleepConfig] = None,
-    *,
-    seed_tasks: Optional[List[TaskRecord]] = None,
-    dry_run: bool = False,
-    clock: Optional[float] = None,
-) -> CycleOutcome:
-    """Run one full sleep cycle and return the outcome.
-
-    Parameters
-    ----------
-    cfg : SleepConfig
-    seed_tasks : optional pre-built TaskRecords (used by the experiment to
-        inject a known persona instead of harvesting ~/.claude).
-    dry_run : harvest+mine+replay but DO NOT stage/adopt (report only).
-    clock : fixed epoch seconds for deterministic timestamps in tests.
-    """
-    cfg = cfg or load_config()
-    state = SleepState.load(cfg.state_path)
-    night = state.begin_night(clock)
-    project = _project_paths(cfg)
-    started = _now_iso(clock)
-
-    backend = get_backend(
-        cfg.get("backend", "mock"),
-        model=cfg.get("model", ""),
-        codex_path=cfg.get("codex_path", ""),
-    )
-
-    # ── 1+2. harvest + mine (unless seed_tasks injected) ─────────────────
-    digests: List[SessionDigest] = []
-    if seed_tasks is not None:
-        tasks = seed_tasks
-        n_sessions = 0
-    else:
-        since = state.last_harvest_for(project)
-        digests = harvest(
-            cfg.transcripts_dir,
-            scope=cfg.get("projects", "invoked"),
-            invoked_project=cfg.get("invoked_project", ""),
-            since_iso=since,
-            limit=cfg.get("max_tasks_per_night", 40) * 3,
-        )
-        n_sessions = len(digests)
-        # When a real backend is configured, use it to mine checkable tasks from
-        # the transcripts (rubric/rule judges); otherwise fall back to the
-        # heuristic miner (no API, no checkable reference).
-        llm_miner = None
-        if cfg.get("backend", "mock") != "mock" and cfg.get("llm_mine", True):
-            try:
-                from skillopt.sleep.llm_miner import make_llm_miner
-                llm_miner = make_llm_miner(backend, max_tasks=cfg.get("max_tasks_per_night", 40))
-            except Exception:
-                llm_miner = None
-        tasks = mine(
-            digests,
-            max_tasks=cfg.get("max_tasks_per_night", 40),
-            holdout_fraction=cfg.get("holdout_fraction", 0.34),
-            seed=cfg.get("seed", 42),
-            llm_miner=llm_miner,
-        )
-
-    # ── live skill/memory docs ───────────────────────────────────────────
-    live_memory_path = os.path.join(project, "CLAUDE.md")
-    live_skill_path = cfg.managed_skill_path()
-    skill = _read(live_skill_path)
-    memory = _read(live_memory_path)
-    if not skill:
-        skill = ensure_skill_scaffold(
-            "", name=cfg.get("managed_skill_name", "skillopt-sleep-learned"),
-            description="Preferences and procedures learned from past Claude Code sessions.",
-        )
-
-    report = SleepReport(
-        night=night, project=project, started_at=started,
-        n_sessions=n_sessions, n_tasks=len(tasks),
-    )
-
-    if not tasks:
-        report.ended_at = _now_iso(clock)
-        report.notes.append("no tasks mined — nothing to consolidate")
-        state.set_last_harvest(project, started)
-        state.record_night({"night": night, "accepted": False, "n_tasks": 0})
-        if not dry_run:
-            state.save()
-        staging_dir = ""
-        return CycleOutcome(report, staging_dir, False, [])
-
-    # ── 3+4. replay + consolidate (gate) ─────────────────────────────────
-    result = consolidate(
-        backend, tasks, skill, memory,
-        edit_budget=cfg.get("edit_budget", 4),
-        gate_metric=cfg.get("gate_metric", "mixed"),
-        gate_mixed_weight=cfg.get("gate_mixed_weight", 0.5),
-        gate_mode=cfg.get("gate_mode", "on"),
-        evolve_skill=cfg.get("evolve_skill", True),
-        evolve_memory=cfg.get("evolve_memory", True),
-        night=night,
-    )
-
-    report.n_replayed = len(tasks)
-    report.baseline_score = result.baseline_score
-    report.candidate_score = result.candidate_score
-    report.accepted = result.accepted
-    report.gate_action = result.gate_action
-    report.edits = result.applied_edits
-    report.rejected_edits = result.rejected_edits
-    report.tokens_used = backend.tokens_used()
-    report.ended_at = _now_iso(clock)
-
-    # ── 5. stage (unless dry-run) ────────────────────────────────────────
-    staging_dir = ""
-    adopted = False
-    adopted_paths: List[str] = []
-    if not dry_run:
-        report_md = _render_report_md(report, cfg)
-        proposed_skill = result.new_skill if (cfg.get("evolve_skill") and result.accepted) else None
-        proposed_memory = result.new_memory if (cfg.get("evolve_memory") and result.accepted) else None
-        staging_dir = write_staging(
-            project,
-            report=report,
-            proposed_skill=proposed_skill,
-            proposed_memory=proposed_memory,
-            live_skill_path=live_skill_path,
-            live_memory_path=live_memory_path,
-            report_md=report_md,
-        )
-        state.set_last_harvest(project, started)
-        state.record_night({
-            "night": night, "accepted": result.accepted,
-            "baseline": result.baseline_score, "candidate": result.candidate_score,
-            "n_tasks": len(tasks), "staging": staging_dir,
-        })
-        # ── 6. adopt (opt-in) ────────────────────────────────────────────
-        if cfg.get("auto_adopt") and result.accepted:
-            adopted_paths = adopt_staging(staging_dir)
-            adopted = bool(adopted_paths)
-        state.save()
-
-    return CycleOutcome(report, staging_dir, adopted, adopted_paths)
--- a/skillopt/sleep/experiments/init.py
+++ b/skillopt/sleep/experiments/init.py
@@ -1 +0,0 @@
-"""SkillOpt-Sleep experiments."""
--- a/skillopt/sleep/experiments/gbrain_bench.py
+++ b/skillopt/sleep/experiments/gbrain_bench.py
@@ -1,119 +0,0 @@
-"""SkillOpt-Sleep — gbrain-evals benchmark adapter.
-
-Loads gbrain-evals' `skillopt-v1` benchmark (deficient skills + train/held-out
-task sets with rule-based judges) into our TaskRecord format, so we can run the
-SkillOpt-Sleep cycle against the SAME suite gbrain publishes a scorecard for:
-
-  docs/benchmarks/2026-06-03-skillopt.md  — "4/4 skills 0 -> 1.00"
-
-Each gbrain seed dir has:
-  SKILL.md          — the deliberately deficient starting skill
-  benchmark.jsonl   — training tasks  {task_id, task, judge:{kind:"rule",checks}}
-  held-out.jsonl    — held-out tasks (same judge shape, unseen items)
-
-We map:
-  benchmark.jsonl -> TaskRecords with split="replay"
-  held-out.jsonl  -> TaskRecords with split="holdout"
-  judge           -> TaskRecord.judge (+ reference_kind="rule")
-
-This lets us reproduce gbrain's headline result with our engine and either the
-claude or codex backend, scoring locally via skillopt.sleep.judges (no judge API).
-"""
-from __future__ import annotations
-
-import json
-import os
-from typing import Dict, List, Optional, Tuple
-
-from skillopt.sleep.types import TaskRecord
-
-
-SEED_DIRS = {
-    "brief-writer": "seed-missing-structure",
-    "thorough-analyst": "seed-verbose",
-    "advisor": "seed-no-verdict",
-    "quick-answerer": "seed-no-brain-first",
-}
-
-
-def _load_jsonl(path: str) -> List[dict]:
-    out: List[dict] = []
-    if not os.path.exists(path):
-        return out
-    with open(path, encoding="utf-8") as f:
-        for line in f:
-            line = line.strip()
-            if line:
-                try:
-                    out.append(json.loads(line))
-                except Exception:
-                    pass
-    return out
-
-
-def _to_task(rec: dict, *, seed: str, split: str) -> TaskRecord:
-    return TaskRecord(
-        id=f"{seed}:{rec.get('task_id', '')}",
-        project=f"gbrain/{seed}",
-        intent=str(rec.get("task", "")),
-        reference_kind="rule",
-        judge=rec.get("judge", {}) or {},
-        tags=[f"seed:{seed}"],
-        split=split,
-    )
-
-
-def load_seed(data_root: str, seed: str, *, val_fraction: float = 0.34,
-              split_seed: int = 42) -> Tuple[str, List[TaskRecord]]:
-    """Return (deficient_skill_md, tasks) for one gbrain seed.
-
-    Faithful split mapping:
-      * gbrain held-out.jsonl  -> our ``test`` (the true final measure)
-      * gbrain benchmark.jsonl -> split deterministically into ``train`` + ``val``
-        (val gates updates; train drives reflect)
-    All tasks are origin='real' (gbrain provides no synthetic tasks).
-    """
-    import hashlib
-    sub = SEED_DIRS.get(seed, seed)
-    seed_dir = os.path.join(data_root, sub)
-    skill_path = os.path.join(seed_dir, "SKILL.md")
-    skill = ""
-    if os.path.exists(skill_path):
-        with open(skill_path, encoding="utf-8") as f:
-            skill = f.read()
-    tasks: List[TaskRecord] = []
-    # benchmark pool -> train/val
-    val_cut = int(round(val_fraction * 100))
-    for rec in _load_jsonl(os.path.join(seed_dir, "benchmark.jsonl")):
-        t = _to_task(rec, seed=seed, split="train")
-        bucket = int(hashlib.sha256((str(split_seed) + t.id).encode()).hexdigest(), 16) % 100
-        t.split = "val" if bucket < val_cut else "train"
-        tasks.append(t)
-    # held-out -> test
-    for rec in _load_jsonl(os.path.join(seed_dir, "held-out.jsonl")):
-        tasks.append(_to_task(rec, seed=seed, split="test"))
-    # guarantee a non-empty val
-    if not any(t.split == "val" for t in tasks):
-        train_only = [t for t in tasks if t.split == "train"]
-        if train_only:
-            train_only[0].split = "val"
-    return skill, tasks
-
-
-def available_seeds(data_root: str) -> List[str]:
-    return [s for s, sub in SEED_DIRS.items()
-            if os.path.isdir(os.path.join(data_root, sub))]
-
-
-def find_data_root(explicit: str = "") -> Optional[str]:
-    """Locate eval/data/skillopt-v1 from common clone locations."""
-    cands = [explicit] if explicit else []
-    cands += [
-        os.path.expanduser("~/git/gbrain-evals/eval/data/skillopt-v1"),
-        "/tmp/gbrain-evals/eval/data/skillopt-v1",
-        os.path.expanduser("~/gbrain-evals/eval/data/skillopt-v1"),
-    ]
-    for c in cands:
-        if c and os.path.isdir(c):
-            return c
-    return None
--- a/skillopt/sleep/experiments/personas.py
+++ b/skillopt/sleep/experiments/personas.py
@@ -1,86 +0,0 @@
-"""SkillOpt-Sleep — persona task fixtures for the validation experiment.
-
-Each persona is a list of TaskRecords with EXACT checkable references and a
-`rule:<key>` tag naming the single skill rule that makes the task solvable
-(consumed by MockBackend). This lets the experiment prove — deterministically,
-with no API — that nightly consolidation lifts a held-out score and that the
-gate blocks regressions.
-
-Personas mirror the user's framing: programmer / researcher / analyst.
-"""
-from __future__ import annotations
-
-from typing import List
-
-from skillopt.sleep.types import TaskRecord
-
-
-def _t(i, intent, ref, rule, project="/personas/demo", outcome="fail") -> TaskRecord:
-    return TaskRecord(
-        id=f"persona_{rule}_{i}",
-        project=project,
-        intent=intent,
-        context_excerpt="",
-        attempted_solution="",
-        outcome=outcome,
-        reference_kind="exact",
-        reference=ref,
-        tags=[f"rule:{rule}"],
-        source_sessions=[f"sess_{i}"],
-    )
-
-
-def researcher_persona() -> List[TaskRecord]:
-    """Researcher who always wants arXiv ids wrapped in <answer> tags."""
-    items = [
-        ("Give me the arXiv id for the SkillOpt paper", "arXiv:2605.23904"),
-        ("What's the arXiv id of the Attention paper?", "arXiv:1706.03762"),
-        ("arXiv id for the GAN paper?", "arXiv:1406.2661"),
-        ("arXiv id for BERT?", "arXiv:1810.04805"),
-        ("arXiv id for the ResNet paper?", "arXiv:1512.03385"),
-        ("arXiv id for the Adam optimizer paper?", "arXiv:1412.6980"),
-        ("arXiv id for Dropout?", "arXiv:1207.0580"),
-        ("arXiv id for the Transformer-XL paper?", "arXiv:1901.02860"),
-        ("arXiv id for word2vec?", "arXiv:1301.3781"),
-        ("arXiv id for the VAE paper?", "arXiv:1312.6114"),
-        ("arXiv id for batch norm?", "arXiv:1502.03167"),
-        ("arXiv id for GPT-3?", "arXiv:2005.14165"),
-    ]
-    # Both rules required: format the id (arxiv-id) AND wrap in answer tags.
-    out: List[TaskRecord] = []
-    for i, (q, a) in enumerate(items):
-        t = _t(i, q, a, "wrap-answer")
-        t.tags = ["rule:wrap-answer", "rule:arxiv-id"]
-        out.append(t)
-    return out
-
-
-def programmer_persona() -> List[TaskRecord]:
-    """Programmer who wants imperative-mood commit subjects."""
-    items = [
-        ("commit message for adding a login form", "Add login form"),
-        ("commit message for fixing the null pointer bug", "Fix null pointer in parser"),
-        ("commit message for updating the README", "Update README"),
-        ("commit message for removing dead code", "Remove dead code"),
-        ("commit message for bumping the version", "Bump version to 1.2.0"),
-        ("commit message for refactoring the auth module", "Refactor auth module"),
-        ("commit message for adding tests", "Add unit tests for scheduler"),
-        ("commit message for fixing the CI pipeline", "Fix CI pipeline"),
-    ]
-    return [_t(i, q, a, "commit-imperative") for i, (q, a) in enumerate(items)]
-
-
-def harmful_edit_task() -> TaskRecord:
-    """A task whose 'fix' is a known-bad rule; used to prove the gate rejects
-    regressions. The MockBackend proposes the harmful rule on this failure,
-    but applying it does NOT raise the held-out score, so the gate must reject.
-    """
-    t = _t(99, "answer this freely", "THIS_WILL_NOT_MATCH", "__harmful__")
-    t.reference = "an-answer-that-the-harmful-rule-cannot-produce"
-    return t
-
-
-PERSONAS = {
-    "researcher": researcher_persona,
-    "programmer": programmer_persona,
-}
--- a/skillopt/sleep/experiments/report.py
+++ b/skillopt/sleep/experiments/report.py
@@ -1,132 +0,0 @@
-"""SkillOpt-Sleep — turn a sweep JSONL into a presented Markdown scorecard.
-
-Usage:
-  python -m skillopt.sleep.experiments.report --in docs/sleep/sweep.jsonl \
-      --out docs/sleep/benchmark_report.md
-"""
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import sys
-from typing import Any, Dict, List
-
-
-def _load(path: str) -> List[Dict[str, Any]]:
-    rows = []
-    if os.path.exists(path):
-        with open(path) as f:
-            for line in f:
-                line = line.strip()
-                if line:
-                    try:
-                        rows.append(json.loads(line))
-                    except Exception:
-                        pass
-    return rows
-
-
-def _fmt_model(backend: str, model: str) -> str:
-    m = model or "default"
-    return f"{backend}:{m}"
-
-
-def render(rows: List[Dict[str, Any]]) -> str:
-    direct = [r for r in rows if r.get("cfg", {}).get("kind") in ("direct", "dual") and "error" not in r]
-    transfer = [r for r in rows if r.get("cfg", {}).get("kind") == "transfer" and "error" not in r]
-    errors = [r for r in rows if "error" in r]
-
-    out: List[str] = []
-    out.append("# SkillOpt-Sleep — benchmark report")
-    out.append("")
-    out.append("Auto-generated from `sweep.jsonl`. Benchmark: "
-               "[gbrain-evals](https://github.com/garrytan/gbrain-evals) `skillopt-v1` "
-               "(deficient skills, train/held-out split, local rule judge — no judge-API).")
-    out.append("Held-out scores are computed by the harness, not the optimizer.")
-    out.append("")
-
-    # ── direct improvement table ──────────────────────────────────────────
-    out.append("## Direct improvement (optimize, then deploy)")
-    out.append("")
-    out.append("| Optimizer → Target | Seed | Held-out before | Held-out after | Nights | Tokens |")
-    out.append("|---|---|---|---|---|---|")
-    for r in direct:
-        c = r["cfg"]
-        if c.get("kind") == "dual":
-            label = (f"{_fmt_model(c['optimizer_backend'], c.get('optimizer_model',''))}"
-                     f" → {_fmt_model(c['target_backend'], c.get('target_model',''))}")
-        else:
-            m = _fmt_model(c["backend"], c.get("model", ""))
-            label = f"{m} → {m}"
-        out.append(f"| {label} | {c['seed']} | "
-                   f"{r['baseline']:.2f} | **{r['after']:.2f}** | {c['nights']} | "
-                   f"{r.get('tokens','?')} |")
-    if direct:
-        n_imp = sum(1 for r in direct if r.get("improved"))
-        out.append("")
-        out.append(f"**{n_imp}/{len(direct)} configurations improved on held-out.**")
-    out.append("")
-
-    # ── transfer table ────────────────────────────────────────────────────
-    if transfer:
-        out.append("## Cross-model transfer (optimize on SOURCE, deploy frozen on TARGET)")
-        out.append("")
-        out.append("The price-difference story: spend cheap tokens optimizing overnight, "
-                   "then deploy the frozen skill on any model with no further optimization.")
-        out.append("")
-        out.append("| Source (optimizer) | Target (deploy) | Seed | Target baseline | Transferred | Gain |")
-        out.append("|---|---|---|---|---|---|")
-        for r in transfer:
-            c = r["cfg"]
-            s = _fmt_model(c["source_backend"], c.get("source_model", ""))
-            t = _fmt_model(c["target_backend"], c.get("target_model", ""))
-            out.append(f"| {s} | {t} | {c['seed']} | {r['baseline_target']:.2f} | "
-                       f"**{r['transferred']:.2f}** | {r['transfer_gain']:+.2f} |")
-        n_pos = sum(1 for r in transfer if r.get("transfer_gain", 0) > 0)
-        out.append("")
-        out.append(f"**{n_pos}/{len(transfer)} transfers were positive** "
-                   "(frozen skill helped a different model than it was optimized on).")
-        out.append("")
-
-    # ── errors (honest reporting) ─────────────────────────────────────────
-    if errors:
-        out.append("## Configs that errored (reported, not hidden)")
-        out.append("")
-        for r in errors:
-            out.append(f"- `{json.dumps(r['cfg'])}` → {r['error']}")
-        out.append("")
-
-    out.append("## How to reproduce")
-    out.append("")
-    out.append("```bash")
-    out.append("git clone https://github.com/garrytan/gbrain-evals /tmp/gbrain-evals")
-    out.append("python -m skillopt.sleep.experiments.sweep --plan full \\")
-    out.append("    --data-root /tmp/gbrain-evals/eval/data/skillopt-v1 --out docs/sleep/sweep.jsonl")
-    out.append("python -m skillopt.sleep.experiments.report \\")
-    out.append("    --in docs/sleep/sweep.jsonl --out docs/sleep/benchmark_report.md")
-    out.append("```")
-    out.append("")
-    return "\n".join(out)
-
-
-def main(argv=None) -> int:
-    ap = argparse.ArgumentParser(description="Render SkillOpt-Sleep sweep report")
-    ap.add_argument("--in", dest="inp", default="docs/sleep/sweep.jsonl")
-    ap.add_argument("--out", default="docs/sleep/benchmark_report.md")
-    args = ap.parse_args(argv)
-
-    rows = _load(args.inp)
-    if not rows:
-        print(f"no rows in {args.inp}", file=sys.stderr)
-        return 1
-    md = render(rows)
-    os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
-    with open(args.out, "w") as f:
-        f.write(md)
-    print(f"wrote {args.out} ({len(rows)} rows)")
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
--- a/skillopt/sleep/experiments/run_experiment.py
+++ b/skillopt/sleep/experiments/run_experiment.py
@@ -1,178 +0,0 @@
-"""SkillOpt-Sleep — validation experiment.
-
-Answers the question the user posed: *does nightly offline self-evolution
-actually improve the agent?*  Runs deterministically with the MockBackend
-(no API key, reproducible) and is the acceptance test for the whole idea.
-
-What it proves:
-  1. MONOTONIC LIFT — over N sleep nights, the held-out score rises from a
-     baseline (empty skill/memory) toward 1.0 as the gate accepts the
-     general rules the persona's tasks require.
-  2. GATE SAFETY — an injected harmful edit is REJECTED (held-out score does
-     not improve), so a bad nightly proposal can never be adopted.
-  3. PLUMBING — harvest->mine->replay->consolidate->stage->adopt all run and
-     the adopted artifact, re-scored, retains the lift.
-
-Run:
-    python -m skillopt.sleep.experiments.run_experiment
-    python -m skillopt.sleep.experiments.run_experiment --persona programmer --nights 3
-    python -m skillopt.sleep.experiments.run_experiment --backend anthropic   # real lift
-"""
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import sys
-import tempfile
-from typing import List
-
-from skillopt.sleep.backend import get_backend
-from skillopt.sleep.consolidate import consolidate
-from skillopt.sleep.experiments.personas import (
-    PERSONAS,
-    harmful_edit_task,
-    researcher_persona,
-)
-from skillopt.sleep.memory import ensure_skill_scaffold
-from skillopt.sleep.replay import aggregate_scores, replay_batch
-from skillopt.sleep.types import TaskRecord
-
-
-def _score_holdout(backend, tasks: List[TaskRecord], skill: str, memory: str,
-                   metric: str = "mixed", w: float = 0.5) -> float:
-    from skillopt.sleep.consolidate import select_gate_score
-    # the persona experiment uses a 2-way split (train/val, no test); score on val
-    holdout = [t for t in tasks if t.split in ("val", "holdout")] or tasks
-    pairs = replay_batch(backend, holdout, skill, memory)
-    h, s = aggregate_scores(pairs)
-    return select_gate_score(h, s, metric, w)
-
-
-def run(persona: str = "researcher", nights: int = 4, backend_name: str = "mock",
-        edit_budget: int = 4, seed: int = 42, model: str = "", codex_path: str = "",
-        limit_tasks: int = 0) -> dict:
-    from skillopt.sleep.mine import assign_splits
-
-    make = PERSONAS.get(persona, researcher_persona)
-    items = make()
-    if limit_tasks and limit_tasks < len(items):
-        items = items[:limit_tasks]
-    tasks = assign_splits(items, holdout_fraction=0.34, seed=seed)
-    backend = get_backend(backend_name, model=model, codex_path=codex_path)
-    is_mock = (backend.name == "mock")
-
-    # start from an empty managed skill + empty memory
-    skill = ensure_skill_scaffold("", name="skillopt-sleep-learned",
-                                  description="Learned preferences.")
-    memory = ""
-
-    baseline = _score_holdout(backend, tasks, skill, memory)
-    trace = [{"night": 0, "holdout_score": round(baseline, 4), "action": "baseline",
-              "n_edits": 0}]
-
-    for night in range(1, nights + 1):
-        res = consolidate(
-            backend, tasks, skill, memory,
-            edit_budget=edit_budget, gate_metric="mixed", gate_mixed_weight=0.5,
-            evolve_skill=True, evolve_memory=True, night=night,
-        )
-        if res.accepted:
-            skill, memory = res.new_skill, res.new_memory
-        trace.append({
-            "night": night,
-            "holdout_score": round(res.candidate_score, 4),
-            "action": res.gate_action,
-            "accepted": res.accepted,
-            "n_edits": len(res.applied_edits),
-            "edits": [e.content for e in res.applied_edits],
-            "n_rejected": len(res.rejected_edits),
-        })
-        # converged: stop early if perfect
-        if res.candidate_score >= 0.999:
-            break
-
-    after = _score_holdout(backend, tasks, skill, memory)
-
-    # ── gate-safety probe (mock only; it relies on the mock's known bad rule) ──
-    harmful_rejected = None
-    if is_mock:
-        harmful_tasks = assign_splits([harmful_edit_task()] + make()[:3],
-                                      holdout_fraction=0.5, seed=seed)
-        _ = _score_holdout(backend, harmful_tasks, skill, memory)
-        res_h = consolidate(backend, harmful_tasks, skill, memory,
-                            edit_budget=edit_budget, gate_metric="mixed",
-                            evolve_skill=True, evolve_memory=False, night=nights + 1)
-        harmful_rule_text = get_backend("mock").RULE_TEXT["__harmful__"]  # type: ignore[attr-defined]
-        harmful_rejected = (harmful_rule_text not in res_h.new_skill)
-
-    result = {
-        "persona": persona,
-        "backend": backend.name,
-        "model": model or "(default)",
-        "n_tasks": len(tasks),
-        "nights_run": len(trace) - 1,
-        "baseline_holdout": round(baseline, 4),
-        "after_holdout": round(after, 4),
-        "lift": round(after - baseline, 4),
-        "improved": after > baseline,
-        "gate_blocks_harmful": harmful_rejected,  # None for real backends
-        "tokens_used": backend.tokens_used(),
-        "final_skill_excerpt": skill[-500:],
-        "trace": trace,
-    }
-    return result
-
-
-def _assert(cond: bool, msg: str) -> None:
-    if not cond:
-        print(f"FAIL: {msg}")
-        raise SystemExit(1)
-
-
-def main(argv=None) -> int:
-    ap = argparse.ArgumentParser(description="SkillOpt-Sleep validation experiment")
-    ap.add_argument("--persona", default="researcher", choices=list(PERSONAS.keys()))
-    ap.add_argument("--nights", type=int, default=4)
-    ap.add_argument("--backend", default="mock", choices=["mock", "claude", "codex"])
-    ap.add_argument("--model", default="", help="backend model override")
-    ap.add_argument("--codex-path", default="", help="path to the real @openai/codex binary")
-    ap.add_argument("--edit-budget", type=int, default=4)
-    ap.add_argument("--limit-tasks", type=int, default=0, help="cap #tasks (control API cost)")
-    ap.add_argument("--json", action="store_true")
-    ap.add_argument("--assert-improves", action="store_true",
-                    help="exit nonzero unless lift>0 (and, for mock, gate blocks harmful edit)")
-    args = ap.parse_args(argv)
-
-    res = run(args.persona, nights=args.nights, backend_name=args.backend,
-              edit_budget=args.edit_budget, model=args.model,
-              codex_path=args.codex_path, limit_tasks=args.limit_tasks)
-
-    if args.json:
-        print(json.dumps(res, ensure_ascii=False, indent=2))
-    else:
-        print(f"=== SkillOpt-Sleep experiment: persona={res['persona']} "
-              f"backend={res['backend']} model={res['model']} ===")
-        print(f"tasks: {res['n_tasks']}   tokens(approx): {res['tokens_used']}")
-        print(f"baseline held-out : {res['baseline_holdout']}")
-        print(f"after  held-out   : {res['after_holdout']}   (lift {res['lift']:+.4f})")
-        if res["gate_blocks_harmful"] is not None:
-            print(f"gate blocks harmful edit: {res['gate_blocks_harmful']}")
-        print("trace:")
-        for row in res["trace"]:
-            edits = "; ".join(row.get("edits", []))[:80]
-            print(f"  night {row['night']}: holdout={row['holdout_score']} "
-                  f"{row['action']} (+{row['n_edits']} edits) {edits}")
-
-    if args.assert_improves:
-        _assert(res["improved"], "held-out score did not improve")
-        if res["gate_blocks_harmful"] is not None:
-            _assert(res["gate_blocks_harmful"], "gate failed to block harmful edit")
-            print("\nPASS: nightly consolidation improves held-out score AND gate blocks regressions.")
-        else:
-            print("\nPASS: nightly consolidation improves held-out score (real backend).")
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
--- a/skillopt/sleep/experiments/run_gbrain.py
+++ b/skillopt/sleep/experiments/run_gbrain.py
@@ -1,209 +0,0 @@
-"""SkillOpt-Sleep — run the gbrain-evals skillopt-v1 benchmark with our engine.
-
-Reproduces gbrain's "Result 1 — skills measurably improve" scorecard
-(docs/benchmarks/2026-06-03-skillopt.md) using SkillOpt-Sleep's
-consolidate() loop and either the claude or codex backend.
-
-For each deficient seed skill:
-  1. score the held-out tasks with the ORIGINAL skill            -> before
-  2. run N consolidation nights on the training tasks (gated)     -> evolve skill
-  3. score the held-out tasks with the EVOLVED skill             -> after
-
-Held-out scoring is done locally by the rule judge (no judge API). Only the
-agent's `attempt` (and the optimizer's `reflect`) spend tokens.
-
-Usage:
-    python -m skillopt.sleep.experiments.run_gbrain --backend mock
-    python -m skillopt.sleep.experiments.run_gbrain --backend claude --seeds brief-writer --nights 2
-    python -m skillopt.sleep.experiments.run_gbrain --backend codex  --data-root /tmp/gbrain-evals/eval/data/skillopt-v1
-"""
-from __future__ import annotations
-
-import argparse
-import json
-import sys
-from typing import Dict, List, Optional
-
-from skillopt.sleep.backend import build_backend, get_backend
-from skillopt.sleep.consolidate import consolidate, select_gate_score
-from skillopt.sleep.experiments.gbrain_bench import (
-    available_seeds,
-    find_data_root,
-    load_seed,
-)
-from skillopt.sleep.replay import aggregate_scores, replay_batch
-
-
-def _score(backend, tasks, skill, memory, split="test", metric="mixed", w=0.5):
-    sub = [t for t in tasks if t.split == split]
-    if not sub:  # fall back to val, then everything, so we never score on nothing
-        sub = [t for t in tasks if t.split == "val"] or tasks
-    pairs = replay_batch(backend, sub, skill, memory)
-    h, s = aggregate_scores(pairs)
-    return h, s, select_gate_score(h, s, metric, w)
-
-
-def run_seed(backend, seed: str, skill: str, tasks: List, *,
-             nights: int = 3, edit_budget: int = 4, gate_mode: str = "on",
-             slow_update: bool = True, rollouts_k: int = 1,
-             limit_replay: int = 0, limit_holdout: int = 0) -> dict:
-    memory = ""
-    # optionally cap each split to control API cost / latency.
-    # limit_replay caps train; limit_holdout caps BOTH val and test.
-    if limit_replay or limit_holdout:
-        train = [t for t in tasks if t.split == "train"]
-        val = [t for t in tasks if t.split == "val"]
-        test = [t for t in tasks if t.split == "test"]
-        if limit_replay:
-            train = train[:limit_replay]
-        if limit_holdout:
-            val = val[:limit_holdout]
-            test = test[:limit_holdout]
-        tasks = train + val + test
-    # final measure is TEST (the gbrain held-out set); val gates internally
-    bh, bs, bscore = _score(backend, tasks, skill, memory, split="test")
-    trace = [{"night": 0, "test_hard": round(bh, 3), "action": "baseline"}]
-    cur = skill
-    first_night_skill = skill
-    for night in range(1, nights + 1):
-        res = consolidate(
-            backend, tasks, cur, memory,
-            edit_budget=edit_budget, gate_metric="mixed", gate_mixed_weight=0.5,
-            gate_mode=gate_mode, rollouts_k=rollouts_k,
-            evolve_skill=True, evolve_memory=False, night=night,
-        )
-        if res.accepted:
-            cur = res.new_skill
-        if night == 1:
-            first_night_skill = cur
-        # report the TEST score each night (independent of the val gate)
-        th, _ts, _ = _score(backend, tasks, cur, memory, split="test")
-        trace.append({
-            "night": night,
-            "val_hard": round(res.holdout_candidate, 3),
-            "test_hard": round(th, 3),
-            "action": res.gate_action,
-            "accepted": res.accepted,
-            "edits": [e.content for e in res.applied_edits],
-        })
-        if th >= 0.999:
-            break
-
-    # ── SLOW UPDATE: consolidate cross-night experience into the protected
-    # long-term field. Runs regardless of gate mode (it is what preserves
-    # long-term memory even when the gate is OFF).
-    slow_text = None
-    if nights >= 2 and slow_update:
-        try:
-            from skillopt.sleep.slow_update import run_slow_update, replace_slow_field
-            val_tasks = [t for t in tasks if t.split == "val"] or tasks
-            prev_pairs = replay_batch(backend, val_tasks, first_night_skill, memory)
-            curr_pairs = replay_batch(backend, val_tasks, cur, memory)
-            slow_text = run_slow_update(
-                backend, prev_skill=first_night_skill, curr_skill=cur,
-                prev_pairs=[(t, r) for t, r in prev_pairs],
-                curr_pairs=[(t, r) for t, r in curr_pairs],
-            )
-            if slow_text:
-                cur = replace_slow_field(cur, slow_text)
-        except Exception:
-            slow_text = None
-
-    ah, as_, ascore = _score(backend, tasks, cur, memory, split="test")
-    return {
-        "seed": seed,
-        "held_out_before": round(bh, 3),
-        "held_out_after": round(ah, 3),
-        "improved": ah > bh,
-        "nights": len(trace) - 1,
-        "trace": trace,
-        "slow_update": slow_text,
-        "final_skill_tail": cur[-400:],
-    }
-
-
-def main(argv=None) -> int:
-    ap = argparse.ArgumentParser(description="Run gbrain-evals skillopt-v1 with SkillOpt-Sleep")
-    ap.add_argument("--backend", default="mock", choices=["mock", "claude", "codex"])
-    ap.add_argument("--model", default="")
-    ap.add_argument("--optimizer-backend", default="", help="route reflect/judge here (dual)")
-    ap.add_argument("--optimizer-model", default="")
-    ap.add_argument("--target-backend", default="", help="route attempt here (dual)")
-    ap.add_argument("--target-model", default="")
-    ap.add_argument("--codex-path", default="")
-    ap.add_argument("--data-root", default="", help="path to eval/data/skillopt-v1")
-    ap.add_argument("--seeds", default="", help="comma list; default = all available")
-    ap.add_argument("--nights", type=int, default=3)
-    ap.add_argument("--edit-budget", type=int, default=4)
-    ap.add_argument("--gate", default="on", choices=["on", "off", "hard", "soft"],
-                    help="on/hard/soft = validation-gated; off = greedy (no hard filter)")
-    ap.add_argument("--rollouts-k", type=int, default=1,
-                    help=">1 = multi-rollout contrastive reflection per task")
-    ap.add_argument("--budget-tokens", type=int, default=0,
-                    help="approx token budget; auto-plans nights x rollouts when set")
-    ap.add_argument("--budget-minutes", type=float, default=0.0)
-    ap.add_argument("--preferences", default="", help="free-text user preferences (prior for reflect)")
-    ap.add_argument("--limit-replay", type=int, default=0, help="cap #train tasks (cost control)")
-    ap.add_argument("--limit-holdout", type=int, default=0, help="cap #val and #test tasks (cost control)")
-    ap.add_argument("--json", action="store_true")
-    args = ap.parse_args(argv)
-
-    data_root = find_data_root(args.data_root)
-    if not data_root:
-        print("ERROR: could not find eval/data/skillopt-v1. Clone gbrain-evals and pass --data-root.",
-              file=sys.stderr)
-        return 2
-
-    seeds = [s.strip() for s in args.seeds.split(",") if s.strip()] or available_seeds(data_root)
-    backend = build_backend(
-        backend=args.backend, model=args.model,
-        optimizer_backend=args.optimizer_backend, optimizer_model=args.optimizer_model,
-        target_backend=args.target_backend, target_model=args.target_model,
-        codex_path=args.codex_path, preferences=args.preferences,
-    )
-
-    results = []
-    for seed in seeds:
-        skill, tasks = load_seed(data_root, seed)
-        if not tasks:
-            continue
-        # budget auto-planning: derive nights x rollouts_k from a token budget
-        nights, rollouts_k = args.nights, args.rollouts_k
-        if args.budget_tokens:
-            from skillopt.sleep.budget import Budget, plan_depth
-            n_train = len([t for t in tasks if t.split == "train"]) or len(tasks)
-            nights, rollouts_k = plan_depth(
-                Budget(max_tokens=args.budget_tokens), n_tasks=n_train,
-                default_nights=args.nights, default_k=args.rollouts_k,
-            )
-            if not args.json:
-                print(f"  [budget] {args.budget_tokens} tok -> nights={nights} rollouts_k={rollouts_k}")
-        r = run_seed(backend, seed, skill, tasks, nights=nights,
-                     edit_budget=args.edit_budget, rollouts_k=rollouts_k,
-                     gate_mode=("off" if args.gate == "off" else "on"),
-                     limit_replay=args.limit_replay, limit_holdout=args.limit_holdout)
-        results.append(r)
-        if not args.json:
-            print(f"  {seed:<18} held-out {r['held_out_before']:.2f} -> {r['held_out_after']:.2f}"
-                  f"  ({'IMPROVED' if r['improved'] else 'no change'}, {r['nights']} nights)")
-
-    n_improved = sum(1 for r in results if r["improved"])
-    summary = {
-        "benchmark": "gbrain-evals/skillopt-v1",
-        "backend": backend.name,
-        "model": args.model or "(default)",
-        "n_seeds": len(results),
-        "n_improved": n_improved,
-        "tokens_used": backend.tokens_used(),
-        "results": results,
-    }
-    if args.json:
-        print(json.dumps(summary, ensure_ascii=False, indent=2))
-    else:
-        print(f"\n=== {n_improved}/{len(results)} seeds improved on held-out "
-              f"(backend={backend.name}, ~{backend.tokens_used()} tokens) ===")
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
--- a/skillopt/sleep/experiments/run_transfer.py
+++ b/skillopt/sleep/experiments/run_transfer.py
@@ -1,155 +0,0 @@
-"""SkillOpt-Sleep — skill-transfer experiment (sleep scenario).
-
-Answers: "if I optimize a skill while the agent sleeps using a CHEAP model,
-does the learned skill still help an EXPENSIVE model at deploy time?" — and the
-reverse. This is the SkillOpt paper's cross-model transfer result, reproduced
-in the sleep setting, and it is the core price-difference value proposition:
-spend cheap tokens overnight, deploy the frozen skill anywhere.
-
-Protocol, per gbrain seed:
-  1. baseline_target = held-out score of the DEFICIENT skill, run on TARGET model
-  2. optimize the skill for N nights using the SOURCE model (attempt+reflect)
-  3. transferred = held-out score of the LEARNED skill, run on TARGET model,
-     with NO further optimization
-  4. (reference) direct = held-out score of a skill optimized AND run on TARGET
-
-Report baseline / direct / transferred, mirroring SkillOpt Table "transfer".
-
-Usage:
-  python -m skillopt.sleep.experiments.run_transfer \
-     --source-backend claude --source-model haiku \
-     --target-backend claude --target-model sonnet \
-     --seeds brief-writer --nights 2
-"""
-from __future__ import annotations
-
-import argparse
-import json
-import sys
-from typing import List, Optional
-
-from skillopt.sleep.backend import get_backend
-from skillopt.sleep.consolidate import consolidate, select_gate_score
-from skillopt.sleep.experiments.gbrain_bench import (
-    available_seeds, find_data_root, load_seed,
-)
-from skillopt.sleep.replay import aggregate_scores, replay_batch
-
-
-def _holdout_hard(backend, tasks, skill, memory="") -> float:
-    # transfer is measured on the true held-out TEST split
-    ho = [t for t in tasks if t.split == "test"]
-    if not ho:
-        ho = [t for t in tasks if t.split in ("val", "holdout")] or tasks
-    pairs = replay_batch(backend, ho, skill, memory)
-    h, _s = aggregate_scores(pairs)
-    return h
-
-
-def _optimize(backend, skill, tasks, *, nights, edit_budget) -> str:
-    cur = skill
-    for night in range(1, nights + 1):
-        res = consolidate(backend, tasks, cur, "",
-                          edit_budget=edit_budget, gate_metric="mixed",
-                          evolve_skill=True, evolve_memory=False, night=night)
-        if res.accepted:
-            cur = res.new_skill
-        if res.holdout_candidate >= 0.999:
-            break
-    return cur
-
-
-def run_seed(seed, skill, tasks, *, source, target, nights, edit_budget,
-             limit_replay, limit_holdout, do_direct=True) -> dict:
-    if limit_replay or limit_holdout:
-        train = [t for t in tasks if t.split == "train"]
-        val = [t for t in tasks if t.split == "val"]
-        test = [t for t in tasks if t.split == "test"]
-        if limit_replay:
-            train = train[:limit_replay]
-        if limit_holdout:
-            val = val[:limit_holdout]
-            test = test[:limit_holdout]
-        tasks = train + val + test
-
-    baseline_target = _holdout_hard(target, tasks, skill)
-
-    # optimize on SOURCE, evaluate frozen skill on TARGET
-    learned_on_source = _optimize(source, skill, tasks, nights=nights, edit_budget=edit_budget)
-    transferred = _holdout_hard(target, tasks, learned_on_source)
-
-    direct = None
-    if do_direct:
-        learned_on_target = _optimize(target, skill, tasks, nights=nights, edit_budget=edit_budget)
-        direct = _holdout_hard(target, tasks, learned_on_target)
-
-    return {
-        "seed": seed,
-        "baseline_target": round(baseline_target, 3),
-        "direct_target": (round(direct, 3) if direct is not None else None),
-        "transferred": round(transferred, 3),
-        "transfer_gain": round(transferred - baseline_target, 3),
-        "learned_skill_tail": learned_on_source[-300:],
-    }
-
-
-def main(argv=None) -> int:
-    ap = argparse.ArgumentParser(description="SkillOpt-Sleep cross-model transfer")
-    ap.add_argument("--source-backend", default="claude")
-    ap.add_argument("--source-model", default="haiku")
-    ap.add_argument("--target-backend", default="claude")
-    ap.add_argument("--target-model", default="sonnet")
-    ap.add_argument("--codex-path", default="")
-    ap.add_argument("--data-root", default="")
-    ap.add_argument("--seeds", default="brief-writer")
-    ap.add_argument("--nights", type=int, default=2)
-    ap.add_argument("--edit-budget", type=int, default=4)
-    ap.add_argument("--limit-replay", type=int, default=3)
-    ap.add_argument("--limit-holdout", type=int, default=3)
-    ap.add_argument("--no-direct", action="store_true", help="skip the direct reference (saves cost)")
-    ap.add_argument("--json", action="store_true")
-    args = ap.parse_args(argv)
-
-    data_root = find_data_root(args.data_root)
-    if not data_root:
-        print("ERROR: gbrain-evals skillopt-v1 data not found; pass --data-root", file=sys.stderr)
-        return 2
-
-    source = get_backend(args.source_backend, model=args.source_model, codex_path=args.codex_path)
-    target = get_backend(args.target_backend, model=args.target_model, codex_path=args.codex_path)
-
-    seeds = [s.strip() for s in args.seeds.split(",") if s.strip()] or available_seeds(data_root)
-    results = []
-    for seed in seeds:
-        skill, tasks = load_seed(data_root, seed)
-        if not tasks:
-            continue
-        r = run_seed(seed, skill, tasks, source=source, target=target,
-                     nights=args.nights, edit_budget=args.edit_budget,
-                     limit_replay=args.limit_replay, limit_holdout=args.limit_holdout,
-                     do_direct=not args.no_direct)
-        results.append(r)
-        if not args.json:
-            d = f" direct={r['direct_target']}" if r['direct_target'] is not None else ""
-            print(f"  {seed:<16} baseline={r['baseline_target']:.2f}"
-                  f" transferred={r['transferred']:.2f}{d}"
-                  f"  (gain {r['transfer_gain']:+.2f})")
-
-    summary = {
-        "experiment": "skillopt-sleep/transfer",
-        "source": f"{args.source_backend}:{args.source_model}",
-        "target": f"{args.target_backend}:{args.target_model}",
-        "tokens_source": source.tokens_used(),
-        "tokens_target": target.tokens_used(),
-        "results": results,
-    }
-    if args.json:
-        print(json.dumps(summary, ensure_ascii=False, indent=2))
-    else:
-        print(f"\n=== transfer {summary['source']} -> {summary['target']}: "
-              f"{sum(1 for r in results if r['transfer_gain'] > 0)}/{len(results)} positive ===")
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
--- a/skillopt/sleep/experiments/sweep.py
+++ b/skillopt/sleep/experiments/sweep.py
@@ -1,164 +0,0 @@
-"""SkillOpt-Sleep — benchmark sweep driver.
-
-Runs many (backend, model, seed, transfer-pair) configurations SEQUENTIALLY in
-one process, appending each result to a JSONL file as it finishes. Designed to
-run unattended in the background; safe to interrupt (already-written rows
-survive) and resume (skip configs whose row already exists).
-
-Then `report.py` turns the JSONL into a presented Markdown scorecard.
-
-Usage:
-  python -m skillopt.sleep.experiments.sweep --plan quick   --out docs/sleep/sweep.jsonl
-  python -m skillopt.sleep.experiments.sweep --plan full    --out docs/sleep/sweep.jsonl
-"""
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import sys
-import time
-from typing import Any, Dict, List
-
-from skillopt.sleep.backend import build_backend, get_backend
-from skillopt.sleep.experiments.gbrain_bench import find_data_root, load_seed
-from skillopt.sleep.experiments.run_gbrain import run_seed as bench_seed
-from skillopt.sleep.experiments.run_transfer import run_seed as transfer_seed
-
-
-# Plans: lists of config dicts. Kept small per-run to bound cost/latency.
-def _direct_cfg(backend, model, seed, nights=2):
-    return {"kind": "direct", "backend": backend, "model": model, "seed": seed, "nights": nights}
-
-
-def _dual_cfg(opt_backend, opt_model, tgt_backend, tgt_model, seed, nights=2):
-    # a 'direct' run on a DualBackend: strong optimizer proposes, weak target runs
-    return {"kind": "dual", "optimizer_backend": opt_backend, "optimizer_model": opt_model,
-            "target_backend": tgt_backend, "target_model": tgt_model, "seed": seed, "nights": nights}
-
-
-def _transfer_cfg(sb, sm, tb, tm, seed, nights=2):
-    return {"kind": "transfer", "source_backend": sb, "source_model": sm,
-            "target_backend": tb, "target_model": tm, "seed": seed, "nights": nights}
-
-
-PLANS: Dict[str, List[Dict[str, Any]]] = {
-    # one cheap seed each, both backends — fast sanity
-    "quick": [
-        _direct_cfg("claude", "haiku", "brief-writer", 1),
-        _direct_cfg("codex", "", "brief-writer", 2),
-    ],
-    # SkillOpt-faithful: STRONG optimizer (sonnet) proposes, WEAK target (haiku)
-    # runs — the reliable config. Plus Codex self-optimized. All 4 gbrain seeds,
-    # including quick-answerer (real tool loop).
-    "direct": [
-        _dual_cfg("claude", "sonnet", "claude", "haiku", "brief-writer"),
-        _dual_cfg("claude", "sonnet", "claude", "haiku", "advisor"),
-        _dual_cfg("claude", "sonnet", "claude", "haiku", "thorough-analyst"),
-        _dual_cfg("claude", "sonnet", "claude", "haiku", "quick-answerer"),
-        _direct_cfg("codex", "", "brief-writer"),
-        _direct_cfg("codex", "", "advisor"),
-        _direct_cfg("codex", "", "quick-answerer"),
-    ],
-    # the price-difference story: optimize cheap, deploy expensive (and reverse)
-    "transfer": [
-        _transfer_cfg("claude", "haiku", "claude", "sonnet", "brief-writer"),
-        _transfer_cfg("claude", "sonnet", "claude", "haiku", "brief-writer"),
-        _transfer_cfg("codex", "", "claude", "haiku", "brief-writer"),
-        _transfer_cfg("claude", "haiku", "codex", "", "brief-writer"),
-    ],
-}
-PLANS["full"] = PLANS["direct"] + PLANS["transfer"]
-
-
-def _cfg_key(c: Dict[str, Any]) -> str:
-    return json.dumps({k: c[k] for k in sorted(c)}, ensure_ascii=False)
-
-
-def _load_done(out_path: str) -> set:
-    done = set()
-    if os.path.exists(out_path):
-        with open(out_path) as f:
-            for line in f:
-                try:
-                    row = json.loads(line)
-                    if "cfg_key" in row:
-                        done.add(row["cfg_key"])
-                except Exception:
-                    pass
-    return done
-
-
-def _append(out_path: str, row: Dict[str, Any]) -> None:
-    os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
-    with open(out_path, "a") as f:
-        f.write(json.dumps(row, ensure_ascii=False) + "\n")
-
-
-def run_one(cfg: Dict[str, Any], data_root: str, codex_path: str,
-            limit_replay: int, limit_holdout: int) -> Dict[str, Any]:
-    seed = cfg["seed"]
-    skill, tasks = load_seed(data_root, seed)
-    t0 = time.time()
-    if cfg["kind"] in ("direct", "dual"):
-        if cfg["kind"] == "dual":
-            be = build_backend(
-                optimizer_backend=cfg["optimizer_backend"], optimizer_model=cfg.get("optimizer_model", ""),
-                target_backend=cfg["target_backend"], target_model=cfg.get("target_model", ""),
-                codex_path=codex_path,
-            )
-        else:
-            be = get_backend(cfg["backend"], model=cfg.get("model", ""), codex_path=codex_path)
-        r = bench_seed(be, seed, skill, tasks, nights=cfg["nights"],
-                       limit_replay=limit_replay, limit_holdout=limit_holdout)
-        out = {"baseline": r["held_out_before"], "after": r["held_out_after"],
-               "improved": r["improved"], "tokens": be.tokens_used()}
-    else:
-        src = get_backend(cfg["source_backend"], model=cfg.get("source_model", ""), codex_path=codex_path)
-        tgt = get_backend(cfg["target_backend"], model=cfg.get("target_model", ""), codex_path=codex_path)
-        r = transfer_seed(seed, skill, tasks, source=src, target=tgt, nights=cfg["nights"],
-                          edit_budget=4, limit_replay=limit_replay, limit_holdout=limit_holdout,
-                          do_direct=False)
-        out = {"baseline_target": r["baseline_target"], "transferred": r["transferred"],
-               "transfer_gain": r["transfer_gain"],
-               "tokens": src.tokens_used() + tgt.tokens_used()}
-    out.update({"cfg": cfg, "cfg_key": _cfg_key(cfg), "elapsed_s": round(time.time() - t0, 1)})
-    return out
-
-
-def main(argv=None) -> int:
-    ap = argparse.ArgumentParser(description="SkillOpt-Sleep benchmark sweep")
-    ap.add_argument("--plan", default="quick", choices=list(PLANS.keys()))
-    ap.add_argument("--out", default="docs/sleep/sweep.jsonl")
-    ap.add_argument("--data-root", default="")
-    ap.add_argument("--codex-path", default="")
-    ap.add_argument("--limit-replay", type=int, default=3)
-    ap.add_argument("--limit-holdout", type=int, default=3)
-    args = ap.parse_args(argv)
-
-    data_root = find_data_root(args.data_root)
-    if not data_root:
-        print("ERROR: gbrain-evals data not found; pass --data-root", file=sys.stderr)
-        return 2
-
-    plan = PLANS[args.plan]
-    done = _load_done(args.out)
-    print(f"[sweep] plan={args.plan} configs={len(plan)} already_done={len(done)} -> {args.out}")
-    for i, cfg in enumerate(plan, 1):
-        key = _cfg_key(cfg)
-        if key in done:
-            print(f"[sweep] ({i}/{len(plan)}) skip (done): {cfg}")
-            continue
-        print(f"[sweep] ({i}/{len(plan)}) running: {cfg}", flush=True)
-        try:
-            row = run_one(cfg, data_root, args.codex_path, args.limit_replay, args.limit_holdout)
-        except Exception as e:  # never let one config kill the sweep
-            row = {"cfg": cfg, "cfg_key": key, "error": f"{type(e).__name__}: {e}"}
-        _append(args.out, row)
-        print(f"[sweep]   -> {json.dumps({k: v for k, v in row.items() if k not in ('cfg','cfg_key')})}", flush=True)
-    print(f"[sweep] done. rows in {args.out}: {len(_load_done(args.out))}")
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
--- a/skillopt/sleep/harvest.py
+++ b/skillopt/sleep/harvest.py
@@ -1,236 +0,0 @@
-"""SkillOpt-Sleep — Stage 1: harvest.
-
-Read the user's local Claude Code records (read-only) and normalize them
-into :class:`SessionDigest` objects.
-
-Sources (verified schema):
-  * ~/.claude/history.jsonl        — one JSON/line:
-        {"display": <prompt text>, "pastedContents": {...},
-         "timestamp": <epoch ms>, "project": <abs path>}
-  * ~/.claude/projects/<slug>/<sessionId>.jsonl — one record/line; the
-    records we care about have type "user"/"assistant" and carry:
-        message{role, content}, cwd, gitBranch, timestamp, sessionId, version
-
-This module performs NO writes and NO network calls.
-"""
-from __future__ import annotations
-
-import json
-import os
-from typing import Any, Dict, Iterable, List, Optional
-
-from skillopt.sleep.types import SessionDigest
-
-
-# Heuristic phrases that signal the user (dis)approving of prior output.
-_NEGATIVE_FEEDBACK = (
-    "still broken", "still not", "still wrong", "doesn't work", "does not work",
-    "not working", "that's wrong", "thats wrong", "incorrect", "wrong",
-    "no,", "nope", "fix it", "didn't", "did not", "broken", "error again",
-    "still failing", "still fails", "not fixed", "revert", "undo",
-    "不对", "还是不对", "还是不行", "不行", "错了", "有问题", "没修好",
-)
-_POSITIVE_FEEDBACK = (
-    "thanks", "thank you", "perfect", "great", "works now", "fixed",
-    "that works", "lgtm", "looks good", "nice", "awesome", "correct",
-    "完美", "可以了", "好的", "搞定", "对了", "正确", "谢谢",
-)
-
-
-def _iter_jsonl(path: str) -> Iterable[Dict[str, Any]]:
-    try:
-        with open(path, encoding="utf-8") as f:
-            for line in f:
-                line = line.strip()
-                if not line:
-                    continue
-                try:
-                    yield json.loads(line)
-                except Exception:
-                    continue
-    except (FileNotFoundError, IsADirectoryError, PermissionError):
-        return
-
-
-def _text_from_content(content: Any) -> str:
-    """Flatten a message.content (str or list of blocks) into text."""
-    if isinstance(content, str):
-        return content
-    if isinstance(content, list):
-        parts: List[str] = []
-        for b in content:
-            if isinstance(b, dict):
-                if b.get("type") == "text" and b.get("text"):
-                    parts.append(str(b["text"]))
-        return "\n".join(parts)
-    return ""
-
-
-def _tool_names_from_content(content: Any) -> List[str]:
-    names: List[str] = []
-    if isinstance(content, list):
-        for b in content:
-            if isinstance(b, dict) and b.get("type") == "tool_use" and b.get("name"):
-                names.append(str(b["name"]))
-    return names
-
-
-def _detect_feedback(text: str) -> List[str]:
-    low = text.lower()
-    sig: List[str] = []
-    for ph in _NEGATIVE_FEEDBACK:
-        if ph in low:
-            sig.append("neg:" + ph)
-    for ph in _POSITIVE_FEEDBACK:
-        if ph in low:
-            sig.append("pos:" + ph)
-    return sig
-
-
-def _is_meta_prompt(text: str) -> bool:
-    """Skip slash-commands / system noise that aren't real user intents."""
-    t = text.strip()
-    if not t:
-        return True
-    if t.startswith("<") and t.endswith(">"):
-        return True
-    if t.startswith("/") and len(t.split()) <= 3:
-        return True
-    if t.startswith("[Pasted text") or t.startswith("Caveat:"):
-        return True
-    return False
-
-
-def digest_transcript(path: str) -> Optional[SessionDigest]:
-    """Build a SessionDigest from one ``<sessionId>.jsonl`` transcript."""
-    session_id = os.path.splitext(os.path.basename(path))[0]
-    project = ""
-    git_branch = ""
-    started = ""
-    ended = ""
-    user_prompts: List[str] = []
-    assistant_finals: List[str] = []
-    tools: List[str] = []
-    files: List[str] = []
-    feedback: List[str] = []
-    n_user = 0
-    n_asst = 0
-
-    for rec in _iter_jsonl(path):
-        rtype = rec.get("type")
-        ts = rec.get("timestamp")
-        if isinstance(ts, str) and ts:
-            if not started:
-                started = ts
-            ended = ts
-        if rec.get("cwd") and not project:
-            project = str(rec.get("cwd"))
-        if rec.get("gitBranch") and not git_branch:
-            git_branch = str(rec.get("gitBranch"))
-        if rtype == "file-history-snapshot":
-            snap = rec.get("snapshot") or rec.get("files") or {}
-            if isinstance(snap, dict):
-                files.extend([str(k) for k in list(snap.keys())[:20]])
-        msg = rec.get("message")
-        if not isinstance(msg, dict):
-            continue
-        role = msg.get("role")
-        content = msg.get("content")
-        if role == "user":
-            text = _text_from_content(content)
-            if text and not _is_meta_prompt(text):
-                n_user += 1
-                user_prompts.append(text.strip())
-                feedback.extend(_detect_feedback(text))
-        elif role == "assistant":
-            n_asst += 1
-            tools.extend(_tool_names_from_content(content))
-            text = _text_from_content(content)
-            if text.strip():
-                assistant_finals.append(text.strip())
-
-    if n_user == 0 and n_asst == 0:
-        return None
-
-    # de-dup tools/files preserving order
-    def _dedup(xs: List[str]) -> List[str]:
-        seen = set()
-        out = []
-        for x in xs:
-            if x not in seen:
-                seen.add(x)
-                out.append(x)
-        return out
-
-    return SessionDigest(
-        session_id=session_id,
-        project=project,
-        git_branch=git_branch,
-        started_at=started,
-        ended_at=ended,
-        user_prompts=user_prompts,
-        assistant_finals=assistant_finals[-5:],  # last few finals are the useful ones
-        tools_used=_dedup(tools),
-        files_touched=_dedup(files),
-        feedback_signals=feedback,
-        n_user_turns=n_user,
-        n_assistant_turns=n_asst,
-        raw_path=path,
-    )
-
-
-def _project_matches(project: str, scope: Any, invoked: str) -> bool:
-    if scope == "all":
-        return True
-    if isinstance(scope, (list, tuple)):
-        return any(os.path.abspath(project) == os.path.abspath(p) for p in scope)
-    # "invoked": match the invoked project (or a subdir of it)
-    if not invoked:
-        return True
-    a = os.path.abspath(project)
-    b = os.path.abspath(invoked)
-    return a == b or a.startswith(b + os.sep) or b.startswith(a + os.sep)
-
-
-def harvest(
-    transcripts_dir: str,
-    *,
-    scope: Any = "all",
-    invoked_project: str = "",
-    since_iso: Optional[str] = None,
-    limit: int = 0,
-) -> List[SessionDigest]:
-    """Walk ~/.claude/projects and return digests matching scope/time.
-
-    Parameters
-    ----------
-    transcripts_dir : str    ~/.claude/projects
-    scope : "all" | "invoked" | list[path]
-    invoked_project : str    used when scope == "invoked"
-    since_iso : str|None      ISO8601; only sessions ending after this are kept
-    limit : int               cap number of digests (0 = no cap)
-    """
-    digests: List[SessionDigest] = []
-    if not os.path.isdir(transcripts_dir):
-        return digests
-
-    paths: List[str] = []
-    for root, _dirs, files in os.walk(transcripts_dir):
-        for fn in files:
-            if fn.endswith(".jsonl"):
-                paths.append(os.path.join(root, fn))
-    # newest first by mtime
-    paths.sort(key=lambda p: os.path.getmtime(p), reverse=True)
-
-    for p in paths:
-        d = digest_transcript(p)
-        if d is None:
-            continue
-        if not _project_matches(d.project or "", scope, invoked_project):
-            continue
-        if since_iso and d.ended_at and d.ended_at < since_iso:
-            continue
-        digests.append(d)
-        if limit and len(digests) >= limit:
-            break
-    return digests
--- a/skillopt/sleep/judges.py
+++ b/skillopt/sleep/judges.py
@@ -1,84 +0,0 @@
-"""SkillOpt-Sleep — rule-based judges (gbrain-evals compatible).
-
-Implements the programmatic check operators used by gbrain-evals'
-skillopt-v1 benchmark so we can score skill outputs locally, with NO judge
-API call:
-
-  * section_present <name>   — a markdown heading containing <name> exists
-  * regex <pattern>          — the pattern matches the response
-  * max_chars <n>            — response length <= n
-  * min_chars <n>            — response length >= n
-  * contains <text>          — substring present (case-insensitive)
-  * tool_called <name>       — a tool with <name> was invoked (needs a tool loop;
-                               in single-shot replay we approximate via an
-                               explicit "TOOL_CALL: <name>" marker the agent emits)
-
-A task whose judge is {"kind": "rule", "checks": [...]} passes (hard=1.0) iff
-ALL checks pass; soft = fraction of checks passed. This mirrors gbrain's
-all-checks-must-pass rule scoring and gives the gate a smooth signal.
-"""
-from __future__ import annotations
-
-import re
-from typing import Any, Dict, List, Tuple
-
-
-def _section_present(response: str, name: str) -> bool:
-    # a markdown heading line (#, ##, ...) or bold line that contains `name`
-    pat = re.compile(
-        r"(?im)^\s{0,3}(#{1,6}\s*.*%s|\*\*.*%s.*\*\*\s*:?)\s*$" % (re.escape(name), re.escape(name))
-    )
-    if pat.search(response or ""):
-        return True
-    # also accept "Name:" style label at line start
-    label = re.compile(r"(?im)^\s*%s\s*:" % re.escape(name))
-    return bool(label.search(response or ""))
-
-
-def _check(op: str, arg: Any, response: str, tools_called: List[str]) -> bool:
-    r = response or ""
-    if op == "section_present":
-        return _section_present(r, str(arg))
-    if op == "regex":
-        try:
-            return bool(re.search(str(arg), r))
-        except re.error:
-            return False
-    if op == "max_chars":
-        return len(r) <= int(arg)
-    if op == "min_chars":
-        return len(r) >= int(arg)
-    if op == "contains":
-        return str(arg).lower() in r.lower()
-    if op == "tool_called":
-        name = str(arg).lower()
-        if any(name == t.lower() for t in tools_called):
-            return True
-        # single-shot approximation: the agent emits an explicit marker
-        return bool(re.search(r"(?i)\btool_call\s*:\s*%s\b" % re.escape(name), r))
-    # unknown op: do not block
-    return True
-
-
-def score_rule_judge(
-    judge: Dict[str, Any],
-    response: str,
-    tools_called: List[str] | None = None,
-) -> Tuple[float, float, str]:
-    """Return (hard, soft, rationale) for a gbrain-style rule judge."""
-    checks = (judge or {}).get("checks", []) or []
-    if not checks:
-        return 0.0, 0.0, "no checks"
-    tools_called = tools_called or []
-    passed = 0
-    failed_desc: List[str] = []
-    for c in checks:
-        ok = _check(c.get("op", ""), c.get("arg"), response, tools_called)
-        if ok:
-            passed += 1
-        else:
-            failed_desc.append(f"{c.get('op')}={c.get('arg')}")
-    soft = passed / len(checks)
-    hard = 1.0 if passed == len(checks) else 0.0
-    rationale = "all checks passed" if hard else "failed: " + ", ".join(failed_desc)
-    return hard, soft, rationale
--- a/skillopt/sleep/llm_miner.py
+++ b/skillopt/sleep/llm_miner.py
@@ -1,134 +0,0 @@
-"""SkillOpt-Sleep — LLM-backed task miner.
-
-The heuristic miner (mine.py) produces TaskRecords without a checkable
-reference, so real harvested transcripts can't show measurable lift. This
-module uses an optimizer backend to turn session digests into TaskRecords
-WITH a checkable rubric judge — the missing piece for real-data improvement.
-
-For each recurring intent it extracts:
-  * a clean, generalized `intent` (the reusable task, stripped of one-off specifics)
-  * a `rubric` (what a good answer must satisfy) -> stored as a rule judge of
-    `contains`/`regex`/`section_present` checks the local judge can score, OR a
-    free-text rubric scored by the backend's judge() when no programmatic check fits
-  * a preference signal (was the user satisfied?) to weight failures
-
-It is deliberately conservative: it only emits a task when it can name a
-concrete, checkable success criterion, so the gate has real signal. Tasks it
-can't make checkable are dropped (logged), not faked.
-"""
-from __future__ import annotations
-
-import json
-import re
-from typing import Any, Callable, Dict, List
-
-from skillopt.sleep.backend import Backend, _extract_json
-from skillopt.sleep.types import SessionDigest, TaskRecord
-
-
-_MINER_PROMPT = """You are mining a user's past AI-assistant sessions to find RECURRING tasks
-worth optimizing a skill for. From the session below, extract 0-3 reusable tasks.
-
-A good task is something the user asks for repeatedly or had to correct, where a
-GENERAL rule would help next time (formatting, structure, tool-use, conventions).
-Skip one-off or purely exploratory requests.
-
-For each task return:
-  - "intent": the reusable request, generalized (no one-off specifics)
-  - "checks": a list of programmatic success checks a grader can run on a future
-     answer. Each check is one of:
-        {"op":"section_present","arg":"<heading text>"}
-        {"op":"regex","arg":"<python regex the answer must match>"}
-        {"op":"contains","arg":"<substring the answer must contain>"}
-        {"op":"max_chars","arg":<int>}
-     Only include checks you are confident a GOOD answer must satisfy.
-  - "rubric": a one-sentence description of what a good answer looks like
-  - "satisfied": true/false — did the user seem satisfied with the assistant's answer?
-
-Return ONLY a JSON array (possibly empty). No prose.
-
-# Session
-project: __PROJECT__
-user prompts:
-__PROMPTS__
-assistant final (last):
-__FINAL__
-feedback signals: __FEEDBACK__
-"""
-
-
-def _digest_to_prompt(d: SessionDigest) -> str:
-    prompts = "\n".join(f"  - {p[:240]}" for p in d.user_prompts[:6]) or "  (none)"
-    final = (d.assistant_finals[-1][:400] if d.assistant_finals else "(none)")
-    return (
-        _MINER_PROMPT
-        .replace("__PROJECT__", d.project or "(unknown)")
-        .replace("__PROMPTS__", prompts)
-        .replace("__FINAL__", final)
-        .replace("__FEEDBACK__", ", ".join(d.feedback_signals[:6]) or "(none)")
-    )
-
-
-def _mk_task(d: SessionDigest, obj: Dict[str, Any], idx: int) -> TaskRecord | None:
-    intent = str(obj.get("intent", "")).strip()
-    if len(intent) < 8:
-        return None
-    checks = obj.get("checks") or []
-    rubric = str(obj.get("rubric", "")).strip()
-    satisfied = bool(obj.get("satisfied", False))
-
-    # keep only well-formed checks
-    clean_checks = []
-    for c in checks:
-        if isinstance(c, dict) and c.get("op") in {
-            "section_present", "regex", "contains", "max_chars", "min_chars",
-        }:
-            clean_checks.append({"op": c["op"], "arg": c.get("arg")})
-
-    import hashlib
-    tid = "llm_" + hashlib.sha256((d.project + intent).encode()).hexdigest()[:12]
-
-    if clean_checks:
-        return TaskRecord(
-            id=tid, project=d.project, intent=intent,
-            reference_kind="rule", judge={"kind": "rule", "checks": clean_checks},
-            outcome="success" if satisfied else "fail",
-            tags=["mined:llm"], source_sessions=[d.session_id],
-        )
-    if rubric:
-        return TaskRecord(
-            id=tid, project=d.project, intent=intent,
-            reference_kind="rubric", reference=rubric,
-            outcome="success" if satisfied else "fail",
-            tags=["mined:llm"], source_sessions=[d.session_id],
-        )
-    return None  # not checkable -> drop
-
-
-def make_llm_miner(
-    backend: Backend,
-    *,
-    max_sessions: int = 20,
-    max_tasks: int = 40,
-) -> Callable[[List[SessionDigest]], List[TaskRecord]]:
-    """Return an llm_miner(digests) -> list[TaskRecord] bound to a backend."""
-
-    def _miner(digests: List[SessionDigest]) -> List[TaskRecord]:
-        out: List[TaskRecord] = []
-        for d in digests[:max_sessions]:
-            if not d.user_prompts:
-                continue
-            raw = backend._call(_digest_to_prompt(d), max_tokens=800)  # type: ignore[attr-defined]
-            arr = _extract_json(raw, "array")
-            if not isinstance(arr, list):
-                continue
-            for i, obj in enumerate(arr[:3]):
-                if isinstance(obj, dict):
-                    t = _mk_task(d, obj, i)
-                    if t is not None:
-                        out.append(t)
-                if len(out) >= max_tasks:
-                    return out
-        return out
-
-    return _miner
--- a/skillopt/sleep/memory.py
+++ b/skillopt/sleep/memory.py
@@ -1,130 +0,0 @@
-"""SkillOpt-Sleep — skill/memory document manipulation.
-
-Applies bounded EditRecords to a skill (SKILL.md body) or memory (CLAUDE.md)
-document, and provides Dream-style consolidation helpers (dedup near-identical
-lines, drop contradictions). All edits live inside a protected, clearly-marked
-region so the sleep cycle never clobbers the user's hand-written content.
-"""
-from __future__ import annotations
-
-import re
-from typing import List, Tuple
-
-from skillopt.sleep.types import EditRecord
-
-
-LEARNED_START = "<!-- SKILLOPT-SLEEP:LEARNED START -->"
-LEARNED_END = "<!-- SKILLOPT-SLEEP:LEARNED END -->"
-_BANNER = (
-    "_This block is maintained by SkillOpt-Sleep. Edits here are proposed "
-    "offline, validated against your past tasks, and adopted only after you "
-    "approve them. Hand-edits outside this block are never touched._"
-)
-
-
-def extract_learned(doc: str) -> str:
-    s = doc.find(LEARNED_START)
-    e = doc.find(LEARNED_END)
-    if s == -1 or e == -1:
-        return ""
-    return doc[s + len(LEARNED_START):e].strip()
-
-
-def _strip_learned(doc: str) -> str:
-    while True:
-        s = doc.find(LEARNED_START)
-        if s == -1:
-            break
-        e = doc.find(LEARNED_END, s)
-        if e == -1:
-            doc = doc[:s]
-            break
-        doc = doc[:s] + doc[e + len(LEARNED_END):]
-    while "\n\n\n" in doc:
-        doc = doc.replace("\n\n\n", "\n\n")
-    return doc.rstrip()
-
-
-def set_learned(doc: str, learned_lines: List[str]) -> str:
-    """Replace the protected learned region with the given bullet lines."""
-    base = _strip_learned(doc)
-    body = "\n".join(f"- {ln.strip().lstrip('- ').strip()}" for ln in learned_lines if ln.strip())
-    block = (
-        f"\n\n{LEARNED_START}\n"
-        f"## Learned preferences & procedures\n\n{_BANNER}\n\n{body}\n"
-        f"{LEARNED_END}\n"
-    )
-    return (base + block).lstrip("\n")
-
-
-def current_learned_lines(doc: str) -> List[str]:
-    inner = extract_learned(doc)
-    lines: List[str] = []
-    for ln in inner.splitlines():
-        ln = ln.strip()
-        if ln.startswith("- "):
-            lines.append(ln[2:].strip())
-    return lines
-
-
-def _norm(s: str) -> str:
-    return re.sub(r"\s+", " ", (s or "").lower()).strip()
-
-
-def apply_edits(doc: str, edits: List[EditRecord]) -> Tuple[str, List[EditRecord]]:
-    """Apply add/delete/replace edits to the protected learned region.
-
-    Returns (new_doc, applied_edits). Dedups: an `add` whose content already
-    exists (normalized) is skipped. `delete`/`replace` match on normalized
-    anchor substring.
-    """
-    lines = current_learned_lines(doc)
-    norm_set = {_norm(l) for l in lines}
-    applied: List[EditRecord] = []
-
-    for e in edits:
-        op = (e.op or "add").lower()
-        if op == "add":
-            if _norm(e.content) in norm_set or not e.content.strip():
-                continue
-            lines.append(e.content.strip())
-            norm_set.add(_norm(e.content))
-            applied.append(e)
-        elif op == "delete":
-            anchor = _norm(e.anchor or e.content)
-            keep = [l for l in lines if anchor not in _norm(l)]
-            if len(keep) != len(lines):
-                lines = keep
-                norm_set = {_norm(l) for l in lines}
-                applied.append(e)
-        elif op == "replace":
-            anchor = _norm(e.anchor)
-            new_lines = []
-            changed = False
-            for l in lines:
-                if anchor and anchor in _norm(l):
-                    new_lines.append(e.content.strip())
-                    changed = True
-                else:
-                    new_lines.append(l)
-            if changed:
-                lines = new_lines
-                norm_set = {_norm(l) for l in lines}
-                applied.append(e)
-
-    return set_learned(doc, lines), applied
-
-
-def ensure_skill_scaffold(doc: str, *, name: str, description: str) -> str:
-    """Ensure a SKILL.md has YAML frontmatter so Claude Code loads it."""
-    if doc.lstrip().startswith("---"):
-        return doc
-    fm = (
-        "---\n"
-        f"name: {name}\n"
-        f"description: {description}\n"
-        "---\n\n"
-        f"# {name}\n\n"
-        "Preferences and procedures learned from your past Claude Code sessions.\n"
-    )
-    return fm + doc
--- a/skillopt/sleep/mine.py
+++ b/skillopt/sleep/mine.py
@@ -1,210 +0,0 @@
-"""SkillOpt-Sleep — Stage 2: mine.
-
-Turn :class:`SessionDigest` objects into :class:`TaskRecord` training units.
-
-Two miners:
-  * heuristic_mine  — deterministic, no API. Detects retry chains (a prompt
-    re-asked after negative feedback => the early attempt failed), extracts
-    the user's recurring intents, and labels outcomes from feedback signals.
-  * llm_mine        — optional; uses an optimizer backend to produce richer
-    TaskRecords with checkable references. Falls back to heuristic on error.
-
-The heuristic miner is what makes the whole cycle runnable offline and is the
-basis of the deterministic experiment.
-"""
-from __future__ import annotations
-
-import hashlib
-import re
-from typing import Any, Callable, List, Optional
-
-from skillopt.sleep.types import SessionDigest, TaskRecord
-
-
-def _tid(project: str, intent: str) -> str:
-    h = hashlib.sha256((project + "::" + intent).encode("utf-8")).hexdigest()[:12]
-    return "task_" + h
-
-
-def _short(text: str, n: int = 600) -> str:
-    text = (text or "").strip()
-    return text if len(text) <= n else text[:n] + " …"
-
-
-def _looks_negative(signals: List[str]) -> bool:
-    return any(s.startswith("neg:") for s in signals)
-
-
-def _looks_positive(signals: List[str]) -> bool:
-    return any(s.startswith("pos:") for s in signals)
-
-
-def heuristic_mine(
-    digests: List[SessionDigest],
-    *,
-    max_tasks: int = 40,
-) -> List[TaskRecord]:
-    """Deterministic miner — no API calls.
-
-    Strategy:
-      * Each session with >=1 real user prompt yields one TaskRecord whose
-        intent is the FIRST substantive prompt (the original ask).
-      * Outcome is inferred:
-          - negative feedback present and no later positive  -> "fail"
-          - positive feedback present                         -> "success"
-          - re-asks (multiple user turns) without resolution  -> "mixed"
-          - otherwise                                         -> "unknown"
-      * attempted_solution = the last assistant final (what was produced).
-      * reference_kind defaults to "none"; the consolidation step will use a
-        rubric judge for these. (Exact refs are added by the experiment data
-        or by the LLM miner when it can derive a checkable answer.)
-    """
-    tasks: List[TaskRecord] = []
-    for d in digests:
-        if not d.user_prompts:
-            continue
-        intent = d.user_prompts[0]
-        if len(intent.strip()) < 8:
-            continue
-        if _looks_positive(d.feedback_signals) and not _looks_negative(d.feedback_signals):
-            outcome = "success"
-        elif _looks_negative(d.feedback_signals):
-            outcome = "fail"
-        elif d.n_user_turns >= 3:
-            outcome = "mixed"
-        else:
-            outcome = "unknown"
-
-        attempted = d.assistant_finals[-1] if d.assistant_finals else ""
-        context = ""
-        if len(d.user_prompts) > 1:
-            # later prompts often carry the corrective detail / real constraints
-            context = "Follow-up constraints from the same session:\n- " + "\n- ".join(
-                _short(p, 200) for p in d.user_prompts[1:4]
-            )
-        tags = []
-        if d.tools_used:
-            tags.append("tools:" + "+".join(d.tools_used[:4]))
-        if d.git_branch:
-            tags.append("branch:" + d.git_branch)
-
-        tasks.append(
-            TaskRecord(
-                id=_tid(d.project, intent),
-                project=d.project,
-                intent=_short(intent, 800),
-                context_excerpt=_short(context, 600),
-                attempted_solution=_short(attempted, 600),
-                outcome=outcome,
-                reference_kind="none",
-                reference="",
-                tags=tags,
-                source_sessions=[d.session_id],
-            )
-        )
-        if len(tasks) >= max_tasks:
-            break
-    return tasks
-
-
-def dedup_tasks(tasks: List[TaskRecord]) -> List[TaskRecord]:
-    """Merge tasks sharing an id (same project+intent across sessions)."""
-    by_id: dict = {}
-    for t in tasks:
-        if t.id in by_id:
-            ex = by_id[t.id]
-            ex.source_sessions = list(dict.fromkeys(ex.source_sessions + t.source_sessions))
-            # prefer a resolved outcome if either session resolved it
-            order = {"success": 3, "fail": 2, "mixed": 1, "unknown": 0}
-            if order.get(t.outcome, 0) > order.get(ex.outcome, 0):
-                ex.outcome = t.outcome
-        else:
-            by_id[t.id] = t
-    return list(by_id.values())
-
-
-def assign_splits(
-    tasks: List[TaskRecord],
-    *,
-    val_fraction: float = 0.34,
-    test_fraction: float = 0.0,
-    holdout_fraction: float | None = None,  # legacy alias for val_fraction
-    seed: int = 42,
-) -> List[TaskRecord]:
-    """Deterministically split tasks into train / val / test.
-
-    Anti-overfitting contract (the user's design):
-      * ``val`` and ``test`` are drawn ONLY from REAL mined tasks (origin=='real')
-        and never overlap. val gates updates; test is the final held-out measure.
-      * ``train`` may include DREAM-augmented tasks (origin=='dream'); those are
-        NEVER placed in val/test.
-
-    A stable hash of the task id keeps the same real task in the same split across
-    nights (a fixed held-out gate, like SkillOpt's D_sel/D_test).
-
-    Back-compat: if ``test_fraction`` is 0 (default), this behaves like the old
-    two-way replay/holdout split — real tasks divide into train + val, no test.
-    ``holdout_fraction`` is accepted as an alias for ``val_fraction``.
-    """
-    if holdout_fraction is not None:
-        val_fraction = holdout_fraction
-
-    dream = [t for t in tasks if t.origin == "dream"]
-    real = [t for t in tasks if t.origin != "dream"]
-
-    # all dream tasks go to train, unconditionally
-    for t in dream:
-        t.split = "train"
-
-    val_cut = int(round(val_fraction * 100))
-    test_cut = val_cut + int(round(test_fraction * 100))
-    for t in real:
-        bucket = int(hashlib.sha256((str(seed) + t.id).encode()).hexdigest(), 16) % 100
-        if bucket < val_cut:
-            t.split = "val"
-        elif bucket < test_cut:
-            t.split = "test"
-        else:
-            t.split = "train"
-
-    # guarantee val (the gate) is non-empty when we have >=2 real tasks
-    real_splits = {t.split for t in real}
-    if len(real) >= 2 and "val" not in real_splits:
-        real[-1].split = "val"
-    # guarantee a train pool exists (dream or real) when possible
-    if not any(t.split == "train" for t in tasks) and len(real) >= 2:
-        real[0].split = "train"
-    # if test was requested but ended up empty with >=3 real tasks, carve one
-    if test_fraction > 0 and len(real) >= 3 and not any(t.split == "test" for t in real):
-        for t in real:
-            if t.split == "train":
-                t.split = "test"
-                break
-    return tasks
-
-
-def normalize_legacy_split(value: str) -> str:
-    """Map old split names to the new vocabulary."""
-    return {"replay": "train", "holdout": "val"}.get(value, value)
-
-
-def mine(
-    digests: List[SessionDigest],
-    *,
-    max_tasks: int = 40,
-    holdout_fraction: float = 0.34,
-    seed: int = 42,
-    llm_miner: Optional[Callable[[List[SessionDigest]], List[TaskRecord]]] = None,
-) -> List[TaskRecord]:
-    """Top-level miner. Uses ``llm_miner`` if provided, else heuristic."""
-    tasks: List[TaskRecord] = []
-    if llm_miner is not None:
-        try:
-            tasks = llm_miner(digests) or []
-        except Exception:
-            tasks = []
-    if not tasks:
-        tasks = heuristic_mine(digests, max_tasks=max_tasks)
-    tasks = dedup_tasks(tasks)
-    tasks = assign_splits(tasks, holdout_fraction=holdout_fraction, seed=seed)
-    return tasks
--- a/skillopt/sleep/replay.py
+++ b/skillopt/sleep/replay.py
@@ -1,118 +0,0 @@
-"""SkillOpt-Sleep — Stage 3: replay.
-
-Re-run mined TaskRecords offline under a given (skill, memory) and score
-them, producing the (hard, soft) signal SkillOpt's gate consumes.
-
-Single-shot text replay by default. Tasks whose rule judge requires a tool
-call (gbrain's `tool_called`) are run through the backend's real tool loop
-(attempt_with_tools), so tool use is verified honestly rather than self-reported.
-"""
-from __future__ import annotations
-
-from typing import List, Tuple
-
-from skillopt.sleep.backend import Backend
-from skillopt.sleep.types import ReplayResult, TaskRecord
-
-
-def _required_tools(task: TaskRecord) -> List[str]:
-    """Tool names a rule judge requires (op == 'tool_called')."""
-    if task.reference_kind != "rule" or not task.judge:
-        return []
-    tools = []
-    for c in task.judge.get("checks", []) or []:
-        if isinstance(c, dict) and c.get("op") == "tool_called" and c.get("arg"):
-            tools.append(str(c["arg"]))
-    return tools
-
-
-def replay_one(backend: Backend, task: TaskRecord, skill: str, memory: str) -> ReplayResult:
-    import time
-    tools = _required_tools(task)
-    tools_called: List[str] = []
-    t0 = time.time()
-    tok_before = backend.tokens_used()
-    if tools:
-        response, tools_called = backend.attempt_with_tools(task, skill, memory, tools)
-    else:
-        response = backend.attempt(task, skill, memory)
-    latency_ms = (time.time() - t0) * 1000.0
-    tokens = max(0, backend.tokens_used() - tok_before)
-    # if the backend doesn't track tokens (e.g. mock), approximate from text length
-    if tokens == 0:
-        tokens = (len(skill) + len(memory) + len(task.intent) + len(response)) // 4
-
-    # rule judges may need the detected tool calls; score locally when possible
-    if task.reference_kind == "rule" and task.judge:
-        from skillopt.sleep.judges import score_rule_judge
-        hard, soft, rationale = score_rule_judge(task.judge, response, tools_called)
-    else:
-        hard, soft, rationale = backend.judge(task, response)
-
-    return ReplayResult(
-        id=task.id,
-        hard=float(hard),
-        soft=float(soft),
-        response=response,
-        fail_reason="" if hard >= 1.0 else (rationale or "below threshold"),
-        task_type=(task.tags[0] if task.tags else "task"),
-        judge_rationale=rationale,
-        tools_called=tools_called,
-        tokens=int(tokens),
-        latency_ms=round(latency_ms, 1),
-    )
-
-
-def replay_batch(
-    backend: Backend,
-    tasks: List[TaskRecord],
-    skill: str,
-    memory: str,
-) -> List[Tuple[TaskRecord, ReplayResult]]:
-    return [(t, replay_one(backend, t, skill, memory)) for t in tasks]
-
-
-def aggregate_scores(pairs: List[Tuple[TaskRecord, ReplayResult]]) -> Tuple[float, float]:
-    if not pairs:
-        return 0.0, 0.0
-    hard = sum(r.hard for _t, r in pairs) / len(pairs)
-    soft = sum(r.soft for _t, r in pairs) / len(pairs)
-    return hard, soft
-
-
-def aggregate_cost(pairs: List[Tuple[TaskRecord, ReplayResult]]) -> Tuple[float, float]:
-    """Mean (tokens, latency_ms) per task — the cost objectives."""
-    if not pairs:
-        return 0.0, 0.0
-    tok = sum(r.tokens for _t, r in pairs) / len(pairs)
-    lat = sum(r.latency_ms for _t, r in pairs) / len(pairs)
-    return tok, lat
-
-
-def multi_objective_reward(
-    pairs: List[Tuple[TaskRecord, ReplayResult]],
-    *,
-    w_acc: float = 1.0,
-    w_tokens: float = 0.0,
-    w_latency: float = 0.0,
-    token_ref: float = 2000.0,
-    latency_ref_ms: float = 15000.0,
-) -> float:
-    """Weighted reward = accuracy↑, tokens↓, latency↓.
-
-    Cost terms are normalized against a reference and clamped to [0,1], so a
-    response at/under the reference cost contributes ~1.0 and an expensive one
-    less. Weights let the user trade off (default = accuracy only, backward
-    compatible).
-    """
-    if not pairs:
-        return 0.0
-    acc, _soft = aggregate_scores(pairs)
-    tok, lat = aggregate_cost(pairs)
-    tok_score = max(0.0, 1.0 - tok / max(1.0, token_ref)) if token_ref else 0.0
-    lat_score = max(0.0, 1.0 - lat / max(1.0, latency_ref_ms)) if latency_ref_ms else 0.0
-    total_w = w_acc + w_tokens + w_latency
-    if total_w <= 0:
-        return acc
-    return (w_acc * acc + w_tokens * tok_score + w_latency * lat_score) / total_w
-
--- a/skillopt/sleep/rollout.py
+++ b/skillopt/sleep/rollout.py
@@ -1,122 +0,0 @@
-"""SkillOpt-Sleep — multi-rollout + contrastive reflection ("脑补推演" core).
-
-The user's insight: let the agent re-run the SAME task many times, then look at
-which rollouts went well vs badly and distill a rule from the *contrast*. This
-is a much stronger learning signal than a single failure, and it is the essence
-of the offline "dream/imagination" process — train-time rollouts are synthetic,
-so doing many is fine.
-
-Pieces:
-  * multi_rollout   — run one task K times under (skill, memory), return scored attempts
-  * contrastive_reflect — given good vs bad attempts of the same tasks, ask the
-    optimizer what distinguishes them and propose a general rule
-
-Driven through the Backend abstraction (mock/claude/codex), import-light.
-"""
-from __future__ import annotations
-
-from dataclasses import dataclass, field
-from typing import List, Optional, Tuple
-
-from skillopt.sleep.backend import Backend, _extract_json
-from skillopt.sleep.replay import replay_one
-from skillopt.sleep.types import EditRecord, ReplayResult, TaskRecord
-
-
-@dataclass
-class RolloutSet:
-    """K scored attempts at one task under a fixed (skill, memory)."""
-    task: TaskRecord
-    attempts: List[ReplayResult] = field(default_factory=list)
-
-    @property
-    def best(self) -> Optional[ReplayResult]:
-        return max(self.attempts, key=lambda r: r.hard, default=None)
-
-    @property
-    def worst(self) -> Optional[ReplayResult]:
-        return min(self.attempts, key=lambda r: r.hard, default=None)
-
-    @property
-    def spread(self) -> float:
-        if not self.attempts:
-            return 0.0
-        hs = [r.hard for r in self.attempts]
-        return max(hs) - min(hs)
-
-    @property
-    def pass_rate(self) -> float:
-        if not self.attempts:
-            return 0.0
-        return sum(1 for r in self.attempts if r.hard >= 1.0) / len(self.attempts)
-
-
-def multi_rollout(
-    backend: Backend,
-    task: TaskRecord,
-    skill: str,
-    memory: str,
-    *,
-    k: int = 3,
-) -> RolloutSet:
-    """Run ``task`` K times. replay_one is deterministic for mock; for real
-    backends the model's own sampling yields variation across attempts."""
-    rs = RolloutSet(task=task)
-    for _ in range(max(1, k)):
-        rs.attempts.append(replay_one(backend, task, skill, memory))
-    return rs
-
-
-def contrastive_reflect(
-    backend: Backend,
-    rollout_sets: List[RolloutSet],
-    skill: str,
-    memory: str,
-    *,
-    edit_budget: int = 4,
-    target: str = "skill",
-) -> List[EditRecord]:
-    """Distill a rule from the contrast between good and bad attempts.
-
-    We pick tasks with the highest score *spread* (some attempts passed, some
-    failed) — those are the most informative — and show the optimizer a
-    high-scoring vs a low-scoring attempt of each, asking what general rule makes
-    the good behavior reliable.
-    """
-    informative = [rs for rs in rollout_sets if rs.spread > 0 and rs.best and rs.worst]
-    informative.sort(key=lambda rs: rs.spread, reverse=True)
-    informative = informative[:6]
-    if not informative:
-        return []
-
-    blocks = []
-    for rs in informative:
-        blocks.append(
-            f"## Task: {rs.task.intent[:160]}\n"
-            f"- GOOD attempt (score {rs.best.hard:.1f}): {rs.best.response[:200]}\n"
-            f"- BAD  attempt (score {rs.worst.hard:.1f}): {rs.worst.response[:200]}\n"
-            f"  (bad failed: {rs.worst.fail_reason[:100]})"
-        )
-    prompt = (
-        "You are SkillOpt's optimizer doing CONTRASTIVE reflection. For each task "
-        "below the agent was run multiple times; some attempts succeeded and some "
-        "failed. Identify what the GOOD attempts did that the BAD ones did not, "
-        f"and propose at most {edit_budget} SHORT, GENERAL, reusable rules for the "
-        f"{target} that would make the good behavior reliable every time. Quote "
-        "concrete thresholds/formats verbatim; do not paraphrase vaguely. "
-        'Return ONLY a JSON array: '
-        '[{"op":"add","content":"<rule>","rationale":"<what good did that bad didnt>"}].\n\n'
-        + "\n\n".join(blocks)
-    )
-    raw = backend._call(prompt, max_tokens=1024)  # type: ignore[attr-defined]
-    arr = _extract_json(raw, "array")
-    edits: List[EditRecord] = []
-    if isinstance(arr, list):
-        for e in arr[:edit_budget]:
-            if isinstance(e, dict) and str(e.get("content", "")).strip():
-                edits.append(EditRecord(
-                    target=target, op=str(e.get("op", "add")).strip().lower(),
-                    content=str(e["content"]).strip(),
-                    rationale=str(e.get("rationale", "")).strip(),
-                ))
-    return edits
--- a/skillopt/sleep/slow_update.py
+++ b/skillopt/sleep/slow_update.py
@@ -1,142 +0,0 @@
-"""SkillOpt-Sleep — slow update (cross-night long-term memory).
-
-This is the deployment-time analogue of SkillOpt's epoch-wise slow/meta update
-(paper §3.6). Step-level edits (consolidate) learn from one night's batch; the
-slow update learns across nights and writes a durable "longitudinal guidance"
-block into a PROTECTED field of the skill that step-level edits never touch.
-
-It reuses the exact protected-field marker convention from the main repo
-(``skillopt/optimizer/slow_update.py``) so the artifact is compatible:
-
-    <!-- SLOW_UPDATE_START --> ... <!-- SLOW_UPDATE_END -->
-
-Why it matters: even when the user turns the validation gate OFF (greedy mode),
-the slow update still runs at the end of the run, so short-term nightly
-experience is consolidated into long-term memory rather than lost. The cross-night
-content is carried in ``state.slow_memory``.
-
-Driven through the Backend abstraction (mock/claude/codex), so it stays
-import-light — no `openai` dependency.
-"""
-from __future__ import annotations
-
-import re
-from typing import List, Optional, Tuple
-
-from skillopt.sleep.backend import Backend, _extract_json
-from skillopt.sleep.types import ReplayResult, TaskRecord
-
-
-SLOW_UPDATE_START = "<!-- SLOW_UPDATE_START -->"
-SLOW_UPDATE_END = "<!-- SLOW_UPDATE_END -->"
-
-
-# ── protected-field helpers (mirror skillopt/optimizer/slow_update.py) ─────────
-
-def has_slow_field(skill: str) -> bool:
-    return SLOW_UPDATE_START in skill and SLOW_UPDATE_END in skill
-
-
-def extract_slow_field(skill: str) -> str:
-    s = skill.find(SLOW_UPDATE_START)
-    e = skill.find(SLOW_UPDATE_END)
-    if s == -1 or e == -1:
-        return ""
-    return skill[s + len(SLOW_UPDATE_START):e].strip()
-
-
-def _strip_slow_fields(skill: str) -> str:
-    while True:
-        s = skill.find(SLOW_UPDATE_START)
-        if s == -1:
-            break
-        e = skill.find(SLOW_UPDATE_END, s)
-        if e == -1:
-            skill = skill[:s]
-            break
-        skill = skill[:s] + skill[e + len(SLOW_UPDATE_END):]
-    skill = skill.replace(SLOW_UPDATE_END, "")
-    while "\n\n\n" in skill:
-        skill = skill.replace("\n\n\n", "\n\n")
-    return skill.rstrip()
-
-
-def replace_slow_field(skill: str, content: str) -> str:
-    """Set the protected slow-update field to ``content`` (exactly one block)."""
-    base = _strip_slow_fields(skill)
-    if not content.strip():
-        return base
-    block = f"\n\n{SLOW_UPDATE_START}\n{content.strip()}\n{SLOW_UPDATE_END}\n"
-    return base + block
-
-
-# ── the slow-update synthesis ──────────────────────────────────────────────────
-
-def _summarize_pairs(
-    prev_pairs: List[Tuple[TaskRecord, ReplayResult]],
-    curr_pairs: List[Tuple[TaskRecord, ReplayResult]],
-) -> str:
-    """Group adjacent-version outcomes into improved/regressed/persistent/stable."""
-    prev_by = {t.id: r for t, r in prev_pairs}
-    lines: List[str] = []
-    counts = {"improved": 0, "regressed": 0, "persistent_fail": 0, "stable_success": 0}
-    for t, r in curr_pairs:
-        p = prev_by.get(t.id)
-        if p is None:
-            continue
-        a, b = p.hard, r.hard
-        if b > a:
-            cat = "improved"
-        elif b < a:
-            cat = "regressed"
-        elif b >= 1.0:
-            cat = "stable_success"
-        else:
-            cat = "persistent_fail"
-        counts[cat] += 1
-        if cat in ("regressed", "persistent_fail") and len(lines) < 8:
-            lines.append(f"- [{cat}] {t.intent[:120]} (why: {r.fail_reason[:80]})")
-    head = ", ".join(f"{k}={v}" for k, v in counts.items())
-    return head + ("\n" + "\n".join(lines) if lines else ""), counts  # type: ignore[return-value]
-
-
-def run_slow_update(
-    backend: Backend,
-    *,
-    prev_skill: str,
-    curr_skill: str,
-    prev_pairs: List[Tuple[TaskRecord, ReplayResult]],
-    curr_pairs: List[Tuple[TaskRecord, ReplayResult]],
-    prev_slow_content: str = "",
-) -> Optional[str]:
-    """Produce durable longitudinal guidance text (or None).
-
-    Compares behavior under the previous vs current skill across the same tasks
-    and asks the optimizer to distill a short, durable guidance block — what to
-    keep doing, what regressions to avoid — refining any prior slow-update text.
-    """
-    summary, counts = _summarize_pairs(prev_pairs, curr_pairs)  # type: ignore[misc]
-    # nothing changed and no prior guidance to refine → skip
-    if counts["regressed"] == 0 and counts["persistent_fail"] == 0 and not prev_slow_content:
-        return None
-
-    prompt = (
-        "You are SkillOpt's SLOW UPDATE — the long-term memory pass that runs "
-        "across nights. Write a SHORT, durable guidance block (2-5 bullet "
-        "points) capturing the longitudinal lessons: behaviors that reliably "
-        "help and should be preserved, and regressions/persistent failures to "
-        "avoid. Keep it GENERAL and stable (not tied to one task). If prior "
-        "guidance is given, refine it rather than restate it.\n"
-        'Return ONLY JSON: {"guidance": "<bullet list as one string>"}.\n\n'
-        f"# Cross-night outcome summary\n{summary}\n\n"
-        f"# Prior long-term guidance (refine this)\n{prev_slow_content or '(none)'}"
-    )
-    raw = backend._call(prompt, max_tokens=600)  # type: ignore[attr-defined]
-    obj = _extract_json(raw, "object")
-    if isinstance(obj, dict):
-        g = str(obj.get("guidance", "")).strip()
-        if g:
-            return g
-    # fallback: if the model returned prose, keep the first ~400 chars
-    text = (raw or "").strip()
-    return text[:400] if text else None
--- a/skillopt/sleep/staging.py
+++ b/skillopt/sleep/staging.py
@@ -1,103 +0,0 @@
-"""SkillOpt-Sleep — Stage 5/6: staging and adoption.
-
-Implements the Dreams safety contract: the cycle never mutates the user's
-live CLAUDE.md / SKILL.md. It writes proposals + a human-readable report into
-a staging directory; a separate, explicit `adopt` step copies them over the
-live files after taking a backup.
-"""
-from __future__ import annotations
-
-import json
-import os
-import shutil
-import time
-from typing import List, Optional
-
-from skillopt.sleep.types import SleepReport
-
-
-def _ts_dir() -> str:
-    return time.strftime("%Y%m%d-%H%M%S", time.localtime())
-
-
-def staging_root(project: str) -> str:
-    return os.path.join(project, ".skillopt-sleep", "staging")
-
-
-def latest_staging(project: str) -> Optional[str]:
-    root = staging_root(project)
-    if not os.path.isdir(root):
-        return None
-    subs = sorted(
-        (os.path.join(root, d) for d in os.listdir(root)),
-        key=lambda p: os.path.getmtime(p),
-        reverse=True,
-    )
-    return subs[0] if subs else None
-
-
-def write_staging(
-    project: str,
-    *,
-    report: SleepReport,
-    proposed_skill: Optional[str],
-    proposed_memory: Optional[str],
-    live_skill_path: str,
-    live_memory_path: str,
-    report_md: str,
-) -> str:
-    """Write proposals + report into staging/<ts>/ and return that path."""
-    out = os.path.join(staging_root(project), _ts_dir())
-    os.makedirs(out, exist_ok=True)
-
-    manifest = {
-        "live_skill_path": live_skill_path,
-        "live_memory_path": live_memory_path,
-        "has_skill": proposed_skill is not None,
-        "has_memory": proposed_memory is not None,
-        "accepted": report.accepted,
-    }
-    if proposed_skill is not None:
-        with open(os.path.join(out, "proposed_SKILL.md"), "w", encoding="utf-8") as f:
-            f.write(proposed_skill)
-    if proposed_memory is not None:
-        with open(os.path.join(out, "proposed_CLAUDE.md"), "w", encoding="utf-8") as f:
-            f.write(proposed_memory)
-    with open(os.path.join(out, "report.json"), "w", encoding="utf-8") as f:
-        json.dump(report.to_dict(), f, ensure_ascii=False, indent=2)
-    with open(os.path.join(out, "report.md"), "w", encoding="utf-8") as f:
-        f.write(report_md)
-    with open(os.path.join(out, "manifest.json"), "w", encoding="utf-8") as f:
-        json.dump(manifest, f, ensure_ascii=False, indent=2)
-    return out
-
-
-def _backup(path: str, backup_dir: str) -> None:
-    if os.path.exists(path):
-        os.makedirs(backup_dir, exist_ok=True)
-        shutil.copy2(path, os.path.join(backup_dir, os.path.basename(path)))
-
-
-def adopt(staging_dir: str) -> List[str]:
-    """Copy staged proposals over the live files, backing up first.
-
-    Returns the list of live paths that were updated.
-    """
-    with open(os.path.join(staging_dir, "manifest.json")) as f:
-        manifest = json.load(f)
-    backup_dir = os.path.join(staging_dir, "backup")
-    updated: List[str] = []
-
-    if manifest.get("has_skill"):
-        live = manifest["live_skill_path"]
-        os.makedirs(os.path.dirname(live), exist_ok=True)
-        _backup(live, backup_dir)
-        shutil.copy2(os.path.join(staging_dir, "proposed_SKILL.md"), live)
-        updated.append(live)
-    if manifest.get("has_memory"):
-        live = manifest["live_memory_path"]
-        os.makedirs(os.path.dirname(live), exist_ok=True)
-        _backup(live, backup_dir)
-        shutil.copy2(os.path.join(staging_dir, "proposed_CLAUDE.md"), live)
-        updated.append(live)
-    return updated
--- a/skillopt/sleep/state.py
+++ b/skillopt/sleep/state.py
@@ -1,83 +0,0 @@
-"""SkillOpt-Sleep — persistent cross-night state.
-
-state.json lives in ~/.skillopt-sleep and is the "long-term" store that
-turns nightly episodes into durable competence (the Agent-Sleep paper's
-short-term -> long-term transfer). It records:
-
-  - night counter
-  - last harvest timestamp per project (so each night only sees new data)
-  - cross-night "slow/meta" memory (lessons that persisted across nights)
-  - per-night history (scores, accept/reject) for trend reporting
-"""
-from __future__ import annotations
-
-import json
-import os
-from typing import Any, Dict, List, Optional
-
-
-def _now_iso(clock: Optional[float] = None) -> str:
-    # caller passes a timestamp; we avoid importing time at module import
-    import time as _t
-    return _t.strftime("%Y-%m-%dT%H:%M:%S", _t.localtime(clock if clock is not None else _t.time()))
-
-
-DEFAULT_STATE: Dict[str, Any] = {
-    "version": 1,
-    "night": 0,
-    "last_harvest": {},     # project -> iso timestamp of last harvested record
-    "slow_memory": "",      # cross-night consolidated lessons (meta-skill analogue)
-    "history": [],          # list of per-night summaries
-}
-
-
-class SleepState:
-    def __init__(self, path: str, data: Optional[Dict[str, Any]] = None) -> None:
-        self.path = path
-        self.data = data if data is not None else dict(DEFAULT_STATE)
-
-    # io ---------------------------------------------------------------------
-    @classmethod
-    def load(cls, path: str) -> "SleepState":
-        if os.path.exists(path):
-            try:
-                with open(path) as f:
-                    data = json.load(f)
-                merged = dict(DEFAULT_STATE)
-                merged.update(data if isinstance(data, dict) else {})
-                return cls(path, merged)
-            except Exception:
-                pass
-        return cls(path, dict(DEFAULT_STATE))
-
-    def save(self) -> None:
-        os.makedirs(os.path.dirname(self.path), exist_ok=True)
-        tmp = self.path + ".tmp"
-        with open(tmp, "w") as f:
-            json.dump(self.data, f, ensure_ascii=False, indent=2)
-        os.replace(tmp, self.path)
-
-    # accessors --------------------------------------------------------------
-    @property
-    def night(self) -> int:
-        return int(self.data.get("night", 0))
-
-    def last_harvest_for(self, project: str) -> Optional[str]:
-        return self.data.get("last_harvest", {}).get(project)
-
-    def set_last_harvest(self, project: str, iso_ts: str) -> None:
-        self.data.setdefault("last_harvest", {})[project] = iso_ts
-
-    @property
-    def slow_memory(self) -> str:
-        return str(self.data.get("slow_memory", ""))
-
-    def set_slow_memory(self, content: str) -> None:
-        self.data["slow_memory"] = content
-
-    def begin_night(self, clock: Optional[float] = None) -> int:
-        self.data["night"] = self.night + 1
-        return self.night
-
-    def record_night(self, summary: Dict[str, Any]) -> None:
-        self.data.setdefault("history", []).append(summary)
--- a/skillopt/sleep/types.py
+++ b/skillopt/sleep/types.py
@@ -1,140 +0,0 @@
-"""SkillOpt-Sleep — core data types.
-
-These dataclasses are the interfaces between the sleep-cycle stages
-(harvest -> mine -> replay -> consolidate -> stage). They are intentionally
-plain (no slots, no heavy deps) so the package imports cleanly on any
-Python 3.8+ interpreter and the deterministic experiment runs with zero
-external dependencies.
-"""
-from __future__ import annotations
-
-from dataclasses import dataclass, field, asdict
-from typing import Any, Dict, List, Optional
-
-
-# ── Stage 1: harvest ──────────────────────────────────────────────────────────
-
-@dataclass
-class SessionDigest:
-    """A normalized summary of one Claude Code session transcript.
-
-    Produced by :mod:`skillopt.sleep.harvest` from a ``<sessionId>.jsonl``
-    transcript plus ``history.jsonl`` entries.
-    """
-
-    session_id: str
-    project: str
-    git_branch: str = ""
-    started_at: str = ""
-    ended_at: str = ""
-    user_prompts: List[str] = field(default_factory=list)
-    assistant_finals: List[str] = field(default_factory=list)
-    tools_used: List[str] = field(default_factory=list)
-    files_touched: List[str] = field(default_factory=list)
-    feedback_signals: List[str] = field(default_factory=list)  # "still broken", "perfect", ...
-    n_user_turns: int = 0
-    n_assistant_turns: int = 0
-    raw_path: str = ""
-
-    def to_dict(self) -> Dict[str, Any]:
-        return asdict(self)
-
-
-# ── Stage 2: mine ─────────────────────────────────────────────────────────────
-
-@dataclass
-class TaskRecord:
-    """A self-contained recurring task mined from one or more sessions.
-
-    This is the *training unit* of the sleep cycle — the analogue of a
-    SkillOpt benchmark item.
-    """
-
-    id: str
-    project: str
-    intent: str                       # what the user wanted (the "question")
-    context_excerpt: str = ""         # minimal context needed to attempt it
-    attempted_solution: str = ""      # what the agent produced before
-    outcome: str = "unknown"          # success | fail | mixed | unknown
-    reference_kind: str = "none"      # exact | rubric | rule | none
-    reference: str = ""               # exact answer, or rubric text
-    judge: Dict[str, Any] = field(default_factory=dict)  # gbrain-style rule judge
-    tags: List[str] = field(default_factory=list)
-    source_sessions: List[str] = field(default_factory=list)
-    # split ∈ {train, val, test}.  val + test come ONLY from real mined tasks and
-    # never overlap (val gates updates, test is the final held-out measure). train
-    # may be dream-augmented (see origin).  Legacy values replay->train,
-    # holdout->val are normalized on load.
-    split: str = "train"
-    # origin ∈ {real, dream}.  'real' = mined from the user's actual sessions;
-    # 'dream' = synthetic/augmented for the training pool. Dream tasks are NEVER
-    # allowed into val/test, which is the anti-overfitting guarantee.
-    origin: str = "real"
-    derived_from: str = ""            # for dream tasks: the real task id it varies
-
-    def to_dict(self) -> Dict[str, Any]:
-        return asdict(self)
-
-    @classmethod
-    def from_dict(cls, d: Dict[str, Any]) -> "TaskRecord":
-        known = {f for f in cls.__dataclass_fields__}  # type: ignore[attr-defined]
-        return cls(**{k: v for k, v in d.items() if k in known})
-
-
-# ── Stage 3: replay ───────────────────────────────────────────────────────────
-
-@dataclass
-class ReplayResult:
-    """Outcome of re-running one TaskRecord offline under a given skill+memory."""
-
-    id: str
-    hard: float = 0.0                 # 0/1 exact, or continuous reward
-    soft: float = 0.0                 # partial credit / judge score 0..1
-    response: str = ""
-    fail_reason: str = ""
-    task_type: str = "task"
-    judge_rationale: str = ""
-    tools_called: List[str] = field(default_factory=list)
-    tokens: int = 0                   # approx tokens this rollout cost (for token objective)
-    latency_ms: float = 0.0           # wall-clock for this rollout (for latency objective)
-
-    def to_dict(self) -> Dict[str, Any]:
-        return asdict(self)
-
-
-# ── Stage 4/5: consolidation report ───────────────────────────────────────────
-
-@dataclass
-class EditRecord:
-    """One bounded edit proposed/applied to skill or memory."""
-
-    target: str                       # "skill" | "memory"
-    op: str                           # add | delete | replace
-    content: str = ""
-    anchor: str = ""                  # for replace/delete: text being changed
-    rationale: str = ""
-
-
-@dataclass
-class SleepReport:
-    """Everything one night produced — written to staging for review."""
-
-    night: int
-    project: str
-    started_at: str = ""
-    ended_at: str = ""
-    n_sessions: int = 0
-    n_tasks: int = 0
-    n_replayed: int = 0
-    baseline_score: float = 0.0
-    candidate_score: float = 0.0
-    accepted: bool = False
-    gate_action: str = ""
-    edits: List[EditRecord] = field(default_factory=list)
-    rejected_edits: List[EditRecord] = field(default_factory=list)
-    tokens_used: int = 0
-    notes: List[str] = field(default_factory=list)
-
-    def to_dict(self) -> Dict[str, Any]:
-        d = asdict(self)
-        return d