diff --git a/skillopt-sleep-plugin/README.md b/skillopt-sleep-plugin/README.md
index a3b6ddc..6d77559 100644
--- a/skillopt-sleep-plugin/README.md
+++ b/skillopt-sleep-plugin/README.md
@@ -48,7 +48,7 @@ cd SkillOpt
 ```
 
 The plugin's bundled runner (`scripts/sleep.sh`) auto-selects a Python ≥ 3.10
-interpreter and calls the `skillopt.sleep` engine in the repo. No `pip install`
+interpreter and calls the `skillopt_sleep` engine in the repo. No `pip install`
 is required for the default `mock` backend or for `claude`/`codex` backends —
 they shell out to the CLIs you already have.
 
@@ -65,9 +65,9 @@ they shell out to the CLIs you already have.
 Or call the engine directly (Python ≥ 3.10):
 
 ```bash
-python -m skillopt.sleep run --project "$(pwd)" --scope invoked --backend mock
-python -m skillopt.sleep run --project "$(pwd)" --backend claude   # real lift via Claude
-python -m skillopt.sleep run --project "$(pwd)" --backend codex    # real lift via Codex
+python -m skillopt_sleep run --project "$(pwd)" --scope invoked --backend mock
+python -m skillopt_sleep run --project "$(pwd)" --backend claude   # real lift via Claude
+python -m skillopt_sleep run --project "$(pwd)" --backend codex    # real lift via Codex
 ```
 
 Default backend is **`mock`** — deterministic, no API spend — so you can try the
@@ -98,10 +98,10 @@ Reproduce:
 
 ```bash
 git clone https://github.com/garrytan/gbrain-evals /tmp/gbrain-evals
-python -m skillopt.sleep.experiments.run_gbrain --backend claude --model haiku \
+python -m skillopt_sleep.experiments.run_gbrain --backend claude --model haiku \
   --seeds brief-writer --data-root /tmp/gbrain-evals/eval/data/skillopt-v1 \
   --nights 1 --limit-replay 3 --limit-holdout 3
-python -m skillopt.sleep.experiments.run_gbrain --backend codex \
+python -m skillopt_sleep.experiments.run_gbrain --backend codex \
   --seeds brief-writer --data-root /tmp/gbrain-evals/eval/data/skillopt-v1 \
   --nights 1 --limit-replay 3 --limit-holdout 3
 ```
@@ -109,8 +109,8 @@ python -m skillopt.sleep.experiments.run_gbrain --backend codex \
 ## Deterministic proof (no API, no keys)
 
 ```bash
-python -m skillopt.sleep.experiments.run_experiment --persona researcher --assert-improves
-python -m skillopt.sleep.experiments.run_experiment --persona programmer  --assert-improves
+python -m skillopt_sleep.experiments.run_experiment --persona researcher --assert-improves
+python -m skillopt_sleep.experiments.run_experiment --persona programmer  --assert-improves
 ```
 
 Each prints the held-out score rising from baseline toward 1.0 as the gate
diff --git a/skillopt-sleep-plugin/commands/sleep.md b/skillopt-sleep-plugin/commands/sleep.md
index 48b62f7..6ed3ef9 100644
--- a/skillopt-sleep-plugin/commands/sleep.md
+++ b/skillopt-sleep-plugin/commands/sleep.md
@@ -18,7 +18,7 @@ held-out replay score, and nothing live is modified until the user adopts it.
 
 ## How to run it
 
-The engine is the `skillopt.sleep` Python package in this repo. Use the
+The engine is the `skillopt_sleep` Python package in this repo. Use the
 **plugin's bundled runner** so the right interpreter and repo are on the path:
 
 ```bash
diff --git a/skillopt-sleep-plugin/scripts/sleep.sh b/skillopt-sleep-plugin/scripts/sleep.sh
index 49c224a..052b0c5 100755
--- a/skillopt-sleep-plugin/scripts/sleep.sh
+++ b/skillopt-sleep-plugin/scripts/sleep.sh
@@ -1,12 +1,12 @@
 #!/usr/bin/env bash
-# SkillOpt-Sleep runner — invokes the skillopt.sleep engine with a suitable
+# SkillOpt-Sleep runner — invokes the skillopt_sleep engine with a suitable
 # Python interpreter, from the repo that contains this plugin.
 #
 # Usage: sleep.sh <run|dry-run|status|adopt|harvest> [extra args...]
 set -euo pipefail
 
 # Resolve the repo root: the plugin lives at <repo>/skillopt-sleep-plugin,
-# so the engine package is at <repo>/skillopt/sleep. CLAUDE_PLUGIN_ROOT points
+# so the engine package is at <repo>/skillopt_sleep. CLAUDE_PLUGIN_ROOT points
 # at the plugin dir when run by Claude Code; fall back to this script's dir.
 PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}"
 REPO_ROOT="$(cd "$PLUGIN_ROOT/.." && pwd)"
@@ -27,4 +27,4 @@ fi
 if [ "$#" -eq 0 ]; then set -- status; fi
 
 cd "$REPO_ROOT"
-exec "$PY" -m skillopt.sleep "$@"
+exec "$PY" -m skillopt_sleep "$@"
diff --git a/skillopt-sleep-plugin/skills/skillopt-sleep/SKILL.md b/skillopt-sleep-plugin/skills/skillopt-sleep/SKILL.md
index da252f7..bf6b86f 100644
--- a/skillopt-sleep-plugin/skills/skillopt-sleep/SKILL.md
+++ b/skillopt-sleep-plugin/skills/skillopt-sleep/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: skillopt-sleep
-description: "Use when the user wants their Claude agent to self-improve from past usage, asks about a nightly/offline 'sleep' or 'dream' cycle, memory/skill consolidation, or says things like '让 agent 越用越好用', 'review my past sessions', 'learn my preferences', 'consolidate what you learned', 'run the sleep cycle', or wants to schedule offline self-optimization. Drives the skillopt.sleep engine: harvest past sessions → mine recurring tasks → replay offline → consolidate validated CLAUDE.md/SKILL.md behind a held-out gate."
+description: "Use when the user wants their Claude agent to self-improve from past usage, asks about a nightly/offline 'sleep' or 'dream' cycle, memory/skill consolidation, or says things like '让 agent 越用越好用', 'review my past sessions', 'learn my preferences', 'consolidate what you learned', 'run the sleep cycle', or wants to schedule offline self-optimization. Drives the skillopt_sleep engine: harvest past sessions → mine recurring tasks → replay offline → consolidate validated CLAUDE.md/SKILL.md behind a held-out gate."
 ---
 
 # SkillOpt-Sleep: offline self-evolution for a local Claude agent
@@ -62,7 +62,7 @@ Prefer the `/sleep` command. Under the hood it calls the bundled runner:
 - Always show the user the **held-out baseline → candidate** score and the
   exact proposed edits before suggesting adoption. Evidence before adoption.
 - If asked whether it really helps, run
-  `python -m skillopt.sleep.experiments.run_experiment --persona researcher --json`
+  `python -m skillopt_sleep.experiments.run_experiment --persona researcher --json`
   — a deterministic demo that proves held-out lift and that the gate blocks
   harmful edits.
 
@@ -70,8 +70,8 @@ Prefer the `/sleep` command. Under the hood it calls the bundled runner:
 
 ```bash
 # deterministic proof (no API): held-out score rises, gate blocks regressions
-python -m skillopt.sleep.experiments.run_experiment --persona researcher --assert-improves
-python -m skillopt.sleep.experiments.run_experiment --persona programmer  --assert-improves
+python -m skillopt_sleep.experiments.run_experiment --persona researcher --assert-improves
+python -m skillopt_sleep.experiments.run_experiment --persona programmer  --assert-improves
 ```
 
 See `docs/sleep/experiment_results.md` for recorded output and
diff --git a/skillopt/sleep/__init__.py b/skillopt_sleep/__init__.py
similarity index 86%
rename from skillopt/sleep/__init__.py
rename to skillopt_sleep/__init__.py
index 6e35c0b..8660d06 100644
--- a/skillopt/sleep/__init__.py
+++ b/skillopt_sleep/__init__.py
@@ -11,8 +11,8 @@ Synthesizes three ideas:
   * Sleep     — short-term experience -> long-term competence, offline
 
 Public entry points:
-  * skillopt.sleep.cli      — `python -m skillopt.sleep ...`
-  * skillopt.sleep.cycle.run_sleep_cycle(...)
+  * skillopt_sleep.cli      — `python -m skillopt_sleep ...`
+  * skillopt_sleep.cycle.run_sleep_cycle(...)
 """
 from __future__ import annotations
 
diff --git a/skillopt/sleep/__main__.py b/skillopt_sleep/__main__.py
similarity index 90%
rename from skillopt/sleep/__main__.py
rename to skillopt_sleep/__main__.py
index f01612e..f2efa3e 100644
--- a/skillopt/sleep/__main__.py
+++ b/skillopt_sleep/__main__.py
@@ -1,10 +1,10 @@
 """SkillOpt-Sleep — command-line interface.
 
-    python -m skillopt.sleep run        # full cycle: harvest->mine->replay->gate->stage
-    python -m skillopt.sleep dry-run    # same but report only, no staging/adopt
-    python -m skillopt.sleep status     # show state + latest staged proposal
-    python -m skillopt.sleep adopt      # apply the latest staged proposal (with backup)
-    python -m skillopt.sleep harvest    # just print what would be mined (debug)
+    python -m skillopt_sleep run        # full cycle: harvest->mine->replay->gate->stage
+    python -m skillopt_sleep dry-run    # same but report only, no staging/adopt
+    python -m skillopt_sleep status     # show state + latest staged proposal
+    python -m skillopt_sleep adopt      # apply the latest staged proposal (with backup)
+    python -m skillopt_sleep harvest    # just print what would be mined (debug)
 
 Common flags:
     --project PATH      project to evolve (default: cwd)
@@ -23,12 +23,12 @@ import os
 import sys
 from typing import Any, Dict
 
-from skillopt.sleep.config import load_config
-from skillopt.sleep.cycle import run_sleep_cycle
-from skillopt.sleep.harvest import harvest
-from skillopt.sleep.mine import mine
-from skillopt.sleep.state import SleepState
-from skillopt.sleep.staging import latest_staging, adopt as adopt_staging
+from skillopt_sleep.config import load_config
+from skillopt_sleep.cycle import run_sleep_cycle
+from skillopt_sleep.harvest import harvest
+from skillopt_sleep.mine import mine
+from skillopt_sleep.state import SleepState
+from skillopt_sleep.staging import latest_staging, adopt as adopt_staging
 
 
 def _add_common(p: argparse.ArgumentParser) -> None:
@@ -90,7 +90,7 @@ def cmd_run(args, dry: bool = False) -> int:
         if outcome.staging_dir:
             print(f"[sleep] staged: {outcome.staging_dir}")
             if not outcome.adopted:
-                print("[sleep] review it, then: python -m skillopt.sleep adopt")
+                print("[sleep] review it, then: python -m skillopt_sleep adopt")
         if outcome.adopted:
             print(f"[sleep] auto-adopted: {', '.join(outcome.adopted_paths)}")
     return 0
@@ -164,7 +164,7 @@ def cmd_harvest(args) -> int:
 
 
 def main(argv=None) -> int:
-    parser = argparse.ArgumentParser(prog="skillopt.sleep", description="SkillOpt-Sleep nightly self-evolution")
+    parser = argparse.ArgumentParser(prog="skillopt_sleep", description="SkillOpt-Sleep nightly self-evolution")
     sub = parser.add_subparsers(dest="cmd", required=True)
 
     p_run = sub.add_parser("run", help="run a full sleep cycle")
diff --git a/skillopt/sleep/backend.py b/skillopt_sleep/backend.py
similarity index 99%
rename from skillopt/sleep/backend.py
rename to skillopt_sleep/backend.py
index c4b873d..fbc8d26 100644
--- a/skillopt/sleep/backend.py
+++ b/skillopt_sleep/backend.py
@@ -26,7 +26,7 @@ import re
 import subprocess
 from typing import Any, Dict, List, Optional, Tuple
 
-from skillopt.sleep.types import EditRecord, ReplayResult, TaskRecord
+from skillopt_sleep.types import EditRecord, ReplayResult, TaskRecord
 
 
 def skill_hash(content: str) -> str:
@@ -192,7 +192,7 @@ class MockBackend(Backend):
 
     def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]:
         if task.reference_kind == "rule" and task.judge:
-            from skillopt.sleep.judges import score_rule_judge
+            from skillopt_sleep.judges import score_rule_judge
             return score_rule_judge(task.judge, response)
         if task.reference_kind == "exact" and task.reference:
             hard = exact_score(task.reference, response)
@@ -303,7 +303,7 @@ class CliBackend(Backend):
     def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]:
         # gbrain-style rule judge: scored locally, no API spend
         if task.reference_kind == "rule" and task.judge:
-            from skillopt.sleep.judges import score_rule_judge
+            from skillopt_sleep.judges import score_rule_judge
             return score_rule_judge(task.judge, response)
         # exact references are scored locally — no API spend
         if task.reference_kind == "exact" and task.reference:
diff --git a/skillopt/sleep/budget.py b/skillopt_sleep/budget.py
similarity index 100%
rename from skillopt/sleep/budget.py
rename to skillopt_sleep/budget.py
diff --git a/skillopt/sleep/config.py b/skillopt_sleep/config.py
similarity index 100%
rename from skillopt/sleep/config.py
rename to skillopt_sleep/config.py
diff --git a/skillopt/sleep/consolidate.py b/skillopt_sleep/consolidate.py
similarity index 88%
rename from skillopt/sleep/consolidate.py
rename to skillopt_sleep/consolidate.py
index da15d96..f01808e 100644
--- a/skillopt/sleep/consolidate.py
+++ b/skillopt_sleep/consolidate.py
@@ -3,11 +3,8 @@
 This is the core that makes nightly evolution *safe*: it proposes bounded
 edits from replayed failures, applies them to a candidate skill/memory, then
 **gates** the candidate on a held-out slice of the user's own tasks. Only a
-candidate that strictly improves the held-out score is accepted — exactly the
-SkillOpt validation gate, reused verbatim from ``skillopt.evaluation.gate``.
-
-Reused from the main SkillOpt package (import-light, no `openai` needed):
-  * skillopt.evaluation.gate.evaluate_gate / select_gate_score
+candidate that strictly improves the held-out score is accepted — the SkillOpt
+validation gate, vendored self-contained in ``skillopt_sleep.gate``.
 """
 from __future__ import annotations
 
@@ -15,26 +12,16 @@ import os
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
 
-from skillopt.sleep.backend import Backend
-from skillopt.sleep.memory import apply_edits
-from skillopt.sleep.replay import aggregate_scores, replay_batch
-from skillopt.sleep.types import EditRecord, ReplayResult, TaskRecord
+from skillopt_sleep.backend import Backend
+from skillopt_sleep.memory import apply_edits
+from skillopt_sleep.replay import aggregate_scores, replay_batch
+from skillopt_sleep.types import EditRecord, ReplayResult, TaskRecord
 
 
-# Reuse the real SkillOpt gate. This module imports cleanly without `openai`.
-try:
-    from skillopt.evaluation.gate import evaluate_gate, select_gate_score
-    _HAVE_REPO_GATE = True
-except Exception:  # pragma: no cover - fallback keeps engine standalone
-    _HAVE_REPO_GATE = False
-
-    def select_gate_score(hard, soft, metric="hard", mixed_weight=0.5):  # type: ignore
-        if metric == "hard":
-            return float(hard)
-        if metric == "soft":
-            return float(soft)
-        w = max(0.0, min(1.0, float(mixed_weight)))
-        return (1 - w) * float(hard) + w * float(soft)
+# Self-contained validation gate (vendored from SkillOpt; zero dependency on the
+# research package, so this open-source tool stays decoupled from the paper code).
+from skillopt_sleep.gate import evaluate_gate, select_gate_score
+_HAVE_REPO_GATE = True
 
 
 @dataclass
@@ -140,7 +127,7 @@ def consolidate(
         if rollouts_k > 1:
             # multi-rollout contrastive reflection: run each train task K times
             # and distill a rule from the good-vs-bad contrast (the "脑补" signal).
-            from skillopt.sleep.rollout import multi_rollout, contrastive_reflect
+            from skillopt_sleep.rollout import multi_rollout, contrastive_reflect
             sets = [multi_rollout(backend, t, cand_skill, cand_memory, k=rollouts_k)
                     for t in train_tasks]
             edits = contrastive_reflect(
diff --git a/skillopt/sleep/cycle.py b/skillopt_sleep/cycle.py
similarity index 93%
rename from skillopt/sleep/cycle.py
rename to skillopt_sleep/cycle.py
index 4efc81b..04baa03 100644
--- a/skillopt/sleep/cycle.py
+++ b/skillopt_sleep/cycle.py
@@ -14,15 +14,15 @@ import time
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 
-from skillopt.sleep.backend import get_backend
-from skillopt.sleep.config import SleepConfig, load_config
-from skillopt.sleep.consolidate import consolidate
-from skillopt.sleep.harvest import harvest
-from skillopt.sleep.memory import ensure_skill_scaffold
-from skillopt.sleep.mine import mine
-from skillopt.sleep.state import SleepState, _now_iso
-from skillopt.sleep.staging import write_staging, adopt as adopt_staging
-from skillopt.sleep.types import SessionDigest, SleepReport, TaskRecord
+from skillopt_sleep.backend import get_backend
+from skillopt_sleep.config import SleepConfig, load_config
+from skillopt_sleep.consolidate import consolidate
+from skillopt_sleep.harvest import harvest
+from skillopt_sleep.memory import ensure_skill_scaffold
+from skillopt_sleep.mine import mine
+from skillopt_sleep.state import SleepState, _now_iso
+from skillopt_sleep.staging import write_staging, adopt as adopt_staging
+from skillopt_sleep.types import SessionDigest, SleepReport, TaskRecord
 
 
 @dataclass
@@ -131,7 +131,7 @@ def run_sleep_cycle(
         llm_miner = None
         if cfg.get("backend", "mock") != "mock" and cfg.get("llm_mine", True):
             try:
-                from skillopt.sleep.llm_miner import make_llm_miner
+                from skillopt_sleep.llm_miner import make_llm_miner
                 llm_miner = make_llm_miner(backend, max_tasks=cfg.get("max_tasks_per_night", 40))
             except Exception:
                 llm_miner = None
diff --git a/skillopt/sleep/experiments/__init__.py b/skillopt_sleep/experiments/__init__.py
similarity index 100%
rename from skillopt/sleep/experiments/__init__.py
rename to skillopt_sleep/experiments/__init__.py
diff --git a/skillopt/sleep/experiments/gbrain_bench.py b/skillopt_sleep/experiments/gbrain_bench.py
similarity index 97%
rename from skillopt/sleep/experiments/gbrain_bench.py
rename to skillopt_sleep/experiments/gbrain_bench.py
index efe0ff6..49261d6 100644
--- a/skillopt/sleep/experiments/gbrain_bench.py
+++ b/skillopt_sleep/experiments/gbrain_bench.py
@@ -17,7 +17,7 @@ We map:
   judge           -> TaskRecord.judge (+ reference_kind="rule")
 
 This lets us reproduce gbrain's headline result with our engine and either the
-claude or codex backend, scoring locally via skillopt.sleep.judges (no judge API).
+claude or codex backend, scoring locally via skillopt_sleep.judges (no judge API).
 """
 from __future__ import annotations
 
@@ -25,7 +25,7 @@ import json
 import os
 from typing import Dict, List, Optional, Tuple
 
-from skillopt.sleep.types import TaskRecord
+from skillopt_sleep.types import TaskRecord
 
 
 SEED_DIRS = {
diff --git a/skillopt/sleep/experiments/personas.py b/skillopt_sleep/experiments/personas.py
similarity index 98%
rename from skillopt/sleep/experiments/personas.py
rename to skillopt_sleep/experiments/personas.py
index 0fdb127..72eb6af 100644
--- a/skillopt/sleep/experiments/personas.py
+++ b/skillopt_sleep/experiments/personas.py
@@ -12,7 +12,7 @@ from __future__ import annotations
 
 from typing import List
 
-from skillopt.sleep.types import TaskRecord
+from skillopt_sleep.types import TaskRecord
 
 
 def _t(i, intent, ref, rule, project="/personas/demo", outcome="fail") -> TaskRecord:
diff --git a/skillopt/sleep/experiments/report.py b/skillopt_sleep/experiments/report.py
similarity index 96%
rename from skillopt/sleep/experiments/report.py
rename to skillopt_sleep/experiments/report.py
index 8672822..767ea65 100644
--- a/skillopt/sleep/experiments/report.py
+++ b/skillopt_sleep/experiments/report.py
@@ -1,7 +1,7 @@
 """SkillOpt-Sleep — turn a sweep JSONL into a presented Markdown scorecard.
 
 Usage:
-  python -m skillopt.sleep.experiments.report --in docs/sleep/sweep.jsonl \
+  python -m skillopt_sleep.experiments.report --in docs/sleep/sweep.jsonl \
       --out docs/sleep/benchmark_report.md
 """
 from __future__ import annotations
@@ -101,9 +101,9 @@ def render(rows: List[Dict[str, Any]]) -> str:
     out.append("")
     out.append("```bash")
     out.append("git clone https://github.com/garrytan/gbrain-evals /tmp/gbrain-evals")
-    out.append("python -m skillopt.sleep.experiments.sweep --plan full \\")
+    out.append("python -m skillopt_sleep.experiments.sweep --plan full \\")
     out.append("    --data-root /tmp/gbrain-evals/eval/data/skillopt-v1 --out docs/sleep/sweep.jsonl")
-    out.append("python -m skillopt.sleep.experiments.report \\")
+    out.append("python -m skillopt_sleep.experiments.report \\")
     out.append("    --in docs/sleep/sweep.jsonl --out docs/sleep/benchmark_report.md")
     out.append("```")
     out.append("")
diff --git a/skillopt/sleep/experiments/run_experiment.py b/skillopt_sleep/experiments/run_experiment.py
similarity index 92%
rename from skillopt/sleep/experiments/run_experiment.py
rename to skillopt_sleep/experiments/run_experiment.py
index 385b0a1..91a9ca9 100644
--- a/skillopt/sleep/experiments/run_experiment.py
+++ b/skillopt_sleep/experiments/run_experiment.py
@@ -14,9 +14,9 @@ What it proves:
      the adopted artifact, re-scored, retains the lift.
 
 Run:
-    python -m skillopt.sleep.experiments.run_experiment
-    python -m skillopt.sleep.experiments.run_experiment --persona programmer --nights 3
-    python -m skillopt.sleep.experiments.run_experiment --backend anthropic   # real lift
+    python -m skillopt_sleep.experiments.run_experiment
+    python -m skillopt_sleep.experiments.run_experiment --persona programmer --nights 3
+    python -m skillopt_sleep.experiments.run_experiment --backend anthropic   # real lift
 """
 from __future__ import annotations
 
@@ -27,21 +27,21 @@ import sys
 import tempfile
 from typing import List
 
-from skillopt.sleep.backend import get_backend
-from skillopt.sleep.consolidate import consolidate
-from skillopt.sleep.experiments.personas import (
+from skillopt_sleep.backend import get_backend
+from skillopt_sleep.consolidate import consolidate
+from skillopt_sleep.experiments.personas import (
     PERSONAS,
     harmful_edit_task,
     researcher_persona,
 )
-from skillopt.sleep.memory import ensure_skill_scaffold
-from skillopt.sleep.replay import aggregate_scores, replay_batch
-from skillopt.sleep.types import TaskRecord
+from skillopt_sleep.memory import ensure_skill_scaffold
+from skillopt_sleep.replay import aggregate_scores, replay_batch
+from skillopt_sleep.types import TaskRecord
 
 
 def _score_holdout(backend, tasks: List[TaskRecord], skill: str, memory: str,
                    metric: str = "mixed", w: float = 0.5) -> float:
-    from skillopt.sleep.consolidate import select_gate_score
+    from skillopt_sleep.consolidate import select_gate_score
     # the persona experiment uses a 2-way split (train/val, no test); score on val
     holdout = [t for t in tasks if t.split in ("val", "holdout")] or tasks
     pairs = replay_batch(backend, holdout, skill, memory)
@@ -52,7 +52,7 @@ def _score_holdout(backend, tasks: List[TaskRecord], skill: str, memory: str,
 def run(persona: str = "researcher", nights: int = 4, backend_name: str = "mock",
         edit_budget: int = 4, seed: int = 42, model: str = "", codex_path: str = "",
         limit_tasks: int = 0) -> dict:
-    from skillopt.sleep.mine import assign_splits
+    from skillopt_sleep.mine import assign_splits
 
     make = PERSONAS.get(persona, researcher_persona)
     items = make()
diff --git a/skillopt/sleep/experiments/run_gbrain.py b/skillopt_sleep/experiments/run_gbrain.py
similarity index 94%
rename from skillopt/sleep/experiments/run_gbrain.py
rename to skillopt_sleep/experiments/run_gbrain.py
index cfa5359..43c7acd 100644
--- a/skillopt/sleep/experiments/run_gbrain.py
+++ b/skillopt_sleep/experiments/run_gbrain.py
@@ -13,9 +13,9 @@ Held-out scoring is done locally by the rule judge (no judge API). Only the
 agent's `attempt` (and the optimizer's `reflect`) spend tokens.
 
 Usage:
-    python -m skillopt.sleep.experiments.run_gbrain --backend mock
-    python -m skillopt.sleep.experiments.run_gbrain --backend claude --seeds brief-writer --nights 2
-    python -m skillopt.sleep.experiments.run_gbrain --backend codex  --data-root /tmp/gbrain-evals/eval/data/skillopt-v1
+    python -m skillopt_sleep.experiments.run_gbrain --backend mock
+    python -m skillopt_sleep.experiments.run_gbrain --backend claude --seeds brief-writer --nights 2
+    python -m skillopt_sleep.experiments.run_gbrain --backend codex  --data-root /tmp/gbrain-evals/eval/data/skillopt-v1
 """
 from __future__ import annotations
 
@@ -24,14 +24,14 @@ import json
 import sys
 from typing import Dict, List, Optional
 
-from skillopt.sleep.backend import build_backend, get_backend
-from skillopt.sleep.consolidate import consolidate, select_gate_score
-from skillopt.sleep.experiments.gbrain_bench import (
+from skillopt_sleep.backend import build_backend, get_backend
+from skillopt_sleep.consolidate import consolidate, select_gate_score
+from skillopt_sleep.experiments.gbrain_bench import (
     available_seeds,
     find_data_root,
     load_seed,
 )
-from skillopt.sleep.replay import aggregate_scores, replay_batch
+from skillopt_sleep.replay import aggregate_scores, replay_batch
 
 
 def _score(backend, tasks, skill, memory, split="test", metric="mixed", w=0.5):
@@ -95,7 +95,7 @@ def run_seed(backend, seed: str, skill: str, tasks: List, *,
     slow_text = None
     if nights >= 2 and slow_update:
         try:
-            from skillopt.sleep.slow_update import run_slow_update, replace_slow_field
+            from skillopt_sleep.slow_update import run_slow_update, replace_slow_field
             val_tasks = [t for t in tasks if t.split == "val"] or tasks
             prev_pairs = replay_batch(backend, val_tasks, first_night_skill, memory)
             curr_pairs = replay_batch(backend, val_tasks, cur, memory)
@@ -170,7 +170,7 @@ def main(argv=None) -> int:
         # budget auto-planning: derive nights x rollouts_k from a token budget
         nights, rollouts_k = args.nights, args.rollouts_k
         if args.budget_tokens:
-            from skillopt.sleep.budget import Budget, plan_depth
+            from skillopt_sleep.budget import Budget, plan_depth
             n_train = len([t for t in tasks if t.split == "train"]) or len(tasks)
             nights, rollouts_k = plan_depth(
                 Budget(max_tokens=args.budget_tokens), n_tasks=n_train,
diff --git a/skillopt/sleep/experiments/run_transfer.py b/skillopt_sleep/experiments/run_transfer.py
similarity index 95%
rename from skillopt/sleep/experiments/run_transfer.py
rename to skillopt_sleep/experiments/run_transfer.py
index 9cdd86d..5b00ec8 100644
--- a/skillopt/sleep/experiments/run_transfer.py
+++ b/skillopt_sleep/experiments/run_transfer.py
@@ -16,7 +16,7 @@ Protocol, per gbrain seed:
 Report baseline / direct / transferred, mirroring SkillOpt Table "transfer".
 
 Usage:
-  python -m skillopt.sleep.experiments.run_transfer \
+  python -m skillopt_sleep.experiments.run_transfer \
      --source-backend claude --source-model haiku \
      --target-backend claude --target-model sonnet \
      --seeds brief-writer --nights 2
@@ -28,12 +28,12 @@ import json
 import sys
 from typing import List, Optional
 
-from skillopt.sleep.backend import get_backend
-from skillopt.sleep.consolidate import consolidate, select_gate_score
-from skillopt.sleep.experiments.gbrain_bench import (
+from skillopt_sleep.backend import get_backend
+from skillopt_sleep.consolidate import consolidate, select_gate_score
+from skillopt_sleep.experiments.gbrain_bench import (
     available_seeds, find_data_root, load_seed,
 )
-from skillopt.sleep.replay import aggregate_scores, replay_batch
+from skillopt_sleep.replay import aggregate_scores, replay_batch
 
 
 def _holdout_hard(backend, tasks, skill, memory="") -> float:
diff --git a/skillopt/sleep/experiments/sweep.py b/skillopt_sleep/experiments/sweep.py
similarity index 94%
rename from skillopt/sleep/experiments/sweep.py
rename to skillopt_sleep/experiments/sweep.py
index 75109e9..ddd337c 100644
--- a/skillopt/sleep/experiments/sweep.py
+++ b/skillopt_sleep/experiments/sweep.py
@@ -8,8 +8,8 @@ survive) and resume (skip configs whose row already exists).
 Then `report.py` turns the JSONL into a presented Markdown scorecard.
 
 Usage:
-  python -m skillopt.sleep.experiments.sweep --plan quick   --out docs/sleep/sweep.jsonl
-  python -m skillopt.sleep.experiments.sweep --plan full    --out docs/sleep/sweep.jsonl
+  python -m skillopt_sleep.experiments.sweep --plan quick   --out docs/sleep/sweep.jsonl
+  python -m skillopt_sleep.experiments.sweep --plan full    --out docs/sleep/sweep.jsonl
 """
 from __future__ import annotations
 
@@ -20,10 +20,10 @@ import sys
 import time
 from typing import Any, Dict, List
 
-from skillopt.sleep.backend import build_backend, get_backend
-from skillopt.sleep.experiments.gbrain_bench import find_data_root, load_seed
-from skillopt.sleep.experiments.run_gbrain import run_seed as bench_seed
-from skillopt.sleep.experiments.run_transfer import run_seed as transfer_seed
+from skillopt_sleep.backend import build_backend, get_backend
+from skillopt_sleep.experiments.gbrain_bench import find_data_root, load_seed
+from skillopt_sleep.experiments.run_gbrain import run_seed as bench_seed
+from skillopt_sleep.experiments.run_transfer import run_seed as transfer_seed
 
 
 # Plans: lists of config dicts. Kept small per-run to bound cost/latency.
diff --git a/skillopt_sleep/gate.py b/skillopt_sleep/gate.py
new file mode 100644
index 0000000..7eca3b4
--- /dev/null
+++ b/skillopt_sleep/gate.py
@@ -0,0 +1,50 @@
+"""SkillOpt-Sleep — vendored validation gate.
+
+This is a self-contained copy of the SkillOpt validation gate so the sleep
+engine has ZERO dependency on the research package (skillopt/*). The research
+repo's ``skillopt.evaluation.gate`` is the reference implementation and the two
+are kept behaviourally identical; vendoring keeps this open-source tool
+decoupled from the paper's experiment code.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class GateResult:
+    action: str            # "accept_new_best" | "accept" | "reject"
+    current_skill: str
+    current_score: float
+    best_skill: str
+    best_score: float
+    best_step: int
+
+
+def select_gate_score(hard: float, soft: float, metric: str = "hard",
+                      mixed_weight: float = 0.5) -> float:
+    """Project (hard, soft) onto a single comparison metric."""
+    if metric == "hard":
+        return float(hard)
+    if metric == "soft":
+        return float(soft)
+    if metric == "mixed":
+        w = max(0.0, min(1.0, float(mixed_weight)))
+        return (1.0 - w) * float(hard) + w * float(soft)
+    raise ValueError(f"unknown gate metric {metric!r}; expected hard/soft/mixed")
+
+
+def evaluate_gate(candidate_skill: str, cand_hard: float, current_skill: str,
+                  current_score: float, best_skill: str, best_score: float,
+                  best_step: int, global_step: int, *, cand_soft: float = 0.0,
+                  metric: str = "hard", mixed_weight: float = 0.5) -> GateResult:
+    """Pure gate decision: compare candidate score to current/best."""
+    cand_score = select_gate_score(cand_hard, cand_soft, metric, mixed_weight)
+    if cand_score > current_score:
+        if cand_score > best_score:
+            return GateResult("accept_new_best", candidate_skill, cand_score,
+                              candidate_skill, cand_score, global_step)
+        return GateResult("accept", candidate_skill, cand_score,
+                          best_skill, best_score, best_step)
+    return GateResult("reject", current_skill, current_score,
+                      best_skill, best_score, best_step)
diff --git a/skillopt/sleep/harvest.py b/skillopt_sleep/harvest.py
similarity index 99%
rename from skillopt/sleep/harvest.py
rename to skillopt_sleep/harvest.py
index 013483f..fb48379 100644
--- a/skillopt/sleep/harvest.py
+++ b/skillopt_sleep/harvest.py
@@ -19,7 +19,7 @@ import json
 import os
 from typing import Any, Dict, Iterable, List, Optional
 
-from skillopt.sleep.types import SessionDigest
+from skillopt_sleep.types import SessionDigest
 
 
 # Heuristic phrases that signal the user (dis)approving of prior output.
diff --git a/skillopt/sleep/judges.py b/skillopt_sleep/judges.py
similarity index 100%
rename from skillopt/sleep/judges.py
rename to skillopt_sleep/judges.py
diff --git a/skillopt/sleep/llm_miner.py b/skillopt_sleep/llm_miner.py
similarity index 97%
rename from skillopt/sleep/llm_miner.py
rename to skillopt_sleep/llm_miner.py
index 374b787..dd78c63 100644
--- a/skillopt/sleep/llm_miner.py
+++ b/skillopt_sleep/llm_miner.py
@@ -22,8 +22,8 @@ import json
 import re
 from typing import Any, Callable, Dict, List
 
-from skillopt.sleep.backend import Backend, _extract_json
-from skillopt.sleep.types import SessionDigest, TaskRecord
+from skillopt_sleep.backend import Backend, _extract_json
+from skillopt_sleep.types import SessionDigest, TaskRecord
 
 
 _MINER_PROMPT = """You are mining a user's past AI-assistant sessions to find RECURRING tasks
diff --git a/skillopt/sleep/memory.py b/skillopt_sleep/memory.py
similarity index 98%
rename from skillopt/sleep/memory.py
rename to skillopt_sleep/memory.py
index 2f7ddbb..579d714 100644
--- a/skillopt/sleep/memory.py
+++ b/skillopt_sleep/memory.py
@@ -10,7 +10,7 @@ from __future__ import annotations
 import re
 from typing import List, Tuple
 
-from skillopt.sleep.types import EditRecord
+from skillopt_sleep.types import EditRecord
 
 
 LEARNED_START = "<!-- SKILLOPT-SLEEP:LEARNED START -->"
diff --git a/skillopt/sleep/mine.py b/skillopt_sleep/mine.py
similarity index 99%
rename from skillopt/sleep/mine.py
rename to skillopt_sleep/mine.py
index ec22f18..64d7546 100644
--- a/skillopt/sleep/mine.py
+++ b/skillopt_sleep/mine.py
@@ -18,7 +18,7 @@ import hashlib
 import re
 from typing import Any, Callable, List, Optional
 
-from skillopt.sleep.types import SessionDigest, TaskRecord
+from skillopt_sleep.types import SessionDigest, TaskRecord
 
 
 def _tid(project: str, intent: str) -> str:
diff --git a/skillopt/sleep/replay.py b/skillopt_sleep/replay.py
similarity index 96%
rename from skillopt/sleep/replay.py
rename to skillopt_sleep/replay.py
index 7cdfd7f..dc63f7f 100644
--- a/skillopt/sleep/replay.py
+++ b/skillopt_sleep/replay.py
@@ -11,8 +11,8 @@ from __future__ import annotations
 
 from typing import List, Tuple
 
-from skillopt.sleep.backend import Backend
-from skillopt.sleep.types import ReplayResult, TaskRecord
+from skillopt_sleep.backend import Backend
+from skillopt_sleep.types import ReplayResult, TaskRecord
 
 
 def _required_tools(task: TaskRecord) -> List[str]:
@@ -44,7 +44,7 @@ def replay_one(backend: Backend, task: TaskRecord, skill: str, memory: str) -> R
 
     # rule judges may need the detected tool calls; score locally when possible
     if task.reference_kind == "rule" and task.judge:
-        from skillopt.sleep.judges import score_rule_judge
+        from skillopt_sleep.judges import score_rule_judge
         hard, soft, rationale = score_rule_judge(task.judge, response, tools_called)
     else:
         hard, soft, rationale = backend.judge(task, response)
diff --git a/skillopt/sleep/rollout.py b/skillopt_sleep/rollout.py
similarity index 96%
rename from skillopt/sleep/rollout.py
rename to skillopt_sleep/rollout.py
index f96679c..bae20dd 100644
--- a/skillopt/sleep/rollout.py
+++ b/skillopt_sleep/rollout.py
@@ -18,9 +18,9 @@ from __future__ import annotations
 from dataclasses import dataclass, field
 from typing import List, Optional, Tuple
 
-from skillopt.sleep.backend import Backend, _extract_json
-from skillopt.sleep.replay import replay_one
-from skillopt.sleep.types import EditRecord, ReplayResult, TaskRecord
+from skillopt_sleep.backend import Backend, _extract_json
+from skillopt_sleep.replay import replay_one
+from skillopt_sleep.types import EditRecord, ReplayResult, TaskRecord
 
 
 @dataclass
diff --git a/skillopt/sleep/slow_update.py b/skillopt_sleep/slow_update.py
similarity index 97%
rename from skillopt/sleep/slow_update.py
rename to skillopt_sleep/slow_update.py
index 20a7175..7262785 100644
--- a/skillopt/sleep/slow_update.py
+++ b/skillopt_sleep/slow_update.py
@@ -23,8 +23,8 @@ from __future__ import annotations
 import re
 from typing import List, Optional, Tuple
 
-from skillopt.sleep.backend import Backend, _extract_json
-from skillopt.sleep.types import ReplayResult, TaskRecord
+from skillopt_sleep.backend import Backend, _extract_json
+from skillopt_sleep.types import ReplayResult, TaskRecord
 
 
 SLOW_UPDATE_START = "<!-- SLOW_UPDATE_START -->"
diff --git a/skillopt/sleep/staging.py b/skillopt_sleep/staging.py
similarity index 98%
rename from skillopt/sleep/staging.py
rename to skillopt_sleep/staging.py
index b2e0ab2..2af5be9 100644
--- a/skillopt/sleep/staging.py
+++ b/skillopt_sleep/staging.py
@@ -13,7 +13,7 @@ import shutil
 import time
 from typing import List, Optional
 
-from skillopt.sleep.types import SleepReport
+from skillopt_sleep.types import SleepReport
 
 
 def _ts_dir() -> str:
diff --git a/skillopt/sleep/state.py b/skillopt_sleep/state.py
similarity index 100%
rename from skillopt/sleep/state.py
rename to skillopt_sleep/state.py
diff --git a/skillopt/sleep/types.py b/skillopt_sleep/types.py
similarity index 98%
rename from skillopt/sleep/types.py
rename to skillopt_sleep/types.py
index edfd77d..7208bb9 100644
--- a/skillopt/sleep/types.py
+++ b/skillopt_sleep/types.py
@@ -18,7 +18,7 @@ from typing import Any, Dict, List, Optional
 class SessionDigest:
     """A normalized summary of one Claude Code session transcript.
 
-    Produced by :mod:`skillopt.sleep.harvest` from a ``<sessionId>.jsonl``
+    Produced by :mod:`skillopt_sleep.harvest` from a ``<sessionId>.jsonl``
     transcript plus ``history.jsonl`` entries.
     """
 
diff --git a/tests/test_sleep_engine.py b/tests/test_sleep_engine.py
index ba2a890..2a28dce 100644
--- a/tests/test_sleep_engine.py
+++ b/tests/test_sleep_engine.py
@@ -2,7 +2,7 @@
 
 Pure-stdlib (unittest), deterministic, no API key, no third-party deps.
 Run:  python3.12 -m pytest tests/test_sleep_engine.py
-  or: python3.12 -m unittest skillopt.sleep ... (see bottom)
+  or: python3.12 -m unittest skillopt_sleep ... (see bottom)
 """
 from __future__ import annotations
 
@@ -11,16 +11,16 @@ import os
 import tempfile
 import unittest
 
-from skillopt.sleep.backend import MockBackend, exact_score, keyword_soft_score
-from skillopt.sleep.config import load_config
-from skillopt.sleep.consolidate import consolidate
-from skillopt.sleep.cycle import run_sleep_cycle
-from skillopt.sleep.experiments.personas import researcher_persona, programmer_persona
-from skillopt.sleep.harvest import digest_transcript, _detect_feedback, _is_meta_prompt
-from skillopt.sleep.memory import apply_edits, current_learned_lines, extract_learned, set_learned
-from skillopt.sleep.mine import assign_splits, heuristic_mine, dedup_tasks
-from skillopt.sleep.staging import adopt, latest_staging
-from skillopt.sleep.types import EditRecord, SessionDigest, TaskRecord
+from skillopt_sleep.backend import MockBackend, exact_score, keyword_soft_score
+from skillopt_sleep.config import load_config
+from skillopt_sleep.consolidate import consolidate
+from skillopt_sleep.cycle import run_sleep_cycle
+from skillopt_sleep.experiments.personas import researcher_persona, programmer_persona
+from skillopt_sleep.harvest import digest_transcript, _detect_feedback, _is_meta_prompt
+from skillopt_sleep.memory import apply_edits, current_learned_lines, extract_learned, set_learned
+from skillopt_sleep.mine import assign_splits, heuristic_mine, dedup_tasks
+from skillopt_sleep.staging import adopt, latest_staging
+from skillopt_sleep.types import EditRecord, SessionDigest, TaskRecord
 
 
 class TestScoring(unittest.TestCase):
@@ -115,7 +115,7 @@ class TestMine(unittest.TestCase):
 
     def test_dream_never_in_val_or_test(self):
         # the anti-overfitting guarantee: origin='dream' tasks only ever land in train
-        from skillopt.sleep.types import TaskRecord
+        from skillopt_sleep.types import TaskRecord
         real = researcher_persona()
         dream = [TaskRecord(id=f"d{i}", project="/p", intent=f"dream {i}",
                             origin="dream", derived_from="r0") for i in range(5)]
@@ -152,7 +152,7 @@ class TestConsolidateGate(unittest.TestCase):
 
 class TestRuleJudge(unittest.TestCase):
     def test_section_and_regex(self):
-        from skillopt.sleep.judges import score_rule_judge
+        from skillopt_sleep.judges import score_rule_judge
         j = {"kind": "rule", "checks": [
             {"op": "section_present", "arg": "Key Risks"},
             {"op": "regex", "arg": r"[Cc]onfidence\s*[:=]"},
@@ -162,13 +162,13 @@ class TestRuleJudge(unittest.TestCase):
         self.assertEqual(score_rule_judge(j, "just an answer")[0], 0.0)
 
     def test_max_chars(self):
-        from skillopt.sleep.judges import score_rule_judge
+        from skillopt_sleep.judges import score_rule_judge
         j = {"checks": [{"op": "max_chars", "arg": 50}]}
         self.assertEqual(score_rule_judge(j, "x" * 10)[0], 1.0)
         self.assertEqual(score_rule_judge(j, "x" * 100)[0], 0.0)
 
     def test_partial_soft_score(self):
-        from skillopt.sleep.judges import score_rule_judge
+        from skillopt_sleep.judges import score_rule_judge
         j = {"checks": [
             {"op": "contains", "arg": "alpha"},
             {"op": "contains", "arg": "beta"},
@@ -180,7 +180,7 @@ class TestRuleJudge(unittest.TestCase):
 
 class TestGbrainLoader(unittest.TestCase):
     def test_loads_when_present(self):
-        from skillopt.sleep.experiments.gbrain_bench import find_data_root, load_seed
+        from skillopt_sleep.experiments.gbrain_bench import find_data_root, load_seed
         root = find_data_root()
         if not root:
             self.skipTest("gbrain-evals data not present")
@@ -191,7 +191,7 @@ class TestGbrainLoader(unittest.TestCase):
         self.assertTrue(any(t.split == "val" for t in tasks))
         self.assertTrue(all(t.reference_kind == "rule" for t in tasks))
         # the deficient skill must FAIL its own held-out (test) checks (baseline 0)
-        from skillopt.sleep.judges import score_rule_judge
+        from skillopt_sleep.judges import score_rule_judge
         ho = [t for t in tasks if t.split == "test"][0]
         self.assertEqual(score_rule_judge(ho.judge, skill)[0], 0.0)
 
@@ -199,8 +199,8 @@ class TestGbrainLoader(unittest.TestCase):
 class TestLlmMiner(unittest.TestCase):
     def test_miner_emits_checkable_tasks(self):
         # a stub backend whose _call returns canned miner JSON => deterministic
-        from skillopt.sleep.backend import Backend
-        from skillopt.sleep.llm_miner import make_llm_miner
+        from skillopt_sleep.backend import Backend
+        from skillopt_sleep.llm_miner import make_llm_miner
 
         class StubBackend(Backend):
             name = "stub"
@@ -219,8 +219,8 @@ class TestLlmMiner(unittest.TestCase):
         self.assertEqual(tasks[0].judge["checks"][0]["op"], "section_present")
 
     def test_miner_drops_uncheckable(self):
-        from skillopt.sleep.backend import Backend
-        from skillopt.sleep.llm_miner import make_llm_miner
+        from skillopt_sleep.backend import Backend
+        from skillopt_sleep.llm_miner import make_llm_miner
 
         class EmptyBackend(Backend):
             name = "stub"
@@ -234,8 +234,8 @@ class TestLlmMiner(unittest.TestCase):
 
 class TestMultiObjectiveAndPrefs(unittest.TestCase):
     def test_multi_objective_reward(self):
-        from skillopt.sleep.replay import multi_objective_reward
-        from skillopt.sleep.types import ReplayResult, TaskRecord
+        from skillopt_sleep.replay import multi_objective_reward
+        from skillopt_sleep.types import ReplayResult, TaskRecord
         t = TaskRecord(id="t", project="/p", intent="x")
         expensive = [(t, ReplayResult(id="t", hard=1.0, tokens=4000, latency_ms=20000))]
         cheap = [(t, ReplayResult(id="t", hard=1.0, tokens=200, latency_ms=1000))]
@@ -248,8 +248,8 @@ class TestMultiObjectiveAndPrefs(unittest.TestCase):
         self.assertGreater(rc, re)
 
     def test_preferences_injected_into_reflect(self):
-        from skillopt.sleep.backend import CliBackend
-        from skillopt.sleep.types import TaskRecord, ReplayResult
+        from skillopt_sleep.backend import CliBackend
+        from skillopt_sleep.types import TaskRecord, ReplayResult
         captured = {}
 
         class CapBackend(CliBackend):
@@ -267,9 +267,9 @@ class TestMultiObjectiveAndPrefs(unittest.TestCase):
         self.assertIn("British English", captured["prompt"])
 
     def test_replay_records_cost(self):
-        from skillopt.sleep.backend import MockBackend
-        from skillopt.sleep.replay import replay_one
-        from skillopt.sleep.types import TaskRecord
+        from skillopt_sleep.backend import MockBackend
+        from skillopt_sleep.replay import replay_one
+        from skillopt_sleep.types import TaskRecord
         t = TaskRecord(id="t", project="/p", intent="hello world",
                        reference_kind="exact", reference="hi")
         r = replay_one(MockBackend(), t, "some skill text", "")
@@ -279,8 +279,8 @@ class TestMultiObjectiveAndPrefs(unittest.TestCase):
 
 class TestMultiRolloutAndBudget(unittest.TestCase):
     def test_rolloutset_stats(self):
-        from skillopt.sleep.rollout import RolloutSet
-        from skillopt.sleep.types import ReplayResult, TaskRecord
+        from skillopt_sleep.rollout import RolloutSet
+        from skillopt_sleep.types import ReplayResult, TaskRecord
         rs = RolloutSet(task=TaskRecord(id="t", project="/p", intent="x"),
                         attempts=[ReplayResult(id="t", hard=1.0),
                                   ReplayResult(id="t", hard=0.0),
@@ -291,7 +291,7 @@ class TestMultiRolloutAndBudget(unittest.TestCase):
         self.assertAlmostEqual(rs.pass_rate, 2 / 3)
 
     def test_budget_exhaustion_and_plan(self):
-        from skillopt.sleep.budget import Budget, plan_depth
+        from skillopt_sleep.budget import Budget, plan_depth
         clock = [0.0]
         b = Budget(max_tokens=1000)
         b.start(lambda: clock[0], tokens_now=0)
@@ -303,9 +303,9 @@ class TestMultiRolloutAndBudget(unittest.TestCase):
         self.assertGreaterEqual(k, 1)
 
     def test_contrastive_reflect_with_stub(self):
-        from skillopt.sleep.backend import Backend
-        from skillopt.sleep.rollout import RolloutSet, contrastive_reflect
-        from skillopt.sleep.types import ReplayResult, TaskRecord
+        from skillopt_sleep.backend import Backend
+        from skillopt_sleep.rollout import RolloutSet, contrastive_reflect
+        from skillopt_sleep.types import ReplayResult, TaskRecord
 
         class StubBackend(Backend):
             name = "stub"
@@ -322,7 +322,7 @@ class TestMultiRolloutAndBudget(unittest.TestCase):
 
 class TestSlowUpdate(unittest.TestCase):
     def test_protected_field_roundtrip(self):
-        from skillopt.sleep.slow_update import (
+        from skillopt_sleep.slow_update import (
             replace_slow_field, extract_slow_field, has_slow_field,
             SLOW_UPDATE_START, SLOW_UPDATE_END,
         )
@@ -339,9 +339,9 @@ class TestSlowUpdate(unittest.TestCase):
         self.assertIn("keep me", doc2)
 
     def test_run_slow_update_with_stub_backend(self):
-        from skillopt.sleep.backend import Backend
-        from skillopt.sleep.slow_update import run_slow_update
-        from skillopt.sleep.types import TaskRecord, ReplayResult
+        from skillopt_sleep.backend import Backend
+        from skillopt_sleep.slow_update import run_slow_update
+        from skillopt_sleep.types import TaskRecord, ReplayResult
 
         class StubBackend(Backend):
             name = "stub"
@@ -365,10 +365,10 @@ class TestSlowUpdate(unittest.TestCase):
 
 class TestToolLoop(unittest.TestCase):
     def test_tool_called_judge_via_replay(self):
-        from skillopt.sleep.backend import MockBackend
-        from skillopt.sleep.replay import replay_one, _required_tools
-        from skillopt.sleep.memory import set_learned
-        from skillopt.sleep.types import TaskRecord
+        from skillopt_sleep.backend import MockBackend
+        from skillopt_sleep.replay import replay_one, _required_tools
+        from skillopt_sleep.memory import set_learned
+        from skillopt_sleep.types import TaskRecord
 
         task = TaskRecord(
             id="qa1", project="/p", intent="answer the question",