diff --git a/skillopt-sleep-plugin/README.md b/skillopt-sleep-plugin/README.md index a3b6ddc..6d77559 100644 --- a/skillopt-sleep-plugin/README.md +++ b/skillopt-sleep-plugin/README.md @@ -48,7 +48,7 @@ cd SkillOpt ``` The plugin's bundled runner (`scripts/sleep.sh`) auto-selects a Python ≥ 3.10 -interpreter and calls the `skillopt.sleep` engine in the repo. No `pip install` +interpreter and calls the `skillopt_sleep` engine in the repo. No `pip install` is required for the default `mock` backend or for `claude`/`codex` backends — they shell out to the CLIs you already have. @@ -65,9 +65,9 @@ they shell out to the CLIs you already have. Or call the engine directly (Python ≥ 3.10): ```bash -python -m skillopt.sleep run --project "$(pwd)" --scope invoked --backend mock -python -m skillopt.sleep run --project "$(pwd)" --backend claude # real lift via Claude -python -m skillopt.sleep run --project "$(pwd)" --backend codex # real lift via Codex +python -m skillopt_sleep run --project "$(pwd)" --scope invoked --backend mock +python -m skillopt_sleep run --project "$(pwd)" --backend claude # real lift via Claude +python -m skillopt_sleep run --project "$(pwd)" --backend codex # real lift via Codex ``` Default backend is **`mock`** — deterministic, no API spend — so you can try the @@ -98,10 +98,10 @@ Reproduce: ```bash git clone https://github.com/garrytan/gbrain-evals /tmp/gbrain-evals -python -m skillopt.sleep.experiments.run_gbrain --backend claude --model haiku \ +python -m skillopt_sleep.experiments.run_gbrain --backend claude --model haiku \ --seeds brief-writer --data-root /tmp/gbrain-evals/eval/data/skillopt-v1 \ --nights 1 --limit-replay 3 --limit-holdout 3 -python -m skillopt.sleep.experiments.run_gbrain --backend codex \ +python -m skillopt_sleep.experiments.run_gbrain --backend codex \ --seeds brief-writer --data-root /tmp/gbrain-evals/eval/data/skillopt-v1 \ --nights 1 --limit-replay 3 --limit-holdout 3 ``` @@ -109,8 +109,8 @@ python -m skillopt.sleep.experiments.run_gbrain --backend codex \ ## Deterministic proof (no API, no keys) ```bash -python -m skillopt.sleep.experiments.run_experiment --persona researcher --assert-improves -python -m skillopt.sleep.experiments.run_experiment --persona programmer --assert-improves +python -m skillopt_sleep.experiments.run_experiment --persona researcher --assert-improves +python -m skillopt_sleep.experiments.run_experiment --persona programmer --assert-improves ``` Each prints the held-out score rising from baseline toward 1.0 as the gate diff --git a/skillopt-sleep-plugin/commands/sleep.md b/skillopt-sleep-plugin/commands/sleep.md index 48b62f7..6ed3ef9 100644 --- a/skillopt-sleep-plugin/commands/sleep.md +++ b/skillopt-sleep-plugin/commands/sleep.md @@ -18,7 +18,7 @@ held-out replay score, and nothing live is modified until the user adopts it. ## How to run it -The engine is the `skillopt.sleep` Python package in this repo. Use the +The engine is the `skillopt_sleep` Python package in this repo. Use the **plugin's bundled runner** so the right interpreter and repo are on the path: ```bash diff --git a/skillopt-sleep-plugin/scripts/sleep.sh b/skillopt-sleep-plugin/scripts/sleep.sh index 49c224a..052b0c5 100755 --- a/skillopt-sleep-plugin/scripts/sleep.sh +++ b/skillopt-sleep-plugin/scripts/sleep.sh @@ -1,12 +1,12 @@ #!/usr/bin/env bash -# SkillOpt-Sleep runner — invokes the skillopt.sleep engine with a suitable +# SkillOpt-Sleep runner — invokes the skillopt_sleep engine with a suitable # Python interpreter, from the repo that contains this plugin. # # Usage: sleep.sh [extra args...] set -euo pipefail # Resolve the repo root: the plugin lives at /skillopt-sleep-plugin, -# so the engine package is at /skillopt/sleep. CLAUDE_PLUGIN_ROOT points +# so the engine package is at /skillopt_sleep. CLAUDE_PLUGIN_ROOT points # at the plugin dir when run by Claude Code; fall back to this script's dir. PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}" REPO_ROOT="$(cd "$PLUGIN_ROOT/.." && pwd)" @@ -27,4 +27,4 @@ fi if [ "$#" -eq 0 ]; then set -- status; fi cd "$REPO_ROOT" -exec "$PY" -m skillopt.sleep "$@" +exec "$PY" -m skillopt_sleep "$@" diff --git a/skillopt-sleep-plugin/skills/skillopt-sleep/SKILL.md b/skillopt-sleep-plugin/skills/skillopt-sleep/SKILL.md index da252f7..bf6b86f 100644 --- a/skillopt-sleep-plugin/skills/skillopt-sleep/SKILL.md +++ b/skillopt-sleep-plugin/skills/skillopt-sleep/SKILL.md @@ -1,6 +1,6 @@ --- name: skillopt-sleep -description: "Use when the user wants their Claude agent to self-improve from past usage, asks about a nightly/offline 'sleep' or 'dream' cycle, memory/skill consolidation, or says things like '让 agent 越用越好用', 'review my past sessions', 'learn my preferences', 'consolidate what you learned', 'run the sleep cycle', or wants to schedule offline self-optimization. Drives the skillopt.sleep engine: harvest past sessions → mine recurring tasks → replay offline → consolidate validated CLAUDE.md/SKILL.md behind a held-out gate." +description: "Use when the user wants their Claude agent to self-improve from past usage, asks about a nightly/offline 'sleep' or 'dream' cycle, memory/skill consolidation, or says things like '让 agent 越用越好用', 'review my past sessions', 'learn my preferences', 'consolidate what you learned', 'run the sleep cycle', or wants to schedule offline self-optimization. Drives the skillopt_sleep engine: harvest past sessions → mine recurring tasks → replay offline → consolidate validated CLAUDE.md/SKILL.md behind a held-out gate." --- # SkillOpt-Sleep: offline self-evolution for a local Claude agent @@ -62,7 +62,7 @@ Prefer the `/sleep` command. Under the hood it calls the bundled runner: - Always show the user the **held-out baseline → candidate** score and the exact proposed edits before suggesting adoption. Evidence before adoption. - If asked whether it really helps, run - `python -m skillopt.sleep.experiments.run_experiment --persona researcher --json` + `python -m skillopt_sleep.experiments.run_experiment --persona researcher --json` — a deterministic demo that proves held-out lift and that the gate blocks harmful edits. @@ -70,8 +70,8 @@ Prefer the `/sleep` command. Under the hood it calls the bundled runner: ```bash # deterministic proof (no API): held-out score rises, gate blocks regressions -python -m skillopt.sleep.experiments.run_experiment --persona researcher --assert-improves -python -m skillopt.sleep.experiments.run_experiment --persona programmer --assert-improves +python -m skillopt_sleep.experiments.run_experiment --persona researcher --assert-improves +python -m skillopt_sleep.experiments.run_experiment --persona programmer --assert-improves ``` See `docs/sleep/experiment_results.md` for recorded output and diff --git a/skillopt/sleep/__init__.py b/skillopt_sleep/__init__.py similarity index 86% rename from skillopt/sleep/__init__.py rename to skillopt_sleep/__init__.py index 6e35c0b..8660d06 100644 --- a/skillopt/sleep/__init__.py +++ b/skillopt_sleep/__init__.py @@ -11,8 +11,8 @@ Synthesizes three ideas: * Sleep — short-term experience -> long-term competence, offline Public entry points: - * skillopt.sleep.cli — `python -m skillopt.sleep ...` - * skillopt.sleep.cycle.run_sleep_cycle(...) + * skillopt_sleep.cli — `python -m skillopt_sleep ...` + * skillopt_sleep.cycle.run_sleep_cycle(...) """ from __future__ import annotations diff --git a/skillopt/sleep/__main__.py b/skillopt_sleep/__main__.py similarity index 90% rename from skillopt/sleep/__main__.py rename to skillopt_sleep/__main__.py index f01612e..f2efa3e 100644 --- a/skillopt/sleep/__main__.py +++ b/skillopt_sleep/__main__.py @@ -1,10 +1,10 @@ """SkillOpt-Sleep — command-line interface. - python -m skillopt.sleep run # full cycle: harvest->mine->replay->gate->stage - python -m skillopt.sleep dry-run # same but report only, no staging/adopt - python -m skillopt.sleep status # show state + latest staged proposal - python -m skillopt.sleep adopt # apply the latest staged proposal (with backup) - python -m skillopt.sleep harvest # just print what would be mined (debug) + python -m skillopt_sleep run # full cycle: harvest->mine->replay->gate->stage + python -m skillopt_sleep dry-run # same but report only, no staging/adopt + python -m skillopt_sleep status # show state + latest staged proposal + python -m skillopt_sleep adopt # apply the latest staged proposal (with backup) + python -m skillopt_sleep harvest # just print what would be mined (debug) Common flags: --project PATH project to evolve (default: cwd) @@ -23,12 +23,12 @@ import os import sys from typing import Any, Dict -from skillopt.sleep.config import load_config -from skillopt.sleep.cycle import run_sleep_cycle -from skillopt.sleep.harvest import harvest -from skillopt.sleep.mine import mine -from skillopt.sleep.state import SleepState -from skillopt.sleep.staging import latest_staging, adopt as adopt_staging +from skillopt_sleep.config import load_config +from skillopt_sleep.cycle import run_sleep_cycle +from skillopt_sleep.harvest import harvest +from skillopt_sleep.mine import mine +from skillopt_sleep.state import SleepState +from skillopt_sleep.staging import latest_staging, adopt as adopt_staging def _add_common(p: argparse.ArgumentParser) -> None: @@ -90,7 +90,7 @@ def cmd_run(args, dry: bool = False) -> int: if outcome.staging_dir: print(f"[sleep] staged: {outcome.staging_dir}") if not outcome.adopted: - print("[sleep] review it, then: python -m skillopt.sleep adopt") + print("[sleep] review it, then: python -m skillopt_sleep adopt") if outcome.adopted: print(f"[sleep] auto-adopted: {', '.join(outcome.adopted_paths)}") return 0 @@ -164,7 +164,7 @@ def cmd_harvest(args) -> int: def main(argv=None) -> int: - parser = argparse.ArgumentParser(prog="skillopt.sleep", description="SkillOpt-Sleep nightly self-evolution") + parser = argparse.ArgumentParser(prog="skillopt_sleep", description="SkillOpt-Sleep nightly self-evolution") sub = parser.add_subparsers(dest="cmd", required=True) p_run = sub.add_parser("run", help="run a full sleep cycle") diff --git a/skillopt/sleep/backend.py b/skillopt_sleep/backend.py similarity index 99% rename from skillopt/sleep/backend.py rename to skillopt_sleep/backend.py index c4b873d..fbc8d26 100644 --- a/skillopt/sleep/backend.py +++ b/skillopt_sleep/backend.py @@ -26,7 +26,7 @@ import re import subprocess from typing import Any, Dict, List, Optional, Tuple -from skillopt.sleep.types import EditRecord, ReplayResult, TaskRecord +from skillopt_sleep.types import EditRecord, ReplayResult, TaskRecord def skill_hash(content: str) -> str: @@ -192,7 +192,7 @@ class MockBackend(Backend): def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]: if task.reference_kind == "rule" and task.judge: - from skillopt.sleep.judges import score_rule_judge + from skillopt_sleep.judges import score_rule_judge return score_rule_judge(task.judge, response) if task.reference_kind == "exact" and task.reference: hard = exact_score(task.reference, response) @@ -303,7 +303,7 @@ class CliBackend(Backend): def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]: # gbrain-style rule judge: scored locally, no API spend if task.reference_kind == "rule" and task.judge: - from skillopt.sleep.judges import score_rule_judge + from skillopt_sleep.judges import score_rule_judge return score_rule_judge(task.judge, response) # exact references are scored locally — no API spend if task.reference_kind == "exact" and task.reference: diff --git a/skillopt/sleep/budget.py b/skillopt_sleep/budget.py similarity index 100% rename from skillopt/sleep/budget.py rename to skillopt_sleep/budget.py diff --git a/skillopt/sleep/config.py b/skillopt_sleep/config.py similarity index 100% rename from skillopt/sleep/config.py rename to skillopt_sleep/config.py diff --git a/skillopt/sleep/consolidate.py b/skillopt_sleep/consolidate.py similarity index 88% rename from skillopt/sleep/consolidate.py rename to skillopt_sleep/consolidate.py index da15d96..f01808e 100644 --- a/skillopt/sleep/consolidate.py +++ b/skillopt_sleep/consolidate.py @@ -3,11 +3,8 @@ This is the core that makes nightly evolution *safe*: it proposes bounded edits from replayed failures, applies them to a candidate skill/memory, then **gates** the candidate on a held-out slice of the user's own tasks. Only a -candidate that strictly improves the held-out score is accepted — exactly the -SkillOpt validation gate, reused verbatim from ``skillopt.evaluation.gate``. - -Reused from the main SkillOpt package (import-light, no `openai` needed): - * skillopt.evaluation.gate.evaluate_gate / select_gate_score +candidate that strictly improves the held-out score is accepted — the SkillOpt +validation gate, vendored self-contained in ``skillopt_sleep.gate``. """ from __future__ import annotations @@ -15,26 +12,16 @@ import os from dataclasses import dataclass from typing import List, Optional, Tuple -from skillopt.sleep.backend import Backend -from skillopt.sleep.memory import apply_edits -from skillopt.sleep.replay import aggregate_scores, replay_batch -from skillopt.sleep.types import EditRecord, ReplayResult, TaskRecord +from skillopt_sleep.backend import Backend +from skillopt_sleep.memory import apply_edits +from skillopt_sleep.replay import aggregate_scores, replay_batch +from skillopt_sleep.types import EditRecord, ReplayResult, TaskRecord -# Reuse the real SkillOpt gate. This module imports cleanly without `openai`. -try: - from skillopt.evaluation.gate import evaluate_gate, select_gate_score - _HAVE_REPO_GATE = True -except Exception: # pragma: no cover - fallback keeps engine standalone - _HAVE_REPO_GATE = False - - def select_gate_score(hard, soft, metric="hard", mixed_weight=0.5): # type: ignore - if metric == "hard": - return float(hard) - if metric == "soft": - return float(soft) - w = max(0.0, min(1.0, float(mixed_weight))) - return (1 - w) * float(hard) + w * float(soft) +# Self-contained validation gate (vendored from SkillOpt; zero dependency on the +# research package, so this open-source tool stays decoupled from the paper code). +from skillopt_sleep.gate import evaluate_gate, select_gate_score +_HAVE_REPO_GATE = True @dataclass @@ -140,7 +127,7 @@ def consolidate( if rollouts_k > 1: # multi-rollout contrastive reflection: run each train task K times # and distill a rule from the good-vs-bad contrast (the "脑补" signal). - from skillopt.sleep.rollout import multi_rollout, contrastive_reflect + from skillopt_sleep.rollout import multi_rollout, contrastive_reflect sets = [multi_rollout(backend, t, cand_skill, cand_memory, k=rollouts_k) for t in train_tasks] edits = contrastive_reflect( diff --git a/skillopt/sleep/cycle.py b/skillopt_sleep/cycle.py similarity index 93% rename from skillopt/sleep/cycle.py rename to skillopt_sleep/cycle.py index 4efc81b..04baa03 100644 --- a/skillopt/sleep/cycle.py +++ b/skillopt_sleep/cycle.py @@ -14,15 +14,15 @@ import time from dataclasses import dataclass from typing import Any, Dict, List, Optional -from skillopt.sleep.backend import get_backend -from skillopt.sleep.config import SleepConfig, load_config -from skillopt.sleep.consolidate import consolidate -from skillopt.sleep.harvest import harvest -from skillopt.sleep.memory import ensure_skill_scaffold -from skillopt.sleep.mine import mine -from skillopt.sleep.state import SleepState, _now_iso -from skillopt.sleep.staging import write_staging, adopt as adopt_staging -from skillopt.sleep.types import SessionDigest, SleepReport, TaskRecord +from skillopt_sleep.backend import get_backend +from skillopt_sleep.config import SleepConfig, load_config +from skillopt_sleep.consolidate import consolidate +from skillopt_sleep.harvest import harvest +from skillopt_sleep.memory import ensure_skill_scaffold +from skillopt_sleep.mine import mine +from skillopt_sleep.state import SleepState, _now_iso +from skillopt_sleep.staging import write_staging, adopt as adopt_staging +from skillopt_sleep.types import SessionDigest, SleepReport, TaskRecord @dataclass @@ -131,7 +131,7 @@ def run_sleep_cycle( llm_miner = None if cfg.get("backend", "mock") != "mock" and cfg.get("llm_mine", True): try: - from skillopt.sleep.llm_miner import make_llm_miner + from skillopt_sleep.llm_miner import make_llm_miner llm_miner = make_llm_miner(backend, max_tasks=cfg.get("max_tasks_per_night", 40)) except Exception: llm_miner = None diff --git a/skillopt/sleep/experiments/__init__.py b/skillopt_sleep/experiments/__init__.py similarity index 100% rename from skillopt/sleep/experiments/__init__.py rename to skillopt_sleep/experiments/__init__.py diff --git a/skillopt/sleep/experiments/gbrain_bench.py b/skillopt_sleep/experiments/gbrain_bench.py similarity index 97% rename from skillopt/sleep/experiments/gbrain_bench.py rename to skillopt_sleep/experiments/gbrain_bench.py index efe0ff6..49261d6 100644 --- a/skillopt/sleep/experiments/gbrain_bench.py +++ b/skillopt_sleep/experiments/gbrain_bench.py @@ -17,7 +17,7 @@ We map: judge -> TaskRecord.judge (+ reference_kind="rule") This lets us reproduce gbrain's headline result with our engine and either the -claude or codex backend, scoring locally via skillopt.sleep.judges (no judge API). +claude or codex backend, scoring locally via skillopt_sleep.judges (no judge API). """ from __future__ import annotations @@ -25,7 +25,7 @@ import json import os from typing import Dict, List, Optional, Tuple -from skillopt.sleep.types import TaskRecord +from skillopt_sleep.types import TaskRecord SEED_DIRS = { diff --git a/skillopt/sleep/experiments/personas.py b/skillopt_sleep/experiments/personas.py similarity index 98% rename from skillopt/sleep/experiments/personas.py rename to skillopt_sleep/experiments/personas.py index 0fdb127..72eb6af 100644 --- a/skillopt/sleep/experiments/personas.py +++ b/skillopt_sleep/experiments/personas.py @@ -12,7 +12,7 @@ from __future__ import annotations from typing import List -from skillopt.sleep.types import TaskRecord +from skillopt_sleep.types import TaskRecord def _t(i, intent, ref, rule, project="/personas/demo", outcome="fail") -> TaskRecord: diff --git a/skillopt/sleep/experiments/report.py b/skillopt_sleep/experiments/report.py similarity index 96% rename from skillopt/sleep/experiments/report.py rename to skillopt_sleep/experiments/report.py index 8672822..767ea65 100644 --- a/skillopt/sleep/experiments/report.py +++ b/skillopt_sleep/experiments/report.py @@ -1,7 +1,7 @@ """SkillOpt-Sleep — turn a sweep JSONL into a presented Markdown scorecard. Usage: - python -m skillopt.sleep.experiments.report --in docs/sleep/sweep.jsonl \ + python -m skillopt_sleep.experiments.report --in docs/sleep/sweep.jsonl \ --out docs/sleep/benchmark_report.md """ from __future__ import annotations @@ -101,9 +101,9 @@ def render(rows: List[Dict[str, Any]]) -> str: out.append("") out.append("```bash") out.append("git clone https://github.com/garrytan/gbrain-evals /tmp/gbrain-evals") - out.append("python -m skillopt.sleep.experiments.sweep --plan full \\") + out.append("python -m skillopt_sleep.experiments.sweep --plan full \\") out.append(" --data-root /tmp/gbrain-evals/eval/data/skillopt-v1 --out docs/sleep/sweep.jsonl") - out.append("python -m skillopt.sleep.experiments.report \\") + out.append("python -m skillopt_sleep.experiments.report \\") out.append(" --in docs/sleep/sweep.jsonl --out docs/sleep/benchmark_report.md") out.append("```") out.append("") diff --git a/skillopt/sleep/experiments/run_experiment.py b/skillopt_sleep/experiments/run_experiment.py similarity index 92% rename from skillopt/sleep/experiments/run_experiment.py rename to skillopt_sleep/experiments/run_experiment.py index 385b0a1..91a9ca9 100644 --- a/skillopt/sleep/experiments/run_experiment.py +++ b/skillopt_sleep/experiments/run_experiment.py @@ -14,9 +14,9 @@ What it proves: the adopted artifact, re-scored, retains the lift. Run: - python -m skillopt.sleep.experiments.run_experiment - python -m skillopt.sleep.experiments.run_experiment --persona programmer --nights 3 - python -m skillopt.sleep.experiments.run_experiment --backend anthropic # real lift + python -m skillopt_sleep.experiments.run_experiment + python -m skillopt_sleep.experiments.run_experiment --persona programmer --nights 3 + python -m skillopt_sleep.experiments.run_experiment --backend anthropic # real lift """ from __future__ import annotations @@ -27,21 +27,21 @@ import sys import tempfile from typing import List -from skillopt.sleep.backend import get_backend -from skillopt.sleep.consolidate import consolidate -from skillopt.sleep.experiments.personas import ( +from skillopt_sleep.backend import get_backend +from skillopt_sleep.consolidate import consolidate +from skillopt_sleep.experiments.personas import ( PERSONAS, harmful_edit_task, researcher_persona, ) -from skillopt.sleep.memory import ensure_skill_scaffold -from skillopt.sleep.replay import aggregate_scores, replay_batch -from skillopt.sleep.types import TaskRecord +from skillopt_sleep.memory import ensure_skill_scaffold +from skillopt_sleep.replay import aggregate_scores, replay_batch +from skillopt_sleep.types import TaskRecord def _score_holdout(backend, tasks: List[TaskRecord], skill: str, memory: str, metric: str = "mixed", w: float = 0.5) -> float: - from skillopt.sleep.consolidate import select_gate_score + from skillopt_sleep.consolidate import select_gate_score # the persona experiment uses a 2-way split (train/val, no test); score on val holdout = [t for t in tasks if t.split in ("val", "holdout")] or tasks pairs = replay_batch(backend, holdout, skill, memory) @@ -52,7 +52,7 @@ def _score_holdout(backend, tasks: List[TaskRecord], skill: str, memory: str, def run(persona: str = "researcher", nights: int = 4, backend_name: str = "mock", edit_budget: int = 4, seed: int = 42, model: str = "", codex_path: str = "", limit_tasks: int = 0) -> dict: - from skillopt.sleep.mine import assign_splits + from skillopt_sleep.mine import assign_splits make = PERSONAS.get(persona, researcher_persona) items = make() diff --git a/skillopt/sleep/experiments/run_gbrain.py b/skillopt_sleep/experiments/run_gbrain.py similarity index 94% rename from skillopt/sleep/experiments/run_gbrain.py rename to skillopt_sleep/experiments/run_gbrain.py index cfa5359..43c7acd 100644 --- a/skillopt/sleep/experiments/run_gbrain.py +++ b/skillopt_sleep/experiments/run_gbrain.py @@ -13,9 +13,9 @@ Held-out scoring is done locally by the rule judge (no judge API). Only the agent's `attempt` (and the optimizer's `reflect`) spend tokens. Usage: - python -m skillopt.sleep.experiments.run_gbrain --backend mock - python -m skillopt.sleep.experiments.run_gbrain --backend claude --seeds brief-writer --nights 2 - python -m skillopt.sleep.experiments.run_gbrain --backend codex --data-root /tmp/gbrain-evals/eval/data/skillopt-v1 + python -m skillopt_sleep.experiments.run_gbrain --backend mock + python -m skillopt_sleep.experiments.run_gbrain --backend claude --seeds brief-writer --nights 2 + python -m skillopt_sleep.experiments.run_gbrain --backend codex --data-root /tmp/gbrain-evals/eval/data/skillopt-v1 """ from __future__ import annotations @@ -24,14 +24,14 @@ import json import sys from typing import Dict, List, Optional -from skillopt.sleep.backend import build_backend, get_backend -from skillopt.sleep.consolidate import consolidate, select_gate_score -from skillopt.sleep.experiments.gbrain_bench import ( +from skillopt_sleep.backend import build_backend, get_backend +from skillopt_sleep.consolidate import consolidate, select_gate_score +from skillopt_sleep.experiments.gbrain_bench import ( available_seeds, find_data_root, load_seed, ) -from skillopt.sleep.replay import aggregate_scores, replay_batch +from skillopt_sleep.replay import aggregate_scores, replay_batch def _score(backend, tasks, skill, memory, split="test", metric="mixed", w=0.5): @@ -95,7 +95,7 @@ def run_seed(backend, seed: str, skill: str, tasks: List, *, slow_text = None if nights >= 2 and slow_update: try: - from skillopt.sleep.slow_update import run_slow_update, replace_slow_field + from skillopt_sleep.slow_update import run_slow_update, replace_slow_field val_tasks = [t for t in tasks if t.split == "val"] or tasks prev_pairs = replay_batch(backend, val_tasks, first_night_skill, memory) curr_pairs = replay_batch(backend, val_tasks, cur, memory) @@ -170,7 +170,7 @@ def main(argv=None) -> int: # budget auto-planning: derive nights x rollouts_k from a token budget nights, rollouts_k = args.nights, args.rollouts_k if args.budget_tokens: - from skillopt.sleep.budget import Budget, plan_depth + from skillopt_sleep.budget import Budget, plan_depth n_train = len([t for t in tasks if t.split == "train"]) or len(tasks) nights, rollouts_k = plan_depth( Budget(max_tokens=args.budget_tokens), n_tasks=n_train, diff --git a/skillopt/sleep/experiments/run_transfer.py b/skillopt_sleep/experiments/run_transfer.py similarity index 95% rename from skillopt/sleep/experiments/run_transfer.py rename to skillopt_sleep/experiments/run_transfer.py index 9cdd86d..5b00ec8 100644 --- a/skillopt/sleep/experiments/run_transfer.py +++ b/skillopt_sleep/experiments/run_transfer.py @@ -16,7 +16,7 @@ Protocol, per gbrain seed: Report baseline / direct / transferred, mirroring SkillOpt Table "transfer". Usage: - python -m skillopt.sleep.experiments.run_transfer \ + python -m skillopt_sleep.experiments.run_transfer \ --source-backend claude --source-model haiku \ --target-backend claude --target-model sonnet \ --seeds brief-writer --nights 2 @@ -28,12 +28,12 @@ import json import sys from typing import List, Optional -from skillopt.sleep.backend import get_backend -from skillopt.sleep.consolidate import consolidate, select_gate_score -from skillopt.sleep.experiments.gbrain_bench import ( +from skillopt_sleep.backend import get_backend +from skillopt_sleep.consolidate import consolidate, select_gate_score +from skillopt_sleep.experiments.gbrain_bench import ( available_seeds, find_data_root, load_seed, ) -from skillopt.sleep.replay import aggregate_scores, replay_batch +from skillopt_sleep.replay import aggregate_scores, replay_batch def _holdout_hard(backend, tasks, skill, memory="") -> float: diff --git a/skillopt/sleep/experiments/sweep.py b/skillopt_sleep/experiments/sweep.py similarity index 94% rename from skillopt/sleep/experiments/sweep.py rename to skillopt_sleep/experiments/sweep.py index 75109e9..ddd337c 100644 --- a/skillopt/sleep/experiments/sweep.py +++ b/skillopt_sleep/experiments/sweep.py @@ -8,8 +8,8 @@ survive) and resume (skip configs whose row already exists). Then `report.py` turns the JSONL into a presented Markdown scorecard. Usage: - python -m skillopt.sleep.experiments.sweep --plan quick --out docs/sleep/sweep.jsonl - python -m skillopt.sleep.experiments.sweep --plan full --out docs/sleep/sweep.jsonl + python -m skillopt_sleep.experiments.sweep --plan quick --out docs/sleep/sweep.jsonl + python -m skillopt_sleep.experiments.sweep --plan full --out docs/sleep/sweep.jsonl """ from __future__ import annotations @@ -20,10 +20,10 @@ import sys import time from typing import Any, Dict, List -from skillopt.sleep.backend import build_backend, get_backend -from skillopt.sleep.experiments.gbrain_bench import find_data_root, load_seed -from skillopt.sleep.experiments.run_gbrain import run_seed as bench_seed -from skillopt.sleep.experiments.run_transfer import run_seed as transfer_seed +from skillopt_sleep.backend import build_backend, get_backend +from skillopt_sleep.experiments.gbrain_bench import find_data_root, load_seed +from skillopt_sleep.experiments.run_gbrain import run_seed as bench_seed +from skillopt_sleep.experiments.run_transfer import run_seed as transfer_seed # Plans: lists of config dicts. Kept small per-run to bound cost/latency. diff --git a/skillopt_sleep/gate.py b/skillopt_sleep/gate.py new file mode 100644 index 0000000..7eca3b4 --- /dev/null +++ b/skillopt_sleep/gate.py @@ -0,0 +1,50 @@ +"""SkillOpt-Sleep — vendored validation gate. + +This is a self-contained copy of the SkillOpt validation gate so the sleep +engine has ZERO dependency on the research package (skillopt/*). The research +repo's ``skillopt.evaluation.gate`` is the reference implementation and the two +are kept behaviourally identical; vendoring keeps this open-source tool +decoupled from the paper's experiment code. +""" +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class GateResult: + action: str # "accept_new_best" | "accept" | "reject" + current_skill: str + current_score: float + best_skill: str + best_score: float + best_step: int + + +def select_gate_score(hard: float, soft: float, metric: str = "hard", + mixed_weight: float = 0.5) -> float: + """Project (hard, soft) onto a single comparison metric.""" + if metric == "hard": + return float(hard) + if metric == "soft": + return float(soft) + if metric == "mixed": + w = max(0.0, min(1.0, float(mixed_weight))) + return (1.0 - w) * float(hard) + w * float(soft) + raise ValueError(f"unknown gate metric {metric!r}; expected hard/soft/mixed") + + +def evaluate_gate(candidate_skill: str, cand_hard: float, current_skill: str, + current_score: float, best_skill: str, best_score: float, + best_step: int, global_step: int, *, cand_soft: float = 0.0, + metric: str = "hard", mixed_weight: float = 0.5) -> GateResult: + """Pure gate decision: compare candidate score to current/best.""" + cand_score = select_gate_score(cand_hard, cand_soft, metric, mixed_weight) + if cand_score > current_score: + if cand_score > best_score: + return GateResult("accept_new_best", candidate_skill, cand_score, + candidate_skill, cand_score, global_step) + return GateResult("accept", candidate_skill, cand_score, + best_skill, best_score, best_step) + return GateResult("reject", current_skill, current_score, + best_skill, best_score, best_step) diff --git a/skillopt/sleep/harvest.py b/skillopt_sleep/harvest.py similarity index 99% rename from skillopt/sleep/harvest.py rename to skillopt_sleep/harvest.py index 013483f..fb48379 100644 --- a/skillopt/sleep/harvest.py +++ b/skillopt_sleep/harvest.py @@ -19,7 +19,7 @@ import json import os from typing import Any, Dict, Iterable, List, Optional -from skillopt.sleep.types import SessionDigest +from skillopt_sleep.types import SessionDigest # Heuristic phrases that signal the user (dis)approving of prior output. diff --git a/skillopt/sleep/judges.py b/skillopt_sleep/judges.py similarity index 100% rename from skillopt/sleep/judges.py rename to skillopt_sleep/judges.py diff --git a/skillopt/sleep/llm_miner.py b/skillopt_sleep/llm_miner.py similarity index 97% rename from skillopt/sleep/llm_miner.py rename to skillopt_sleep/llm_miner.py index 374b787..dd78c63 100644 --- a/skillopt/sleep/llm_miner.py +++ b/skillopt_sleep/llm_miner.py @@ -22,8 +22,8 @@ import json import re from typing import Any, Callable, Dict, List -from skillopt.sleep.backend import Backend, _extract_json -from skillopt.sleep.types import SessionDigest, TaskRecord +from skillopt_sleep.backend import Backend, _extract_json +from skillopt_sleep.types import SessionDigest, TaskRecord _MINER_PROMPT = """You are mining a user's past AI-assistant sessions to find RECURRING tasks diff --git a/skillopt/sleep/memory.py b/skillopt_sleep/memory.py similarity index 98% rename from skillopt/sleep/memory.py rename to skillopt_sleep/memory.py index 2f7ddbb..579d714 100644 --- a/skillopt/sleep/memory.py +++ b/skillopt_sleep/memory.py @@ -10,7 +10,7 @@ from __future__ import annotations import re from typing import List, Tuple -from skillopt.sleep.types import EditRecord +from skillopt_sleep.types import EditRecord LEARNED_START = "" diff --git a/skillopt/sleep/mine.py b/skillopt_sleep/mine.py similarity index 99% rename from skillopt/sleep/mine.py rename to skillopt_sleep/mine.py index ec22f18..64d7546 100644 --- a/skillopt/sleep/mine.py +++ b/skillopt_sleep/mine.py @@ -18,7 +18,7 @@ import hashlib import re from typing import Any, Callable, List, Optional -from skillopt.sleep.types import SessionDigest, TaskRecord +from skillopt_sleep.types import SessionDigest, TaskRecord def _tid(project: str, intent: str) -> str: diff --git a/skillopt/sleep/replay.py b/skillopt_sleep/replay.py similarity index 96% rename from skillopt/sleep/replay.py rename to skillopt_sleep/replay.py index 7cdfd7f..dc63f7f 100644 --- a/skillopt/sleep/replay.py +++ b/skillopt_sleep/replay.py @@ -11,8 +11,8 @@ from __future__ import annotations from typing import List, Tuple -from skillopt.sleep.backend import Backend -from skillopt.sleep.types import ReplayResult, TaskRecord +from skillopt_sleep.backend import Backend +from skillopt_sleep.types import ReplayResult, TaskRecord def _required_tools(task: TaskRecord) -> List[str]: @@ -44,7 +44,7 @@ def replay_one(backend: Backend, task: TaskRecord, skill: str, memory: str) -> R # rule judges may need the detected tool calls; score locally when possible if task.reference_kind == "rule" and task.judge: - from skillopt.sleep.judges import score_rule_judge + from skillopt_sleep.judges import score_rule_judge hard, soft, rationale = score_rule_judge(task.judge, response, tools_called) else: hard, soft, rationale = backend.judge(task, response) diff --git a/skillopt/sleep/rollout.py b/skillopt_sleep/rollout.py similarity index 96% rename from skillopt/sleep/rollout.py rename to skillopt_sleep/rollout.py index f96679c..bae20dd 100644 --- a/skillopt/sleep/rollout.py +++ b/skillopt_sleep/rollout.py @@ -18,9 +18,9 @@ from __future__ import annotations from dataclasses import dataclass, field from typing import List, Optional, Tuple -from skillopt.sleep.backend import Backend, _extract_json -from skillopt.sleep.replay import replay_one -from skillopt.sleep.types import EditRecord, ReplayResult, TaskRecord +from skillopt_sleep.backend import Backend, _extract_json +from skillopt_sleep.replay import replay_one +from skillopt_sleep.types import EditRecord, ReplayResult, TaskRecord @dataclass diff --git a/skillopt/sleep/slow_update.py b/skillopt_sleep/slow_update.py similarity index 97% rename from skillopt/sleep/slow_update.py rename to skillopt_sleep/slow_update.py index 20a7175..7262785 100644 --- a/skillopt/sleep/slow_update.py +++ b/skillopt_sleep/slow_update.py @@ -23,8 +23,8 @@ from __future__ import annotations import re from typing import List, Optional, Tuple -from skillopt.sleep.backend import Backend, _extract_json -from skillopt.sleep.types import ReplayResult, TaskRecord +from skillopt_sleep.backend import Backend, _extract_json +from skillopt_sleep.types import ReplayResult, TaskRecord SLOW_UPDATE_START = "" diff --git a/skillopt/sleep/staging.py b/skillopt_sleep/staging.py similarity index 98% rename from skillopt/sleep/staging.py rename to skillopt_sleep/staging.py index b2e0ab2..2af5be9 100644 --- a/skillopt/sleep/staging.py +++ b/skillopt_sleep/staging.py @@ -13,7 +13,7 @@ import shutil import time from typing import List, Optional -from skillopt.sleep.types import SleepReport +from skillopt_sleep.types import SleepReport def _ts_dir() -> str: diff --git a/skillopt/sleep/state.py b/skillopt_sleep/state.py similarity index 100% rename from skillopt/sleep/state.py rename to skillopt_sleep/state.py diff --git a/skillopt/sleep/types.py b/skillopt_sleep/types.py similarity index 98% rename from skillopt/sleep/types.py rename to skillopt_sleep/types.py index edfd77d..7208bb9 100644 --- a/skillopt/sleep/types.py +++ b/skillopt_sleep/types.py @@ -18,7 +18,7 @@ from typing import Any, Dict, List, Optional class SessionDigest: """A normalized summary of one Claude Code session transcript. - Produced by :mod:`skillopt.sleep.harvest` from a ``.jsonl`` + Produced by :mod:`skillopt_sleep.harvest` from a ``.jsonl`` transcript plus ``history.jsonl`` entries. """ diff --git a/tests/test_sleep_engine.py b/tests/test_sleep_engine.py index ba2a890..2a28dce 100644 --- a/tests/test_sleep_engine.py +++ b/tests/test_sleep_engine.py @@ -2,7 +2,7 @@ Pure-stdlib (unittest), deterministic, no API key, no third-party deps. Run: python3.12 -m pytest tests/test_sleep_engine.py - or: python3.12 -m unittest skillopt.sleep ... (see bottom) + or: python3.12 -m unittest skillopt_sleep ... (see bottom) """ from __future__ import annotations @@ -11,16 +11,16 @@ import os import tempfile import unittest -from skillopt.sleep.backend import MockBackend, exact_score, keyword_soft_score -from skillopt.sleep.config import load_config -from skillopt.sleep.consolidate import consolidate -from skillopt.sleep.cycle import run_sleep_cycle -from skillopt.sleep.experiments.personas import researcher_persona, programmer_persona -from skillopt.sleep.harvest import digest_transcript, _detect_feedback, _is_meta_prompt -from skillopt.sleep.memory import apply_edits, current_learned_lines, extract_learned, set_learned -from skillopt.sleep.mine import assign_splits, heuristic_mine, dedup_tasks -from skillopt.sleep.staging import adopt, latest_staging -from skillopt.sleep.types import EditRecord, SessionDigest, TaskRecord +from skillopt_sleep.backend import MockBackend, exact_score, keyword_soft_score +from skillopt_sleep.config import load_config +from skillopt_sleep.consolidate import consolidate +from skillopt_sleep.cycle import run_sleep_cycle +from skillopt_sleep.experiments.personas import researcher_persona, programmer_persona +from skillopt_sleep.harvest import digest_transcript, _detect_feedback, _is_meta_prompt +from skillopt_sleep.memory import apply_edits, current_learned_lines, extract_learned, set_learned +from skillopt_sleep.mine import assign_splits, heuristic_mine, dedup_tasks +from skillopt_sleep.staging import adopt, latest_staging +from skillopt_sleep.types import EditRecord, SessionDigest, TaskRecord class TestScoring(unittest.TestCase): @@ -115,7 +115,7 @@ class TestMine(unittest.TestCase): def test_dream_never_in_val_or_test(self): # the anti-overfitting guarantee: origin='dream' tasks only ever land in train - from skillopt.sleep.types import TaskRecord + from skillopt_sleep.types import TaskRecord real = researcher_persona() dream = [TaskRecord(id=f"d{i}", project="/p", intent=f"dream {i}", origin="dream", derived_from="r0") for i in range(5)] @@ -152,7 +152,7 @@ class TestConsolidateGate(unittest.TestCase): class TestRuleJudge(unittest.TestCase): def test_section_and_regex(self): - from skillopt.sleep.judges import score_rule_judge + from skillopt_sleep.judges import score_rule_judge j = {"kind": "rule", "checks": [ {"op": "section_present", "arg": "Key Risks"}, {"op": "regex", "arg": r"[Cc]onfidence\s*[:=]"}, @@ -162,13 +162,13 @@ class TestRuleJudge(unittest.TestCase): self.assertEqual(score_rule_judge(j, "just an answer")[0], 0.0) def test_max_chars(self): - from skillopt.sleep.judges import score_rule_judge + from skillopt_sleep.judges import score_rule_judge j = {"checks": [{"op": "max_chars", "arg": 50}]} self.assertEqual(score_rule_judge(j, "x" * 10)[0], 1.0) self.assertEqual(score_rule_judge(j, "x" * 100)[0], 0.0) def test_partial_soft_score(self): - from skillopt.sleep.judges import score_rule_judge + from skillopt_sleep.judges import score_rule_judge j = {"checks": [ {"op": "contains", "arg": "alpha"}, {"op": "contains", "arg": "beta"}, @@ -180,7 +180,7 @@ class TestRuleJudge(unittest.TestCase): class TestGbrainLoader(unittest.TestCase): def test_loads_when_present(self): - from skillopt.sleep.experiments.gbrain_bench import find_data_root, load_seed + from skillopt_sleep.experiments.gbrain_bench import find_data_root, load_seed root = find_data_root() if not root: self.skipTest("gbrain-evals data not present") @@ -191,7 +191,7 @@ class TestGbrainLoader(unittest.TestCase): self.assertTrue(any(t.split == "val" for t in tasks)) self.assertTrue(all(t.reference_kind == "rule" for t in tasks)) # the deficient skill must FAIL its own held-out (test) checks (baseline 0) - from skillopt.sleep.judges import score_rule_judge + from skillopt_sleep.judges import score_rule_judge ho = [t for t in tasks if t.split == "test"][0] self.assertEqual(score_rule_judge(ho.judge, skill)[0], 0.0) @@ -199,8 +199,8 @@ class TestGbrainLoader(unittest.TestCase): class TestLlmMiner(unittest.TestCase): def test_miner_emits_checkable_tasks(self): # a stub backend whose _call returns canned miner JSON => deterministic - from skillopt.sleep.backend import Backend - from skillopt.sleep.llm_miner import make_llm_miner + from skillopt_sleep.backend import Backend + from skillopt_sleep.llm_miner import make_llm_miner class StubBackend(Backend): name = "stub" @@ -219,8 +219,8 @@ class TestLlmMiner(unittest.TestCase): self.assertEqual(tasks[0].judge["checks"][0]["op"], "section_present") def test_miner_drops_uncheckable(self): - from skillopt.sleep.backend import Backend - from skillopt.sleep.llm_miner import make_llm_miner + from skillopt_sleep.backend import Backend + from skillopt_sleep.llm_miner import make_llm_miner class EmptyBackend(Backend): name = "stub" @@ -234,8 +234,8 @@ class TestLlmMiner(unittest.TestCase): class TestMultiObjectiveAndPrefs(unittest.TestCase): def test_multi_objective_reward(self): - from skillopt.sleep.replay import multi_objective_reward - from skillopt.sleep.types import ReplayResult, TaskRecord + from skillopt_sleep.replay import multi_objective_reward + from skillopt_sleep.types import ReplayResult, TaskRecord t = TaskRecord(id="t", project="/p", intent="x") expensive = [(t, ReplayResult(id="t", hard=1.0, tokens=4000, latency_ms=20000))] cheap = [(t, ReplayResult(id="t", hard=1.0, tokens=200, latency_ms=1000))] @@ -248,8 +248,8 @@ class TestMultiObjectiveAndPrefs(unittest.TestCase): self.assertGreater(rc, re) def test_preferences_injected_into_reflect(self): - from skillopt.sleep.backend import CliBackend - from skillopt.sleep.types import TaskRecord, ReplayResult + from skillopt_sleep.backend import CliBackend + from skillopt_sleep.types import TaskRecord, ReplayResult captured = {} class CapBackend(CliBackend): @@ -267,9 +267,9 @@ class TestMultiObjectiveAndPrefs(unittest.TestCase): self.assertIn("British English", captured["prompt"]) def test_replay_records_cost(self): - from skillopt.sleep.backend import MockBackend - from skillopt.sleep.replay import replay_one - from skillopt.sleep.types import TaskRecord + from skillopt_sleep.backend import MockBackend + from skillopt_sleep.replay import replay_one + from skillopt_sleep.types import TaskRecord t = TaskRecord(id="t", project="/p", intent="hello world", reference_kind="exact", reference="hi") r = replay_one(MockBackend(), t, "some skill text", "") @@ -279,8 +279,8 @@ class TestMultiObjectiveAndPrefs(unittest.TestCase): class TestMultiRolloutAndBudget(unittest.TestCase): def test_rolloutset_stats(self): - from skillopt.sleep.rollout import RolloutSet - from skillopt.sleep.types import ReplayResult, TaskRecord + from skillopt_sleep.rollout import RolloutSet + from skillopt_sleep.types import ReplayResult, TaskRecord rs = RolloutSet(task=TaskRecord(id="t", project="/p", intent="x"), attempts=[ReplayResult(id="t", hard=1.0), ReplayResult(id="t", hard=0.0), @@ -291,7 +291,7 @@ class TestMultiRolloutAndBudget(unittest.TestCase): self.assertAlmostEqual(rs.pass_rate, 2 / 3) def test_budget_exhaustion_and_plan(self): - from skillopt.sleep.budget import Budget, plan_depth + from skillopt_sleep.budget import Budget, plan_depth clock = [0.0] b = Budget(max_tokens=1000) b.start(lambda: clock[0], tokens_now=0) @@ -303,9 +303,9 @@ class TestMultiRolloutAndBudget(unittest.TestCase): self.assertGreaterEqual(k, 1) def test_contrastive_reflect_with_stub(self): - from skillopt.sleep.backend import Backend - from skillopt.sleep.rollout import RolloutSet, contrastive_reflect - from skillopt.sleep.types import ReplayResult, TaskRecord + from skillopt_sleep.backend import Backend + from skillopt_sleep.rollout import RolloutSet, contrastive_reflect + from skillopt_sleep.types import ReplayResult, TaskRecord class StubBackend(Backend): name = "stub" @@ -322,7 +322,7 @@ class TestMultiRolloutAndBudget(unittest.TestCase): class TestSlowUpdate(unittest.TestCase): def test_protected_field_roundtrip(self): - from skillopt.sleep.slow_update import ( + from skillopt_sleep.slow_update import ( replace_slow_field, extract_slow_field, has_slow_field, SLOW_UPDATE_START, SLOW_UPDATE_END, ) @@ -339,9 +339,9 @@ class TestSlowUpdate(unittest.TestCase): self.assertIn("keep me", doc2) def test_run_slow_update_with_stub_backend(self): - from skillopt.sleep.backend import Backend - from skillopt.sleep.slow_update import run_slow_update - from skillopt.sleep.types import TaskRecord, ReplayResult + from skillopt_sleep.backend import Backend + from skillopt_sleep.slow_update import run_slow_update + from skillopt_sleep.types import TaskRecord, ReplayResult class StubBackend(Backend): name = "stub" @@ -365,10 +365,10 @@ class TestSlowUpdate(unittest.TestCase): class TestToolLoop(unittest.TestCase): def test_tool_called_judge_via_replay(self): - from skillopt.sleep.backend import MockBackend - from skillopt.sleep.replay import replay_one, _required_tools - from skillopt.sleep.memory import set_learned - from skillopt.sleep.types import TaskRecord + from skillopt_sleep.backend import MockBackend + from skillopt_sleep.replay import replay_one, _required_tools + from skillopt_sleep.memory import set_learned + from skillopt_sleep.types import TaskRecord task = TaskRecord( id="qa1", project="/p", intent="answer the question",