From 0be780052ac94652c6e6999c84a6f9d7c1414c36 Mon Sep 17 00:00:00 2001 From: carpedkm Date: Sat, 20 Jun 2026 11:31:09 +0000 Subject: [PATCH] feat: sync all 4 runtime plugins with full engine surface + fix #52 #58 #62 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug fixes: - #52: bundle run-sleep.sh in Claude Code plugin + 4-level fallback - #58: add skillopt-sleep console script entry point in pyproject.toml - #62: filter headless claude -p replay sessions from harvest Plugin sync (Claude Code / Codex / Copilot / OpenClaw): - Document all 22 CLI flags, 7 actions, 4 backends across all SKILL.md files - Document config keys (preferences, gate_mode, dream_rollouts, etc.) - Document memory consolidation (evolve_memory / evolve_skill) - Add schedule/unschedule to all plugins - Copilot MCP: expand schema from 3 → 16 params + schedule tools - OpenClaw: add schedule/unschedule subcommands via shared scheduler Tests: - Cross-plugin parity test (prevents future feature drift) - MCP schema completeness test Co-Authored-By: Claude Fable 5 --- plugins/claude-code/scripts/run-sleep.sh | 46 ++++++++++ plugins/claude-code/scripts/sleep.sh | 31 +++++-- .../skills/skillopt-sleep/SKILL.md | 47 ++++++++++ plugins/codex/skills/skillopt-sleep/SKILL.md | 39 ++++++++- .../copilot/copilot-instructions.snippet.md | 18 ++++ plugins/copilot/mcp_server.py | 70 +++++++++++++-- plugins/openclaw/SKILL.md | 33 +++++++ plugins/openclaw/slash_sleep.py | 34 ++++++++ pyproject.toml | 1 + skillopt_sleep/harvest.py | 52 +++++++++++ tests/test_mcp_schema.py | 37 ++++++++ tests/test_plugin_sync.py | 87 +++++++++++++++++++ 12 files changed, 479 insertions(+), 16 deletions(-) create mode 100755 plugins/claude-code/scripts/run-sleep.sh create mode 100644 tests/test_mcp_schema.py create mode 100644 tests/test_plugin_sync.py diff --git a/plugins/claude-code/scripts/run-sleep.sh b/plugins/claude-code/scripts/run-sleep.sh new file mode 100755 index 0000000..e46e212 --- /dev/null +++ b/plugins/claude-code/scripts/run-sleep.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# SkillOpt-Sleep shared runner — used by all platform plugins (Claude Code, +# Codex, Copilot). Resolves the repo root (which contains the skillopt_sleep +# package), picks a Python >= 3.10, and execs the engine CLI. +# +# Usage: run-sleep.sh [args...] +set -euo pipefail + +# This script lives at /plugins/run-sleep.sh, so the repo root (which +# holds skillopt_sleep/) is one level up. CLAUDE_PLUGIN_ROOT (if set by Claude +# Code) points at the plugin dir; the engine is then two levels above it. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if [ -d "$SCRIPT_DIR/../skillopt_sleep" ]; then + REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +elif [ -n "${CLAUDE_PLUGIN_ROOT:-}" ] && [ -d "$CLAUDE_PLUGIN_ROOT/../../skillopt_sleep" ]; then + REPO_ROOT="$(cd "$CLAUDE_PLUGIN_ROOT/../.." && pwd)" +elif [ -n "${SKILLOPT_SLEEP_REPO:-}" ] && [ -d "$SKILLOPT_SLEEP_REPO/skillopt_sleep" ]; then + REPO_ROOT="$SKILLOPT_SLEEP_REPO" +else + # last resort: search upward from CWD + d="$PWD" + while [ "$d" != "/" ]; do + [ -d "$d/skillopt_sleep" ] && { REPO_ROOT="$d"; break; } + d="$(dirname "$d")" + done +fi +if [ -z "${REPO_ROOT:-}" ]; then + echo "[sleep] ERROR: could not locate the skillopt_sleep package. Set SKILLOPT_SLEEP_REPO to the repo root." >&2 + exit 1 +fi + +PY="" +for cand in python3.12 python3.11 python3.10 python3; do + if command -v "$cand" >/dev/null 2>&1; then + ver="$("$cand" -c 'import sys; print("%d%d" % sys.version_info[:2])' 2>/dev/null || echo 0)" + if [ "${ver:-0}" -ge 310 ]; then PY="$cand"; break; fi + fi +done +if [ -z "$PY" ]; then + echo "[sleep] ERROR: need Python >= 3.10 (found none)." >&2 + exit 1 +fi + +if [ "$#" -eq 0 ]; then set -- status; fi +cd "$REPO_ROOT" +exec "$PY" -m skillopt_sleep "$@" diff --git a/plugins/claude-code/scripts/sleep.sh b/plugins/claude-code/scripts/sleep.sh index 3d2210e..20a9f36 100755 --- a/plugins/claude-code/scripts/sleep.sh +++ b/plugins/claude-code/scripts/sleep.sh @@ -1,11 +1,30 @@ #!/usr/bin/env bash -# Claude Code plugin runner — thin wrapper over the shared runner so all three -# platform plugins share one engine launcher. The shared runner lives at -# /plugins/run-sleep.sh and handles repo-root + interpreter resolution. +# Claude Code plugin runner — thin wrapper over the shared runner so all +# platform plugins share one engine launcher. +# +# After marketplace install the plugin is isolated in a cache directory and +# the repo-relative path no longer works. We try four locations: +# 1. Co-located run-sleep.sh (bundled copy — works in marketplace cache) +# 2. Repo-relative ../../run-sleep.sh (dev checkout) +# 3. CLAUDE_PLUGIN_ROOT/../run-sleep.sh (plugin env variable) +# 4. SKILLOPT_SLEEP_REPO/plugins/run-sleep.sh (explicit env) set -euo pipefail -HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # /plugins/claude-code/scripts -SHARED="$(cd "$HERE/../.." && pwd)/run-sleep.sh" # /plugins/run-sleep.sh -if [ ! -f "$SHARED" ] && [ -n "${CLAUDE_PLUGIN_ROOT:-}" ]; then +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +SHARED="" +if [ -f "$HERE/run-sleep.sh" ]; then + SHARED="$HERE/run-sleep.sh" +elif [ -f "$(cd "$HERE/../.." 2>/dev/null && pwd)/run-sleep.sh" ]; then + SHARED="$(cd "$HERE/../.." && pwd)/run-sleep.sh" +elif [ -n "${CLAUDE_PLUGIN_ROOT:-}" ] && [ -f "$(cd "$CLAUDE_PLUGIN_ROOT/.." 2>/dev/null && pwd)/run-sleep.sh" ]; then SHARED="$(cd "$CLAUDE_PLUGIN_ROOT/.." && pwd)/run-sleep.sh" +elif [ -n "${SKILLOPT_SLEEP_REPO:-}" ] && [ -f "$SKILLOPT_SLEEP_REPO/plugins/run-sleep.sh" ]; then + SHARED="$SKILLOPT_SLEEP_REPO/plugins/run-sleep.sh" +fi + +if [ -z "$SHARED" ]; then + echo "[sleep] ERROR: cannot locate run-sleep.sh." >&2 + echo "[sleep] Set SKILLOPT_SLEEP_REPO to the SkillOpt repo root, or pip install skillopt." >&2 + exit 1 fi exec bash "$SHARED" "$@" diff --git a/plugins/claude-code/skills/skillopt-sleep/SKILL.md b/plugins/claude-code/skills/skillopt-sleep/SKILL.md index 021cd11..b7f4019 100644 --- a/plugins/claude-code/skills/skillopt-sleep/SKILL.md +++ b/plugins/claude-code/skills/skillopt-sleep/SKILL.md @@ -54,6 +54,53 @@ Prefer the `/skillopt-sleep` command. Under the hood it calls the bundled runner - Add `--backend claude` or `--backend codex` to spend the user's real budget for genuine improvement. - Scope defaults to the invoked project; `--scope all` harvests every project. +### Scheduling + +```bash +"${CLAUDE_PLUGIN_ROOT}/scripts/sleep.sh" schedule --project "$(pwd)" --hour 3 --minute 17 +"${CLAUDE_PLUGIN_ROOT}/scripts/sleep.sh" unschedule --project "$(pwd)" +``` + +Installs a nightly cron entry. `unschedule --all` removes every managed entry. + +## All CLI flags + +| Flag | Default | Description | +|------|---------|-------------| +| `--project PATH` | cwd | Project directory to evolve | +| `--scope all\|invoked` | invoked | Harvest scope | +| `--backend mock\|claude\|codex\|copilot` | mock | Replay backend (mock = no API spend) | +| `--model NAME` | backend default | Override the model used for replay | +| `--source claude\|codex\|auto` | claude | Transcript source | +| `--lookback-hours N` | 72 | Harvest window | +| `--max-sessions N` | unlimited | Cap harvested sessions | +| `--max-tasks N` | 40 | Cap mined tasks | +| `--target-skill-path PATH` | auto | Explicit SKILL.md to evolve | +| `--tasks-file PATH` | — | Reviewed TaskRecord JSON (skip harvest) | +| `--progress` | off | Print phase progress to stderr | +| `--auto-adopt` | off | Auto-adopt if gate passes | +| `--edit-budget N` | 4 | Max bounded edits per night | +| `--json` | off | Machine-readable JSON output | + +## Config keys (`~/.skillopt-sleep/config.json`) + +Beyond the CLI flags, advanced behavior is controlled via config: + +- **`preferences`** — free-text house rules injected into the optimizer's reflect step (e.g. "Always use async/await", "Answers in `\boxed{}`"). +- **`gate_mode`** — `on` (default, validation-gated) or `off` (greedy, accept all edits). +- **`gate_metric`** — `hard`, `soft`, or `mixed` (default). Controls how the held-out gate scores. +- **`dream_rollouts`** — >1 enables multi-rollout contrastive reflection per task. +- **`recall_k`** — >0 recalls K similar past tasks into the dream (long-term memory). +- **`evolve_memory`** / **`evolve_skill`** — independently toggle CLAUDE.md vs SKILL.md consolidation. + +## Memory consolidation + +The sleep cycle can consolidate both: +- **SKILL.md** — the managed skill file (bounded edits: add/delete/replace) +- **CLAUDE.md** — the project memory (same bounded edits) + +Both are gated by the same held-out validation score. Set `evolve_memory: false` to consolidate only skills, or `evolve_skill: false` for only memory. + ## Hard rules - **Never** hand-edit the user's `CLAUDE.md` / `SKILL.md` as part of this skill. diff --git a/plugins/codex/skills/skillopt-sleep/SKILL.md b/plugins/codex/skills/skillopt-sleep/SKILL.md index 88b4113..44745dd 100644 --- a/plugins/codex/skills/skillopt-sleep/SKILL.md +++ b/plugins/codex/skills/skillopt-sleep/SKILL.md @@ -52,7 +52,7 @@ bash "$SKILLOPT_SLEEP_REPO/plugins/run-sleep.sh" run --project "$(pwd)" --source bash "$SKILLOPT_SLEEP_REPO/plugins/run-sleep.sh" adopt --project "$(pwd)" ``` -Actions are `status`, `harvest`, `dry-run`, `run`, and `adopt`. +Actions are `status`, `harvest`, `dry-run`, `run`, `adopt`, `schedule`, and `unschedule`. - Default backend is `mock`, which is deterministic and spends no API budget. - `--backend codex` uses the user's Codex budget for real improvement. @@ -61,6 +61,43 @@ Actions are `status`, `harvest`, `dry-run`, `run`, and `adopt`. - Keep `dry-run --backend mock` as the first smoke check unless the user explicitly asked for a real optimization run. +### Scheduling + +```bash +bash "$SKILLOPT_SLEEP_REPO/plugins/run-sleep.sh" schedule --project "$(pwd)" --hour 3 --minute 17 +bash "$SKILLOPT_SLEEP_REPO/plugins/run-sleep.sh" unschedule --project "$(pwd)" +``` + +Installs a nightly cron entry. `unschedule --all` removes every managed entry. + +### All backends + +- `--backend mock` — deterministic, no API spend (default) +- `--backend claude` — uses the Claude CLI +- `--backend codex` — uses the Codex CLI +- `--backend copilot` — uses the GitHub Copilot CLI + +### Additional flags + +| Flag | Description | +|------|-------------| +| `--auto-adopt` | Auto-adopt if the gate passes (default: stage only) | +| `--edit-budget N` | Max bounded edits per night (default: 4) | +| `--lookback-hours N` | Harvest window in hours (default: 72) | +| `--json` | Machine-readable JSON output | + +### Config keys (`~/.skillopt-sleep/config.json`) + +- **`preferences`** — free-text house rules for the optimizer +- **`gate_mode`** — `on` (validation-gated, default) or `off` (greedy) +- **`gate_metric`** — `hard` | `soft` | `mixed` (default) +- **`dream_rollouts`** — >1 for multi-rollout contrastive reflection +- **`recall_k`** — >0 recalls similar past tasks from the archive + +### Memory consolidation + +The sleep cycle consolidates both **memory** (AGENTS.md / CLAUDE.md) and **skills** (SKILL.md) by default. Each is independently toggleable via `evolve_memory` / `evolve_skill` config keys. Both are gated by the same held-out validation score. + ## Steps 1. Run the requested action; capture stdout. diff --git a/plugins/copilot/copilot-instructions.snippet.md b/plugins/copilot/copilot-instructions.snippet.md index be19047..298ead9 100644 --- a/plugins/copilot/copilot-instructions.snippet.md +++ b/plugins/copilot/copilot-instructions.snippet.md @@ -19,6 +19,24 @@ my preferences", or "make the agent improve from past usage", use the MCP tools: - `sleep_run` — full cycle, stages a reviewed proposal (nothing live changes) - `sleep_adopt` — apply the staged proposal (backs up first) - `sleep_harvest` — list mined recurring tasks +- `sleep_schedule` — install a nightly cron entry (set `hour`/`minute`) +- `sleep_unschedule` — remove the nightly cron entry + +### Key parameters (pass as MCP tool arguments) + +- `backend` — `mock` (default, free), `claude`, `codex`, or `copilot` +- `source` — `claude`, `codex`, or `auto` (where to read transcripts) +- `target_skill_path` — explicit SKILL.md to evolve +- `tasks_file` — pre-built TaskRecord JSON (skip harvest) +- `max_tasks` / `max_sessions` — cap workload +- `auto_adopt` — auto-adopt if the gate passes +- `json` — machine-readable output for programmatic use + +### Advanced config (`~/.skillopt-sleep/config.json`) + +- `preferences` — free-text house rules for the optimizer +- `gate_mode` — `on` (default) or `off`; `dream_rollouts` — >1 for more signal +- `evolve_memory` / `evolve_skill` — toggle which docs consolidate Always show the user the held-out baseline → candidate score and the proposed edits before suggesting `sleep_adopt`. Never hand-edit the user's memory/skill diff --git a/plugins/copilot/mcp_server.py b/plugins/copilot/mcp_server.py index 2c592ae..fe50542 100755 --- a/plugins/copilot/mcp_server.py +++ b/plugins/copilot/mcp_server.py @@ -38,16 +38,48 @@ TOOLS = [ "description": "Apply the latest staged proposal to CLAUDE.md/SKILL.md (backs up first)."}, {"name": "sleep_harvest", "action": "harvest", "description": "Debug: list the recurring tasks mined from recent sessions."}, + {"name": "sleep_schedule", "action": "schedule", + "description": "Install a nightly cron entry to run the sleep cycle automatically."}, + {"name": "sleep_unschedule", "action": "unschedule", + "description": "Remove the nightly cron entry for a project."}, ] _BY_NAME = {t["name"]: t for t in TOOLS} _TOOL_SCHEMA = { "type": "object", "properties": { - "project": {"type": "string", "description": "Project dir to evolve (default: cwd)."}, + "project": {"type": "string", + "description": "Project dir to evolve (default: cwd)."}, "backend": {"type": "string", "enum": ["mock", "claude", "codex", "copilot"], "description": "mock = no API spend (default); claude/codex/copilot = real."}, - "scope": {"type": "string", "enum": ["invoked", "all"]}, + "scope": {"type": "string", "enum": ["invoked", "all"], + "description": "Harvest scope (default: invoked project only)."}, + "source": {"type": "string", "enum": ["claude", "codex", "auto"], + "description": "Transcript source (default: claude)."}, + "model": {"type": "string", + "description": "Backend-specific model override."}, + "tasks_file": {"type": "string", + "description": "Path to reviewed TaskRecord JSON (skips harvest)."}, + "target_skill_path": {"type": "string", + "description": "Explicit SKILL.md path to evolve/stage/adopt."}, + "progress": {"type": "boolean", + "description": "Print phase progress to stderr."}, + "max_sessions": {"type": "integer", + "description": "Cap harvested sessions per run."}, + "max_tasks": {"type": "integer", + "description": "Cap mined tasks per run."}, + "lookback_hours": {"type": "integer", + "description": "Harvest window in hours (default: 72)."}, + "auto_adopt": {"type": "boolean", + "description": "Auto-adopt if gate passes (default: false)."}, + "json": {"type": "boolean", + "description": "Return machine-readable JSON output."}, + "edit_budget": {"type": "integer", + "description": "Max bounded edits per night (default: 4)."}, + "hour": {"type": "integer", + "description": "Hour for schedule (0-23, default: 3)."}, + "minute": {"type": "integer", + "description": "Minute for schedule (0-59, default: 17)."}, }, "additionalProperties": False, } @@ -56,15 +88,35 @@ _TOOL_SCHEMA = { def _run_engine(action: str, args: dict) -> str: py = sys.executable or "python3" cmd = [py, "-m", "skillopt_sleep", action] - if args.get("project"): - cmd += ["--project", str(args["project"])] - if args.get("backend"): - cmd += ["--backend", str(args["backend"])] - if args.get("scope"): - cmd += ["--scope", str(args["scope"])] + # String-valued flags + for flag, key in [ + ("--project", "project"), ("--backend", "backend"), + ("--scope", "scope"), ("--source", "source"), + ("--model", "model"), ("--tasks-file", "tasks_file"), + ("--target-skill-path", "target_skill_path"), + ]: + val = args.get(key) + if val: + cmd += [flag, str(val)] + # Integer-valued flags + for flag, key in [ + ("--max-sessions", "max_sessions"), ("--max-tasks", "max_tasks"), + ("--lookback-hours", "lookback_hours"), ("--edit-budget", "edit_budget"), + ("--hour", "hour"), ("--minute", "minute"), + ]: + val = args.get(key) + if val is not None: + cmd += [flag, str(int(val))] + # Boolean flags + for flag, key in [ + ("--progress", "progress"), ("--auto-adopt", "auto_adopt"), + ("--json", "json"), + ]: + if args.get(key): + cmd.append(flag) try: proc = subprocess.run(cmd, cwd=REPO_ROOT, capture_output=True, text=True, timeout=3600) - except Exception as e: # noqa: BLE001 + except Exception as e: return f"[error] failed to run engine: {e}" out = (proc.stdout or "").strip() err = (proc.stderr or "").strip() diff --git a/plugins/openclaw/SKILL.md b/plugins/openclaw/SKILL.md index 9982d2b..66b24ac 100644 --- a/plugins/openclaw/SKILL.md +++ b/plugins/openclaw/SKILL.md @@ -52,6 +52,39 @@ python3 run_sleep.py --dry-run python3 run_sleep.py --tasks tests/research-cron-tasks.json ``` +## Scheduling + +```bash +python3 slash_sleep.py schedule --hour 3 --minute 17 +python3 slash_sleep.py unschedule +python3 slash_sleep.py unschedule --all +``` + +Installs a nightly cron entry using the shared SkillOpt-Sleep scheduler. This is an alternative to the external `run_sleep_cron.sh` script. + +## Alternative backends + +While OpenClaw defaults to `openclaw-deepseek` (DeepSeek V4 Pro + Ollama), the shared engine also supports: +- `--backend mock` — deterministic, no API spend (for testing) +- `--backend claude` — uses the Claude CLI +- `--backend codex` — uses the Codex CLI +- `--backend copilot` — uses the GitHub Copilot CLI + +These can be used via the engine directly (`python -m skillopt_sleep`). + +## Shared-engine flags + +When invoking the engine directly, all standard flags are available: +- `--source codex` / `--source auto` — harvest from Codex Desktop sessions +- `--tasks-file PATH` — use a pre-built task set +- `--target-skill-path PATH` — explicit SKILL.md target +- `--max-tasks N` / `--max-sessions N` — cap workload +- `--progress` — print phase progress +- `--json` — machine-readable output +- `--auto-adopt` — auto-adopt if gate passes + +Config keys: `preferences`, `gate_mode`, `gate_metric`, `dream_rollouts`, `recall_k`, `evolve_memory`, `evolve_skill`. + ## Config (config.json) Key knobs: diff --git a/plugins/openclaw/slash_sleep.py b/plugins/openclaw/slash_sleep.py index 1e952e2..09c7486 100755 --- a/plugins/openclaw/slash_sleep.py +++ b/plugins/openclaw/slash_sleep.py @@ -207,6 +207,30 @@ def reject(night: str = None) -> int: return 0 +def schedule_cmd(hour: int, minute: int) -> int: + """Install a nightly cron entry via the shared SkillOpt-Sleep scheduler.""" + try: + from skillopt_sleep.scheduler import schedule + except ImportError: + print("ERROR: skillopt_sleep.scheduler not available — is SkillOpt-Sleep installed?") + return 1 + result = schedule(hour=hour, minute=minute) + print(result) + return 0 + + +def unschedule_cmd(all_projects: bool) -> int: + """Remove cron entry via the shared SkillOpt-Sleep scheduler.""" + try: + from skillopt_sleep.scheduler import unschedule + except ImportError: + print("ERROR: skillopt_sleep.scheduler not available — is SkillOpt-Sleep installed?") + return 1 + result = unschedule(all_projects=all_projects) + print(result) + return 0 + + def cost() -> int: """Estimate per-night cost based on the actual measurement from Phase 2. @@ -265,6 +289,12 @@ def main(): p_reject = sub.add_parser("reject", help="discard most recent staging") p_reject.add_argument("night", nargs="?", default=None) sub.add_parser("cost", help="estimate cost") + p_schedule = sub.add_parser("schedule", help="install nightly cron entry") + p_schedule.add_argument("--hour", type=int, default=3, help="hour (0-23)") + p_schedule.add_argument("--minute", type=int, default=0, help="minute (0-59)") + p_unschedule = sub.add_parser("unschedule", help="remove cron entry") + p_unschedule.add_argument("--all", dest="all_projects", action="store_true", + help="remove entries for all projects") args = ap.parse_args() @@ -282,6 +312,10 @@ def main(): return reject(args.night) if args.cmd == "cost": return cost() + if args.cmd == "schedule": + return schedule_cmd(args.hour, args.minute) + if args.cmd == "unschedule": + return unschedule_cmd(args.all_projects) return 1 diff --git a/pyproject.toml b/pyproject.toml index 48da25c..9a0020e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,6 +58,7 @@ all = [ [project.scripts] skillopt-train = "scripts.train:main" skillopt-eval = "scripts.eval_only:main" +skillopt-sleep = "skillopt_sleep.__main__:main" [project.urls] Homepage = "https://github.com/microsoft/SkillOpt" diff --git a/skillopt_sleep/harvest.py b/skillopt_sleep/harvest.py index 4e7514f..4086717 100644 --- a/skillopt_sleep/harvest.py +++ b/skillopt_sleep/harvest.py @@ -111,6 +111,56 @@ def _is_meta_prompt(text: str) -> bool: return False +# ── Issue #62: filter headless replay sessions ───────────────────────── + +# Prompt markers generated by the engine's own headless `claude -p` calls +# (judge, reflect, attempt). If the sole user prompt in a single-turn +# session matches any of these, the session is engine-generated, not a +# real user task. +_REPLAY_PROMPT_MARKERS = ( + "## CURRENT SKILL", + "## FAILED TASKS", + "## SUCCESSFUL TASKS", + "## OUTPUT FORMAT", + "You are a strict grader", + "Score the response 0.0-1.0", + "You are SkillOpt-Sleep", + "## TASK\n", + "## SKILL\n", +) + + +def _is_headless_replay(digest: "SessionDigest") -> bool: + """Detect sessions created by the engine's own headless replay calls. + + Heuristics (conservatively applied): + 1. Session has exactly 1 user turn AND + 2. The sole prompt matches engine-generated patterns (grader/reflect), + OR the session lasted < 3 seconds (programmatic, not interactive). + Multi-turn sessions are always kept (interactive by definition). + """ + if digest.n_user_turns > 1: + return False + if digest.n_user_turns == 0: + return True + prompt = digest.user_prompts[0] if digest.user_prompts else "" + for marker in _REPLAY_PROMPT_MARKERS: + if marker in prompt: + return True + # Sub-3-second single-turn sessions are almost certainly programmatic. + if digest.started_at and digest.ended_at: + try: + from datetime import datetime + fmt = "%Y-%m-%dT%H:%M:%S" + start = datetime.strptime(digest.started_at[:19], fmt) + end = datetime.strptime(digest.ended_at[:19], fmt) + if (end - start).total_seconds() < 3: + return True + except (ValueError, TypeError): + pass + return False + + def digest_transcript(path: str) -> Optional[SessionDigest]: """Build a SessionDigest from one ``.jsonl`` transcript.""" session_id = os.path.splitext(os.path.basename(path))[0] @@ -236,6 +286,8 @@ def harvest( d = digest_transcript(p) if d is None: continue + if _is_headless_replay(d): + continue # Issue #62: skip engine's own headless replay sessions if not _project_matches(d.project or "", scope, invoked_project): continue if since_iso and d.ended_at and d.ended_at < since_iso: diff --git a/tests/test_mcp_schema.py b/tests/test_mcp_schema.py new file mode 100644 index 0000000..f8960b1 --- /dev/null +++ b/tests/test_mcp_schema.py @@ -0,0 +1,37 @@ +"""Tests for the Copilot MCP server schema completeness.""" +import os +import sys +import unittest + +# Allow importing from the plugin directory +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "plugins", "copilot")) + + +class TestMcpSchema(unittest.TestCase): + def test_schema_includes_all_engine_flags(self): + from mcp_server import _TOOL_SCHEMA + required_params = { + "project", "backend", "scope", "source", "model", + "tasks_file", "target_skill_path", "progress", + "max_sessions", "max_tasks", "lookback_hours", + "auto_adopt", "json", "edit_budget", + } + schema_props = set(_TOOL_SCHEMA["properties"].keys()) + missing = required_params - schema_props + self.assertEqual(missing, set(), f"MCP schema missing: {missing}") + + def test_all_backends_in_enum(self): + from mcp_server import _TOOL_SCHEMA + backends = _TOOL_SCHEMA["properties"]["backend"]["enum"] + for b in ["mock", "claude", "codex", "copilot"]: + self.assertIn(b, backends) + + def test_schedule_tools_exist(self): + from mcp_server import TOOLS + names = {t["name"] for t in TOOLS} + self.assertIn("sleep_schedule", names) + self.assertIn("sleep_unschedule", names) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_plugin_sync.py b/tests/test_plugin_sync.py new file mode 100644 index 0000000..f7850e2 --- /dev/null +++ b/tests/test_plugin_sync.py @@ -0,0 +1,87 @@ +"""Cross-plugin parity tests — ensure all plugins document the same features. + +Run: python3 -m pytest tests/test_plugin_sync.py -v +""" +import os +import unittest + +REPO = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + +PLUGIN_SKILL_MDS = { + "claude-code": os.path.join(REPO, "plugins/claude-code/skills/skillopt-sleep/SKILL.md"), + "codex": os.path.join(REPO, "plugins/codex/skills/skillopt-sleep/SKILL.md"), + "openclaw": os.path.join(REPO, "plugins/openclaw/SKILL.md"), +} + +MCP_SERVER = os.path.join(REPO, "plugins/copilot/mcp_server.py") +COPILOT_INSTRUCTIONS = os.path.join(REPO, "plugins/copilot/copilot-instructions.snippet.md") + +CANONICAL_BACKENDS = {"mock", "claude", "codex", "copilot"} + + +def _read(path): + if not os.path.exists(path): + return "" + with open(path, encoding="utf-8") as f: + return f.read() + + +class TestPluginParity(unittest.TestCase): + def test_all_skill_mds_mention_all_backends(self): + for name, path in PLUGIN_SKILL_MDS.items(): + text = _read(path) + if not text: + self.skipTest(f"{name} SKILL.md not found") + for backend in CANONICAL_BACKENDS: + self.assertIn(backend, text, + f"{name}/SKILL.md missing backend '{backend}'") + + def test_all_skill_mds_mention_schedule(self): + for name, path in PLUGIN_SKILL_MDS.items(): + text = _read(path) + if not text: + continue + self.assertIn("schedule", text.lower(), + f"{name}/SKILL.md missing 'schedule'") + self.assertIn("unschedule", text.lower(), + f"{name}/SKILL.md missing 'unschedule'") + + def test_copilot_instructions_mention_schedule(self): + text = _read(COPILOT_INSTRUCTIONS) + self.assertIn("sleep_schedule", text) + self.assertIn("sleep_unschedule", text) + + def test_copilot_instructions_mention_all_backends(self): + text = _read(COPILOT_INSTRUCTIONS) + for backend in CANONICAL_BACKENDS: + self.assertIn(backend, text, + f"copilot-instructions missing backend '{backend}'") + + def test_mcp_server_has_schedule_tools(self): + text = _read(MCP_SERVER) + self.assertIn("sleep_schedule", text) + self.assertIn("sleep_unschedule", text) + + def test_mcp_schema_has_key_params(self): + text = _read(MCP_SERVER) + for param in ["source", "tasks_file", "target_skill_path", + "max_sessions", "max_tasks", "auto_adopt", "json"]: + self.assertIn(f'"{param}"', text, + f"MCP schema missing param '{param}'") + + def test_all_skill_mds_mention_memory_consolidation(self): + for name, path in PLUGIN_SKILL_MDS.items(): + text = _read(path).lower() + if not text: + continue + has_mention = ( + "memory consolidation" in text + or "evolve_memory" in text + or ("consolidate" in text and "memory" in text) + ) + self.assertTrue(has_mention, + f"{name}/SKILL.md missing memory consolidation docs") + + +if __name__ == "__main__": + unittest.main()