From 21f93c16c736639a3638bc70a82c010eabb1b3cd Mon Sep 17 00:00:00 2001 From: DB Lee Date: Fri, 12 Jun 2026 08:21:57 -0700 Subject: [PATCH] Add GitHub Copilot backend to SkillOpt-Sleep Add CopilotCliBackend that drives the GitHub Copilot CLI in non-interactive mode (copilot -p ... --output-format json) and parses the JSONL event stream for assistant.message content. Registered as the 'copilot' backend (with aliases) and wired through the CLI, config, experiment harness, and the Copilot MCP server's backend enum. - Force UTF-8 decoding of CLI output (fixes cp1252 UnicodeDecodeError on Windows when responses contain non-cp1252 bytes). - Minimise per-call startup: isolated COPILOT_HOME with built-in MCPs and custom instructions disabled, so user MCP servers are not spawned per call (~5x faster: 36s -> 7.4s). Override via SKILLOPT_SLEEP_COPILOT_HOME / SKILLOPT_SLEEP_COPILOT_MODEL / SKILLOPT_SLEEP_COPILOT_FULL_ENV. Validated end-to-end on real held-out tasks (researcher persona: 0.42 -> 1.00 lift; gate correctly rejects non-improving edits). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- plugins/copilot/README.md | 13 ++- plugins/copilot/mcp_server.py | 4 +- skillopt_sleep/__main__.py | 4 +- skillopt_sleep/backend.py | 111 +++++++++++++++++++ skillopt_sleep/config.py | 2 +- skillopt_sleep/experiments/run_experiment.py | 2 +- 6 files changed, 128 insertions(+), 8 deletions(-) diff --git a/plugins/copilot/README.md b/plugins/copilot/README.md index 769eac5..6171381 100644 --- a/plugins/copilot/README.md +++ b/plugins/copilot/README.md @@ -45,8 +45,17 @@ Ask Copilot things like *"run the sleep cycle"*, *"what did the last sleep propose?"*, *"adopt the staged sleep proposal"*. Copilot calls the MCP tools: `sleep_status`, `sleep_dry_run`, `sleep_run`, `sleep_adopt`, `sleep_harvest`. -Each tool takes optional `project`, `backend` (`mock`/`claude`/`codex`), and -`scope` arguments. Default backend is `mock` (no API spend). +Each tool takes optional `project`, `backend` (`mock`/`claude`/`codex`/`copilot`), and +`scope` arguments. Default backend is `mock` (no API spend). The `copilot` +backend drives the GitHub Copilot CLI (`copilot -p ... --output-format json`) +and requires the `copilot` CLI to be installed and authenticated. + +For speed, the `copilot` backend runs each call against an isolated +`COPILOT_HOME` with built-in MCP servers and custom instructions disabled, so +your user MCP servers (including this project's own) are not spawned per call +(~5x faster). Override with `SKILLOPT_SLEEP_COPILOT_HOME=`, pick a model +with `SKILLOPT_SLEEP_COPILOT_MODEL`, or set `SKILLOPT_SLEEP_COPILOT_FULL_ENV=1` +to use your real Copilot environment instead. ## Verify the server directly (no Copilot needed) diff --git a/plugins/copilot/mcp_server.py b/plugins/copilot/mcp_server.py index d03a95b..2c592ae 100755 --- a/plugins/copilot/mcp_server.py +++ b/plugins/copilot/mcp_server.py @@ -45,8 +45,8 @@ _TOOL_SCHEMA = { "type": "object", "properties": { "project": {"type": "string", "description": "Project dir to evolve (default: cwd)."}, - "backend": {"type": "string", "enum": ["mock", "claude", "codex"], - "description": "mock = no API spend (default); claude/codex = real."}, + "backend": {"type": "string", "enum": ["mock", "claude", "codex", "copilot"], + "description": "mock = no API spend (default); claude/codex/copilot = real."}, "scope": {"type": "string", "enum": ["invoked", "all"]}, }, "additionalProperties": False, diff --git a/skillopt_sleep/__main__.py b/skillopt_sleep/__main__.py index 2666ee6..d947491 100644 --- a/skillopt_sleep/__main__.py +++ b/skillopt_sleep/__main__.py @@ -9,7 +9,7 @@ Common flags: --project PATH project to evolve (default: cwd) --scope all|invoked harvest scope (default: invoked) - --backend mock|claude|codex + --backend mock|claude|codex|copilot --source claude|codex|auto --model NAME --lookback-hours N @@ -36,7 +36,7 @@ from skillopt_sleep.state import SleepState def _add_common(p: argparse.ArgumentParser) -> None: p.add_argument("--project", default="") p.add_argument("--scope", default="", choices=["", "all", "invoked"]) - p.add_argument("--backend", default="", choices=["", "mock", "claude", "codex"]) + p.add_argument("--backend", default="", choices=["", "mock", "claude", "codex", "copilot"]) p.add_argument("--model", default="") p.add_argument("--codex-path", default="", help="path to the real @openai/codex binary") p.add_argument("--claude-home", default="", help="override ~/.claude (also isolates state)") diff --git a/skillopt_sleep/backend.py b/skillopt_sleep/backend.py index 2ec5cdd..8977e0d 100644 --- a/skillopt_sleep/backend.py +++ b/skillopt_sleep/backend.py @@ -24,6 +24,7 @@ import json import os import re import subprocess +import tempfile from typing import Any, Dict, List, Optional, Tuple from skillopt_sleep.types import EditRecord, ReplayResult, TaskRecord @@ -788,6 +789,114 @@ class CodexCliBackend(CliBackend): except Exception: pass +def resolve_copilot_path(explicit: str = "") -> str: + """Find the GitHub Copilot CLI (`copilot`) binary.""" + if explicit: + return explicit + env = os.environ.get("SKILLOPT_SLEEP_COPILOT_PATH") + if env: + return env + import shutil + found = shutil.which("copilot") + return found or "copilot" + + +class CopilotCliBackend(CliBackend): + """Drives the GitHub Copilot CLI in non-interactive mode. + + Uses ``copilot -p --output-format json`` and parses the emitted + JSONL event stream, returning the concatenated ``assistant.message`` + content. The plain-text / ``--silent`` modes do not reliably stream the + response to stdout on all platforms, so JSONL is used for robust capture. + + The call runs in a clean temp cwd with streaming disabled and tools allowed + (so non-interactive mode never blocks on a permission prompt); the prompts + ask for final-answer text only, so no tool use is expected. + + Startup overhead is minimised: each invocation points ``COPILOT_HOME`` at a + dedicated, isolated config dir (no user ``mcp-config.json``, so the user's + MCP servers — including this project's own — are NOT spawned, avoiding a + slow recursive launch), and built-in MCP servers / custom instructions are + disabled. Auth is read from the OS credential store / token env vars, which + live outside ``COPILOT_HOME``, so isolation does not break authentication. + Set ``SKILLOPT_SLEEP_COPILOT_HOME`` to override the isolated home, or set it + empty / ``SKILLOPT_SLEEP_COPILOT_FULL_ENV=1`` to use the user's real + environment instead. + """ + + name = "copilot" + + def __init__(self, model: str = "", copilot_path: str = "", timeout: int = 240) -> None: + super().__init__(model=model or os.environ.get("SKILLOPT_SLEEP_COPILOT_MODEL", ""), + timeout=timeout) + self.copilot_path = resolve_copilot_path(copilot_path) + self.full_env = os.environ.get("SKILLOPT_SLEEP_COPILOT_FULL_ENV", "") == "1" + # Stable isolated home so first-run setup is cached across calls. + if self.full_env: + self.copilot_home = "" + else: + self.copilot_home = os.environ.get("SKILLOPT_SLEEP_COPILOT_HOME") or os.path.join( + tempfile.gettempdir(), "skillopt_sleep_copilot_home" + ) + try: + os.makedirs(self.copilot_home, exist_ok=True) + except Exception: + self.copilot_home = "" + + def _call(self, prompt: str, *, max_tokens: int = 1024) -> str: + clean_cwd = tempfile.mkdtemp(prefix="skillopt_sleep_copilot_") + cmd = [ + self.copilot_path, "-p", prompt, + "--output-format", "json", + "--stream", "off", + "--no-color", + "--log-level", "none", + "--allow-all-tools", + "-C", clean_cwd, + ] + if not self.full_env: + # Drop unneeded startup work: no built-in (github) MCP server and no + # AGENTS.md / custom-instruction loading. With an isolated home that + # has no mcp-config.json, no user MCP servers spawn either. + cmd += ["--disable-builtin-mcps", "--no-custom-instructions"] + if self.model: + cmd += ["--model", self.model] + env = os.environ.copy() + if self.copilot_home: + env["COPILOT_HOME"] = self.copilot_home + try: + proc = subprocess.run( + cmd, capture_output=True, text=True, timeout=self.timeout, cwd=clean_cwd, + encoding="utf-8", errors="replace", env=env, + ) + except Exception: + return "" + finally: + try: + import shutil + shutil.rmtree(clean_cwd, ignore_errors=True) + except Exception: + pass + return self._parse_jsonl_response(proc.stdout or "") + + @staticmethod + def _parse_jsonl_response(raw: str) -> str: + parts: List[str] = [] + for line in raw.splitlines(): + line = line.strip() + if not line or not line.startswith("{"): + continue + try: + obj = json.loads(line) + except Exception: + continue + if obj.get("type") == "assistant.message": + content = (obj.get("data") or {}).get("content") + if isinstance(content, str) and content: + parts.append(content) + return "\n".join(parts).strip() + + class DualBackend(Backend): """Route operations to two backends, à la SkillOpt's target vs optimizer. @@ -1036,6 +1145,8 @@ def get_backend( if n in {"azure-responses", "azure_responses", "aoai-responses", "responses"}: eps = [e.strip() for e in azure_endpoint.split(",") if e.strip()] or None return AzureResponsesBackend(deployment=model, endpoints=eps) + if n in {"copilot", "github_copilot", "copilot_cli", "gh_copilot"}: + return CopilotCliBackend(model=model) return MockBackend() diff --git a/skillopt_sleep/config.py b/skillopt_sleep/config.py index 0bfb5a2..0e7cb04 100644 --- a/skillopt_sleep/config.py +++ b/skillopt_sleep/config.py @@ -36,7 +36,7 @@ DEFAULTS: Dict[str, Any] = { "val_fraction": 0.34, # real tasks reserved to gate updates "test_fraction": 0.0, # real tasks reserved as the final held-out measure # ── optimizer ────────────────────────────────────────────────────────── - "backend": "mock", # "mock" | "claude" | "codex" + "backend": "mock", # "mock" | "claude" | "codex" | "copilot" "model": "", # backend-specific; "" => backend default "gate_mode": "on", # "on" (validation-gated) | "off" (greedy, no hard filter) "codex_path": "", # "" => auto-detect the real @openai/codex binary diff --git a/skillopt_sleep/experiments/run_experiment.py b/skillopt_sleep/experiments/run_experiment.py index 91a9ca9..1110f26 100644 --- a/skillopt_sleep/experiments/run_experiment.py +++ b/skillopt_sleep/experiments/run_experiment.py @@ -134,7 +134,7 @@ def main(argv=None) -> int: ap = argparse.ArgumentParser(description="SkillOpt-Sleep validation experiment") ap.add_argument("--persona", default="researcher", choices=list(PERSONAS.keys())) ap.add_argument("--nights", type=int, default=4) - ap.add_argument("--backend", default="mock", choices=["mock", "claude", "codex"]) + ap.add_argument("--backend", default="mock", choices=["mock", "claude", "codex", "copilot"]) ap.add_argument("--model", default="", help="backend model override") ap.add_argument("--codex-path", default="", help="path to the real @openai/codex binary") ap.add_argument("--edit-budget", type=int, default=4)