Add GitHub Copilot backend to SkillOpt-Sleep

Add CopilotCliBackend that drives the GitHub Copilot CLI in
non-interactive mode (copilot -p ... --output-format json) and parses the
JSONL event stream for assistant.message content. Registered as the
'copilot' backend (with aliases) and wired through the CLI, config,
experiment harness, and the Copilot MCP server's backend enum.

- Force UTF-8 decoding of CLI output (fixes cp1252 UnicodeDecodeError on
  Windows when responses contain non-cp1252 bytes).
- Minimise per-call startup: isolated COPILOT_HOME with built-in MCPs and
  custom instructions disabled, so user MCP servers are not spawned per
  call (~5x faster: 36s -> 7.4s). Override via SKILLOPT_SLEEP_COPILOT_HOME
  / SKILLOPT_SLEEP_COPILOT_MODEL / SKILLOPT_SLEEP_COPILOT_FULL_ENV.

Validated end-to-end on real held-out tasks (researcher persona:
0.42 -> 1.00 lift; gate correctly rejects non-improving edits).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
DB Lee
2026-06-12 08:21:57 -07:00
parent 5dc894715f
commit 21f93c16c7
6 changed files with 128 additions and 8 deletions

View File

@@ -45,8 +45,17 @@ Ask Copilot things like *"run the sleep cycle"*, *"what did the last sleep
propose?"*, *"adopt the staged sleep proposal"*. Copilot calls the MCP tools:
`sleep_status`, `sleep_dry_run`, `sleep_run`, `sleep_adopt`, `sleep_harvest`.
Each tool takes optional `project`, `backend` (`mock`/`claude`/`codex`), and
`scope` arguments. Default backend is `mock` (no API spend).
Each tool takes optional `project`, `backend` (`mock`/`claude`/`codex`/`copilot`), and
`scope` arguments. Default backend is `mock` (no API spend). The `copilot`
backend drives the GitHub Copilot CLI (`copilot -p ... --output-format json`)
and requires the `copilot` CLI to be installed and authenticated.
For speed, the `copilot` backend runs each call against an isolated
`COPILOT_HOME` with built-in MCP servers and custom instructions disabled, so
your user MCP servers (including this project's own) are not spawned per call
(~5x faster). Override with `SKILLOPT_SLEEP_COPILOT_HOME=<dir>`, pick a model
with `SKILLOPT_SLEEP_COPILOT_MODEL`, or set `SKILLOPT_SLEEP_COPILOT_FULL_ENV=1`
to use your real Copilot environment instead.
## Verify the server directly (no Copilot needed)

View File

@@ -45,8 +45,8 @@ _TOOL_SCHEMA = {
"type": "object",
"properties": {
"project": {"type": "string", "description": "Project dir to evolve (default: cwd)."},
"backend": {"type": "string", "enum": ["mock", "claude", "codex"],
"description": "mock = no API spend (default); claude/codex = real."},
"backend": {"type": "string", "enum": ["mock", "claude", "codex", "copilot"],
"description": "mock = no API spend (default); claude/codex/copilot = real."},
"scope": {"type": "string", "enum": ["invoked", "all"]},
},
"additionalProperties": False,

View File

@@ -9,7 +9,7 @@
Common flags:
--project PATH project to evolve (default: cwd)
--scope all|invoked harvest scope (default: invoked)
--backend mock|claude|codex
--backend mock|claude|codex|copilot
--source claude|codex|auto
--model NAME
--lookback-hours N
@@ -36,7 +36,7 @@ from skillopt_sleep.state import SleepState
def _add_common(p: argparse.ArgumentParser) -> None:
p.add_argument("--project", default="")
p.add_argument("--scope", default="", choices=["", "all", "invoked"])
p.add_argument("--backend", default="", choices=["", "mock", "claude", "codex"])
p.add_argument("--backend", default="", choices=["", "mock", "claude", "codex", "copilot"])
p.add_argument("--model", default="")
p.add_argument("--codex-path", default="", help="path to the real @openai/codex binary")
p.add_argument("--claude-home", default="", help="override ~/.claude (also isolates state)")

View File

@@ -24,6 +24,7 @@ import json
import os
import re
import subprocess
import tempfile
from typing import Any, Dict, List, Optional, Tuple
from skillopt_sleep.types import EditRecord, ReplayResult, TaskRecord
@@ -788,6 +789,114 @@ class CodexCliBackend(CliBackend):
except Exception:
pass
def resolve_copilot_path(explicit: str = "") -> str:
"""Find the GitHub Copilot CLI (`copilot`) binary."""
if explicit:
return explicit
env = os.environ.get("SKILLOPT_SLEEP_COPILOT_PATH")
if env:
return env
import shutil
found = shutil.which("copilot")
return found or "copilot"
class CopilotCliBackend(CliBackend):
"""Drives the GitHub Copilot CLI in non-interactive mode.
Uses ``copilot -p <prompt> --output-format json`` and parses the emitted
JSONL event stream, returning the concatenated ``assistant.message``
content. The plain-text / ``--silent`` modes do not reliably stream the
response to stdout on all platforms, so JSONL is used for robust capture.
The call runs in a clean temp cwd with streaming disabled and tools allowed
(so non-interactive mode never blocks on a permission prompt); the prompts
ask for final-answer text only, so no tool use is expected.
Startup overhead is minimised: each invocation points ``COPILOT_HOME`` at a
dedicated, isolated config dir (no user ``mcp-config.json``, so the user's
MCP servers — including this project's own — are NOT spawned, avoiding a
slow recursive launch), and built-in MCP servers / custom instructions are
disabled. Auth is read from the OS credential store / token env vars, which
live outside ``COPILOT_HOME``, so isolation does not break authentication.
Set ``SKILLOPT_SLEEP_COPILOT_HOME`` to override the isolated home, or set it
empty / ``SKILLOPT_SLEEP_COPILOT_FULL_ENV=1`` to use the user's real
environment instead.
"""
name = "copilot"
def __init__(self, model: str = "", copilot_path: str = "", timeout: int = 240) -> None:
super().__init__(model=model or os.environ.get("SKILLOPT_SLEEP_COPILOT_MODEL", ""),
timeout=timeout)
self.copilot_path = resolve_copilot_path(copilot_path)
self.full_env = os.environ.get("SKILLOPT_SLEEP_COPILOT_FULL_ENV", "") == "1"
# Stable isolated home so first-run setup is cached across calls.
if self.full_env:
self.copilot_home = ""
else:
self.copilot_home = os.environ.get("SKILLOPT_SLEEP_COPILOT_HOME") or os.path.join(
tempfile.gettempdir(), "skillopt_sleep_copilot_home"
)
try:
os.makedirs(self.copilot_home, exist_ok=True)
except Exception:
self.copilot_home = ""
def _call(self, prompt: str, *, max_tokens: int = 1024) -> str:
clean_cwd = tempfile.mkdtemp(prefix="skillopt_sleep_copilot_")
cmd = [
self.copilot_path, "-p", prompt,
"--output-format", "json",
"--stream", "off",
"--no-color",
"--log-level", "none",
"--allow-all-tools",
"-C", clean_cwd,
]
if not self.full_env:
# Drop unneeded startup work: no built-in (github) MCP server and no
# AGENTS.md / custom-instruction loading. With an isolated home that
# has no mcp-config.json, no user MCP servers spawn either.
cmd += ["--disable-builtin-mcps", "--no-custom-instructions"]
if self.model:
cmd += ["--model", self.model]
env = os.environ.copy()
if self.copilot_home:
env["COPILOT_HOME"] = self.copilot_home
try:
proc = subprocess.run(
cmd, capture_output=True, text=True, timeout=self.timeout, cwd=clean_cwd,
encoding="utf-8", errors="replace", env=env,
)
except Exception:
return ""
finally:
try:
import shutil
shutil.rmtree(clean_cwd, ignore_errors=True)
except Exception:
pass
return self._parse_jsonl_response(proc.stdout or "")
@staticmethod
def _parse_jsonl_response(raw: str) -> str:
parts: List[str] = []
for line in raw.splitlines():
line = line.strip()
if not line or not line.startswith("{"):
continue
try:
obj = json.loads(line)
except Exception:
continue
if obj.get("type") == "assistant.message":
content = (obj.get("data") or {}).get("content")
if isinstance(content, str) and content:
parts.append(content)
return "\n".join(parts).strip()
class DualBackend(Backend):
"""Route operations to two backends, à la SkillOpt's target vs optimizer.
@@ -1036,6 +1145,8 @@ def get_backend(
if n in {"azure-responses", "azure_responses", "aoai-responses", "responses"}:
eps = [e.strip() for e in azure_endpoint.split(",") if e.strip()] or None
return AzureResponsesBackend(deployment=model, endpoints=eps)
if n in {"copilot", "github_copilot", "copilot_cli", "gh_copilot"}:
return CopilotCliBackend(model=model)
return MockBackend()

View File

@@ -36,7 +36,7 @@ DEFAULTS: Dict[str, Any] = {
"val_fraction": 0.34, # real tasks reserved to gate updates
"test_fraction": 0.0, # real tasks reserved as the final held-out measure
# ── optimizer ──────────────────────────────────────────────────────────
"backend": "mock", # "mock" | "claude" | "codex"
"backend": "mock", # "mock" | "claude" | "codex" | "copilot"
"model": "", # backend-specific; "" => backend default
"gate_mode": "on", # "on" (validation-gated) | "off" (greedy, no hard filter)
"codex_path": "", # "" => auto-detect the real @openai/codex binary

View File

@@ -134,7 +134,7 @@ def main(argv=None) -> int:
ap = argparse.ArgumentParser(description="SkillOpt-Sleep validation experiment")
ap.add_argument("--persona", default="researcher", choices=list(PERSONAS.keys()))
ap.add_argument("--nights", type=int, default=4)
ap.add_argument("--backend", default="mock", choices=["mock", "claude", "codex"])
ap.add_argument("--backend", default="mock", choices=["mock", "claude", "codex", "copilot"])
ap.add_argument("--model", default="", help="backend model override")
ap.add_argument("--codex-path", default="", help="path to the real @openai/codex binary")
ap.add_argument("--edit-budget", type=int, default=4)