From 5dc894715f57ae89b59ff99a588e967eda0f940e Mon Sep 17 00:00:00 2001 From: DB Lee Date: Fri, 12 Jun 2026 08:21:47 -0700 Subject: [PATCH] Add SkillOpt research-engine MCP server plugin for Copilot Exposes scripts/train.py and scripts/eval_only.py as Copilot MCP tools (skillopt_list_configs, skillopt_train, skillopt_eval) via a stdlib-only stdio server, mirroring the existing SkillOpt-Sleep plugin layout. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- plugins/copilot/skillopt/README.md | 98 ++++++++ .../skillopt/copilot-instructions.snippet.md | 33 +++ .../copilot/skillopt/mcp-config.example.json | 11 + plugins/copilot/skillopt/mcp_server.py | 229 ++++++++++++++++++ 4 files changed, 371 insertions(+) create mode 100644 plugins/copilot/skillopt/README.md create mode 100644 plugins/copilot/skillopt/copilot-instructions.snippet.md create mode 100644 plugins/copilot/skillopt/mcp-config.example.json create mode 100644 plugins/copilot/skillopt/mcp_server.py diff --git a/plugins/copilot/skillopt/README.md b/plugins/copilot/skillopt/README.md new file mode 100644 index 0000000..c4910a2 --- /dev/null +++ b/plugins/copilot/skillopt/README.md @@ -0,0 +1,98 @@ +# SkillOpt — GitHub Copilot integration + +Give **Copilot** (CLI or VS Code) direct access to the **SkillOpt** research +engine via a tiny **MCP server**. MCP is GitHub's supported way to extend +Copilot, so this works across Copilot CLI, VS Code, and other MCP clients with +the same server. + +SkillOpt is **validation-gated, text-space skill optimization**: it reflects on +rollouts, makes bounded edits to a skill, and keeps a change only if it improves +a held-out validation set. This plugin exposes the repo's training and eval +entry points (`scripts/train.py`, `scripts/eval_only.py`) as Copilot tools. + +> This is the companion to the **SkillOpt-Sleep** plugin (`../mcp_server.py`, +> `sleep_*` tools). Sleep evolves a *local coding agent* from your past +> sessions; this server drives the *research* training/eval loops on the +> benchmark configs in [`../../../configs`](../../../configs). + +## What's here + +| File | Purpose | +|---|---| +| `mcp_server.py` | stdlib-only MCP (stdio) server exposing `skillopt_*` tools | +| `mcp-config.example.json` | drop-in MCP server config | +| `copilot-instructions.snippet.md` | paste into `.github/copilot-instructions.md` | + +## Install + +Requires Python ≥ 3.10. The MCP server itself is pure stdlib, but the tools it +launches need SkillOpt's runtime deps — install the package first: + +```bash +pip install -e . # or: pip install -r requirements.txt +``` + +1. **Register the MCP server.** Add the server to your Copilot MCP config + (Copilot CLI: `~/.copilot/mcp-config.json`; VS Code: your MCP settings). + Use `mcp-config.example.json` as a template — set `SKILLOPT_REPO` to this + repo's path: + + ```json + { + "mcpServers": { + "skillopt": { + "command": "python3", + "args": ["/abs/path/SkillOpt/plugins/copilot/skillopt/mcp_server.py"], + "env": { "SKILLOPT_REPO": "/abs/path/SkillOpt" } + } + } + } + ``` + +2. **(Optional) Tell Copilot about it.** Append + `copilot-instructions.snippet.md` to your repo's + `.github/copilot-instructions.md` so Copilot reaches for the tools when the + user asks to "optimize a skill" or "train on a benchmark". + +## Use + +Ask Copilot things like *"what configs can I run?"*, *"optimize the searchqa +skill"*, or *"evaluate this skill on the dataset"*. Copilot calls the MCP tools: +`skillopt_list_configs`, `skillopt_train`, `skillopt_eval`. + +| Tool | Required args | Notes | +|---|---|---| +| `skillopt_list_configs` | — | Lists `configs/**/*.yaml` you can pass as `config`. | +| `skillopt_train` | `config` | Runs a reflective optimization loop. Long-running; spends budget. | +| `skillopt_eval` | `config`, `skill` | Evaluates one skill markdown file; no training. | + +Common optional args (both train and eval): `env`, `backend`, +`optimizer_model`, `target_model`, `out_root`, `cfg_options` (space-separated +`KEY=VALUE` YAML overrides), and `extra_args` (raw passthrough flags for the +underlying script). `skillopt_train` also accepts `num_epochs`, `batch_size`, +`seed`, and `use_gate`. + +Runs can be very long. The server's subprocess timeout defaults to 6 hours; +override it with the `SKILLOPT_RUN_TIMEOUT` environment variable (seconds). + +## Verify the server directly (no Copilot needed) + +```bash +printf '%s\n' \ + '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}' \ + '{"jsonrpc":"2.0","id":2,"method":"tools/list"}' \ + '{"jsonrpc":"2.0","id":3,"method":"tools/call","params":{"name":"skillopt_list_configs","arguments":{}}}' \ + | SKILLOPT_REPO="$(pwd)" python3 plugins/copilot/skillopt/mcp_server.py +``` + +You should see the server info, the three `skillopt_*` tools, and the list of +benchmark configs. + +## Notes / status + +- MCP is the stable, official Copilot extension surface, so this is portable + across Copilot CLI and IDE from one server. +- `skillopt_list_configs` is filesystem-only and safe to call anytime; + `skillopt_train` / `skillopt_eval` shell out to the repo scripts and require + the SkillOpt runtime deps (and, for real backends, model credentials — see + [`../../../.env.example`](../../../.env.example)). diff --git a/plugins/copilot/skillopt/copilot-instructions.snippet.md b/plugins/copilot/skillopt/copilot-instructions.snippet.md new file mode 100644 index 0000000..b53c4a5 --- /dev/null +++ b/plugins/copilot/skillopt/copilot-instructions.snippet.md @@ -0,0 +1,33 @@ + + +## SkillOpt (research skill-optimization engine) + +This repo exposes the core **SkillOpt** training/eval engine via an MCP server +(`skillopt`). SkillOpt is validation-gated, text-space skill optimization: it +reflects on rollouts, makes bounded edits to a skill, and keeps a change only +if it improves a held-out validation set. + +When the user asks to "optimize a skill", "train on ", "run +SkillOpt", "evaluate this skill", or "what configs can I run", use the MCP +tools: + +- `skillopt_list_configs` — list the benchmark YAML configs you can pass as `config` +- `skillopt_train` — run a reflective skill-optimization loop on a config (long-running; spends API/compute budget) +- `skillopt_eval` — evaluate a single skill markdown file on a dataset (no training) + +Guidance: +- Always run `skillopt_list_configs` first if you don't already know a valid `config` path. +- `skillopt_train` and `skillopt_eval` are long-running and consume the user's + model backend/budget — confirm the `config`, `backend`, and model choices + with the user before launching, and surface the held-out gate result when the + run finishes. +- For one-off YAML overrides use `cfg_options` (e.g. `seed=123 batch_size=40`); + for any other underlying flag use `extra_args`. + +This is distinct from the **SkillOpt-Sleep** MCP server (`skillopt-sleep`, +`sleep_*` tools), which evolves a local coding agent from past sessions rather +than running the research benchmarks. diff --git a/plugins/copilot/skillopt/mcp-config.example.json b/plugins/copilot/skillopt/mcp-config.example.json new file mode 100644 index 0000000..eb2aba5 --- /dev/null +++ b/plugins/copilot/skillopt/mcp-config.example.json @@ -0,0 +1,11 @@ +{ + "mcpServers": { + "skillopt": { + "command": "python3", + "args": ["plugins/copilot/skillopt/mcp_server.py"], + "env": { + "SKILLOPT_REPO": "${workspaceFolder}" + } + } + } +} diff --git a/plugins/copilot/skillopt/mcp_server.py b/plugins/copilot/skillopt/mcp_server.py new file mode 100644 index 0000000..22be68e --- /dev/null +++ b/plugins/copilot/skillopt/mcp_server.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python3 +"""SkillOpt (research engine) — minimal MCP server (stdio, stdlib-only). + +Exposes the core SkillOpt skill-optimization engine as MCP tools so any +MCP-capable client (GitHub Copilot CLI / VS Code, Claude Desktop, etc.) can +drive it. No third-party deps: speaks JSON-RPC 2.0 over stdio with just the +handful of MCP methods clients need. + +This is the companion to the SkillOpt-Sleep MCP server (``../mcp_server.py``). +Where Sleep evolves a *local agent* from past sessions, this server drives the +*research* training/eval loops from this repo (``scripts/train.py`` / +``scripts/eval_only.py``) against the benchmark configs in ``configs/``. + +Tools exposed: + - skillopt_list_configs : discover the benchmark YAML configs you can use + - skillopt_train : run a reflective skill-optimization (training) loop + - skillopt_eval : evaluate a single skill on a dataset (no training) + +``skillopt_train`` and ``skillopt_eval`` shell out to the repo's entry-point +scripts and stream back their stdout/stderr. Configure your client to launch: + python plugins/copilot/skillopt/mcp_server.py +""" +from __future__ import annotations + +import glob +import json +import os +import subprocess +import sys + +# Repo root: three levels up from plugins/copilot/skillopt/mcp_server.py +REPO_ROOT = os.environ.get("SKILLOPT_REPO") or os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "..", "..") +) +PROTOCOL_VERSION = "2024-11-05" + +# Training/eval runs are long; give the engine plenty of headroom. +RUN_TIMEOUT_SECONDS = int(os.environ.get("SKILLOPT_RUN_TIMEOUT", "21600")) # 6h + + +def _list_configs() -> str: + """List the benchmark configs available under configs/ (filesystem only).""" + pattern = os.path.join(REPO_ROOT, "configs", "**", "*.yaml") + paths = sorted(glob.glob(pattern, recursive=True)) + if not paths: + return f"[no configs found under {os.path.join(REPO_ROOT, 'configs')}]" + rels = [os.path.relpath(p, REPO_ROOT).replace(os.sep, "/") for p in paths] + lines = ["Available SkillOpt configs (pass as `config`):", ""] + lines += [f" - {r}" for r in rels] + return "\n".join(lines) + + +def _run_script(script_rel: str, args: dict, *, required: tuple[str, ...] = ()) -> str: + """Shell out to a repo entry-point script, mapping args -> --flags.""" + for key in required: + if not args.get(key): + return f"[error] missing required argument: {key}" + + py = sys.executable or "python3" + cmd = [py, os.path.join("scripts", script_rel)] + + # Ordered flags that the train/eval scripts accept directly. + flag_args = ( + "config", "skill", "split", "env", "backend", + "optimizer_model", "target_model", "out_root", + "num_epochs", "batch_size", "seed", "use_gate", + ) + for key in flag_args: + val = args.get(key) + if val is None or val == "": + continue + cmd += [f"--{key}", str(val)] + + # cfg-options: arbitrary KEY=VALUE YAML overrides (nargs="+"). + cfg_options = args.get("cfg_options") + if cfg_options: + if isinstance(cfg_options, str): + cfg_options = cfg_options.split() + cmd += ["--cfg-options", *[str(x) for x in cfg_options]] + + # extra_args: raw passthrough for any other train/eval flag. + extra = args.get("extra_args") + if extra: + if isinstance(extra, str): + extra = extra.split() + cmd += [str(x) for x in extra] + + try: + proc = subprocess.run( + cmd, cwd=REPO_ROOT, capture_output=True, text=True, + timeout=RUN_TIMEOUT_SECONDS, + ) + except subprocess.TimeoutExpired: + return f"[error] run exceeded {RUN_TIMEOUT_SECONDS}s timeout: {' '.join(cmd)}" + except Exception as e: # noqa: BLE001 + return f"[error] failed to run script: {e}" + out = (proc.stdout or "").strip() + err = (proc.stderr or "").strip() + body = out + (("\n[stderr]\n" + err) if err else "") + return body or f"[done] exit code {proc.returncode}, no output" + + +TOOLS = [ + { + "name": "skillopt_list_configs", + "description": "List the benchmark YAML configs under configs/ that can be passed as `config` to train/eval.", + }, + { + "name": "skillopt_train", + "description": "Run a SkillOpt reflective skill-optimization (training) loop on a benchmark config. Long-running; uses your model backend/budget.", + }, + { + "name": "skillopt_eval", + "description": "Evaluate a single skill markdown file on a dataset without training (scripts/eval_only.py).", + }, +] +_BY_NAME = {t["name"]: t for t in TOOLS} + +_NO_ARGS_SCHEMA = {"type": "object", "properties": {}, "additionalProperties": False} + +_COMMON_PROPS = { + "config": {"type": "string", + "description": "Path to a benchmark YAML config (e.g. configs/searchqa/default.yaml). See skillopt_list_configs."}, + "env": {"type": "string", "description": "Override the environment/adapter name (e.g. searchqa, alfworld)."}, + "backend": {"type": "string", "description": "Model backend (e.g. openai, claude, codex, qwen)."}, + "optimizer_model": {"type": "string", "description": "Model used for reflection/skill rewriting (the optimizer)."}, + "target_model": {"type": "string", "description": "Model used to execute tasks (the target)."}, + "out_root": {"type": "string", "description": "Output directory root for run artifacts."}, + "cfg_options": {"type": "string", "description": "Space-separated YAML overrides, e.g. 'seed=123 batch_size=40'."}, + "extra_args": {"type": "string", "description": "Raw passthrough flags for the underlying script, e.g. '--workers 8 --max_turns 30'."}, +} + +_TRAIN_SCHEMA = { + "type": "object", + "properties": { + **_COMMON_PROPS, + "num_epochs": {"type": "integer", "description": "Number of optimization epochs."}, + "batch_size": {"type": "integer", "description": "Tasks per optimization step."}, + "seed": {"type": "integer", "description": "Random seed."}, + "use_gate": {"type": "string", "enum": ["true", "false"], + "description": "Whether to keep the held-out validation gate on (default on)."}, + }, + "required": ["config"], + "additionalProperties": False, +} + +_EVAL_SCHEMA = { + "type": "object", + "properties": { + **_COMMON_PROPS, + "skill": {"type": "string", "description": "Path to the skill markdown file to evaluate."}, + "split": {"type": "string", "description": "Dataset split to evaluate (default: all)."}, + }, + "required": ["config", "skill"], + "additionalProperties": False, +} + +_SCHEMA_BY_NAME = { + "skillopt_list_configs": _NO_ARGS_SCHEMA, + "skillopt_train": _TRAIN_SCHEMA, + "skillopt_eval": _EVAL_SCHEMA, +} + + +def _result(id_, result): + return {"jsonrpc": "2.0", "id": id_, "result": result} + + +def _error(id_, code, message): + return {"jsonrpc": "2.0", "id": id_, "error": {"code": code, "message": message}} + + +def _dispatch(name: str, args: dict) -> str: + if name == "skillopt_list_configs": + return _list_configs() + if name == "skillopt_train": + return _run_script("train.py", args, required=("config",)) + if name == "skillopt_eval": + return _run_script("eval_only.py", args, required=("config", "skill")) + return f"[error] unknown tool: {name}" + + +def handle(req: dict): + method = req.get("method") + id_ = req.get("id") + if method == "initialize": + return _result(id_, { + "protocolVersion": PROTOCOL_VERSION, + "capabilities": {"tools": {}}, + "serverInfo": {"name": "skillopt", "version": "0.1.0"}, + }) + if method in ("notifications/initialized", "initialized"): + return None # notification, no response + if method == "tools/list": + return _result(id_, {"tools": [ + {"name": t["name"], "description": t["description"], + "inputSchema": _SCHEMA_BY_NAME[t["name"]]} + for t in TOOLS + ]}) + if method == "tools/call": + params = req.get("params") or {} + name = params.get("name") + if name not in _BY_NAME: + return _error(id_, -32602, f"unknown tool: {name}") + text = _dispatch(name, params.get("arguments") or {}) + return _result(id_, {"content": [{"type": "text", "text": text}]}) + if method == "ping": + return _result(id_, {}) + return _error(id_, -32601, f"method not found: {method}") + + +def main() -> int: + for line in sys.stdin: + line = line.strip() + if not line: + continue + try: + req = json.loads(line) + except Exception: + continue + resp = handle(req) + if resp is not None: + sys.stdout.write(json.dumps(resp) + "\n") + sys.stdout.flush() + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())