From 5dc894715f57ae89b59ff99a588e967eda0f940e Mon Sep 17 00:00:00 2001
From: DB Lee <donlee@microsoft.com>
Date: Fri, 12 Jun 2026 08:21:47 -0700
Subject: [PATCH] Add SkillOpt research-engine MCP server plugin for Copilot

Exposes scripts/train.py and scripts/eval_only.py as Copilot MCP tools
(skillopt_list_configs, skillopt_train, skillopt_eval) via a stdlib-only
stdio server, mirroring the existing SkillOpt-Sleep plugin layout.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 plugins/copilot/skillopt/README.md            |  98 ++++++++
 .../skillopt/copilot-instructions.snippet.md  |  33 +++
 .../copilot/skillopt/mcp-config.example.json  |  11 +
 plugins/copilot/skillopt/mcp_server.py        | 229 ++++++++++++++++++
 4 files changed, 371 insertions(+)
 create mode 100644 plugins/copilot/skillopt/README.md
 create mode 100644 plugins/copilot/skillopt/copilot-instructions.snippet.md
 create mode 100644 plugins/copilot/skillopt/mcp-config.example.json
 create mode 100644 plugins/copilot/skillopt/mcp_server.py

diff --git a/plugins/copilot/skillopt/README.md b/plugins/copilot/skillopt/README.md
new file mode 100644
index 0000000..c4910a2
--- /dev/null
+++ b/plugins/copilot/skillopt/README.md
@@ -0,0 +1,98 @@
+# SkillOpt — GitHub Copilot integration
+
+Give **Copilot** (CLI or VS Code) direct access to the **SkillOpt** research
+engine via a tiny **MCP server**. MCP is GitHub's supported way to extend
+Copilot, so this works across Copilot CLI, VS Code, and other MCP clients with
+the same server.
+
+SkillOpt is **validation-gated, text-space skill optimization**: it reflects on
+rollouts, makes bounded edits to a skill, and keeps a change only if it improves
+a held-out validation set. This plugin exposes the repo's training and eval
+entry points (`scripts/train.py`, `scripts/eval_only.py`) as Copilot tools.
+
+> This is the companion to the **SkillOpt-Sleep** plugin (`../mcp_server.py`,
+> `sleep_*` tools). Sleep evolves a *local coding agent* from your past
+> sessions; this server drives the *research* training/eval loops on the
+> benchmark configs in [`../../../configs`](../../../configs).
+
+## What's here
+
+| File | Purpose |
+|---|---|
+| `mcp_server.py` | stdlib-only MCP (stdio) server exposing `skillopt_*` tools |
+| `mcp-config.example.json` | drop-in MCP server config |
+| `copilot-instructions.snippet.md` | paste into `.github/copilot-instructions.md` |
+
+## Install
+
+Requires Python ≥ 3.10. The MCP server itself is pure stdlib, but the tools it
+launches need SkillOpt's runtime deps — install the package first:
+
+```bash
+pip install -e .   # or: pip install -r requirements.txt
+```
+
+1. **Register the MCP server.** Add the server to your Copilot MCP config
+   (Copilot CLI: `~/.copilot/mcp-config.json`; VS Code: your MCP settings).
+   Use `mcp-config.example.json` as a template — set `SKILLOPT_REPO` to this
+   repo's path:
+
+   ```json
+   {
+     "mcpServers": {
+       "skillopt": {
+         "command": "python3",
+         "args": ["/abs/path/SkillOpt/plugins/copilot/skillopt/mcp_server.py"],
+         "env": { "SKILLOPT_REPO": "/abs/path/SkillOpt" }
+       }
+     }
+   }
+   ```
+
+2. **(Optional) Tell Copilot about it.** Append
+   `copilot-instructions.snippet.md` to your repo's
+   `.github/copilot-instructions.md` so Copilot reaches for the tools when the
+   user asks to "optimize a skill" or "train on a benchmark".
+
+## Use
+
+Ask Copilot things like *"what configs can I run?"*, *"optimize the searchqa
+skill"*, or *"evaluate this skill on the dataset"*. Copilot calls the MCP tools:
+`skillopt_list_configs`, `skillopt_train`, `skillopt_eval`.
+
+| Tool | Required args | Notes |
+|---|---|---|
+| `skillopt_list_configs` | — | Lists `configs/**/*.yaml` you can pass as `config`. |
+| `skillopt_train` | `config` | Runs a reflective optimization loop. Long-running; spends budget. |
+| `skillopt_eval` | `config`, `skill` | Evaluates one skill markdown file; no training. |
+
+Common optional args (both train and eval): `env`, `backend`,
+`optimizer_model`, `target_model`, `out_root`, `cfg_options` (space-separated
+`KEY=VALUE` YAML overrides), and `extra_args` (raw passthrough flags for the
+underlying script). `skillopt_train` also accepts `num_epochs`, `batch_size`,
+`seed`, and `use_gate`.
+
+Runs can be very long. The server's subprocess timeout defaults to 6 hours;
+override it with the `SKILLOPT_RUN_TIMEOUT` environment variable (seconds).
+
+## Verify the server directly (no Copilot needed)
+
+```bash
+printf '%s\n' \
+  '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}' \
+  '{"jsonrpc":"2.0","id":2,"method":"tools/list"}' \
+  '{"jsonrpc":"2.0","id":3,"method":"tools/call","params":{"name":"skillopt_list_configs","arguments":{}}}' \
+  | SKILLOPT_REPO="$(pwd)" python3 plugins/copilot/skillopt/mcp_server.py
+```
+
+You should see the server info, the three `skillopt_*` tools, and the list of
+benchmark configs.
+
+## Notes / status
+
+- MCP is the stable, official Copilot extension surface, so this is portable
+  across Copilot CLI and IDE from one server.
+- `skillopt_list_configs` is filesystem-only and safe to call anytime;
+  `skillopt_train` / `skillopt_eval` shell out to the repo scripts and require
+  the SkillOpt runtime deps (and, for real backends, model credentials — see
+  [`../../../.env.example`](../../../.env.example)).
diff --git a/plugins/copilot/skillopt/copilot-instructions.snippet.md b/plugins/copilot/skillopt/copilot-instructions.snippet.md
new file mode 100644
index 0000000..b53c4a5
--- /dev/null
+++ b/plugins/copilot/skillopt/copilot-instructions.snippet.md
@@ -0,0 +1,33 @@
+<!--
+Copy this block into your repo's .github/copilot-instructions.md so Copilot
+knows the SkillOpt research-engine tools exist. (Copilot reads
+copilot-instructions.md automatically as ambient guidance.)
+-->
+
+## SkillOpt (research skill-optimization engine)
+
+This repo exposes the core **SkillOpt** training/eval engine via an MCP server
+(`skillopt`). SkillOpt is validation-gated, text-space skill optimization: it
+reflects on rollouts, makes bounded edits to a skill, and keeps a change only
+if it improves a held-out validation set.
+
+When the user asks to "optimize a skill", "train on <benchmark>", "run
+SkillOpt", "evaluate this skill", or "what configs can I run", use the MCP
+tools:
+
+- `skillopt_list_configs` — list the benchmark YAML configs you can pass as `config`
+- `skillopt_train` — run a reflective skill-optimization loop on a config (long-running; spends API/compute budget)
+- `skillopt_eval` — evaluate a single skill markdown file on a dataset (no training)
+
+Guidance:
+- Always run `skillopt_list_configs` first if you don't already know a valid `config` path.
+- `skillopt_train` and `skillopt_eval` are long-running and consume the user's
+  model backend/budget — confirm the `config`, `backend`, and model choices
+  with the user before launching, and surface the held-out gate result when the
+  run finishes.
+- For one-off YAML overrides use `cfg_options` (e.g. `seed=123 batch_size=40`);
+  for any other underlying flag use `extra_args`.
+
+This is distinct from the **SkillOpt-Sleep** MCP server (`skillopt-sleep`,
+`sleep_*` tools), which evolves a local coding agent from past sessions rather
+than running the research benchmarks.
diff --git a/plugins/copilot/skillopt/mcp-config.example.json b/plugins/copilot/skillopt/mcp-config.example.json
new file mode 100644
index 0000000..eb2aba5
--- /dev/null
+++ b/plugins/copilot/skillopt/mcp-config.example.json
@@ -0,0 +1,11 @@
+{
+  "mcpServers": {
+    "skillopt": {
+      "command": "python3",
+      "args": ["plugins/copilot/skillopt/mcp_server.py"],
+      "env": {
+        "SKILLOPT_REPO": "${workspaceFolder}"
+      }
+    }
+  }
+}
diff --git a/plugins/copilot/skillopt/mcp_server.py b/plugins/copilot/skillopt/mcp_server.py
new file mode 100644
index 0000000..22be68e
--- /dev/null
+++ b/plugins/copilot/skillopt/mcp_server.py
@@ -0,0 +1,229 @@
+#!/usr/bin/env python3
+"""SkillOpt (research engine) — minimal MCP server (stdio, stdlib-only).
+
+Exposes the core SkillOpt skill-optimization engine as MCP tools so any
+MCP-capable client (GitHub Copilot CLI / VS Code, Claude Desktop, etc.) can
+drive it. No third-party deps: speaks JSON-RPC 2.0 over stdio with just the
+handful of MCP methods clients need.
+
+This is the companion to the SkillOpt-Sleep MCP server (``../mcp_server.py``).
+Where Sleep evolves a *local agent* from past sessions, this server drives the
+*research* training/eval loops from this repo (``scripts/train.py`` /
+``scripts/eval_only.py``) against the benchmark configs in ``configs/``.
+
+Tools exposed:
+  - skillopt_list_configs : discover the benchmark YAML configs you can use
+  - skillopt_train        : run a reflective skill-optimization (training) loop
+  - skillopt_eval         : evaluate a single skill on a dataset (no training)
+
+``skillopt_train`` and ``skillopt_eval`` shell out to the repo's entry-point
+scripts and stream back their stdout/stderr. Configure your client to launch:
+  python plugins/copilot/skillopt/mcp_server.py
+"""
+from __future__ import annotations
+
+import glob
+import json
+import os
+import subprocess
+import sys
+
+# Repo root: three levels up from plugins/copilot/skillopt/mcp_server.py
+REPO_ROOT = os.environ.get("SKILLOPT_REPO") or os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "..", "..")
+)
+PROTOCOL_VERSION = "2024-11-05"
+
+# Training/eval runs are long; give the engine plenty of headroom.
+RUN_TIMEOUT_SECONDS = int(os.environ.get("SKILLOPT_RUN_TIMEOUT", "21600"))  # 6h
+
+
+def _list_configs() -> str:
+    """List the benchmark configs available under configs/ (filesystem only)."""
+    pattern = os.path.join(REPO_ROOT, "configs", "**", "*.yaml")
+    paths = sorted(glob.glob(pattern, recursive=True))
+    if not paths:
+        return f"[no configs found under {os.path.join(REPO_ROOT, 'configs')}]"
+    rels = [os.path.relpath(p, REPO_ROOT).replace(os.sep, "/") for p in paths]
+    lines = ["Available SkillOpt configs (pass as `config`):", ""]
+    lines += [f"  - {r}" for r in rels]
+    return "\n".join(lines)
+
+
+def _run_script(script_rel: str, args: dict, *, required: tuple[str, ...] = ()) -> str:
+    """Shell out to a repo entry-point script, mapping args -> --flags."""
+    for key in required:
+        if not args.get(key):
+            return f"[error] missing required argument: {key}"
+
+    py = sys.executable or "python3"
+    cmd = [py, os.path.join("scripts", script_rel)]
+
+    # Ordered flags that the train/eval scripts accept directly.
+    flag_args = (
+        "config", "skill", "split", "env", "backend",
+        "optimizer_model", "target_model", "out_root",
+        "num_epochs", "batch_size", "seed", "use_gate",
+    )
+    for key in flag_args:
+        val = args.get(key)
+        if val is None or val == "":
+            continue
+        cmd += [f"--{key}", str(val)]
+
+    # cfg-options: arbitrary KEY=VALUE YAML overrides (nargs="+").
+    cfg_options = args.get("cfg_options")
+    if cfg_options:
+        if isinstance(cfg_options, str):
+            cfg_options = cfg_options.split()
+        cmd += ["--cfg-options", *[str(x) for x in cfg_options]]
+
+    # extra_args: raw passthrough for any other train/eval flag.
+    extra = args.get("extra_args")
+    if extra:
+        if isinstance(extra, str):
+            extra = extra.split()
+        cmd += [str(x) for x in extra]
+
+    try:
+        proc = subprocess.run(
+            cmd, cwd=REPO_ROOT, capture_output=True, text=True,
+            timeout=RUN_TIMEOUT_SECONDS,
+        )
+    except subprocess.TimeoutExpired:
+        return f"[error] run exceeded {RUN_TIMEOUT_SECONDS}s timeout: {' '.join(cmd)}"
+    except Exception as e:  # noqa: BLE001
+        return f"[error] failed to run script: {e}"
+    out = (proc.stdout or "").strip()
+    err = (proc.stderr or "").strip()
+    body = out + (("\n[stderr]\n" + err) if err else "")
+    return body or f"[done] exit code {proc.returncode}, no output"
+
+
+TOOLS = [
+    {
+        "name": "skillopt_list_configs",
+        "description": "List the benchmark YAML configs under configs/ that can be passed as `config` to train/eval.",
+    },
+    {
+        "name": "skillopt_train",
+        "description": "Run a SkillOpt reflective skill-optimization (training) loop on a benchmark config. Long-running; uses your model backend/budget.",
+    },
+    {
+        "name": "skillopt_eval",
+        "description": "Evaluate a single skill markdown file on a dataset without training (scripts/eval_only.py).",
+    },
+]
+_BY_NAME = {t["name"]: t for t in TOOLS}
+
+_NO_ARGS_SCHEMA = {"type": "object", "properties": {}, "additionalProperties": False}
+
+_COMMON_PROPS = {
+    "config": {"type": "string",
+               "description": "Path to a benchmark YAML config (e.g. configs/searchqa/default.yaml). See skillopt_list_configs."},
+    "env": {"type": "string", "description": "Override the environment/adapter name (e.g. searchqa, alfworld)."},
+    "backend": {"type": "string", "description": "Model backend (e.g. openai, claude, codex, qwen)."},
+    "optimizer_model": {"type": "string", "description": "Model used for reflection/skill rewriting (the optimizer)."},
+    "target_model": {"type": "string", "description": "Model used to execute tasks (the target)."},
+    "out_root": {"type": "string", "description": "Output directory root for run artifacts."},
+    "cfg_options": {"type": "string", "description": "Space-separated YAML overrides, e.g. 'seed=123 batch_size=40'."},
+    "extra_args": {"type": "string", "description": "Raw passthrough flags for the underlying script, e.g. '--workers 8 --max_turns 30'."},
+}
+
+_TRAIN_SCHEMA = {
+    "type": "object",
+    "properties": {
+        **_COMMON_PROPS,
+        "num_epochs": {"type": "integer", "description": "Number of optimization epochs."},
+        "batch_size": {"type": "integer", "description": "Tasks per optimization step."},
+        "seed": {"type": "integer", "description": "Random seed."},
+        "use_gate": {"type": "string", "enum": ["true", "false"],
+                     "description": "Whether to keep the held-out validation gate on (default on)."},
+    },
+    "required": ["config"],
+    "additionalProperties": False,
+}
+
+_EVAL_SCHEMA = {
+    "type": "object",
+    "properties": {
+        **_COMMON_PROPS,
+        "skill": {"type": "string", "description": "Path to the skill markdown file to evaluate."},
+        "split": {"type": "string", "description": "Dataset split to evaluate (default: all)."},
+    },
+    "required": ["config", "skill"],
+    "additionalProperties": False,
+}
+
+_SCHEMA_BY_NAME = {
+    "skillopt_list_configs": _NO_ARGS_SCHEMA,
+    "skillopt_train": _TRAIN_SCHEMA,
+    "skillopt_eval": _EVAL_SCHEMA,
+}
+
+
+def _result(id_, result):
+    return {"jsonrpc": "2.0", "id": id_, "result": result}
+
+
+def _error(id_, code, message):
+    return {"jsonrpc": "2.0", "id": id_, "error": {"code": code, "message": message}}
+
+
+def _dispatch(name: str, args: dict) -> str:
+    if name == "skillopt_list_configs":
+        return _list_configs()
+    if name == "skillopt_train":
+        return _run_script("train.py", args, required=("config",))
+    if name == "skillopt_eval":
+        return _run_script("eval_only.py", args, required=("config", "skill"))
+    return f"[error] unknown tool: {name}"
+
+
+def handle(req: dict):
+    method = req.get("method")
+    id_ = req.get("id")
+    if method == "initialize":
+        return _result(id_, {
+            "protocolVersion": PROTOCOL_VERSION,
+            "capabilities": {"tools": {}},
+            "serverInfo": {"name": "skillopt", "version": "0.1.0"},
+        })
+    if method in ("notifications/initialized", "initialized"):
+        return None  # notification, no response
+    if method == "tools/list":
+        return _result(id_, {"tools": [
+            {"name": t["name"], "description": t["description"],
+             "inputSchema": _SCHEMA_BY_NAME[t["name"]]}
+            for t in TOOLS
+        ]})
+    if method == "tools/call":
+        params = req.get("params") or {}
+        name = params.get("name")
+        if name not in _BY_NAME:
+            return _error(id_, -32602, f"unknown tool: {name}")
+        text = _dispatch(name, params.get("arguments") or {})
+        return _result(id_, {"content": [{"type": "text", "text": text}]})
+    if method == "ping":
+        return _result(id_, {})
+    return _error(id_, -32601, f"method not found: {method}")
+
+
+def main() -> int:
+    for line in sys.stdin:
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            req = json.loads(line)
+        except Exception:
+            continue
+        resp = handle(req)
+        if resp is not None:
+            sys.stdout.write(json.dumps(resp) + "\n")
+            sys.stdout.flush()
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())