Files
DB Lee 21f93c16c7 Add GitHub Copilot backend to SkillOpt-Sleep
Add CopilotCliBackend that drives the GitHub Copilot CLI in
non-interactive mode (copilot -p ... --output-format json) and parses the
JSONL event stream for assistant.message content. Registered as the
'copilot' backend (with aliases) and wired through the CLI, config,
experiment harness, and the Copilot MCP server's backend enum.

- Force UTF-8 decoding of CLI output (fixes cp1252 UnicodeDecodeError on
  Windows when responses contain non-cp1252 bytes).
- Minimise per-call startup: isolated COPILOT_HOME with built-in MCPs and
  custom instructions disabled, so user MCP servers are not spawned per
  call (~5x faster: 36s -> 7.4s). Override via SKILLOPT_SLEEP_COPILOT_HOME
  / SKILLOPT_SLEEP_COPILOT_MODEL / SKILLOPT_SLEEP_COPILOT_FULL_ENV.

Validated end-to-end on real held-out tasks (researcher persona:
0.42 -> 1.00 lift; gate correctly rejects non-improving edits).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-17 17:25:50 -07:00

179 lines
7.6 KiB
Python

"""SkillOpt-Sleep — validation experiment.
Answers the question the user posed: *does nightly offline self-evolution
actually improve the agent?* Runs deterministically with the MockBackend
(no API key, reproducible) and is the acceptance test for the whole idea.
What it proves:
1. MONOTONIC LIFT — over N sleep nights, the held-out score rises from a
baseline (empty skill/memory) toward 1.0 as the gate accepts the
general rules the persona's tasks require.
2. GATE SAFETY — an injected harmful edit is REJECTED (held-out score does
not improve), so a bad nightly proposal can never be adopted.
3. PLUMBING — harvest->mine->replay->consolidate->stage->adopt all run and
the adopted artifact, re-scored, retains the lift.
Run:
python -m skillopt_sleep.experiments.run_experiment
python -m skillopt_sleep.experiments.run_experiment --persona programmer --nights 3
python -m skillopt_sleep.experiments.run_experiment --backend anthropic # real lift
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import tempfile
from typing import List
from skillopt_sleep.backend import get_backend
from skillopt_sleep.consolidate import consolidate
from skillopt_sleep.experiments.personas import (
PERSONAS,
harmful_edit_task,
researcher_persona,
)
from skillopt_sleep.memory import ensure_skill_scaffold
from skillopt_sleep.replay import aggregate_scores, replay_batch
from skillopt_sleep.types import TaskRecord
def _score_holdout(backend, tasks: List[TaskRecord], skill: str, memory: str,
metric: str = "mixed", w: float = 0.5) -> float:
from skillopt_sleep.consolidate import select_gate_score
# the persona experiment uses a 2-way split (train/val, no test); score on val
holdout = [t for t in tasks if t.split in ("val", "holdout")] or tasks
pairs = replay_batch(backend, holdout, skill, memory)
h, s = aggregate_scores(pairs)
return select_gate_score(h, s, metric, w)
def run(persona: str = "researcher", nights: int = 4, backend_name: str = "mock",
edit_budget: int = 4, seed: int = 42, model: str = "", codex_path: str = "",
limit_tasks: int = 0) -> dict:
from skillopt_sleep.mine import assign_splits
make = PERSONAS.get(persona, researcher_persona)
items = make()
if limit_tasks and limit_tasks < len(items):
items = items[:limit_tasks]
tasks = assign_splits(items, holdout_fraction=0.34, seed=seed)
backend = get_backend(backend_name, model=model, codex_path=codex_path)
is_mock = (backend.name == "mock")
# start from an empty managed skill + empty memory
skill = ensure_skill_scaffold("", name="skillopt-sleep-learned",
description="Learned preferences.")
memory = ""
baseline = _score_holdout(backend, tasks, skill, memory)
trace = [{"night": 0, "holdout_score": round(baseline, 4), "action": "baseline",
"n_edits": 0}]
for night in range(1, nights + 1):
res = consolidate(
backend, tasks, skill, memory,
edit_budget=edit_budget, gate_metric="mixed", gate_mixed_weight=0.5,
evolve_skill=True, evolve_memory=True, night=night,
)
if res.accepted:
skill, memory = res.new_skill, res.new_memory
trace.append({
"night": night,
"holdout_score": round(res.candidate_score, 4),
"action": res.gate_action,
"accepted": res.accepted,
"n_edits": len(res.applied_edits),
"edits": [e.content for e in res.applied_edits],
"n_rejected": len(res.rejected_edits),
})
# converged: stop early if perfect
if res.candidate_score >= 0.999:
break
after = _score_holdout(backend, tasks, skill, memory)
# ── gate-safety probe (mock only; it relies on the mock's known bad rule) ──
harmful_rejected = None
if is_mock:
harmful_tasks = assign_splits([harmful_edit_task()] + make()[:3],
holdout_fraction=0.5, seed=seed)
_ = _score_holdout(backend, harmful_tasks, skill, memory)
res_h = consolidate(backend, harmful_tasks, skill, memory,
edit_budget=edit_budget, gate_metric="mixed",
evolve_skill=True, evolve_memory=False, night=nights + 1)
harmful_rule_text = get_backend("mock").RULE_TEXT["__harmful__"] # type: ignore[attr-defined]
harmful_rejected = (harmful_rule_text not in res_h.new_skill)
result = {
"persona": persona,
"backend": backend.name,
"model": model or "(default)",
"n_tasks": len(tasks),
"nights_run": len(trace) - 1,
"baseline_holdout": round(baseline, 4),
"after_holdout": round(after, 4),
"lift": round(after - baseline, 4),
"improved": after > baseline,
"gate_blocks_harmful": harmful_rejected, # None for real backends
"tokens_used": backend.tokens_used(),
"final_skill_excerpt": skill[-500:],
"trace": trace,
}
return result
def _assert(cond: bool, msg: str) -> None:
if not cond:
print(f"FAIL: {msg}")
raise SystemExit(1)
def main(argv=None) -> int:
ap = argparse.ArgumentParser(description="SkillOpt-Sleep validation experiment")
ap.add_argument("--persona", default="researcher", choices=list(PERSONAS.keys()))
ap.add_argument("--nights", type=int, default=4)
ap.add_argument("--backend", default="mock", choices=["mock", "claude", "codex", "copilot"])
ap.add_argument("--model", default="", help="backend model override")
ap.add_argument("--codex-path", default="", help="path to the real @openai/codex binary")
ap.add_argument("--edit-budget", type=int, default=4)
ap.add_argument("--limit-tasks", type=int, default=0, help="cap #tasks (control API cost)")
ap.add_argument("--json", action="store_true")
ap.add_argument("--assert-improves", action="store_true",
help="exit nonzero unless lift>0 (and, for mock, gate blocks harmful edit)")
args = ap.parse_args(argv)
res = run(args.persona, nights=args.nights, backend_name=args.backend,
edit_budget=args.edit_budget, model=args.model,
codex_path=args.codex_path, limit_tasks=args.limit_tasks)
if args.json:
print(json.dumps(res, ensure_ascii=False, indent=2))
else:
print(f"=== SkillOpt-Sleep experiment: persona={res['persona']} "
f"backend={res['backend']} model={res['model']} ===")
print(f"tasks: {res['n_tasks']} tokens(approx): {res['tokens_used']}")
print(f"baseline held-out : {res['baseline_holdout']}")
print(f"after held-out : {res['after_holdout']} (lift {res['lift']:+.4f})")
if res["gate_blocks_harmful"] is not None:
print(f"gate blocks harmful edit: {res['gate_blocks_harmful']}")
print("trace:")
for row in res["trace"]:
edits = "; ".join(row.get("edits", []))[:80]
print(f" night {row['night']}: holdout={row['holdout_score']} "
f"{row['action']} (+{row['n_edits']} edits) {edits}")
if args.assert_improves:
_assert(res["improved"], "held-out score did not improve")
if res["gate_blocks_harmful"] is not None:
_assert(res["gate_blocks_harmful"], "gate failed to block harmful edit")
print("\nPASS: nightly consolidation improves held-out score AND gate blocks regressions.")
else:
print("\nPASS: nightly consolidation improves held-out score (real backend).")
return 0
if __name__ == "__main__":
sys.exit(main())