mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
Add CopilotCliBackend that drives the GitHub Copilot CLI in non-interactive mode (copilot -p ... --output-format json) and parses the JSONL event stream for assistant.message content. Registered as the 'copilot' backend (with aliases) and wired through the CLI, config, experiment harness, and the Copilot MCP server's backend enum. - Force UTF-8 decoding of CLI output (fixes cp1252 UnicodeDecodeError on Windows when responses contain non-cp1252 bytes). - Minimise per-call startup: isolated COPILOT_HOME with built-in MCPs and custom instructions disabled, so user MCP servers are not spawned per call (~5x faster: 36s -> 7.4s). Override via SKILLOPT_SLEEP_COPILOT_HOME / SKILLOPT_SLEEP_COPILOT_MODEL / SKILLOPT_SLEEP_COPILOT_FULL_ENV. Validated end-to-end on real held-out tasks (researcher persona: 0.42 -> 1.00 lift; gate correctly rejects non-improving edits). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
179 lines
7.6 KiB
Python
179 lines
7.6 KiB
Python
"""SkillOpt-Sleep — validation experiment.
|
|
|
|
Answers the question the user posed: *does nightly offline self-evolution
|
|
actually improve the agent?* Runs deterministically with the MockBackend
|
|
(no API key, reproducible) and is the acceptance test for the whole idea.
|
|
|
|
What it proves:
|
|
1. MONOTONIC LIFT — over N sleep nights, the held-out score rises from a
|
|
baseline (empty skill/memory) toward 1.0 as the gate accepts the
|
|
general rules the persona's tasks require.
|
|
2. GATE SAFETY — an injected harmful edit is REJECTED (held-out score does
|
|
not improve), so a bad nightly proposal can never be adopted.
|
|
3. PLUMBING — harvest->mine->replay->consolidate->stage->adopt all run and
|
|
the adopted artifact, re-scored, retains the lift.
|
|
|
|
Run:
|
|
python -m skillopt_sleep.experiments.run_experiment
|
|
python -m skillopt_sleep.experiments.run_experiment --persona programmer --nights 3
|
|
python -m skillopt_sleep.experiments.run_experiment --backend anthropic # real lift
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
import tempfile
|
|
from typing import List
|
|
|
|
from skillopt_sleep.backend import get_backend
|
|
from skillopt_sleep.consolidate import consolidate
|
|
from skillopt_sleep.experiments.personas import (
|
|
PERSONAS,
|
|
harmful_edit_task,
|
|
researcher_persona,
|
|
)
|
|
from skillopt_sleep.memory import ensure_skill_scaffold
|
|
from skillopt_sleep.replay import aggregate_scores, replay_batch
|
|
from skillopt_sleep.types import TaskRecord
|
|
|
|
|
|
def _score_holdout(backend, tasks: List[TaskRecord], skill: str, memory: str,
|
|
metric: str = "mixed", w: float = 0.5) -> float:
|
|
from skillopt_sleep.consolidate import select_gate_score
|
|
# the persona experiment uses a 2-way split (train/val, no test); score on val
|
|
holdout = [t for t in tasks if t.split in ("val", "holdout")] or tasks
|
|
pairs = replay_batch(backend, holdout, skill, memory)
|
|
h, s = aggregate_scores(pairs)
|
|
return select_gate_score(h, s, metric, w)
|
|
|
|
|
|
def run(persona: str = "researcher", nights: int = 4, backend_name: str = "mock",
|
|
edit_budget: int = 4, seed: int = 42, model: str = "", codex_path: str = "",
|
|
limit_tasks: int = 0) -> dict:
|
|
from skillopt_sleep.mine import assign_splits
|
|
|
|
make = PERSONAS.get(persona, researcher_persona)
|
|
items = make()
|
|
if limit_tasks and limit_tasks < len(items):
|
|
items = items[:limit_tasks]
|
|
tasks = assign_splits(items, holdout_fraction=0.34, seed=seed)
|
|
backend = get_backend(backend_name, model=model, codex_path=codex_path)
|
|
is_mock = (backend.name == "mock")
|
|
|
|
# start from an empty managed skill + empty memory
|
|
skill = ensure_skill_scaffold("", name="skillopt-sleep-learned",
|
|
description="Learned preferences.")
|
|
memory = ""
|
|
|
|
baseline = _score_holdout(backend, tasks, skill, memory)
|
|
trace = [{"night": 0, "holdout_score": round(baseline, 4), "action": "baseline",
|
|
"n_edits": 0}]
|
|
|
|
for night in range(1, nights + 1):
|
|
res = consolidate(
|
|
backend, tasks, skill, memory,
|
|
edit_budget=edit_budget, gate_metric="mixed", gate_mixed_weight=0.5,
|
|
evolve_skill=True, evolve_memory=True, night=night,
|
|
)
|
|
if res.accepted:
|
|
skill, memory = res.new_skill, res.new_memory
|
|
trace.append({
|
|
"night": night,
|
|
"holdout_score": round(res.candidate_score, 4),
|
|
"action": res.gate_action,
|
|
"accepted": res.accepted,
|
|
"n_edits": len(res.applied_edits),
|
|
"edits": [e.content for e in res.applied_edits],
|
|
"n_rejected": len(res.rejected_edits),
|
|
})
|
|
# converged: stop early if perfect
|
|
if res.candidate_score >= 0.999:
|
|
break
|
|
|
|
after = _score_holdout(backend, tasks, skill, memory)
|
|
|
|
# ── gate-safety probe (mock only; it relies on the mock's known bad rule) ──
|
|
harmful_rejected = None
|
|
if is_mock:
|
|
harmful_tasks = assign_splits([harmful_edit_task()] + make()[:3],
|
|
holdout_fraction=0.5, seed=seed)
|
|
_ = _score_holdout(backend, harmful_tasks, skill, memory)
|
|
res_h = consolidate(backend, harmful_tasks, skill, memory,
|
|
edit_budget=edit_budget, gate_metric="mixed",
|
|
evolve_skill=True, evolve_memory=False, night=nights + 1)
|
|
harmful_rule_text = get_backend("mock").RULE_TEXT["__harmful__"] # type: ignore[attr-defined]
|
|
harmful_rejected = (harmful_rule_text not in res_h.new_skill)
|
|
|
|
result = {
|
|
"persona": persona,
|
|
"backend": backend.name,
|
|
"model": model or "(default)",
|
|
"n_tasks": len(tasks),
|
|
"nights_run": len(trace) - 1,
|
|
"baseline_holdout": round(baseline, 4),
|
|
"after_holdout": round(after, 4),
|
|
"lift": round(after - baseline, 4),
|
|
"improved": after > baseline,
|
|
"gate_blocks_harmful": harmful_rejected, # None for real backends
|
|
"tokens_used": backend.tokens_used(),
|
|
"final_skill_excerpt": skill[-500:],
|
|
"trace": trace,
|
|
}
|
|
return result
|
|
|
|
|
|
def _assert(cond: bool, msg: str) -> None:
|
|
if not cond:
|
|
print(f"FAIL: {msg}")
|
|
raise SystemExit(1)
|
|
|
|
|
|
def main(argv=None) -> int:
|
|
ap = argparse.ArgumentParser(description="SkillOpt-Sleep validation experiment")
|
|
ap.add_argument("--persona", default="researcher", choices=list(PERSONAS.keys()))
|
|
ap.add_argument("--nights", type=int, default=4)
|
|
ap.add_argument("--backend", default="mock", choices=["mock", "claude", "codex", "copilot"])
|
|
ap.add_argument("--model", default="", help="backend model override")
|
|
ap.add_argument("--codex-path", default="", help="path to the real @openai/codex binary")
|
|
ap.add_argument("--edit-budget", type=int, default=4)
|
|
ap.add_argument("--limit-tasks", type=int, default=0, help="cap #tasks (control API cost)")
|
|
ap.add_argument("--json", action="store_true")
|
|
ap.add_argument("--assert-improves", action="store_true",
|
|
help="exit nonzero unless lift>0 (and, for mock, gate blocks harmful edit)")
|
|
args = ap.parse_args(argv)
|
|
|
|
res = run(args.persona, nights=args.nights, backend_name=args.backend,
|
|
edit_budget=args.edit_budget, model=args.model,
|
|
codex_path=args.codex_path, limit_tasks=args.limit_tasks)
|
|
|
|
if args.json:
|
|
print(json.dumps(res, ensure_ascii=False, indent=2))
|
|
else:
|
|
print(f"=== SkillOpt-Sleep experiment: persona={res['persona']} "
|
|
f"backend={res['backend']} model={res['model']} ===")
|
|
print(f"tasks: {res['n_tasks']} tokens(approx): {res['tokens_used']}")
|
|
print(f"baseline held-out : {res['baseline_holdout']}")
|
|
print(f"after held-out : {res['after_holdout']} (lift {res['lift']:+.4f})")
|
|
if res["gate_blocks_harmful"] is not None:
|
|
print(f"gate blocks harmful edit: {res['gate_blocks_harmful']}")
|
|
print("trace:")
|
|
for row in res["trace"]:
|
|
edits = "; ".join(row.get("edits", []))[:80]
|
|
print(f" night {row['night']}: holdout={row['holdout_score']} "
|
|
f"{row['action']} (+{row['n_edits']} edits) {edits}")
|
|
|
|
if args.assert_improves:
|
|
_assert(res["improved"], "held-out score did not improve")
|
|
if res["gate_blocks_harmful"] is not None:
|
|
_assert(res["gate_blocks_harmful"], "gate failed to block harmful edit")
|
|
print("\nPASS: nightly consolidation improves held-out score AND gate blocks regressions.")
|
|
else:
|
|
print("\nPASS: nightly consolidation improves held-out score (real backend).")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|