mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-04 06:29:56 +08:00
- Rename teacher -> optimizer, student -> target across all code, configs, docs, prompts - CLI: --teacher_model -> --optimizer_model, --student_model -> --target_model - Remove best_skill files, keep only initial skills - Fix slow update gate (force write into skill) - Fix SLOW_UPDATE marker stripping - Remove deep_reflect and meta_reflect mechanisms - Update .env.example with export prefix and azure_cli docs - Add endpoint empty validation in azure_openai.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
452 lines
20 KiB
Python
452 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""SkillOpt eval-only: run a single skill on a dataset without training.
|
|
|
|
Usage
|
|
-----
|
|
python scripts/eval_only.py \
|
|
--config configs/spreadsheetbench/default.yaml \
|
|
--skill skillopt/envs/spreadsheetbench/skills/initial.md \
|
|
--split_dir /path/to/split \
|
|
--out_root outputs/eval_skill0
|
|
|
|
All YAML keys can be overridden from the CLI, same as train.py.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import datetime
|
|
import json
|
|
import os
|
|
import sys
|
|
|
|
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
_PROJECT_ROOT = os.path.dirname(_SCRIPT_DIR)
|
|
if _PROJECT_ROOT not in sys.path:
|
|
sys.path.insert(0, _PROJECT_ROOT)
|
|
|
|
from skillopt.model import (
|
|
configure_azure_openai,
|
|
configure_claude_code_exec,
|
|
configure_codex_exec,
|
|
set_reasoning_effort,
|
|
set_target_backend,
|
|
set_target_deployment,
|
|
set_optimizer_backend,
|
|
set_optimizer_deployment,
|
|
)
|
|
from skillopt.model.common import default_model_for_backend, normalize_backend_name
|
|
|
|
_OPENAI_DEFAULT_MODEL_SENTINELS = {"gpt-5.4", "gpt-5.5"}
|
|
from skillopt.utils import compute_score
|
|
|
|
|
|
# ── Reuse registry from train.py ───────────────────────────────────────────
|
|
|
|
_ENV_REGISTRY: dict[str, type] = {}
|
|
|
|
|
|
def _register_builtins() -> None:
|
|
try:
|
|
from skillopt.envs.alfworld.adapter import ALFWorldAdapter
|
|
_ENV_REGISTRY["alfworld"] = ALFWorldAdapter
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
from skillopt.envs.searchqa.adapter import SearchQAAdapter
|
|
_ENV_REGISTRY["searchqa"] = SearchQAAdapter
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
from skillopt.envs.livemathematicianbench.adapter import LiveMathematicianBenchAdapter
|
|
_ENV_REGISTRY["livemathematicianbench"] = LiveMathematicianBenchAdapter
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
from skillopt.envs.babyvision.adapter import BabyVisionAdapter
|
|
_ENV_REGISTRY["babyvision"] = BabyVisionAdapter
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
from skillopt.envs.spreadsheetbench.adapter import SpreadsheetBenchAdapter
|
|
_ENV_REGISTRY["spreadsheetbench"] = SpreadsheetBenchAdapter
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
from skillopt.envs.mmrb.adapter import MMRBAdapter
|
|
_ENV_REGISTRY["mmrb"] = MMRBAdapter
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
from skillopt.envs.docvqa.adapter import DocVQAAdapter
|
|
_ENV_REGISTRY["docvqa"] = DocVQAAdapter
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
from skillopt.envs.mathverse.adapter import MathVerseAdapter
|
|
_ENV_REGISTRY["mathverse"] = MathVerseAdapter
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
from skillopt.envs.officeqa.adapter import OfficeQAAdapter
|
|
_ENV_REGISTRY["officeqa"] = OfficeQAAdapter
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
from skillopt.envs.sealqa.adapter import SealQAAdapter
|
|
_ENV_REGISTRY["sealqa"] = SealQAAdapter
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
from skillopt.envs.swebench.adapter import SWEBenchAdapter
|
|
_ENV_REGISTRY["swebench"] = SWEBenchAdapter
|
|
except ImportError:
|
|
pass
|
|
|
|
|
|
def get_adapter(cfg: dict):
|
|
_register_builtins()
|
|
env_name = cfg.get("env", "alfworld")
|
|
if env_name not in _ENV_REGISTRY:
|
|
raise ValueError(
|
|
f"Unknown environment '{env_name}'. "
|
|
f"Available: {list(_ENV_REGISTRY.keys())}"
|
|
)
|
|
adapter_cls = _ENV_REGISTRY[env_name]
|
|
|
|
import inspect
|
|
sig = inspect.signature(adapter_cls.__init__)
|
|
accepted = set(sig.parameters.keys()) - {"self"}
|
|
adapter_kwargs = {k: cfg[k] for k in accepted if k in cfg}
|
|
return adapter_cls(**adapter_kwargs)
|
|
|
|
|
|
# ── CLI ────────────────────────────────────────────────────────────────────
|
|
|
|
_BOOL = lambda x: str(x).lower() in ("true", "1", "yes") # noqa: E731
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
p = argparse.ArgumentParser(description="SkillOpt eval-only")
|
|
p.add_argument("--config", type=str, required=True)
|
|
p.add_argument("--skill", type=str, required=True,
|
|
help="Path to skill .md file to evaluate")
|
|
p.add_argument("--split", type=str, default="all",
|
|
help="Which split to eval: train/valid_seen/valid_unseen/all (default: all)")
|
|
p.add_argument("--cfg-options", nargs="+", default=[],
|
|
help="Override config: section.key=value")
|
|
# Legacy flat overrides
|
|
p.add_argument("--env", type=str)
|
|
p.add_argument("--backend", type=str,
|
|
choices=["azure_openai", "codex", "codex_exec", "claude", "claude_chat", "claude_code_exec"])
|
|
p.add_argument("--optimizer_model", type=str)
|
|
p.add_argument("--target_model", type=str)
|
|
p.add_argument("--optimizer_backend", type=str)
|
|
p.add_argument("--target_backend", type=str)
|
|
p.add_argument("--reasoning_effort", type=str,
|
|
choices=["", "low", "medium", "high", "xhigh", "max"])
|
|
p.add_argument("--azure_endpoint", type=str)
|
|
p.add_argument("--azure_api_version", type=str)
|
|
p.add_argument("--azure_api_key", type=str)
|
|
p.add_argument("--azure_openai_endpoint", type=str)
|
|
p.add_argument("--azure_openai_api_version", type=str)
|
|
p.add_argument("--azure_openai_api_key", type=str)
|
|
p.add_argument("--azure_openai_auth_mode", type=str)
|
|
p.add_argument("--azure_openai_ad_scope", type=str)
|
|
p.add_argument("--azure_openai_managed_identity_client_id", type=str)
|
|
p.add_argument("--optimizer_azure_openai_endpoint", type=str)
|
|
p.add_argument("--optimizer_azure_openai_api_version", type=str)
|
|
p.add_argument("--optimizer_azure_openai_api_key", type=str)
|
|
p.add_argument("--optimizer_azure_openai_auth_mode", type=str)
|
|
p.add_argument("--optimizer_azure_openai_ad_scope", type=str)
|
|
p.add_argument("--optimizer_azure_openai_managed_identity_client_id", type=str)
|
|
p.add_argument("--target_azure_openai_endpoint", type=str)
|
|
p.add_argument("--target_azure_openai_api_version", type=str)
|
|
p.add_argument("--target_azure_openai_api_key", type=str)
|
|
p.add_argument("--target_azure_openai_auth_mode", type=str)
|
|
p.add_argument("--target_azure_openai_ad_scope", type=str)
|
|
p.add_argument("--target_azure_openai_managed_identity_client_id", type=str)
|
|
p.add_argument("--codex_exec_path", type=str)
|
|
p.add_argument("--codex_exec_sandbox", type=str)
|
|
p.add_argument("--codex_exec_profile", type=str)
|
|
p.add_argument("--codex_exec_full_auto", type=_BOOL)
|
|
p.add_argument("--codex_exec_reasoning_effort", type=str)
|
|
p.add_argument("--codex_exec_use_sdk", type=str)
|
|
p.add_argument("--codex_exec_network_access", type=_BOOL)
|
|
p.add_argument("--codex_exec_web_search", type=_BOOL)
|
|
p.add_argument("--codex_exec_approval_policy", type=str)
|
|
p.add_argument("--claude_code_exec_path", type=str)
|
|
p.add_argument("--claude_code_exec_profile", type=str)
|
|
p.add_argument("--claude_code_exec_use_sdk", type=str)
|
|
p.add_argument("--claude_code_exec_effort", type=str)
|
|
p.add_argument("--claude_code_exec_max_thinking_tokens", type=int)
|
|
p.add_argument("--out_root", type=str)
|
|
p.add_argument("--data_path", type=str)
|
|
p.add_argument("--split_mode", type=str,
|
|
choices=["ratio", "split_dir"])
|
|
p.add_argument("--split_ratio", type=str)
|
|
p.add_argument("--split_seed", type=int)
|
|
p.add_argument("--split_dir", type=str)
|
|
p.add_argument("--split_output_dir", type=str)
|
|
p.add_argument("--data_root", type=str)
|
|
p.add_argument("--max_turns", type=int)
|
|
p.add_argument("--workers", type=int)
|
|
p.add_argument("--max_api_workers", type=int)
|
|
p.add_argument("--seed", type=int)
|
|
p.add_argument("--test_env_num", type=int)
|
|
p.add_argument("--mode", type=str,
|
|
help="SpreadsheetBench: single/multi/react (default comes from config)")
|
|
return p.parse_args()
|
|
|
|
|
|
def main() -> None:
|
|
args = parse_args()
|
|
|
|
from skillopt.config import load_config as _load, flatten_config, is_structured
|
|
|
|
cfg = _load(args.config, overrides=args.cfg_options)
|
|
structured = is_structured(cfg)
|
|
|
|
# Apply legacy --key value overrides
|
|
cli = {k: v for k, v in vars(args).items()
|
|
if v is not None and k not in ("config", "skill", "split", "cfg_options")}
|
|
if cli:
|
|
if structured:
|
|
from skillopt.config import apply_overrides
|
|
_MAP = {
|
|
"backend": "model.backend",
|
|
"optimizer_model": "model.optimizer",
|
|
"target_model": "model.target",
|
|
"optimizer_backend": "model.optimizer_backend",
|
|
"target_backend": "model.target_backend",
|
|
"reasoning_effort": "model.reasoning_effort",
|
|
"azure_endpoint": "model.azure_endpoint",
|
|
"azure_api_version": "model.azure_api_version",
|
|
"azure_api_key": "model.azure_api_key",
|
|
"azure_openai_endpoint": "model.azure_openai_endpoint",
|
|
"azure_openai_api_version": "model.azure_openai_api_version",
|
|
"azure_openai_api_key": "model.azure_openai_api_key",
|
|
"azure_openai_auth_mode": "model.azure_openai_auth_mode",
|
|
"azure_openai_ad_scope": "model.azure_openai_ad_scope",
|
|
"azure_openai_managed_identity_client_id": "model.azure_openai_managed_identity_client_id",
|
|
"optimizer_azure_openai_endpoint": "model.optimizer_azure_openai_endpoint",
|
|
"optimizer_azure_openai_api_version": "model.optimizer_azure_openai_api_version",
|
|
"optimizer_azure_openai_api_key": "model.optimizer_azure_openai_api_key",
|
|
"optimizer_azure_openai_auth_mode": "model.optimizer_azure_openai_auth_mode",
|
|
"optimizer_azure_openai_ad_scope": "model.optimizer_azure_openai_ad_scope",
|
|
"optimizer_azure_openai_managed_identity_client_id": "model.optimizer_azure_openai_managed_identity_client_id",
|
|
"target_azure_openai_endpoint": "model.target_azure_openai_endpoint",
|
|
"target_azure_openai_api_version": "model.target_azure_openai_api_version",
|
|
"target_azure_openai_api_key": "model.target_azure_openai_api_key",
|
|
"target_azure_openai_auth_mode": "model.target_azure_openai_auth_mode",
|
|
"target_azure_openai_ad_scope": "model.target_azure_openai_ad_scope",
|
|
"target_azure_openai_managed_identity_client_id": "model.target_azure_openai_managed_identity_client_id",
|
|
"codex_exec_path": "model.codex_exec_path",
|
|
"codex_exec_sandbox": "model.codex_exec_sandbox",
|
|
"codex_exec_profile": "model.codex_exec_profile",
|
|
"codex_exec_full_auto": "model.codex_exec_full_auto",
|
|
"codex_exec_reasoning_effort": "model.codex_exec_reasoning_effort",
|
|
"codex_exec_use_sdk": "model.codex_exec_use_sdk",
|
|
"codex_exec_network_access": "model.codex_exec_network_access",
|
|
"codex_exec_web_search": "model.codex_exec_web_search",
|
|
"codex_exec_approval_policy": "model.codex_exec_approval_policy",
|
|
"claude_code_exec_path": "model.claude_code_exec_path",
|
|
"claude_code_exec_profile": "model.claude_code_exec_profile",
|
|
"claude_code_exec_use_sdk": "model.claude_code_exec_use_sdk",
|
|
"claude_code_exec_effort": "model.claude_code_exec_effort",
|
|
"claude_code_exec_max_thinking_tokens": "model.claude_code_exec_max_thinking_tokens",
|
|
"seed": "train.seed",
|
|
"test_env_num": "evaluation.test_env_num",
|
|
"env": "env.name",
|
|
"out_root": "env.out_root",
|
|
}
|
|
mapped = []
|
|
for k, v in cli.items():
|
|
dotted = _MAP.get(k)
|
|
if dotted:
|
|
mapped.append(f"{dotted}={v}")
|
|
else:
|
|
mapped.append(f"env.{k}={v}")
|
|
apply_overrides(cfg, mapped)
|
|
else:
|
|
cfg.update(cli)
|
|
|
|
cfg = flatten_config(cfg) if structured else cfg
|
|
|
|
for new_key, old_key in (
|
|
("azure_openai_endpoint", "azure_endpoint"),
|
|
("azure_openai_api_version", "azure_api_version"),
|
|
("azure_openai_api_key", "azure_api_key"),
|
|
):
|
|
if cfg.get(new_key) in (None, "") and cfg.get(old_key) not in (None, ""):
|
|
cfg[new_key] = cfg[old_key]
|
|
|
|
explicit_backend = getattr(args, "backend", None)
|
|
if explicit_backend is None:
|
|
for option in args.cfg_options or []:
|
|
key = str(option).split("=", 1)[0].strip()
|
|
if key == "model.backend":
|
|
explicit_backend = str(option).split("=", 1)[1].strip()
|
|
break
|
|
|
|
backend = normalize_backend_name(cfg.get("model_backend") or cfg.get("target_backend") or "azure_openai")
|
|
|
|
def _has_model_override(dotted_key: str, legacy_key: str) -> bool:
|
|
if getattr(args, legacy_key, None) is not None:
|
|
return True
|
|
for option in args.cfg_options or []:
|
|
key = str(option).split("=", 1)[0].strip()
|
|
if key == dotted_key:
|
|
return True
|
|
return False
|
|
|
|
if explicit_backend is not None:
|
|
backend = normalize_backend_name(explicit_backend)
|
|
cfg["model_backend"] = backend
|
|
if backend in {"claude", "claude_chat"}:
|
|
cfg.setdefault("optimizer_backend", "claude_chat")
|
|
cfg.setdefault("target_backend", "claude_chat")
|
|
elif backend in {"codex", "codex_exec"}:
|
|
cfg.setdefault("optimizer_backend", "openai_chat")
|
|
cfg.setdefault("target_backend", "codex_exec")
|
|
elif backend == "claude_code_exec":
|
|
cfg.setdefault("optimizer_backend", "openai_chat")
|
|
cfg.setdefault("target_backend", "claude_code_exec")
|
|
else:
|
|
cfg.setdefault("optimizer_backend", "openai_chat")
|
|
cfg.setdefault("target_backend", "openai_chat")
|
|
else:
|
|
cfg.setdefault("optimizer_backend", "openai_chat")
|
|
cfg.setdefault("target_backend", "openai_chat")
|
|
|
|
if cfg.get("optimizer_backend") == "claude_chat":
|
|
if (
|
|
str(cfg.get("optimizer_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
|
and not _has_model_override("model.optimizer", "optimizer_model")
|
|
):
|
|
cfg["optimizer_model"] = default_model_for_backend("claude_chat")
|
|
if cfg.get("target_backend") == "claude_chat":
|
|
if (
|
|
str(cfg.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
|
and not _has_model_override("model.target", "target_model")
|
|
):
|
|
cfg["target_model"] = default_model_for_backend("claude_chat")
|
|
if cfg.get("target_backend") == "claude_code_exec":
|
|
if (
|
|
str(cfg.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
|
and not _has_model_override("model.target", "target_model")
|
|
):
|
|
cfg["target_model"] = default_model_for_backend("claude_chat")
|
|
|
|
if not cfg.get("out_root"):
|
|
env = cfg.get("env", "unknown")
|
|
model = cfg.get("target_model", "unknown").replace("/", "-")
|
|
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
cfg["out_root"] = os.path.join("outputs", f"eval_{env}_{model}_{ts}")
|
|
|
|
cfg["out_root"] = os.path.abspath(cfg["out_root"])
|
|
|
|
out_root = cfg["out_root"]
|
|
os.makedirs(out_root, exist_ok=True)
|
|
|
|
# Load skill
|
|
skill_path = os.path.abspath(args.skill)
|
|
with open(skill_path) as f:
|
|
skill_content = f.read()
|
|
print(f" [skill] {skill_path} ({len(skill_content)} chars)")
|
|
|
|
# Configure models
|
|
configure_azure_openai(
|
|
endpoint=(cfg.get("azure_openai_endpoint") or cfg.get("azure_endpoint") or None),
|
|
api_version=(cfg.get("azure_openai_api_version") or cfg.get("azure_api_version") or None),
|
|
api_key=(cfg.get("azure_openai_api_key") or cfg.get("azure_api_key") or None),
|
|
auth_mode=cfg.get("azure_openai_auth_mode") or None,
|
|
ad_scope=cfg.get("azure_openai_ad_scope") or None,
|
|
managed_identity_client_id=cfg.get("azure_openai_managed_identity_client_id") or None,
|
|
optimizer_endpoint=cfg.get("optimizer_azure_openai_endpoint") or None,
|
|
optimizer_api_version=cfg.get("optimizer_azure_openai_api_version") or None,
|
|
optimizer_api_key=cfg.get("optimizer_azure_openai_api_key") or None,
|
|
optimizer_auth_mode=cfg.get("optimizer_azure_openai_auth_mode") or None,
|
|
optimizer_ad_scope=cfg.get("optimizer_azure_openai_ad_scope") or None,
|
|
optimizer_managed_identity_client_id=(
|
|
cfg.get("optimizer_azure_openai_managed_identity_client_id") or None
|
|
),
|
|
target_endpoint=cfg.get("target_azure_openai_endpoint") or None,
|
|
target_api_version=cfg.get("target_azure_openai_api_version") or None,
|
|
target_api_key=cfg.get("target_azure_openai_api_key") or None,
|
|
target_auth_mode=cfg.get("target_azure_openai_auth_mode") or None,
|
|
target_ad_scope=cfg.get("target_azure_openai_ad_scope") or None,
|
|
target_managed_identity_client_id=(
|
|
cfg.get("target_azure_openai_managed_identity_client_id") or None
|
|
),
|
|
)
|
|
set_optimizer_backend(cfg.get("optimizer_backend", "openai_chat"))
|
|
set_target_backend(cfg.get("target_backend", "openai_chat"))
|
|
set_optimizer_deployment(cfg.get("optimizer_model", default_model_for_backend(backend)))
|
|
set_target_deployment(cfg.get("target_model", default_model_for_backend(backend)))
|
|
configure_codex_exec(
|
|
path=cfg.get("codex_exec_path", "codex"),
|
|
sandbox=cfg.get("codex_exec_sandbox", "workspace-write"),
|
|
profile=cfg.get("codex_exec_profile", ""),
|
|
full_auto=cfg.get("codex_exec_full_auto", False),
|
|
reasoning_effort=cfg.get("codex_exec_reasoning_effort", "none"),
|
|
use_sdk=cfg.get("codex_exec_use_sdk", None),
|
|
network_access=cfg.get("codex_exec_network_access", False),
|
|
web_search=cfg.get("codex_exec_web_search", False),
|
|
approval_policy=cfg.get("codex_exec_approval_policy", "never"),
|
|
)
|
|
configure_claude_code_exec(
|
|
path=cfg.get("claude_code_exec_path", "claude"),
|
|
profile=cfg.get("claude_code_exec_profile", ""),
|
|
use_sdk=cfg.get("claude_code_exec_use_sdk", None),
|
|
effort=cfg.get("claude_code_exec_effort", cfg.get("reasoning_effort", "medium")),
|
|
max_thinking_tokens=cfg.get("claude_code_exec_max_thinking_tokens", 16384),
|
|
)
|
|
set_reasoning_effort(cfg.get("reasoning_effort", "") or None)
|
|
|
|
# Build adapter
|
|
adapter = get_adapter(cfg)
|
|
adapter.setup(cfg)
|
|
|
|
seed = cfg.get("seed", 42)
|
|
split = args.split or "all"
|
|
|
|
if split == "all":
|
|
items = (
|
|
adapter.build_eval_env(0, "train", seed)
|
|
+ adapter.build_eval_env(0, "valid_seen", seed)
|
|
+ adapter.build_eval_env(0, "valid_unseen", seed)
|
|
)
|
|
else:
|
|
env_num = cfg.get("test_env_num", 0)
|
|
items = adapter.build_eval_env(env_num, split, seed)
|
|
|
|
print(f"\n [eval] split={split} items={len(items)}")
|
|
print(f" [eval] out_root={out_root}")
|
|
print(f"{'='*60}")
|
|
|
|
# Run rollout
|
|
results = adapter.rollout(items, skill_content, out_root)
|
|
|
|
# Score
|
|
hard, soft = compute_score(results)
|
|
print(f"\n{'='*60}")
|
|
print(f" Results: hard={hard:.4f} soft={soft:.4f} (n={len(results)})")
|
|
print(f"{'='*60}")
|
|
|
|
# Save summary
|
|
summary = {
|
|
"skill": skill_path,
|
|
"split": split,
|
|
"n_items": len(results),
|
|
"hard": hard,
|
|
"soft": soft,
|
|
}
|
|
with open(os.path.join(out_root, "eval_summary.json"), "w") as f:
|
|
json.dump(summary, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f" Saved to: {out_root}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|