mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
The shipped env_template.py and loader_template.py described the same fictional async execute / evaluate / build_prompt API documented in docs/reference/api.md. As a result TemplateBenchmarkEnv(cfg) raised 'TypeError: Can't instantiate abstract class' for every copy-and-paste user who followed the in-tree scaffold. Rewrite the template so it's a working starting point: - env_template.py: TemplateBenchmarkEnv(EnvAdapter) now implements all five real abstract methods (build_train_env, build_eval_env, rollout, reflect, get_task_types) with no-op defaults documented as TODO. Instantiable today; pytest 60/60 still passes. - loader_template.py: TemplateBenchmarkLoader(SplitDataLoader) implements load_split_items for .json / .jsonl input and explains the optional load_raw_items override for split_mode="ratio". - README.md: usage steps now point at scripts/train.py's _ENV_REGISTRY (the real registry) instead of a non-existent BENCHMARK_REGISTRY in skillopt/envs/__init__.py, and link to the rewritten new-benchmark guide. - config_template.yaml: _base_ is a string path (not a list, which the loader rejects); skill_init is commented out with a note so the template config doesn't reference a file the user hasn't created. Verified locally: 'from skillopt.envs._template.env_template import TemplateBenchmarkEnv; TemplateBenchmarkEnv()' succeeds. Refs microsoft/SkillOpt#30. Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
197 lines
7.3 KiB
Python
197 lines
7.3 KiB
Python
"""
|
|
Benchmark Environment Template
|
|
===============================
|
|
Copy this file and implement the TODO sections to add a new benchmark.
|
|
|
|
The EnvAdapter is responsible for:
|
|
1. Building per-batch environment managers (train and eval splits).
|
|
2. Running rollouts under the current skill document.
|
|
3. Reflecting on those rollouts into raw patch dicts.
|
|
4. Reporting the distinct task types in your data (for stratified
|
|
sampling).
|
|
|
|
For a fully worked example see ``skillopt/envs/officeqa/``.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
|
|
from skillopt.datasets.base import BatchSpec
|
|
from skillopt.envs.base import EnvAdapter
|
|
from skillopt.envs._template.loader_template import TemplateBenchmarkLoader
|
|
# When you wire in real reflection, also import:
|
|
# from skillopt.gradient.reflect import run_minibatch_reflect
|
|
|
|
|
|
class TemplateBenchmarkEnv(EnvAdapter):
|
|
"""
|
|
Environment adapter for <Your Benchmark Name>.
|
|
|
|
Rename this class. Each abstract method below is required by
|
|
:class:`skillopt.envs.base.EnvAdapter`. The template implementations
|
|
are minimal so this file is importable and instantiable; replace the
|
|
TODOs with real logic.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
split_dir: str = "",
|
|
data_path: str = "",
|
|
split_mode: str = "split_dir",
|
|
split_ratio: str = "2:1:7",
|
|
split_seed: int = 42,
|
|
split_output_dir: str = "",
|
|
workers: int = 4,
|
|
analyst_workers: int = 4,
|
|
failure_only: bool = False,
|
|
minibatch_size: int = 8,
|
|
edit_budget: int = 4,
|
|
seed: int = 42,
|
|
limit: int = 0,
|
|
max_completion_tokens: int = 4096,
|
|
) -> None:
|
|
self.workers = workers
|
|
self.analyst_workers = analyst_workers
|
|
self.failure_only = failure_only
|
|
self.minibatch_size = minibatch_size
|
|
self.edit_budget = edit_budget
|
|
self.max_completion_tokens = int(max_completion_tokens)
|
|
self.dataloader = TemplateBenchmarkLoader(
|
|
split_dir=split_dir,
|
|
data_path=data_path,
|
|
split_mode=split_mode,
|
|
split_ratio=split_ratio,
|
|
split_seed=split_seed,
|
|
split_output_dir=split_output_dir,
|
|
seed=seed,
|
|
limit=limit,
|
|
)
|
|
|
|
# ── Lifecycle hooks ────────────────────────────────────────────────
|
|
|
|
def setup(self, cfg: dict) -> None:
|
|
super().setup(cfg)
|
|
self.dataloader.setup(cfg)
|
|
|
|
def get_dataloader(self):
|
|
return self.dataloader
|
|
|
|
# ── Batch → env manager ────────────────────────────────────────────
|
|
|
|
def build_env_from_batch(self, batch: BatchSpec, **kwargs):
|
|
# Dataset-backed envs typically just pass items straight through.
|
|
return list(batch.payload or [])
|
|
|
|
def build_train_env(self, batch_size: int, seed: int, **kwargs):
|
|
batch = self.dataloader.build_train_batch(
|
|
batch_size=batch_size, seed=seed, **kwargs
|
|
)
|
|
return self.build_env_from_batch(batch, **kwargs)
|
|
|
|
def build_eval_env(self, env_num: int, split: str, seed: int, **kwargs):
|
|
batch = self.dataloader.build_eval_batch(
|
|
env_num=env_num, split=split, seed=seed, **kwargs
|
|
)
|
|
return self.build_env_from_batch(batch, **kwargs)
|
|
|
|
# ── Rollout: run episodes under current skill ──────────────────────
|
|
|
|
def rollout(
|
|
self,
|
|
env_manager,
|
|
skill_content: str,
|
|
out_dir: str,
|
|
**kwargs,
|
|
) -> list[dict]:
|
|
"""
|
|
Run a batch of episodes under the current skill.
|
|
|
|
TODO: replace this loop with your real rollout. For each item:
|
|
1. Build the prompt using `skill_content` as the system message.
|
|
2. Call your target model.
|
|
3. Score the prediction.
|
|
4. Return a dict with at minimum: ``id`` (str), ``hard`` (0|1),
|
|
``soft`` (float in [0, 1]). Add any env-specific extras you
|
|
need for reflect() — they will be preserved on
|
|
``RolloutResult.extras``.
|
|
"""
|
|
items: list[dict] = env_manager
|
|
results: list[dict] = []
|
|
for item in items:
|
|
# ── REPLACE THIS BLOCK WITH YOUR REAL ROLLOUT ──
|
|
results.append(
|
|
{
|
|
"id": str(item.get("id", "")),
|
|
"hard": 0,
|
|
"soft": 0.0,
|
|
"predicted_answer": "",
|
|
"question": item.get("question", ""),
|
|
"fail_reason": "template rollout — not implemented",
|
|
}
|
|
)
|
|
return results
|
|
|
|
# ── Reflect: turn rollout results into patch dicts ─────────────────
|
|
|
|
def reflect(
|
|
self,
|
|
results: list[dict],
|
|
skill_content: str,
|
|
out_dir: str,
|
|
**kwargs,
|
|
) -> list[dict | None]:
|
|
"""
|
|
Turn rollouts into a list of raw patch dicts (or None to drop).
|
|
|
|
Each non-None dict MUST have:
|
|
- "patch": {"edits": [...]} a Patch.to_dict() payload
|
|
- "source_type": "failure" | "success"
|
|
|
|
Most benchmarks delegate to
|
|
:func:`skillopt.gradient.reflect.run_minibatch_reflect` which
|
|
will call the optimizer model with the
|
|
``analyst_error_*`` / ``analyst_success_*`` prompts. To enable it,
|
|
uncomment the import above and call:
|
|
|
|
from skillopt.gradient.reflect import run_minibatch_reflect
|
|
return run_minibatch_reflect(
|
|
results=results,
|
|
skill_content=skill_content,
|
|
prediction_dir=kwargs.get(
|
|
"prediction_dir", os.path.join(out_dir, "predictions")
|
|
),
|
|
patches_dir=kwargs.get(
|
|
"patches_dir", os.path.join(out_dir, "patches")
|
|
),
|
|
workers=self.analyst_workers,
|
|
failure_only=self.failure_only,
|
|
minibatch_size=self.minibatch_size,
|
|
edit_budget=self.edit_budget,
|
|
random_seed=kwargs.get("random_seed"),
|
|
error_system=self.get_error_minibatch_prompt(),
|
|
success_system=self.get_success_minibatch_prompt(),
|
|
step_buffer_context=kwargs.get("step_buffer_context", ""),
|
|
update_mode=getattr(self, "_cfg", {}).get(
|
|
"skill_update_mode", "patch"
|
|
),
|
|
)
|
|
"""
|
|
# Template default: produce no patches (no-op trainer step).
|
|
return [None for _ in results]
|
|
|
|
# ── Stratification hint ────────────────────────────────────────────
|
|
|
|
def get_task_types(self) -> list[str]:
|
|
"""Distinct task-type strings used for stratified sampling."""
|
|
seen: list[str] = []
|
|
all_items = (
|
|
self.dataloader.train_items
|
|
+ self.dataloader.val_items
|
|
+ self.dataloader.test_items
|
|
)
|
|
for item in all_items:
|
|
tt = str(item.get("task_type") or "template")
|
|
if tt not in seen:
|
|
seen.append(tt)
|
|
return seen or ["template"]
|