mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
feat(slow-update): add config-controlled gated / force-injected modes
Add optimizer.slow_update_gate_with_selection to control how epoch-boundary slow-update guidance is applied: - false (default): force-injected - inject guidance into current & best unconditionally (unchanged behavior). - true: gated - evaluate the slow-update candidate on the selection set and accept/reject via the same validation gate as step-level updates (logic follows the SkillReflection ablation). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -66,6 +66,7 @@ optimizer:
|
||||
skill_update_mode: patch # patch / rewrite_from_suggestions / full_rewrite_minibatch
|
||||
use_slow_update: true
|
||||
slow_update_samples: 20
|
||||
slow_update_gate_with_selection: false
|
||||
longitudinal_pair_policy: mixed # mixed / changed / unchanged
|
||||
use_meta_skill: true
|
||||
|
||||
|
||||
@@ -98,6 +98,7 @@ _FLATTEN_MAP: dict[str, str] = {
|
||||
"optimizer.meta_learning_rate": "meta_edit_budget",
|
||||
"optimizer.use_slow_update": "use_slow_update",
|
||||
"optimizer.slow_update_samples": "slow_update_samples",
|
||||
"optimizer.slow_update_gate_with_selection": "slow_update_gate_with_selection",
|
||||
"optimizer.longitudinal_pair_policy": "longitudinal_pair_policy",
|
||||
"optimizer.use_meta_skill": "use_meta_skill",
|
||||
"evaluation.use_gate": "use_gate",
|
||||
|
||||
@@ -865,6 +865,15 @@ class ReflACTTrainer:
|
||||
else ""
|
||||
)
|
||||
)
|
||||
slow_gate_with_selection = bool(
|
||||
cfg.get("slow_update_gate_with_selection", False)
|
||||
)
|
||||
print(
|
||||
" [slow update] acceptance="
|
||||
+ ("gated (selection-set validation)"
|
||||
if slow_gate_with_selection
|
||||
else "force-accept (unconditional)")
|
||||
)
|
||||
if current_score < 0:
|
||||
print(f"\n{'='*60}")
|
||||
print(" BASELINE — evaluate initial skill on Selection set (valid_seen)")
|
||||
@@ -1468,17 +1477,27 @@ class ReflACTTrainer:
|
||||
epoch_comparison_pairs = None
|
||||
if (
|
||||
slow_saved.get("slow_update_content")
|
||||
and slow_saved.get("action") in {
|
||||
"accept", "accept_new_best", "force_accept",
|
||||
}
|
||||
and epoch >= 2
|
||||
):
|
||||
current_skill = replace_slow_update_field(
|
||||
current_skill, slow_saved["slow_update_content"],
|
||||
)
|
||||
best_skill = replace_slow_update_field(
|
||||
best_skill, slow_saved["slow_update_content"],
|
||||
)
|
||||
action = slow_saved.get("action")
|
||||
if slow_gate_with_selection:
|
||||
# Gated mode (follow SkillReflection): re-apply the
|
||||
# guidance to current_skill only when it was accepted.
|
||||
if action in {"accept", "accept_new_best"}:
|
||||
current_skill = replace_slow_update_field(
|
||||
current_skill,
|
||||
slow_saved["slow_update_content"],
|
||||
)
|
||||
elif action in {
|
||||
"accept", "accept_new_best", "force_accept",
|
||||
}:
|
||||
# Force-accept mode: re-apply to both current & best.
|
||||
current_skill = replace_slow_update_field(
|
||||
current_skill, slow_saved["slow_update_content"],
|
||||
)
|
||||
best_skill = replace_slow_update_field(
|
||||
best_skill, slow_saved["slow_update_content"],
|
||||
)
|
||||
elif epoch == 1:
|
||||
# Epoch 1: inject empty placeholder
|
||||
os.makedirs(slow_dir, exist_ok=True)
|
||||
@@ -1618,31 +1637,119 @@ class ReflACTTrainer:
|
||||
"observed across adjacent epochs."
|
||||
)
|
||||
|
||||
# Slow update field is force-updated into both
|
||||
# current_skill and best_skill unconditionally.
|
||||
# The epoch-level longitudinal guidance should always
|
||||
# persist — it must not be gated by step-level
|
||||
# selection scores.
|
||||
slow_content = slow_result["slow_update_content"]
|
||||
current_skill = replace_slow_update_field(
|
||||
current_skill, slow_content,
|
||||
)
|
||||
best_skill = replace_slow_update_field(
|
||||
best_skill, slow_content,
|
||||
)
|
||||
# Update caches so downstream steps use the
|
||||
# slow-update-injected skill for hashing.
|
||||
slow_candidate_hash = skill_hash(current_skill)
|
||||
sel_cache[slow_candidate_hash] = (current_score, 0.0)
|
||||
# Slow update acceptance — two modes selected via
|
||||
# `optimizer.slow_update_gate_with_selection`.
|
||||
if slow_gate_with_selection:
|
||||
# ── Gated mode (follow SkillReflection) ──────────
|
||||
# Evaluate the slow-update candidate on the
|
||||
# selection set and accept/reject via the same
|
||||
# validation gate used for step-level updates.
|
||||
if slow_candidate_hash in sel_cache:
|
||||
slow_sel_hard, slow_sel_soft = sel_cache[
|
||||
slow_candidate_hash
|
||||
]
|
||||
print(
|
||||
f" [slow gate] cache hit: "
|
||||
f"hard={slow_sel_hard:.4f}"
|
||||
)
|
||||
else:
|
||||
sel_env, sel_n = _build_eval_env(
|
||||
split="valid_seen",
|
||||
env_num=cfg["sel_env_num"],
|
||||
seed=seed,
|
||||
)
|
||||
print(f" [slow gate] selection items={sel_n}")
|
||||
slow_eval_dir = os.path.join(
|
||||
slow_dir, "selection_eval",
|
||||
)
|
||||
slow_eval_results = adapter.rollout(
|
||||
sel_env, slow_candidate, slow_eval_dir,
|
||||
)
|
||||
slow_sel_hard, slow_sel_soft = compute_score(
|
||||
slow_eval_results
|
||||
)
|
||||
sel_cache[slow_candidate_hash] = (
|
||||
slow_sel_hard, slow_sel_soft,
|
||||
)
|
||||
|
||||
slow_result["action"] = "force_accept"
|
||||
current_origin = f"slow_update_epoch_{epoch:02d}"
|
||||
slow_gate = evaluate_gate(
|
||||
candidate_skill=slow_candidate,
|
||||
cand_hard=slow_sel_hard,
|
||||
current_skill=current_skill,
|
||||
current_score=current_score,
|
||||
best_skill=best_skill,
|
||||
best_score=best_score,
|
||||
best_step=best_step,
|
||||
global_step=global_step,
|
||||
cand_soft=slow_sel_soft,
|
||||
metric=gate_metric,
|
||||
mixed_weight=gate_mixed_weight,
|
||||
)
|
||||
slow_result["selection_hard"] = slow_sel_hard
|
||||
slow_result["selection_soft"] = slow_sel_soft
|
||||
slow_result["action"] = slow_gate.action
|
||||
prev_current = current_score
|
||||
prev_best = best_score
|
||||
current_skill = slow_gate.current_skill
|
||||
current_score = slow_gate.current_score
|
||||
best_skill = slow_gate.best_skill
|
||||
best_score = slow_gate.best_score
|
||||
best_step = slow_gate.best_step
|
||||
if slow_gate.action in {"accept", "accept_new_best"}:
|
||||
current_origin = (
|
||||
f"slow_update_epoch_{epoch:02d}"
|
||||
)
|
||||
if slow_gate.action == "accept_new_best":
|
||||
best_origin = current_origin
|
||||
print(
|
||||
f" [slow gate] ACCEPT (new best) "
|
||||
f"hard={slow_sel_hard:.4f} > "
|
||||
f"prev best {prev_best:.4f}"
|
||||
)
|
||||
elif slow_gate.action == "accept":
|
||||
print(
|
||||
f" [slow gate] ACCEPT "
|
||||
f"hard={slow_sel_hard:.4f} > "
|
||||
f"current={prev_current:.4f}"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f" [slow gate] REJECT "
|
||||
f"hard={slow_sel_hard:.4f} <= "
|
||||
f"current={current_score:.4f}"
|
||||
)
|
||||
print(
|
||||
f" [slow update] guidance written "
|
||||
f"({len(slow_result['slow_update_content'])} "
|
||||
f"chars), {slow_time}s"
|
||||
)
|
||||
else:
|
||||
# ── Force-accept mode (default) ──────────────────
|
||||
# The epoch-level longitudinal guidance is injected
|
||||
# into both current_skill and best_skill
|
||||
# unconditionally — it must not be gated by
|
||||
# step-level selection scores.
|
||||
slow_content = slow_result["slow_update_content"]
|
||||
current_skill = replace_slow_update_field(
|
||||
current_skill, slow_content,
|
||||
)
|
||||
best_skill = replace_slow_update_field(
|
||||
best_skill, slow_content,
|
||||
)
|
||||
# Update caches so downstream steps use the
|
||||
# slow-update-injected skill for hashing.
|
||||
slow_candidate_hash = skill_hash(current_skill)
|
||||
sel_cache[slow_candidate_hash] = (current_score, 0.0)
|
||||
|
||||
print(
|
||||
f" [slow update] force-injected into current & best "
|
||||
f"({len(slow_content)} chars), "
|
||||
f"{slow_time}s"
|
||||
)
|
||||
slow_result["action"] = "force_accept"
|
||||
current_origin = f"slow_update_epoch_{epoch:02d}"
|
||||
|
||||
print(
|
||||
f" [slow update] force-injected into "
|
||||
f"current & best "
|
||||
f"({len(slow_content)} chars), "
|
||||
f"{slow_time}s"
|
||||
)
|
||||
else:
|
||||
slow_result = slow_result or {}
|
||||
slow_result["action"] = "no_content"
|
||||
|
||||
@@ -41,14 +41,6 @@ def run_meta_skill(
|
||||
"""Produce updated optimizer-side meta skill from adjacent epochs."""
|
||||
actual_system = system_prompt if system_prompt is not None else load_prompt("meta_skill")
|
||||
|
||||
prev_skill_display = prev_skill
|
||||
if len(prev_skill_display) > 6000:
|
||||
prev_skill_display = prev_skill_display[:6000] + "\n...[truncated]..."
|
||||
|
||||
curr_skill_display = curr_skill
|
||||
if len(curr_skill_display) > 6000:
|
||||
curr_skill_display = curr_skill_display[:6000] + "\n...[truncated]..."
|
||||
|
||||
prev_meta_section = (
|
||||
prev_meta_skill_content.strip()
|
||||
if prev_meta_skill_content and prev_meta_skill_content.strip()
|
||||
@@ -57,8 +49,8 @@ def run_meta_skill(
|
||||
|
||||
comparison_text = format_comparison_text(comparison_pairs)
|
||||
user = (
|
||||
f"## Previous Epoch Last-Step Skill\n{prev_skill_display}\n\n"
|
||||
f"## Current Epoch Last-Step Skill\n{curr_skill_display}\n\n"
|
||||
f"## Previous Epoch Last-Step Skill\n{prev_skill}\n\n"
|
||||
f"## Current Epoch Last-Step Skill\n{curr_skill}\n\n"
|
||||
f"## Previous Optimizer Meta Skill\n"
|
||||
f"The following optimizer memory was available during the current epoch. "
|
||||
f"Reflect on whether it improved or harmed the quality of edits.\n\n"
|
||||
@@ -71,7 +63,7 @@ def run_meta_skill(
|
||||
response, _ = chat_optimizer(
|
||||
system=actual_system,
|
||||
user=user,
|
||||
max_completion_tokens=3072,
|
||||
max_completion_tokens=16384,
|
||||
retries=3,
|
||||
stage="meta_skill",
|
||||
)
|
||||
|
||||
@@ -91,6 +91,11 @@ def replace_slow_update_field(skill: str, new_content: str) -> str:
|
||||
# ── Comparison text builder ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
# NOTE: The character limits below (whole-trajectory cap + the per-field caps in
|
||||
# _read_trajectory and the comparison metadata) only trim the comparison samples
|
||||
# fed to the slow-update optimizer. They exist to cut token usage and speed up the
|
||||
# call; they do NOT affect what gets written into the skill. If you need richer
|
||||
# context for the longitudinal comparison, feel free to raise them.
|
||||
_MAX_TRAJ_CHARS = 3000
|
||||
|
||||
|
||||
@@ -117,6 +122,8 @@ def _read_trajectory(rollout_dir: str, task_id: str) -> str:
|
||||
for entry in conversation:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
# Per-field caps (cmd/obs/reasoning/etc.) keep each trajectory compact to
|
||||
# save tokens / time; raise them if you want fuller step detail.
|
||||
if entry.get("type") == "tool_call":
|
||||
cmd = _clip_text(entry.get("cmd"), 500)
|
||||
obs = _clip_text(entry.get("obs"), 800)
|
||||
@@ -352,10 +359,6 @@ def run_slow_update(
|
||||
)
|
||||
comparison_text = format_comparison_text(pairs)
|
||||
|
||||
prev_skill_display = prev_skill
|
||||
if len(prev_skill_display) > 6000:
|
||||
prev_skill_display = prev_skill_display[:6000] + "\n...[truncated]..."
|
||||
|
||||
prev_guidance_section = (
|
||||
prev_slow_update_content.strip()
|
||||
if prev_slow_update_content and prev_slow_update_content.strip()
|
||||
@@ -363,7 +366,7 @@ def run_slow_update(
|
||||
)
|
||||
|
||||
user = (
|
||||
f"## Previous Epoch's Skill\n{prev_skill_display}\n\n"
|
||||
f"## Previous Epoch's Skill\n{prev_skill}\n\n"
|
||||
f"## Current Epoch's Skill\n{skill_content}\n\n"
|
||||
f"## Previous Slow Update Guidance\n"
|
||||
f"The following guidance was active during the current epoch. "
|
||||
@@ -377,7 +380,7 @@ def run_slow_update(
|
||||
response, _ = chat_optimizer(
|
||||
system=actual_system,
|
||||
user=user,
|
||||
max_completion_tokens=4096,
|
||||
max_completion_tokens=16384,
|
||||
retries=3,
|
||||
stage="slow_update",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user