feat(slow-update): add config-controlled gated / force-injected modes

Add optimizer.slow_update_gate_with_selection to control how epoch-boundary
slow-update guidance is applied:
- false (default): force-injected - inject guidance into current & best
  unconditionally (unchanged behavior).
- true: gated - evaluate the slow-update candidate on the selection set and
  accept/reject via the same validation gate as step-level updates
  (logic follows the SkillReflection ablation).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Cuzyoung
2026-05-31 01:52:02 +00:00
parent 42e555d28e
commit 00602df9e9
5 changed files with 153 additions and 49 deletions

View File

@@ -66,6 +66,7 @@ optimizer:
skill_update_mode: patch # patch / rewrite_from_suggestions / full_rewrite_minibatch
use_slow_update: true
slow_update_samples: 20
slow_update_gate_with_selection: false
longitudinal_pair_policy: mixed # mixed / changed / unchanged
use_meta_skill: true

View File

@@ -98,6 +98,7 @@ _FLATTEN_MAP: dict[str, str] = {
"optimizer.meta_learning_rate": "meta_edit_budget",
"optimizer.use_slow_update": "use_slow_update",
"optimizer.slow_update_samples": "slow_update_samples",
"optimizer.slow_update_gate_with_selection": "slow_update_gate_with_selection",
"optimizer.longitudinal_pair_policy": "longitudinal_pair_policy",
"optimizer.use_meta_skill": "use_meta_skill",
"evaluation.use_gate": "use_gate",

View File

@@ -865,6 +865,15 @@ class ReflACTTrainer:
else ""
)
)
slow_gate_with_selection = bool(
cfg.get("slow_update_gate_with_selection", False)
)
print(
" [slow update] acceptance="
+ ("gated (selection-set validation)"
if slow_gate_with_selection
else "force-accept (unconditional)")
)
if current_score < 0:
print(f"\n{'='*60}")
print(" BASELINE — evaluate initial skill on Selection set (valid_seen)")
@@ -1468,17 +1477,27 @@ class ReflACTTrainer:
epoch_comparison_pairs = None
if (
slow_saved.get("slow_update_content")
and slow_saved.get("action") in {
"accept", "accept_new_best", "force_accept",
}
and epoch >= 2
):
current_skill = replace_slow_update_field(
current_skill, slow_saved["slow_update_content"],
)
best_skill = replace_slow_update_field(
best_skill, slow_saved["slow_update_content"],
)
action = slow_saved.get("action")
if slow_gate_with_selection:
# Gated mode (follow SkillReflection): re-apply the
# guidance to current_skill only when it was accepted.
if action in {"accept", "accept_new_best"}:
current_skill = replace_slow_update_field(
current_skill,
slow_saved["slow_update_content"],
)
elif action in {
"accept", "accept_new_best", "force_accept",
}:
# Force-accept mode: re-apply to both current & best.
current_skill = replace_slow_update_field(
current_skill, slow_saved["slow_update_content"],
)
best_skill = replace_slow_update_field(
best_skill, slow_saved["slow_update_content"],
)
elif epoch == 1:
# Epoch 1: inject empty placeholder
os.makedirs(slow_dir, exist_ok=True)
@@ -1618,31 +1637,119 @@ class ReflACTTrainer:
"observed across adjacent epochs."
)
# Slow update field is force-updated into both
# current_skill and best_skill unconditionally.
# The epoch-level longitudinal guidance should always
# persist — it must not be gated by step-level
# selection scores.
slow_content = slow_result["slow_update_content"]
current_skill = replace_slow_update_field(
current_skill, slow_content,
)
best_skill = replace_slow_update_field(
best_skill, slow_content,
)
# Update caches so downstream steps use the
# slow-update-injected skill for hashing.
slow_candidate_hash = skill_hash(current_skill)
sel_cache[slow_candidate_hash] = (current_score, 0.0)
# Slow update acceptance — two modes selected via
# `optimizer.slow_update_gate_with_selection`.
if slow_gate_with_selection:
# ── Gated mode (follow SkillReflection) ──────────
# Evaluate the slow-update candidate on the
# selection set and accept/reject via the same
# validation gate used for step-level updates.
if slow_candidate_hash in sel_cache:
slow_sel_hard, slow_sel_soft = sel_cache[
slow_candidate_hash
]
print(
f" [slow gate] cache hit: "
f"hard={slow_sel_hard:.4f}"
)
else:
sel_env, sel_n = _build_eval_env(
split="valid_seen",
env_num=cfg["sel_env_num"],
seed=seed,
)
print(f" [slow gate] selection items={sel_n}")
slow_eval_dir = os.path.join(
slow_dir, "selection_eval",
)
slow_eval_results = adapter.rollout(
sel_env, slow_candidate, slow_eval_dir,
)
slow_sel_hard, slow_sel_soft = compute_score(
slow_eval_results
)
sel_cache[slow_candidate_hash] = (
slow_sel_hard, slow_sel_soft,
)
slow_result["action"] = "force_accept"
current_origin = f"slow_update_epoch_{epoch:02d}"
slow_gate = evaluate_gate(
candidate_skill=slow_candidate,
cand_hard=slow_sel_hard,
current_skill=current_skill,
current_score=current_score,
best_skill=best_skill,
best_score=best_score,
best_step=best_step,
global_step=global_step,
cand_soft=slow_sel_soft,
metric=gate_metric,
mixed_weight=gate_mixed_weight,
)
slow_result["selection_hard"] = slow_sel_hard
slow_result["selection_soft"] = slow_sel_soft
slow_result["action"] = slow_gate.action
prev_current = current_score
prev_best = best_score
current_skill = slow_gate.current_skill
current_score = slow_gate.current_score
best_skill = slow_gate.best_skill
best_score = slow_gate.best_score
best_step = slow_gate.best_step
if slow_gate.action in {"accept", "accept_new_best"}:
current_origin = (
f"slow_update_epoch_{epoch:02d}"
)
if slow_gate.action == "accept_new_best":
best_origin = current_origin
print(
f" [slow gate] ACCEPT (new best) "
f"hard={slow_sel_hard:.4f} > "
f"prev best {prev_best:.4f}"
)
elif slow_gate.action == "accept":
print(
f" [slow gate] ACCEPT "
f"hard={slow_sel_hard:.4f} > "
f"current={prev_current:.4f}"
)
else:
print(
f" [slow gate] REJECT "
f"hard={slow_sel_hard:.4f} <= "
f"current={current_score:.4f}"
)
print(
f" [slow update] guidance written "
f"({len(slow_result['slow_update_content'])} "
f"chars), {slow_time}s"
)
else:
# ── Force-accept mode (default) ──────────────────
# The epoch-level longitudinal guidance is injected
# into both current_skill and best_skill
# unconditionally — it must not be gated by
# step-level selection scores.
slow_content = slow_result["slow_update_content"]
current_skill = replace_slow_update_field(
current_skill, slow_content,
)
best_skill = replace_slow_update_field(
best_skill, slow_content,
)
# Update caches so downstream steps use the
# slow-update-injected skill for hashing.
slow_candidate_hash = skill_hash(current_skill)
sel_cache[slow_candidate_hash] = (current_score, 0.0)
print(
f" [slow update] force-injected into current & best "
f"({len(slow_content)} chars), "
f"{slow_time}s"
)
slow_result["action"] = "force_accept"
current_origin = f"slow_update_epoch_{epoch:02d}"
print(
f" [slow update] force-injected into "
f"current & best "
f"({len(slow_content)} chars), "
f"{slow_time}s"
)
else:
slow_result = slow_result or {}
slow_result["action"] = "no_content"

View File

@@ -41,14 +41,6 @@ def run_meta_skill(
"""Produce updated optimizer-side meta skill from adjacent epochs."""
actual_system = system_prompt if system_prompt is not None else load_prompt("meta_skill")
prev_skill_display = prev_skill
if len(prev_skill_display) > 6000:
prev_skill_display = prev_skill_display[:6000] + "\n...[truncated]..."
curr_skill_display = curr_skill
if len(curr_skill_display) > 6000:
curr_skill_display = curr_skill_display[:6000] + "\n...[truncated]..."
prev_meta_section = (
prev_meta_skill_content.strip()
if prev_meta_skill_content and prev_meta_skill_content.strip()
@@ -57,8 +49,8 @@ def run_meta_skill(
comparison_text = format_comparison_text(comparison_pairs)
user = (
f"## Previous Epoch Last-Step Skill\n{prev_skill_display}\n\n"
f"## Current Epoch Last-Step Skill\n{curr_skill_display}\n\n"
f"## Previous Epoch Last-Step Skill\n{prev_skill}\n\n"
f"## Current Epoch Last-Step Skill\n{curr_skill}\n\n"
f"## Previous Optimizer Meta Skill\n"
f"The following optimizer memory was available during the current epoch. "
f"Reflect on whether it improved or harmed the quality of edits.\n\n"
@@ -71,7 +63,7 @@ def run_meta_skill(
response, _ = chat_optimizer(
system=actual_system,
user=user,
max_completion_tokens=3072,
max_completion_tokens=16384,
retries=3,
stage="meta_skill",
)

View File

@@ -91,6 +91,11 @@ def replace_slow_update_field(skill: str, new_content: str) -> str:
# ── Comparison text builder ─────────────────────────────────────────────────
# NOTE: The character limits below (whole-trajectory cap + the per-field caps in
# _read_trajectory and the comparison metadata) only trim the comparison samples
# fed to the slow-update optimizer. They exist to cut token usage and speed up the
# call; they do NOT affect what gets written into the skill. If you need richer
# context for the longitudinal comparison, feel free to raise them.
_MAX_TRAJ_CHARS = 3000
@@ -117,6 +122,8 @@ def _read_trajectory(rollout_dir: str, task_id: str) -> str:
for entry in conversation:
if not isinstance(entry, dict):
continue
# Per-field caps (cmd/obs/reasoning/etc.) keep each trajectory compact to
# save tokens / time; raise them if you want fuller step detail.
if entry.get("type") == "tool_call":
cmd = _clip_text(entry.get("cmd"), 500)
obs = _clip_text(entry.get("obs"), 800)
@@ -352,10 +359,6 @@ def run_slow_update(
)
comparison_text = format_comparison_text(pairs)
prev_skill_display = prev_skill
if len(prev_skill_display) > 6000:
prev_skill_display = prev_skill_display[:6000] + "\n...[truncated]..."
prev_guidance_section = (
prev_slow_update_content.strip()
if prev_slow_update_content and prev_slow_update_content.strip()
@@ -363,7 +366,7 @@ def run_slow_update(
)
user = (
f"## Previous Epoch's Skill\n{prev_skill_display}\n\n"
f"## Previous Epoch's Skill\n{prev_skill}\n\n"
f"## Current Epoch's Skill\n{skill_content}\n\n"
f"## Previous Slow Update Guidance\n"
f"The following guidance was active during the current epoch. "
@@ -377,7 +380,7 @@ def run_slow_update(
response, _ = chat_optimizer(
system=actual_system,
user=user,
max_completion_tokens=4096,
max_completion_tokens=16384,
retries=3,
stage="slow_update",
)