feat(slow-update): add config-controlled gated / force-injected modes

Add optimizer.slow_update_gate_with_selection to control how epoch-boundary slow-update guidance is applied: - false (default): force-injected - inject guidance into current & best unconditionally (unchanged behavior). - true: gated - evaluate the slow-update candidate on the selection set and accept/reject via the same validation gate as step-level updates (logic follows the SkillReflection ablation). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-07-03 14:02:58 +08:00 · 2026-05-31 01:52:02 +00:00
parent 42e555d28e
commit 00602df9e9
5 changed files with 153 additions and 49 deletions
--- a/configs/_base_/default.yaml
+++ b/configs/_base_/default.yaml
@@ -66,6 +66,7 @@ optimizer:
  skill_update_mode: patch  # patch / rewrite_from_suggestions / full_rewrite_minibatch
  use_slow_update: true
  slow_update_samples: 20
+  slow_update_gate_with_selection: false
  longitudinal_pair_policy: mixed  # mixed / changed / unchanged
  use_meta_skill: true

--- a/skillopt/config.py
+++ b/skillopt/config.py
@@ -98,6 +98,7 @@ _FLATTEN_MAP: dict[str, str] = {
    "optimizer.meta_learning_rate": "meta_edit_budget",
    "optimizer.use_slow_update": "use_slow_update",
    "optimizer.slow_update_samples": "slow_update_samples",
+    "optimizer.slow_update_gate_with_selection": "slow_update_gate_with_selection",
    "optimizer.longitudinal_pair_policy": "longitudinal_pair_policy",
    "optimizer.use_meta_skill": "use_meta_skill",
    "evaluation.use_gate": "use_gate",
--- a/skillopt/engine/trainer.py
+++ b/skillopt/engine/trainer.py
@@ -865,6 +865,15 @@ class ReflACTTrainer:
                else ""
            )
        )
+        slow_gate_with_selection = bool(
+            cfg.get("slow_update_gate_with_selection", False)
+        )
+        print(
+            "  [slow update] acceptance="
+            + ("gated (selection-set validation)"
+               if slow_gate_with_selection
+               else "force-accept (unconditional)")
+        )
        if current_score < 0:
            print(f"\n{'='*60}")
            print("  BASELINE — evaluate initial skill on Selection set (valid_seen)")
@@ -1468,17 +1477,27 @@ class ReflACTTrainer:
                            epoch_comparison_pairs = None
                    if (
                        slow_saved.get("slow_update_content")
-                        and slow_saved.get("action") in {
-                            "accept", "accept_new_best", "force_accept",
-                        }
                        and epoch >= 2
                    ):
-                        current_skill = replace_slow_update_field(
-                            current_skill, slow_saved["slow_update_content"],
-                        )
-                        best_skill = replace_slow_update_field(
-                            best_skill, slow_saved["slow_update_content"],
-                        )
+                        action = slow_saved.get("action")
+                        if slow_gate_with_selection:
+                            # Gated mode (follow SkillReflection): re-apply the
+                            # guidance to current_skill only when it was accepted.
+                            if action in {"accept", "accept_new_best"}:
+                                current_skill = replace_slow_update_field(
+                                    current_skill,
+                                    slow_saved["slow_update_content"],
+                                )
+                        elif action in {
+                            "accept", "accept_new_best", "force_accept",
+                        }:
+                            # Force-accept mode: re-apply to both current & best.
+                            current_skill = replace_slow_update_field(
+                                current_skill, slow_saved["slow_update_content"],
+                            )
+                            best_skill = replace_slow_update_field(
+                                best_skill, slow_saved["slow_update_content"],
+                            )
                elif epoch == 1:
                    # Epoch 1: inject empty placeholder
                    os.makedirs(slow_dir, exist_ok=True)
@@ -1618,31 +1637,119 @@ class ReflACTTrainer:
                            "observed across adjacent epochs."
                        )

-                        # Slow update field is force-updated into both
-                        # current_skill and best_skill unconditionally.
-                        # The epoch-level longitudinal guidance should always
-                        # persist — it must not be gated by step-level
-                        # selection scores.
-                        slow_content = slow_result["slow_update_content"]
-                        current_skill = replace_slow_update_field(
-                            current_skill, slow_content,
-                        )
-                        best_skill = replace_slow_update_field(
-                            best_skill, slow_content,
-                        )
-                        # Update caches so downstream steps use the
-                        # slow-update-injected skill for hashing.
-                        slow_candidate_hash = skill_hash(current_skill)
-                        sel_cache[slow_candidate_hash] = (current_score, 0.0)
+                        # Slow update acceptance — two modes selected via
+                        # `optimizer.slow_update_gate_with_selection`.
+                        if slow_gate_with_selection:
+                            # ── Gated mode (follow SkillReflection) ──────────
+                            # Evaluate the slow-update candidate on the
+                            # selection set and accept/reject via the same
+                            # validation gate used for step-level updates.
+                            if slow_candidate_hash in sel_cache:
+                                slow_sel_hard, slow_sel_soft = sel_cache[
+                                    slow_candidate_hash
+                                ]
+                                print(
+                                    f"    [slow gate] cache hit: "
+                                    f"hard={slow_sel_hard:.4f}"
+                                )
+                            else:
+                                sel_env, sel_n = _build_eval_env(
+                                    split="valid_seen",
+                                    env_num=cfg["sel_env_num"],
+                                    seed=seed,
+                                )
+                                print(f"    [slow gate] selection items={sel_n}")
+                                slow_eval_dir = os.path.join(
+                                    slow_dir, "selection_eval",
+                                )
+                                slow_eval_results = adapter.rollout(
+                                    sel_env, slow_candidate, slow_eval_dir,
+                                )
+                                slow_sel_hard, slow_sel_soft = compute_score(
+                                    slow_eval_results
+                                )
+                                sel_cache[slow_candidate_hash] = (
+                                    slow_sel_hard, slow_sel_soft,
+                                )

-                        slow_result["action"] = "force_accept"
-                        current_origin = f"slow_update_epoch_{epoch:02d}"
+                            slow_gate = evaluate_gate(
+                                candidate_skill=slow_candidate,
+                                cand_hard=slow_sel_hard,
+                                current_skill=current_skill,
+                                current_score=current_score,
+                                best_skill=best_skill,
+                                best_score=best_score,
+                                best_step=best_step,
+                                global_step=global_step,
+                                cand_soft=slow_sel_soft,
+                                metric=gate_metric,
+                                mixed_weight=gate_mixed_weight,
+                            )
+                            slow_result["selection_hard"] = slow_sel_hard
+                            slow_result["selection_soft"] = slow_sel_soft
+                            slow_result["action"] = slow_gate.action
+                            prev_current = current_score
+                            prev_best = best_score
+                            current_skill = slow_gate.current_skill
+                            current_score = slow_gate.current_score
+                            best_skill = slow_gate.best_skill
+                            best_score = slow_gate.best_score
+                            best_step = slow_gate.best_step
+                            if slow_gate.action in {"accept", "accept_new_best"}:
+                                current_origin = (
+                                    f"slow_update_epoch_{epoch:02d}"
+                                )
+                            if slow_gate.action == "accept_new_best":
+                                best_origin = current_origin
+                                print(
+                                    f"    [slow gate] ACCEPT (new best) "
+                                    f"hard={slow_sel_hard:.4f} > "
+                                    f"prev best {prev_best:.4f}"
+                                )
+                            elif slow_gate.action == "accept":
+                                print(
+                                    f"    [slow gate] ACCEPT "
+                                    f"hard={slow_sel_hard:.4f} > "
+                                    f"current={prev_current:.4f}"
+                                )
+                            else:
+                                print(
+                                    f"    [slow gate] REJECT "
+                                    f"hard={slow_sel_hard:.4f} <= "
+                                    f"current={current_score:.4f}"
+                                )
+                            print(
+                                f"    [slow update] guidance written "
+                                f"({len(slow_result['slow_update_content'])} "
+                                f"chars), {slow_time}s"
+                            )
+                        else:
+                            # ── Force-accept mode (default) ──────────────────
+                            # The epoch-level longitudinal guidance is injected
+                            # into both current_skill and best_skill
+                            # unconditionally — it must not be gated by
+                            # step-level selection scores.
+                            slow_content = slow_result["slow_update_content"]
+                            current_skill = replace_slow_update_field(
+                                current_skill, slow_content,
+                            )
+                            best_skill = replace_slow_update_field(
+                                best_skill, slow_content,
+                            )
+                            # Update caches so downstream steps use the
+                            # slow-update-injected skill for hashing.
+                            slow_candidate_hash = skill_hash(current_skill)
+                            sel_cache[slow_candidate_hash] = (current_score, 0.0)

-                        print(
-                            f"    [slow update] force-injected into current & best "
-                            f"({len(slow_content)} chars), "
-                            f"{slow_time}s"
-                        )
+                            slow_result["action"] = "force_accept"
+                            current_origin = f"slow_update_epoch_{epoch:02d}"
+
+                            print(
+                                f"    [slow update] force-injected into "
+                                f"current & best "
+                                f"({len(slow_content)} chars), "
+                                f"{slow_time}s"
+                            )
                    else:
                        slow_result = slow_result or {}
                        slow_result["action"] = "no_content"
--- a/skillopt/optimizer/meta_skill.py
+++ b/skillopt/optimizer/meta_skill.py
@@ -41,14 +41,6 @@ def run_meta_skill(
    """Produce updated optimizer-side meta skill from adjacent epochs."""
    actual_system = system_prompt if system_prompt is not None else load_prompt("meta_skill")

-    prev_skill_display = prev_skill
-    if len(prev_skill_display) > 6000:
-        prev_skill_display = prev_skill_display[:6000] + "\n...[truncated]..."
-
-    curr_skill_display = curr_skill
-    if len(curr_skill_display) > 6000:
-        curr_skill_display = curr_skill_display[:6000] + "\n...[truncated]..."
-
    prev_meta_section = (
        prev_meta_skill_content.strip()
        if prev_meta_skill_content and prev_meta_skill_content.strip()
@@ -57,8 +49,8 @@ def run_meta_skill(

    comparison_text = format_comparison_text(comparison_pairs)
    user = (
-        f"## Previous Epoch Last-Step Skill\n{prev_skill_display}\n\n"
-        f"## Current Epoch Last-Step Skill\n{curr_skill_display}\n\n"
+        f"## Previous Epoch Last-Step Skill\n{prev_skill}\n\n"
+        f"## Current Epoch Last-Step Skill\n{curr_skill}\n\n"
        f"## Previous Optimizer Meta Skill\n"
        f"The following optimizer memory was available during the current epoch. "
        f"Reflect on whether it improved or harmed the quality of edits.\n\n"
@@ -71,7 +63,7 @@ def run_meta_skill(
        response, _ = chat_optimizer(
            system=actual_system,
            user=user,
-            max_completion_tokens=3072,
+            max_completion_tokens=16384,
            retries=3,
            stage="meta_skill",
        )
--- a/skillopt/optimizer/slow_update.py
+++ b/skillopt/optimizer/slow_update.py
@@ -91,6 +91,11 @@ def replace_slow_update_field(skill: str, new_content: str) -> str:
 # ── Comparison text builder ─────────────────────────────────────────────────


+# NOTE: The character limits below (whole-trajectory cap + the per-field caps in
+# _read_trajectory and the comparison metadata) only trim the comparison samples
+# fed to the slow-update optimizer. They exist to cut token usage and speed up the
+# call; they do NOT affect what gets written into the skill. If you need richer
+# context for the longitudinal comparison, feel free to raise them.
 _MAX_TRAJ_CHARS = 3000


@@ -117,6 +122,8 @@ def _read_trajectory(rollout_dir: str, task_id: str) -> str:
    for entry in conversation:
        if not isinstance(entry, dict):
            continue
+        # Per-field caps (cmd/obs/reasoning/etc.) keep each trajectory compact to
+        # save tokens / time; raise them if you want fuller step detail.
        if entry.get("type") == "tool_call":
            cmd = _clip_text(entry.get("cmd"), 500)
            obs = _clip_text(entry.get("obs"), 800)
@@ -352,10 +359,6 @@ def run_slow_update(
        )
    comparison_text = format_comparison_text(pairs)

-    prev_skill_display = prev_skill
-    if len(prev_skill_display) > 6000:
-        prev_skill_display = prev_skill_display[:6000] + "\n...[truncated]..."
-
    prev_guidance_section = (
        prev_slow_update_content.strip()
        if prev_slow_update_content and prev_slow_update_content.strip()
@@ -363,7 +366,7 @@ def run_slow_update(
    )

    user = (
-        f"## Previous Epoch's Skill\n{prev_skill_display}\n\n"
+        f"## Previous Epoch's Skill\n{prev_skill}\n\n"
        f"## Current Epoch's Skill\n{skill_content}\n\n"
        f"## Previous Slow Update Guidance\n"
        f"The following guidance was active during the current epoch. "
@@ -377,7 +380,7 @@ def run_slow_update(
        response, _ = chat_optimizer(
            system=actual_system,
            user=user,
-            max_completion_tokens=4096,
+            max_completion_tokens=16384,
            retries=3,
            stage="slow_update",
        )