diff --git a/configs/_base_/default.yaml b/configs/_base_/default.yaml index ed8f61d..7800b4f 100644 --- a/configs/_base_/default.yaml +++ b/configs/_base_/default.yaml @@ -66,6 +66,7 @@ optimizer: skill_update_mode: patch # patch / rewrite_from_suggestions / full_rewrite_minibatch use_slow_update: true slow_update_samples: 20 + slow_update_gate_with_selection: false longitudinal_pair_policy: mixed # mixed / changed / unchanged use_meta_skill: true diff --git a/skillopt/config.py b/skillopt/config.py index bf56bda..c8328ac 100644 --- a/skillopt/config.py +++ b/skillopt/config.py @@ -98,6 +98,7 @@ _FLATTEN_MAP: dict[str, str] = { "optimizer.meta_learning_rate": "meta_edit_budget", "optimizer.use_slow_update": "use_slow_update", "optimizer.slow_update_samples": "slow_update_samples", + "optimizer.slow_update_gate_with_selection": "slow_update_gate_with_selection", "optimizer.longitudinal_pair_policy": "longitudinal_pair_policy", "optimizer.use_meta_skill": "use_meta_skill", "evaluation.use_gate": "use_gate", diff --git a/skillopt/engine/trainer.py b/skillopt/engine/trainer.py index 09afb1f..25058c0 100644 --- a/skillopt/engine/trainer.py +++ b/skillopt/engine/trainer.py @@ -865,6 +865,15 @@ class ReflACTTrainer: else "" ) ) + slow_gate_with_selection = bool( + cfg.get("slow_update_gate_with_selection", False) + ) + print( + " [slow update] acceptance=" + + ("gated (selection-set validation)" + if slow_gate_with_selection + else "force-accept (unconditional)") + ) if current_score < 0: print(f"\n{'='*60}") print(" BASELINE — evaluate initial skill on Selection set (valid_seen)") @@ -1468,17 +1477,27 @@ class ReflACTTrainer: epoch_comparison_pairs = None if ( slow_saved.get("slow_update_content") - and slow_saved.get("action") in { - "accept", "accept_new_best", "force_accept", - } and epoch >= 2 ): - current_skill = replace_slow_update_field( - current_skill, slow_saved["slow_update_content"], - ) - best_skill = replace_slow_update_field( - best_skill, slow_saved["slow_update_content"], - ) + action = slow_saved.get("action") + if slow_gate_with_selection: + # Gated mode (follow SkillReflection): re-apply the + # guidance to current_skill only when it was accepted. + if action in {"accept", "accept_new_best"}: + current_skill = replace_slow_update_field( + current_skill, + slow_saved["slow_update_content"], + ) + elif action in { + "accept", "accept_new_best", "force_accept", + }: + # Force-accept mode: re-apply to both current & best. + current_skill = replace_slow_update_field( + current_skill, slow_saved["slow_update_content"], + ) + best_skill = replace_slow_update_field( + best_skill, slow_saved["slow_update_content"], + ) elif epoch == 1: # Epoch 1: inject empty placeholder os.makedirs(slow_dir, exist_ok=True) @@ -1618,31 +1637,119 @@ class ReflACTTrainer: "observed across adjacent epochs." ) - # Slow update field is force-updated into both - # current_skill and best_skill unconditionally. - # The epoch-level longitudinal guidance should always - # persist — it must not be gated by step-level - # selection scores. - slow_content = slow_result["slow_update_content"] - current_skill = replace_slow_update_field( - current_skill, slow_content, - ) - best_skill = replace_slow_update_field( - best_skill, slow_content, - ) - # Update caches so downstream steps use the - # slow-update-injected skill for hashing. - slow_candidate_hash = skill_hash(current_skill) - sel_cache[slow_candidate_hash] = (current_score, 0.0) + # Slow update acceptance — two modes selected via + # `optimizer.slow_update_gate_with_selection`. + if slow_gate_with_selection: + # ── Gated mode (follow SkillReflection) ────────── + # Evaluate the slow-update candidate on the + # selection set and accept/reject via the same + # validation gate used for step-level updates. + if slow_candidate_hash in sel_cache: + slow_sel_hard, slow_sel_soft = sel_cache[ + slow_candidate_hash + ] + print( + f" [slow gate] cache hit: " + f"hard={slow_sel_hard:.4f}" + ) + else: + sel_env, sel_n = _build_eval_env( + split="valid_seen", + env_num=cfg["sel_env_num"], + seed=seed, + ) + print(f" [slow gate] selection items={sel_n}") + slow_eval_dir = os.path.join( + slow_dir, "selection_eval", + ) + slow_eval_results = adapter.rollout( + sel_env, slow_candidate, slow_eval_dir, + ) + slow_sel_hard, slow_sel_soft = compute_score( + slow_eval_results + ) + sel_cache[slow_candidate_hash] = ( + slow_sel_hard, slow_sel_soft, + ) - slow_result["action"] = "force_accept" - current_origin = f"slow_update_epoch_{epoch:02d}" + slow_gate = evaluate_gate( + candidate_skill=slow_candidate, + cand_hard=slow_sel_hard, + current_skill=current_skill, + current_score=current_score, + best_skill=best_skill, + best_score=best_score, + best_step=best_step, + global_step=global_step, + cand_soft=slow_sel_soft, + metric=gate_metric, + mixed_weight=gate_mixed_weight, + ) + slow_result["selection_hard"] = slow_sel_hard + slow_result["selection_soft"] = slow_sel_soft + slow_result["action"] = slow_gate.action + prev_current = current_score + prev_best = best_score + current_skill = slow_gate.current_skill + current_score = slow_gate.current_score + best_skill = slow_gate.best_skill + best_score = slow_gate.best_score + best_step = slow_gate.best_step + if slow_gate.action in {"accept", "accept_new_best"}: + current_origin = ( + f"slow_update_epoch_{epoch:02d}" + ) + if slow_gate.action == "accept_new_best": + best_origin = current_origin + print( + f" [slow gate] ACCEPT (new best) " + f"hard={slow_sel_hard:.4f} > " + f"prev best {prev_best:.4f}" + ) + elif slow_gate.action == "accept": + print( + f" [slow gate] ACCEPT " + f"hard={slow_sel_hard:.4f} > " + f"current={prev_current:.4f}" + ) + else: + print( + f" [slow gate] REJECT " + f"hard={slow_sel_hard:.4f} <= " + f"current={current_score:.4f}" + ) + print( + f" [slow update] guidance written " + f"({len(slow_result['slow_update_content'])} " + f"chars), {slow_time}s" + ) + else: + # ── Force-accept mode (default) ────────────────── + # The epoch-level longitudinal guidance is injected + # into both current_skill and best_skill + # unconditionally — it must not be gated by + # step-level selection scores. + slow_content = slow_result["slow_update_content"] + current_skill = replace_slow_update_field( + current_skill, slow_content, + ) + best_skill = replace_slow_update_field( + best_skill, slow_content, + ) + # Update caches so downstream steps use the + # slow-update-injected skill for hashing. + slow_candidate_hash = skill_hash(current_skill) + sel_cache[slow_candidate_hash] = (current_score, 0.0) - print( - f" [slow update] force-injected into current & best " - f"({len(slow_content)} chars), " - f"{slow_time}s" - ) + slow_result["action"] = "force_accept" + current_origin = f"slow_update_epoch_{epoch:02d}" + + print( + f" [slow update] force-injected into " + f"current & best " + f"({len(slow_content)} chars), " + f"{slow_time}s" + ) else: slow_result = slow_result or {} slow_result["action"] = "no_content" diff --git a/skillopt/optimizer/meta_skill.py b/skillopt/optimizer/meta_skill.py index 3342454..6e34ff1 100644 --- a/skillopt/optimizer/meta_skill.py +++ b/skillopt/optimizer/meta_skill.py @@ -41,14 +41,6 @@ def run_meta_skill( """Produce updated optimizer-side meta skill from adjacent epochs.""" actual_system = system_prompt if system_prompt is not None else load_prompt("meta_skill") - prev_skill_display = prev_skill - if len(prev_skill_display) > 6000: - prev_skill_display = prev_skill_display[:6000] + "\n...[truncated]..." - - curr_skill_display = curr_skill - if len(curr_skill_display) > 6000: - curr_skill_display = curr_skill_display[:6000] + "\n...[truncated]..." - prev_meta_section = ( prev_meta_skill_content.strip() if prev_meta_skill_content and prev_meta_skill_content.strip() @@ -57,8 +49,8 @@ def run_meta_skill( comparison_text = format_comparison_text(comparison_pairs) user = ( - f"## Previous Epoch Last-Step Skill\n{prev_skill_display}\n\n" - f"## Current Epoch Last-Step Skill\n{curr_skill_display}\n\n" + f"## Previous Epoch Last-Step Skill\n{prev_skill}\n\n" + f"## Current Epoch Last-Step Skill\n{curr_skill}\n\n" f"## Previous Optimizer Meta Skill\n" f"The following optimizer memory was available during the current epoch. " f"Reflect on whether it improved or harmed the quality of edits.\n\n" @@ -71,7 +63,7 @@ def run_meta_skill( response, _ = chat_optimizer( system=actual_system, user=user, - max_completion_tokens=3072, + max_completion_tokens=16384, retries=3, stage="meta_skill", ) diff --git a/skillopt/optimizer/slow_update.py b/skillopt/optimizer/slow_update.py index 16a0f08..3d34954 100644 --- a/skillopt/optimizer/slow_update.py +++ b/skillopt/optimizer/slow_update.py @@ -91,6 +91,11 @@ def replace_slow_update_field(skill: str, new_content: str) -> str: # ── Comparison text builder ───────────────────────────────────────────────── +# NOTE: The character limits below (whole-trajectory cap + the per-field caps in +# _read_trajectory and the comparison metadata) only trim the comparison samples +# fed to the slow-update optimizer. They exist to cut token usage and speed up the +# call; they do NOT affect what gets written into the skill. If you need richer +# context for the longitudinal comparison, feel free to raise them. _MAX_TRAJ_CHARS = 3000 @@ -117,6 +122,8 @@ def _read_trajectory(rollout_dir: str, task_id: str) -> str: for entry in conversation: if not isinstance(entry, dict): continue + # Per-field caps (cmd/obs/reasoning/etc.) keep each trajectory compact to + # save tokens / time; raise them if you want fuller step detail. if entry.get("type") == "tool_call": cmd = _clip_text(entry.get("cmd"), 500) obs = _clip_text(entry.get("obs"), 800) @@ -352,10 +359,6 @@ def run_slow_update( ) comparison_text = format_comparison_text(pairs) - prev_skill_display = prev_skill - if len(prev_skill_display) > 6000: - prev_skill_display = prev_skill_display[:6000] + "\n...[truncated]..." - prev_guidance_section = ( prev_slow_update_content.strip() if prev_slow_update_content and prev_slow_update_content.strip() @@ -363,7 +366,7 @@ def run_slow_update( ) user = ( - f"## Previous Epoch's Skill\n{prev_skill_display}\n\n" + f"## Previous Epoch's Skill\n{prev_skill}\n\n" f"## Current Epoch's Skill\n{skill_content}\n\n" f"## Previous Slow Update Guidance\n" f"The following guidance was active during the current epoch. " @@ -377,7 +380,7 @@ def run_slow_update( response, _ = chat_optimizer( system=actual_system, user=user, - max_completion_tokens=4096, + max_completion_tokens=16384, retries=3, stage="slow_update", )