mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
Split failure reflections into SKILL_DEFECT (body edit) vs EXECUTION_LAPSE (protected appendix note that re-emphasizes an existing rule, never edited by step-level analysts). Toggle: optimizer.use_skill_aware_reflection (default false; baseline byte-identical when off). - optimizer/appendix.py: protected APPENDIX region (inject/extract/append with dedup), mirrors the slow_update protected-field pattern - optimizer/skill_aware.py: analyst prompt augmentation, appendix_notes parsing, threshold-gated LLM consolidation, and a process-wide runtime switch (configure_skill_aware_reflection) set once by the trainer - gradient/reflect.py: augment error/success analyst prompts at runtime; None-sentinel kwargs resolve from the global switch, so env adapters need no per-benchmark wiring (works for all envs, present and future) - optimizer/skill.py: generalize the protected-region check to (slow_update, appendix); edits inside any protected region are skipped - engine/trainer.py: inject appendix at init, flush per-step EXECUTION_LAPSE notes after the gate settles, optional consolidation - tests: regression suite incl. toggle-off byte-identical guarantee and env-independent global-switch resolution (6/6 passing + live smoke) Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
275 lines
12 KiB
Python
275 lines
12 KiB
Python
"""Standalone regression + function tests for skill-aware reflection.
|
|
|
|
Run directly (no pytest needed):
|
|
python tests/test_skill_aware_reflection.py
|
|
|
|
Covers:
|
|
1. Toggle-OFF byte-identical guarantee for skill.py edit application
|
|
(slow-update-only behavior must be unchanged).
|
|
2. Appendix module: inject / append / dedup / extract / accumulate.
|
|
3. Appendix-region protection from step-level edits.
|
|
4. Coexistence of appendix + slow_update regions.
|
|
5. reflect.py prompt augmentation + appendix_notes parsing (no LLM call).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import sys
|
|
|
|
# Ensure THIS repo's skillopt is imported (not an installed copy) when the
|
|
# file is run directly: script mode puts tests/ on sys.path, not the repo root.
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
|
|
def _reference_old_apply(skill: str, edit: dict) -> str:
|
|
"""Reproduce the ORIGINAL slow-update-only edit behavior inline."""
|
|
SU_START = "<!-- SLOW_UPDATE_START -->"
|
|
SU_END = "<!-- SLOW_UPDATE_END -->"
|
|
op = edit.get("op", "")
|
|
content = edit.get("content", "").strip().replace(SU_START, "").replace(SU_END, "")
|
|
target = edit.get("target", "")
|
|
si = skill.find(SU_START)
|
|
ei = skill.find(SU_END)
|
|
|
|
def in_su(t: str) -> bool:
|
|
if si == -1 or ei == -1:
|
|
return False
|
|
ti = skill.find(t)
|
|
if ti == -1:
|
|
return False
|
|
return si <= ti < ei + len(SU_END)
|
|
|
|
if target and in_su(target):
|
|
return skill
|
|
if op == "append":
|
|
s = skill.find(SU_START)
|
|
if s != -1:
|
|
return skill[:s].rstrip() + "\n\n" + content + "\n\n" + skill[s:]
|
|
return skill.rstrip() + "\n\n" + content + "\n"
|
|
if op == "insert_after":
|
|
if not target or target not in skill:
|
|
s = skill.find(SU_START)
|
|
if s != -1:
|
|
return skill[:s].rstrip() + "\n\n" + content + "\n\n" + skill[s:]
|
|
return skill.rstrip() + "\n\n" + content + "\n"
|
|
idx = skill.index(target) + len(target)
|
|
nl = skill.find("\n", idx)
|
|
at = nl + 1 if nl != -1 else len(skill)
|
|
return skill[:at] + "\n" + content + "\n" + skill[at:]
|
|
if op == "replace":
|
|
if not target or target not in skill:
|
|
return skill
|
|
return skill.replace(target, content, 1)
|
|
if op == "delete":
|
|
if not target or target not in skill:
|
|
return skill
|
|
return skill.replace(target, "", 1)
|
|
return skill
|
|
|
|
|
|
def test_toggle_off_byte_identical() -> None:
|
|
from skillopt.optimizer.skill import _apply_edit_with_report
|
|
|
|
SU_START = "<!-- SLOW_UPDATE_START -->"
|
|
SU_END = "<!-- SLOW_UPDATE_END -->"
|
|
skill = (
|
|
"# QA Skill\n\n## Rules\n- Prefer shortest answer span.\n"
|
|
"- Use clue wording to constrain answer type.\n\n"
|
|
f"{SU_START}\nSome slow update guidance here.\n{SU_END}\n"
|
|
)
|
|
edits = [
|
|
{"op": "append", "content": "- New rule appended."},
|
|
{"op": "insert_after", "target": "## Rules", "content": "- Inserted rule."},
|
|
{"op": "insert_after", "target": "NONEXISTENT", "content": "- Fallback rule."},
|
|
{"op": "replace", "target": "Prefer shortest answer span.", "content": "Prefer the exact minimal span."},
|
|
{"op": "delete", "target": "- Use clue wording to constrain answer type."},
|
|
{"op": "replace", "target": "Some slow update guidance here.", "content": "HACKED"},
|
|
{"op": "delete", "target": "Some slow update guidance here."},
|
|
]
|
|
for e in edits:
|
|
new_skill, _ = _apply_edit_with_report(skill, e)
|
|
old_skill = _reference_old_apply(skill, e)
|
|
assert new_skill == old_skill, f"byte mismatch for {e['op']}"
|
|
print("PASS test_toggle_off_byte_identical")
|
|
|
|
|
|
def test_appendix_module() -> None:
|
|
from skillopt.optimizer.appendix import (
|
|
has_appendix_field, inject_empty_appendix_field,
|
|
extract_appendix_notes, append_to_appendix_field, APPENDIX_START,
|
|
)
|
|
skill = "# QA Skill\n\n- Prefer shortest answer span."
|
|
s1 = inject_empty_appendix_field(skill)
|
|
assert has_appendix_field(s1) and extract_appendix_notes(s1) == []
|
|
assert inject_empty_appendix_field(s1) == s1 # idempotent
|
|
s2 = append_to_appendix_field(s1, ["Go to fridge for ice water.", "No stop token."])
|
|
assert extract_appendix_notes(s2) == ["Go to fridge for ice water.", "No stop token."]
|
|
s3 = append_to_appendix_field(s2, ["go to fridge for ice water", "Check sheet range."])
|
|
assert extract_appendix_notes(s3) == [
|
|
"Go to fridge for ice water.", "No stop token.", "Check sheet range.",
|
|
], "near-duplicate must be dropped"
|
|
assert s3.count(APPENDIX_START) == 1, "exactly one region after accumulation"
|
|
assert "# QA Skill" in s3 and "Prefer shortest answer span" in s3
|
|
assert extract_appendix_notes(append_to_appendix_field(s1, [" ", "", "real"])) == ["real"]
|
|
print("PASS test_appendix_module")
|
|
|
|
|
|
def test_appendix_protection() -> None:
|
|
from skillopt.optimizer.skill import _apply_edit_with_report
|
|
from skillopt.optimizer.appendix import append_to_appendix_field, inject_empty_appendix_field
|
|
|
|
skill = inject_empty_appendix_field("# QA Skill\n\n- Rule one.")
|
|
skill = append_to_appendix_field(skill, ["Follow rule one before acting."])
|
|
for e in (
|
|
{"op": "delete", "target": "Follow rule one before acting."},
|
|
{"op": "replace", "target": "Follow rule one before acting.", "content": "HACK"},
|
|
):
|
|
new, rep = _apply_edit_with_report(skill, e)
|
|
assert new == skill, f"appendix must be protected from {e['op']}"
|
|
assert rep["status"] == "skipped_protected_region"
|
|
new, rep = _apply_edit_with_report(skill, {"op": "replace", "target": "Rule one.", "content": "Rule 1."})
|
|
assert "Rule 1." in new and "Follow rule one before acting." in new
|
|
print("PASS test_appendix_protection")
|
|
|
|
|
|
def test_coexistence_with_slow_update() -> None:
|
|
from skillopt.optimizer.skill import _apply_edit_with_report
|
|
from skillopt.optimizer.appendix import (
|
|
inject_empty_appendix_field, append_to_appendix_field, extract_appendix_notes,
|
|
)
|
|
from skillopt.optimizer.slow_update import (
|
|
inject_empty_slow_update_field, replace_slow_update_field, extract_slow_update_field,
|
|
)
|
|
skill = inject_empty_appendix_field("# QA Skill\n\n- Rule one.")
|
|
skill = append_to_appendix_field(skill, ["Follow rule one."])
|
|
skill = inject_empty_slow_update_field(skill)
|
|
skill = replace_slow_update_field(skill, "Slow guidance v2.")
|
|
assert extract_appendix_notes(skill) == ["Follow rule one."]
|
|
assert extract_slow_update_field(skill) == "Slow guidance v2."
|
|
# both regions protected
|
|
n1, r1 = _apply_edit_with_report(skill, {"op": "delete", "target": "Follow rule one."})
|
|
n2, r2 = _apply_edit_with_report(skill, {"op": "replace", "target": "Slow guidance v2.", "content": "X"})
|
|
assert n1 == skill and n2 == skill
|
|
# append lands before both regions (body stays at top)
|
|
n3, _ = _apply_edit_with_report(skill, {"op": "append", "content": "- Rule two."})
|
|
assert n3.find("- Rule two.") < n3.find("<!-- APPENDIX_START -->")
|
|
assert n3.find("- Rule two.") < n3.find("<!-- SLOW_UPDATE_START -->")
|
|
print("PASS test_coexistence_with_slow_update")
|
|
|
|
|
|
def test_reflect_parsing_and_augment() -> None:
|
|
import inspect
|
|
import skillopt.gradient.reflect as R
|
|
from skillopt.optimizer.skill_aware import extract_appendix_notes, augment_error_prompt
|
|
|
|
for fn in ("run_error_analyst_minibatch", "run_success_analyst_minibatch"):
|
|
sig = inspect.signature(getattr(R, fn))
|
|
assert "skill_aware_reflection" in sig.parameters
|
|
assert sig.parameters["skill_aware_reflection"].default is False, f"{fn} default must be False"
|
|
# run_minibatch_reflect uses a None sentinel: explicit kwarg wins, else the
|
|
# process-wide config switch (configure_skill_aware_reflection) decides.
|
|
sig = inspect.signature(R.run_minibatch_reflect)
|
|
assert sig.parameters["skill_aware_reflection"].default is None
|
|
assert sig.parameters["skill_aware_appendix_source"].default is None
|
|
assert extract_appendix_notes({"appendix_notes": ["a", "b"]}) == ["a", "b"]
|
|
assert extract_appendix_notes({"appendix_notes": "x"}) == ["x"]
|
|
assert extract_appendix_notes({"appendix_notes": [{"note": "n"}, {"content": "c"}, {}]}) == ["n", "c"]
|
|
assert extract_appendix_notes({}) == [] and extract_appendix_notes(None) == []
|
|
aug = augment_error_prompt("ORIG")
|
|
assert aug.startswith("ORIG") and "SKILL_DEFECT" in aug and "EXECUTION_LAPSE" in aug
|
|
print("PASS test_reflect_parsing_and_augment")
|
|
|
|
|
|
def test_global_switch_env_independent() -> None:
|
|
"""The config switch alone must drive SAR for ANY env adapter (no kwargs)."""
|
|
from unittest import mock
|
|
import skillopt.gradient.reflect as R
|
|
from skillopt.optimizer.skill_aware import (
|
|
configure_skill_aware_reflection,
|
|
get_skill_aware_appendix_source,
|
|
is_skill_aware_enabled,
|
|
)
|
|
|
|
# configure() round-trip.
|
|
configure_skill_aware_reflection(True, "failure_only")
|
|
assert is_skill_aware_enabled() and get_skill_aware_appendix_source() == "failure_only"
|
|
configure_skill_aware_reflection(False)
|
|
assert not is_skill_aware_enabled() and get_skill_aware_appendix_source() == "both"
|
|
|
|
# run_minibatch_reflect with NO skill-aware kwargs (adapter-style call):
|
|
# capture what it forwards to the analyst workers under each switch state.
|
|
import tempfile
|
|
captured: dict = {}
|
|
|
|
def fake_error_analyst(*args, **kwargs):
|
|
captured["skill_aware_reflection"] = kwargs.get("skill_aware_reflection")
|
|
return None
|
|
|
|
def run_once() -> None:
|
|
captured.clear()
|
|
with mock.patch.object(R, "run_error_analyst_minibatch", fake_error_analyst), \
|
|
tempfile.TemporaryDirectory() as tmp:
|
|
R.run_minibatch_reflect(
|
|
results=[{"id": "t1", "hard": 0, "soft": 0.0}],
|
|
skill_content="# Skill",
|
|
prediction_dir=tmp,
|
|
patches_dir=tmp,
|
|
workers=1,
|
|
failure_only=True,
|
|
minibatch_size=8,
|
|
)
|
|
|
|
try:
|
|
configure_skill_aware_reflection(True, "both")
|
|
run_once()
|
|
assert captured.get("skill_aware_reflection") is True, \
|
|
"switch ON must reach the analyst without adapter wiring"
|
|
|
|
configure_skill_aware_reflection(False)
|
|
run_once()
|
|
assert captured.get("skill_aware_reflection") is False, \
|
|
"switch OFF must keep the analyst at baseline"
|
|
|
|
# Explicit kwarg still overrides the global switch (backward compat).
|
|
captured.clear()
|
|
with mock.patch.object(R, "run_error_analyst_minibatch", fake_error_analyst), \
|
|
tempfile.TemporaryDirectory() as tmp:
|
|
R.run_minibatch_reflect(
|
|
results=[{"id": "t1", "hard": 0, "soft": 0.0}],
|
|
skill_content="# Skill",
|
|
prediction_dir=tmp,
|
|
patches_dir=tmp,
|
|
workers=1,
|
|
failure_only=True,
|
|
minibatch_size=8,
|
|
skill_aware_reflection=True,
|
|
)
|
|
assert captured.get("skill_aware_reflection") is True
|
|
finally:
|
|
configure_skill_aware_reflection(False)
|
|
print("PASS test_global_switch_env_independent")
|
|
|
|
|
|
def main() -> int:
|
|
tests = [
|
|
test_toggle_off_byte_identical,
|
|
test_appendix_module,
|
|
test_appendix_protection,
|
|
test_coexistence_with_slow_update,
|
|
test_reflect_parsing_and_augment,
|
|
test_global_switch_env_independent,
|
|
]
|
|
failed = 0
|
|
for t in tests:
|
|
try:
|
|
t()
|
|
except AssertionError as exc:
|
|
failed += 1
|
|
print(f"FAIL {t.__name__}: {exc}")
|
|
print(f"\n{len(tests) - failed}/{len(tests)} passed")
|
|
return 1 if failed else 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|