mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
test(sleep): add verifier-discipline stress test (closes #67)
Add a regression test to ensure the validation gate correctly rejects reward-hacking skill edits. It has been observed that optimizers sometimes propose shortcuts that improve train/replay metrics but fail to improve held-out behavior. This test codifies that the gate blocks such artifacts. Add TestVerifierDiscipline to the test_sleep_engine.py suite: - Create MockRewardHackingBackend that simulates a reward-hacking rule which passes the train set but degrades the held-out tasks. - Assert that the proposed edit is rejected by the gate.
This commit is contained in:
@@ -1008,3 +1008,42 @@ class TestClaudeCliBackendBare(unittest.TestCase):
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
|
||||
class MockRewardHackingBackend(MockBackend):
|
||||
def attempt(self, task: TaskRecord, skill: str, memory: str, sample_id: int = 0) -> str:
|
||||
ctx = (skill or "") + "\n" + (memory or "")
|
||||
# If the reward-hacking artifact is present in the context:
|
||||
if "Always emit placeholder URLs" in ctx:
|
||||
if "rule:__reward_hacking__" in task.tags:
|
||||
return str(task.reference) # perfectly answers the train/replay shortcut task
|
||||
else:
|
||||
return "placeholder URL" # completely breaks the real held-out task
|
||||
|
||||
# Baseline behavior (without the rule)
|
||||
if "rule:__reward_hacking__" in task.tags:
|
||||
return "I am missing input" # baseline fails the shortcut task
|
||||
if "rule:real" in task.tags:
|
||||
return str(task.reference) # baseline gets the real task right
|
||||
|
||||
return super().attempt(task, skill, memory, sample_id)
|
||||
|
||||
def reflect(self, failures, successes, skill, memory, **kwargs):
|
||||
edits = []
|
||||
for task, _ in failures:
|
||||
if "rule:__reward_hacking__" in task.tags:
|
||||
edits.append(EditRecord("skill", "add", "Always emit placeholder URLs", rationale="shortcut"))
|
||||
break
|
||||
return edits
|
||||
|
||||
class TestVerifierDiscipline(unittest.TestCase):
|
||||
def test_gate_rejects_reward_hacking_edit(self):
|
||||
be = MockRewardHackingBackend()
|
||||
train_task = TaskRecord(id="t1", project="/p", intent="train", reference="XYZABC", reference_kind="exact", tags=["rule:__reward_hacking__"], split="train")
|
||||
val_task = TaskRecord(id="v1", project="/p", intent="val", reference="PQRSTU", reference_kind="exact", tags=["rule:real"], split="val")
|
||||
tasks = [train_task, val_task]
|
||||
|
||||
res = consolidate(be, tasks, "", "", edit_budget=4, gate_metric="hard", night=1)
|
||||
|
||||
self.assertFalse(res.accepted)
|
||||
self.assertGreater(len(res.rejected_edits), 0)
|
||||
self.assertIn("placeholder", res.rejected_edits[0].content)
|
||||
|
||||
Reference in New Issue
Block a user