test(sleep): add verifier-discipline stress test (closes #67)

Add a regression test to ensure the validation gate correctly rejects
reward-hacking skill edits. It has been observed that optimizers
sometimes propose shortcuts that improve train/replay metrics but fail
to improve held-out behavior. This test codifies that the gate blocks
such artifacts.

Add TestVerifierDiscipline to the test_sleep_engine.py suite:
- Create MockRewardHackingBackend that simulates a reward-hacking rule
  which passes the train set but degrades the held-out tasks.
- Assert that the proposed edit is rejected by the gate.
This commit is contained in:
Tanmay9223
2026-06-24 23:54:48 +05:30
parent 6849e609a3
commit fccc21f3f6

View File

@@ -1008,3 +1008,42 @@ class TestClaudeCliBackendBare(unittest.TestCase):
if __name__ == "__main__":
unittest.main(verbosity=2)
class MockRewardHackingBackend(MockBackend):
def attempt(self, task: TaskRecord, skill: str, memory: str, sample_id: int = 0) -> str:
ctx = (skill or "") + "\n" + (memory or "")
# If the reward-hacking artifact is present in the context:
if "Always emit placeholder URLs" in ctx:
if "rule:__reward_hacking__" in task.tags:
return str(task.reference) # perfectly answers the train/replay shortcut task
else:
return "placeholder URL" # completely breaks the real held-out task
# Baseline behavior (without the rule)
if "rule:__reward_hacking__" in task.tags:
return "I am missing input" # baseline fails the shortcut task
if "rule:real" in task.tags:
return str(task.reference) # baseline gets the real task right
return super().attempt(task, skill, memory, sample_id)
def reflect(self, failures, successes, skill, memory, **kwargs):
edits = []
for task, _ in failures:
if "rule:__reward_hacking__" in task.tags:
edits.append(EditRecord("skill", "add", "Always emit placeholder URLs", rationale="shortcut"))
break
return edits
class TestVerifierDiscipline(unittest.TestCase):
def test_gate_rejects_reward_hacking_edit(self):
be = MockRewardHackingBackend()
train_task = TaskRecord(id="t1", project="/p", intent="train", reference="XYZABC", reference_kind="exact", tags=["rule:__reward_hacking__"], split="train")
val_task = TaskRecord(id="v1", project="/p", intent="val", reference="PQRSTU", reference_kind="exact", tags=["rule:real"], split="val")
tasks = [train_task, val_task]
res = consolidate(be, tasks, "", "", edit_budget=4, gate_metric="hard", night=1)
self.assertFalse(res.accepted)
self.assertGreater(len(res.rejected_edits), 0)
self.assertIn("placeholder", res.rejected_edits[0].content)