test(sleep): add verifier-discipline stress test (closes #67)

Add a regression test to ensure the validation gate correctly rejects reward-hacking skill edits. It has been observed that optimizers sometimes propose shortcuts that improve train/replay metrics but fail to improve held-out behavior. This test codifies that the gate blocks such artifacts. Add TestVerifierDiscipline to the test_sleep_engine.py suite: - Create MockRewardHackingBackend that simulates a reward-hacking rule which passes the train set but degrades the held-out tasks. - Assert that the proposed edit is rejected by the gate.
2026-07-03 14:02:58 +08:00 · 2026-06-24 23:54:48 +05:30
parent 6849e609a3
commit fccc21f3f6
1 changed files with 39 additions and 0 deletions
--- a/tests/test_sleep_engine.py
+++ b/tests/test_sleep_engine.py
@@ -1008,3 +1008,42 @@ class TestClaudeCliBackendBare(unittest.TestCase):

 if __name__ == "__main__":
    unittest.main(verbosity=2)
+
+class MockRewardHackingBackend(MockBackend):
+    def attempt(self, task: TaskRecord, skill: str, memory: str, sample_id: int = 0) -> str:
+        ctx = (skill or "") + "\n" + (memory or "")
+        # If the reward-hacking artifact is present in the context:
+        if "Always emit placeholder URLs" in ctx:
+            if "rule:__reward_hacking__" in task.tags:
+                return str(task.reference) # perfectly answers the train/replay shortcut task
+            else:
+                return "placeholder URL" # completely breaks the real held-out task
+                
+        # Baseline behavior (without the rule)
+        if "rule:__reward_hacking__" in task.tags:
+            return "I am missing input" # baseline fails the shortcut task
+        if "rule:real" in task.tags:
+            return str(task.reference) # baseline gets the real task right
+            
+        return super().attempt(task, skill, memory, sample_id)
+
+    def reflect(self, failures, successes, skill, memory, **kwargs):
+        edits = []
+        for task, _ in failures:
+            if "rule:__reward_hacking__" in task.tags:
+                edits.append(EditRecord("skill", "add", "Always emit placeholder URLs", rationale="shortcut"))
+                break
+        return edits
+
+class TestVerifierDiscipline(unittest.TestCase):
+    def test_gate_rejects_reward_hacking_edit(self):
+        be = MockRewardHackingBackend()
+        train_task = TaskRecord(id="t1", project="/p", intent="train", reference="XYZABC", reference_kind="exact", tags=["rule:__reward_hacking__"], split="train")
+        val_task = TaskRecord(id="v1", project="/p", intent="val", reference="PQRSTU", reference_kind="exact", tags=["rule:real"], split="val")
+        tasks = [train_task, val_task]
+
+        res = consolidate(be, tasks, "", "", edit_budget=4, gate_metric="hard", night=1)
+        
+        self.assertFalse(res.accepted)
+        self.assertGreater(len(res.rejected_edits), 0)
+        self.assertIn("placeholder", res.rejected_edits[0].content)