feat(sleep): real tool-loop replay for gbrain quick-answerer (tool_called judge)

The 4th gbrain seed (quick-answerer) is judged by tool_called=search: the agent must ACTUALLY call a search tool. Add an honest tool loop: - Backend.attempt_with_tools(task, skill, memory, tools) -> (response, tools_called) - Claude: exposes a real ./search shell shim, runs with --allowedTools Bash in a clean cwd; detects the call from the shim's log (not a self-reported marker). - Codex: same shim under `exec --sandbox workspace-write`. - Mock: deterministic — "calls" a tool iff skill/memory instructs it (for CI). - replay_one routes tasks with a tool_called check through the tool loop and feeds detected calls to the rule judge; ReplayResult gains tools_called. Verified live (Claude haiku): deficient skill -> tools_called=[] hard=0; learned "must run ./search" rule -> tools_called=['search'] hard=1.0. 20 tests pass. Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
2026-07-03 14:02:58 +08:00 · 2026-06-08 14:31:51 +00:00
parent b1f41a7506
commit 937bc1ec4d
4 changed files with 214 additions and 9 deletions
--- a/skillopt/sleep/backend.py
+++ b/skillopt/sleep/backend.py
@@ -42,6 +42,22 @@ class Backend:
    def attempt(self, task: TaskRecord, skill: str, memory: str) -> str:
        raise NotImplementedError

+    def attempt_with_tools(
+        self, task: TaskRecord, skill: str, memory: str, tools: List[str]
+    ) -> Tuple[str, List[str]]:
+        """Run the task while exposing real tools; return (response, tools_called).
+
+        Default: no real tool loop — fall back to plain attempt and let the
+        single-shot 'TOOL_CALL: <name>' marker convention surface intent. CLI
+        backends override this to expose a genuinely callable tool.
+        """
+        resp = self.attempt(task, skill, memory)
+        called: List[str] = []
+        for t in tools:
+            if re.search(r"(?i)\btool_call\s*:\s*%s\b" % re.escape(t), resp):
+                called.append(t)
+        return resp, called
+
    def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]:
        raise NotImplementedError

@@ -157,6 +173,21 @@ class MockBackend(Backend):
            return f"approximately {mangled} (format not applied)"
        return "(attempted, no checkable reference)"

+    def attempt_with_tools(self, task, skill, memory, tools):
+        # Deterministic tool model: the mock "calls" a tool iff the skill+memory
+        # contains an explicit instruction to use it (a learned rule mentioning
+        # the tool name or "search"). The deficient skill says NOT to, so
+        # baseline calls nothing; a learned "use ./search" rule flips it.
+        ctx = ((skill or "") + "\n" + (memory or "")).lower()
+        resp = self.attempt(task, skill, memory)
+        called = []
+        for t in (tools or []):
+            tl = t.lower()
+            if (f"./{tl}" in ctx or f"use {tl}" in ctx or f"run {tl}" in ctx
+                    or f"call {tl}" in ctx or f"must {tl}" in ctx):
+                called.append(t)
+        return resp, called
+
    def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]:
        if task.reference_kind == "rule" and task.judge:
            from skillopt.sleep.judges import score_rule_judge
@@ -457,8 +488,69 @@ class ClaudeCliBackend(CliBackend):
                pass
        return (proc.stdout or "").strip()

-
-# ── Codex CLI backend (real @openai/codex, not the hermes wrapper) ────────────
+    def attempt_with_tools(self, task, skill, memory, tools):
+        # Expose a REAL, callable `search` tool (a shell shim that logs each
+        # call) so the gbrain quick-answerer judge (tool_called=search) is
+        # validated honestly: we detect the call from the shim's log, not from
+        # a self-reported marker. Other tools are stubbed the same way.
+        import tempfile, shutil, stat
+        work = tempfile.mkdtemp(prefix="skillopt_sleep_tools_")
+        calllog = os.path.join(work, "_tool_calls.log")
+        try:
+            for tname in (tools or ["search"]):
+                shim = os.path.join(work, tname)
+                with open(shim, "w") as f:
+                    f.write(
+                        "#!/usr/bin/env bash\n"
+                        f'echo "{tname}" >> "{calllog}"\n'
+                        'echo "(search results: 3 relevant notes found; use them to answer)"\n'
+                    )
+                os.chmod(shim, os.stat(shim).st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
+            tool_hint = (
+                "You have shell tools available in the current directory: "
+                + ", ".join(f"./{t}" for t in (tools or ["search"]))
+                + ". When the skill says to look something up or search before "
+                "answering, you MUST actually run the tool (e.g. `./search \"query\"`) "
+                "via Bash before giving your final answer."
+            )
+            prompt = (
+                "You are completing a task. Apply the skill and memory rules EXACTLY, "
+                "including any rule about searching/looking up before answering. "
+                "Treat a 'Learned preferences' block as HARD CONSTRAINTS that override "
+                "earlier conflicting skill text.\n\n"
+                f"{tool_hint}\n\n"
+                f"# Skill\n{skill or '(none)'}\n\n# Memory\n{memory or '(none)'}\n\n"
+                f"# Task\n{task.intent}\n\n{task.context_excerpt}\n\n"
+                "Return ONLY the final answer text."
+            )
+            cmd = [
+                self.claude_path, "-p", "--output-format", "text",
+                "--bare", "--disable-slash-commands",
+                "--allowedTools", "Bash",
+                "--exclude-dynamic-system-prompt-sections",
+            ]
+            if self.model:
+                cmd += ["--model", self.model]
+            cmd += ["--", prompt]
+            try:
+                proc = subprocess.run(
+                    cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work,
+                )
+                resp = (proc.stdout or "").strip()
+            except Exception:
+                resp = ""
+            self._tokens += len(prompt) // 4 + len(resp) // 4
+            called: List[str] = []
+            if os.path.exists(calllog):
+                with open(calllog) as f:
+                    logged = {ln.strip() for ln in f if ln.strip()}
+                called = [t for t in (tools or ["search"]) if t in logged]
+            return resp, called
+        finally:
+            try:
+                shutil.rmtree(work, ignore_errors=True)
+            except Exception:
+                pass

 def resolve_codex_path(explicit: str = "") -> str:
    """Find the REAL `@openai/codex` binary, skipping the hermes wrapper.
@@ -535,8 +627,67 @@ class CodexCliBackend(CliBackend):
            except Exception:
                pass

-
-# ── Dual backend: target runs the task, optimizer proposes/judges edits ───────
+    def attempt_with_tools(self, task, skill, memory, tools):
+        # Codex exec runs in a sandbox with shell access; expose the same real
+        # `search` shim and let it run (workspace-write so the shim can log).
+        import tempfile, shutil, stat
+        work = tempfile.mkdtemp(prefix="skillopt_sleep_codextools_")
+        calllog = os.path.join(work, "_tool_calls.log")
+        out_path = os.path.join(work, "_last.txt")
+        try:
+            for tname in (tools or ["search"]):
+                shim = os.path.join(work, tname)
+                with open(shim, "w") as f:
+                    f.write(
+                        "#!/usr/bin/env bash\n"
+                        f'echo "{tname}" >> "{calllog}"\n'
+                        'echo "(search results: 3 relevant notes found; use them to answer)"\n'
+                    )
+                os.chmod(shim, os.stat(shim).st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
+            tool_hint = (
+                "Shell tools are available in the working directory: "
+                + ", ".join(f"./{t}" for t in (tools or ["search"]))
+                + ". When the skill says to look something up or search before "
+                "answering, you MUST actually run the tool (e.g. `./search \"query\"`) "
+                "before giving your final answer."
+            )
+            prompt = (
+                "Complete the task. Apply the skill and memory rules EXACTLY, "
+                "including any rule about searching before answering. Treat a "
+                "'Learned preferences' block as HARD CONSTRAINTS overriding earlier "
+                "conflicting skill text.\n\n"
+                f"{tool_hint}\n\n# Skill\n{skill or '(none)'}\n\n# Memory\n{memory or '(none)'}\n\n"
+                f"# Task\n{task.intent}\n\n{task.context_excerpt}\n\nReturn ONLY the final answer."
+            )
+            cmd = [
+                self.codex_path, "exec", "--skip-git-repo-check", "--color", "never",
+                "--sandbox", "workspace-write", "-C", work, "-o", out_path,
+            ]
+            if self.model:
+                cmd += ["-m", self.model]
+            cmd += ["--", prompt]
+            try:
+                subprocess.run(cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work)
+            except Exception:
+                pass
+            resp = ""
+            try:
+                with open(out_path, encoding="utf-8") as f:
+                    resp = f.read().strip()
+            except Exception:
+                resp = ""
+            self._tokens += len(prompt) // 4 + len(resp) // 4
+            called: List[str] = []
+            if os.path.exists(calllog):
+                with open(calllog) as f:
+                    logged = {ln.strip() for ln in f if ln.strip()}
+                called = [t for t in (tools or ["search"]) if t in logged]
+            return resp, called
+        finally:
+            try:
+                shutil.rmtree(work, ignore_errors=True)
+            except Exception:
+                pass

 class DualBackend(Backend):
    """Route operations to two backends, à la SkillOpt's target vs optimizer.
@@ -560,6 +711,9 @@ class DualBackend(Backend):
    def attempt(self, task, skill, memory):
        return self.target.attempt(task, skill, memory)

+    def attempt_with_tools(self, task, skill, memory, tools):
+        return self.target.attempt_with_tools(task, skill, memory, tools)
+
    def judge(self, task, response):
        # local rule/exact judging needs no model; delegate to target which
        # already short-circuits those. For rubric judging use the optimizer.
--- a/skillopt/sleep/replay.py
+++ b/skillopt/sleep/replay.py
@@ -3,9 +3,9 @@
 Re-run mined TaskRecords offline under a given (skill, memory) and score
 them, producing the (hard, soft) signal SkillOpt's gate consumes.

-For Phase 1 the replay is "mock mode": a sandboxed single-shot attempt via
-the chosen backend (MockBackend = deterministic; AnthropicBackend = real).
-"fresh" worktree replay is Phase 3 and is intentionally not wired here.
+Single-shot text replay by default. Tasks whose rule judge requires a tool
+call (gbrain's `tool_called`) are run through the backend's real tool loop
+(attempt_with_tools), so tool use is verified honestly rather than self-reported.
 """
 from __future__ import annotations

@@ -15,9 +15,32 @@ from skillopt.sleep.backend import Backend
 from skillopt.sleep.types import ReplayResult, TaskRecord


+def _required_tools(task: TaskRecord) -> List[str]:
+    """Tool names a rule judge requires (op == 'tool_called')."""
+    if task.reference_kind != "rule" or not task.judge:
+        return []
+    tools = []
+    for c in task.judge.get("checks", []) or []:
+        if isinstance(c, dict) and c.get("op") == "tool_called" and c.get("arg"):
+            tools.append(str(c["arg"]))
+    return tools
+
+
 def replay_one(backend: Backend, task: TaskRecord, skill: str, memory: str) -> ReplayResult:
-    response = backend.attempt(task, skill, memory)
-    hard, soft, rationale = backend.judge(task, response)
+    tools = _required_tools(task)
+    tools_called: List[str] = []
+    if tools:
+        response, tools_called = backend.attempt_with_tools(task, skill, memory, tools)
+    else:
+        response = backend.attempt(task, skill, memory)
+
+    # rule judges may need the detected tool calls; score locally when possible
+    if task.reference_kind == "rule" and task.judge:
+        from skillopt.sleep.judges import score_rule_judge
+        hard, soft, rationale = score_rule_judge(task.judge, response, tools_called)
+    else:
+        hard, soft, rationale = backend.judge(task, response)
+
    return ReplayResult(
        id=task.id,
        hard=float(hard),
@@ -26,6 +49,7 @@ def replay_one(backend: Backend, task: TaskRecord, skill: str, memory: str) -> R
        fail_reason="" if hard >= 1.0 else (rationale or "below threshold"),
        task_type=(task.tags[0] if task.tags else "task"),
        judge_rationale=rationale,
+        tools_called=tools_called,
    )


--- a/skillopt/sleep/types.py
+++ b/skillopt/sleep/types.py
@@ -85,6 +85,7 @@ class ReplayResult:
    fail_reason: str = ""
    task_type: str = "task"
    judge_rationale: str = ""
+    tools_called: List[str] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)
--- a/tests/test_sleep_engine.py
+++ b/tests/test_sleep_engine.py
@@ -213,6 +213,32 @@ class TestLlmMiner(unittest.TestCase):
        self.assertEqual(make_llm_miner(EmptyBackend())([digest]), [])


+class TestToolLoop(unittest.TestCase):
+    def test_tool_called_judge_via_replay(self):
+        from skillopt.sleep.backend import MockBackend
+        from skillopt.sleep.replay import replay_one, _required_tools
+        from skillopt.sleep.memory import set_learned
+        from skillopt.sleep.types import TaskRecord
+
+        task = TaskRecord(
+            id="qa1", project="/p", intent="answer the question",
+            reference_kind="rule",
+            judge={"kind": "rule", "checks": [{"op": "tool_called", "arg": "search"}]},
+        )
+        self.assertEqual(_required_tools(task), ["search"])
+        be = MockBackend()
+        # deficient skill: no instruction to search -> tool not called -> hard 0
+        deficient = "Answer from memory. Do NOT use tools."
+        r0 = replay_one(be, task, deficient, "")
+        self.assertEqual(r0.hard, 0.0)
+        self.assertEqual(r0.tools_called, [])
+        # learned rule to use ./search -> tool called -> hard 1
+        learned = set_learned(deficient, ["Before answering you MUST run ./search first."])
+        r1 = replay_one(be, task, learned, "")
+        self.assertEqual(r1.hard, 1.0)
+        self.assertEqual(r1.tools_called, ["search"])
+
+
 class TestFullCycleAndAdopt(unittest.TestCase):
    def test_cycle_stage_then_adopt_with_backup(self):
        with tempfile.TemporaryDirectory() as proj, tempfile.TemporaryDirectory() as home: