feat(sleep): real tool-loop replay for gbrain quick-answerer (tool_called judge)

The 4th gbrain seed (quick-answerer) is judged by tool_called=search: the agent
must ACTUALLY call a search tool. Add an honest tool loop:

  - Backend.attempt_with_tools(task, skill, memory, tools) -> (response, tools_called)
  - Claude: exposes a real ./search shell shim, runs with --allowedTools Bash in a
    clean cwd; detects the call from the shim's log (not a self-reported marker).
  - Codex: same shim under `exec --sandbox workspace-write`.
  - Mock: deterministic — "calls" a tool iff skill/memory instructs it (for CI).
  - replay_one routes tasks with a tool_called check through the tool loop and
    feeds detected calls to the rule judge; ReplayResult gains tools_called.

Verified live (Claude haiku): deficient skill -> tools_called=[] hard=0;
learned "must run ./search" rule -> tools_called=['search'] hard=1.0.
20 tests pass.

Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
This commit is contained in:
Yifan Yang
2026-06-08 14:31:51 +00:00
parent b1f41a7506
commit 937bc1ec4d
4 changed files with 214 additions and 9 deletions

View File

@@ -42,6 +42,22 @@ class Backend:
def attempt(self, task: TaskRecord, skill: str, memory: str) -> str:
raise NotImplementedError
def attempt_with_tools(
self, task: TaskRecord, skill: str, memory: str, tools: List[str]
) -> Tuple[str, List[str]]:
"""Run the task while exposing real tools; return (response, tools_called).
Default: no real tool loop — fall back to plain attempt and let the
single-shot 'TOOL_CALL: <name>' marker convention surface intent. CLI
backends override this to expose a genuinely callable tool.
"""
resp = self.attempt(task, skill, memory)
called: List[str] = []
for t in tools:
if re.search(r"(?i)\btool_call\s*:\s*%s\b" % re.escape(t), resp):
called.append(t)
return resp, called
def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]:
raise NotImplementedError
@@ -157,6 +173,21 @@ class MockBackend(Backend):
return f"approximately {mangled} (format not applied)"
return "(attempted, no checkable reference)"
def attempt_with_tools(self, task, skill, memory, tools):
# Deterministic tool model: the mock "calls" a tool iff the skill+memory
# contains an explicit instruction to use it (a learned rule mentioning
# the tool name or "search"). The deficient skill says NOT to, so
# baseline calls nothing; a learned "use ./search" rule flips it.
ctx = ((skill or "") + "\n" + (memory or "")).lower()
resp = self.attempt(task, skill, memory)
called = []
for t in (tools or []):
tl = t.lower()
if (f"./{tl}" in ctx or f"use {tl}" in ctx or f"run {tl}" in ctx
or f"call {tl}" in ctx or f"must {tl}" in ctx):
called.append(t)
return resp, called
def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]:
if task.reference_kind == "rule" and task.judge:
from skillopt.sleep.judges import score_rule_judge
@@ -457,8 +488,69 @@ class ClaudeCliBackend(CliBackend):
pass
return (proc.stdout or "").strip()
# ── Codex CLI backend (real @openai/codex, not the hermes wrapper) ────────────
def attempt_with_tools(self, task, skill, memory, tools):
# Expose a REAL, callable `search` tool (a shell shim that logs each
# call) so the gbrain quick-answerer judge (tool_called=search) is
# validated honestly: we detect the call from the shim's log, not from
# a self-reported marker. Other tools are stubbed the same way.
import tempfile, shutil, stat
work = tempfile.mkdtemp(prefix="skillopt_sleep_tools_")
calllog = os.path.join(work, "_tool_calls.log")
try:
for tname in (tools or ["search"]):
shim = os.path.join(work, tname)
with open(shim, "w") as f:
f.write(
"#!/usr/bin/env bash\n"
f'echo "{tname}" >> "{calllog}"\n'
'echo "(search results: 3 relevant notes found; use them to answer)"\n'
)
os.chmod(shim, os.stat(shim).st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
tool_hint = (
"You have shell tools available in the current directory: "
+ ", ".join(f"./{t}" for t in (tools or ["search"]))
+ ". When the skill says to look something up or search before "
"answering, you MUST actually run the tool (e.g. `./search \"query\"`) "
"via Bash before giving your final answer."
)
prompt = (
"You are completing a task. Apply the skill and memory rules EXACTLY, "
"including any rule about searching/looking up before answering. "
"Treat a 'Learned preferences' block as HARD CONSTRAINTS that override "
"earlier conflicting skill text.\n\n"
f"{tool_hint}\n\n"
f"# Skill\n{skill or '(none)'}\n\n# Memory\n{memory or '(none)'}\n\n"
f"# Task\n{task.intent}\n\n{task.context_excerpt}\n\n"
"Return ONLY the final answer text."
)
cmd = [
self.claude_path, "-p", "--output-format", "text",
"--bare", "--disable-slash-commands",
"--allowedTools", "Bash",
"--exclude-dynamic-system-prompt-sections",
]
if self.model:
cmd += ["--model", self.model]
cmd += ["--", prompt]
try:
proc = subprocess.run(
cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work,
)
resp = (proc.stdout or "").strip()
except Exception:
resp = ""
self._tokens += len(prompt) // 4 + len(resp) // 4
called: List[str] = []
if os.path.exists(calllog):
with open(calllog) as f:
logged = {ln.strip() for ln in f if ln.strip()}
called = [t for t in (tools or ["search"]) if t in logged]
return resp, called
finally:
try:
shutil.rmtree(work, ignore_errors=True)
except Exception:
pass
def resolve_codex_path(explicit: str = "") -> str:
"""Find the REAL `@openai/codex` binary, skipping the hermes wrapper.
@@ -535,8 +627,67 @@ class CodexCliBackend(CliBackend):
except Exception:
pass
# ── Dual backend: target runs the task, optimizer proposes/judges edits ───────
def attempt_with_tools(self, task, skill, memory, tools):
# Codex exec runs in a sandbox with shell access; expose the same real
# `search` shim and let it run (workspace-write so the shim can log).
import tempfile, shutil, stat
work = tempfile.mkdtemp(prefix="skillopt_sleep_codextools_")
calllog = os.path.join(work, "_tool_calls.log")
out_path = os.path.join(work, "_last.txt")
try:
for tname in (tools or ["search"]):
shim = os.path.join(work, tname)
with open(shim, "w") as f:
f.write(
"#!/usr/bin/env bash\n"
f'echo "{tname}" >> "{calllog}"\n'
'echo "(search results: 3 relevant notes found; use them to answer)"\n'
)
os.chmod(shim, os.stat(shim).st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
tool_hint = (
"Shell tools are available in the working directory: "
+ ", ".join(f"./{t}" for t in (tools or ["search"]))
+ ". When the skill says to look something up or search before "
"answering, you MUST actually run the tool (e.g. `./search \"query\"`) "
"before giving your final answer."
)
prompt = (
"Complete the task. Apply the skill and memory rules EXACTLY, "
"including any rule about searching before answering. Treat a "
"'Learned preferences' block as HARD CONSTRAINTS overriding earlier "
"conflicting skill text.\n\n"
f"{tool_hint}\n\n# Skill\n{skill or '(none)'}\n\n# Memory\n{memory or '(none)'}\n\n"
f"# Task\n{task.intent}\n\n{task.context_excerpt}\n\nReturn ONLY the final answer."
)
cmd = [
self.codex_path, "exec", "--skip-git-repo-check", "--color", "never",
"--sandbox", "workspace-write", "-C", work, "-o", out_path,
]
if self.model:
cmd += ["-m", self.model]
cmd += ["--", prompt]
try:
subprocess.run(cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work)
except Exception:
pass
resp = ""
try:
with open(out_path, encoding="utf-8") as f:
resp = f.read().strip()
except Exception:
resp = ""
self._tokens += len(prompt) // 4 + len(resp) // 4
called: List[str] = []
if os.path.exists(calllog):
with open(calllog) as f:
logged = {ln.strip() for ln in f if ln.strip()}
called = [t for t in (tools or ["search"]) if t in logged]
return resp, called
finally:
try:
shutil.rmtree(work, ignore_errors=True)
except Exception:
pass
class DualBackend(Backend):
"""Route operations to two backends, à la SkillOpt's target vs optimizer.
@@ -560,6 +711,9 @@ class DualBackend(Backend):
def attempt(self, task, skill, memory):
return self.target.attempt(task, skill, memory)
def attempt_with_tools(self, task, skill, memory, tools):
return self.target.attempt_with_tools(task, skill, memory, tools)
def judge(self, task, response):
# local rule/exact judging needs no model; delegate to target which
# already short-circuits those. For rubric judging use the optimizer.

View File

@@ -3,9 +3,9 @@
Re-run mined TaskRecords offline under a given (skill, memory) and score
them, producing the (hard, soft) signal SkillOpt's gate consumes.
For Phase 1 the replay is "mock mode": a sandboxed single-shot attempt via
the chosen backend (MockBackend = deterministic; AnthropicBackend = real).
"fresh" worktree replay is Phase 3 and is intentionally not wired here.
Single-shot text replay by default. Tasks whose rule judge requires a tool
call (gbrain's `tool_called`) are run through the backend's real tool loop
(attempt_with_tools), so tool use is verified honestly rather than self-reported.
"""
from __future__ import annotations
@@ -15,9 +15,32 @@ from skillopt.sleep.backend import Backend
from skillopt.sleep.types import ReplayResult, TaskRecord
def _required_tools(task: TaskRecord) -> List[str]:
"""Tool names a rule judge requires (op == 'tool_called')."""
if task.reference_kind != "rule" or not task.judge:
return []
tools = []
for c in task.judge.get("checks", []) or []:
if isinstance(c, dict) and c.get("op") == "tool_called" and c.get("arg"):
tools.append(str(c["arg"]))
return tools
def replay_one(backend: Backend, task: TaskRecord, skill: str, memory: str) -> ReplayResult:
response = backend.attempt(task, skill, memory)
hard, soft, rationale = backend.judge(task, response)
tools = _required_tools(task)
tools_called: List[str] = []
if tools:
response, tools_called = backend.attempt_with_tools(task, skill, memory, tools)
else:
response = backend.attempt(task, skill, memory)
# rule judges may need the detected tool calls; score locally when possible
if task.reference_kind == "rule" and task.judge:
from skillopt.sleep.judges import score_rule_judge
hard, soft, rationale = score_rule_judge(task.judge, response, tools_called)
else:
hard, soft, rationale = backend.judge(task, response)
return ReplayResult(
id=task.id,
hard=float(hard),
@@ -26,6 +49,7 @@ def replay_one(backend: Backend, task: TaskRecord, skill: str, memory: str) -> R
fail_reason="" if hard >= 1.0 else (rationale or "below threshold"),
task_type=(task.tags[0] if task.tags else "task"),
judge_rationale=rationale,
tools_called=tools_called,
)

View File

@@ -85,6 +85,7 @@ class ReplayResult:
fail_reason: str = ""
task_type: str = "task"
judge_rationale: str = ""
tools_called: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)

View File

@@ -213,6 +213,32 @@ class TestLlmMiner(unittest.TestCase):
self.assertEqual(make_llm_miner(EmptyBackend())([digest]), [])
class TestToolLoop(unittest.TestCase):
def test_tool_called_judge_via_replay(self):
from skillopt.sleep.backend import MockBackend
from skillopt.sleep.replay import replay_one, _required_tools
from skillopt.sleep.memory import set_learned
from skillopt.sleep.types import TaskRecord
task = TaskRecord(
id="qa1", project="/p", intent="answer the question",
reference_kind="rule",
judge={"kind": "rule", "checks": [{"op": "tool_called", "arg": "search"}]},
)
self.assertEqual(_required_tools(task), ["search"])
be = MockBackend()
# deficient skill: no instruction to search -> tool not called -> hard 0
deficient = "Answer from memory. Do NOT use tools."
r0 = replay_one(be, task, deficient, "")
self.assertEqual(r0.hard, 0.0)
self.assertEqual(r0.tools_called, [])
# learned rule to use ./search -> tool called -> hard 1
learned = set_learned(deficient, ["Before answering you MUST run ./search first."])
r1 = replay_one(be, task, learned, "")
self.assertEqual(r1.hard, 1.0)
self.assertEqual(r1.tools_called, ["search"])
class TestFullCycleAndAdopt(unittest.TestCase):
def test_cycle_stage_then_adopt_with_backup(self):
with tempfile.TemporaryDirectory() as proj, tempfile.TemporaryDirectory() as home: