diff --git a/skillopt/sleep/backend.py b/skillopt/sleep/backend.py index a0badff..2813395 100644 --- a/skillopt/sleep/backend.py +++ b/skillopt/sleep/backend.py @@ -42,6 +42,22 @@ class Backend: def attempt(self, task: TaskRecord, skill: str, memory: str) -> str: raise NotImplementedError + def attempt_with_tools( + self, task: TaskRecord, skill: str, memory: str, tools: List[str] + ) -> Tuple[str, List[str]]: + """Run the task while exposing real tools; return (response, tools_called). + + Default: no real tool loop — fall back to plain attempt and let the + single-shot 'TOOL_CALL: ' marker convention surface intent. CLI + backends override this to expose a genuinely callable tool. + """ + resp = self.attempt(task, skill, memory) + called: List[str] = [] + for t in tools: + if re.search(r"(?i)\btool_call\s*:\s*%s\b" % re.escape(t), resp): + called.append(t) + return resp, called + def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]: raise NotImplementedError @@ -157,6 +173,21 @@ class MockBackend(Backend): return f"approximately {mangled} (format not applied)" return "(attempted, no checkable reference)" + def attempt_with_tools(self, task, skill, memory, tools): + # Deterministic tool model: the mock "calls" a tool iff the skill+memory + # contains an explicit instruction to use it (a learned rule mentioning + # the tool name or "search"). The deficient skill says NOT to, so + # baseline calls nothing; a learned "use ./search" rule flips it. + ctx = ((skill or "") + "\n" + (memory or "")).lower() + resp = self.attempt(task, skill, memory) + called = [] + for t in (tools or []): + tl = t.lower() + if (f"./{tl}" in ctx or f"use {tl}" in ctx or f"run {tl}" in ctx + or f"call {tl}" in ctx or f"must {tl}" in ctx): + called.append(t) + return resp, called + def judge(self, task: TaskRecord, response: str) -> Tuple[float, float, str]: if task.reference_kind == "rule" and task.judge: from skillopt.sleep.judges import score_rule_judge @@ -457,8 +488,69 @@ class ClaudeCliBackend(CliBackend): pass return (proc.stdout or "").strip() - -# ── Codex CLI backend (real @openai/codex, not the hermes wrapper) ──────────── + def attempt_with_tools(self, task, skill, memory, tools): + # Expose a REAL, callable `search` tool (a shell shim that logs each + # call) so the gbrain quick-answerer judge (tool_called=search) is + # validated honestly: we detect the call from the shim's log, not from + # a self-reported marker. Other tools are stubbed the same way. + import tempfile, shutil, stat + work = tempfile.mkdtemp(prefix="skillopt_sleep_tools_") + calllog = os.path.join(work, "_tool_calls.log") + try: + for tname in (tools or ["search"]): + shim = os.path.join(work, tname) + with open(shim, "w") as f: + f.write( + "#!/usr/bin/env bash\n" + f'echo "{tname}" >> "{calllog}"\n' + 'echo "(search results: 3 relevant notes found; use them to answer)"\n' + ) + os.chmod(shim, os.stat(shim).st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH) + tool_hint = ( + "You have shell tools available in the current directory: " + + ", ".join(f"./{t}" for t in (tools or ["search"])) + + ". When the skill says to look something up or search before " + "answering, you MUST actually run the tool (e.g. `./search \"query\"`) " + "via Bash before giving your final answer." + ) + prompt = ( + "You are completing a task. Apply the skill and memory rules EXACTLY, " + "including any rule about searching/looking up before answering. " + "Treat a 'Learned preferences' block as HARD CONSTRAINTS that override " + "earlier conflicting skill text.\n\n" + f"{tool_hint}\n\n" + f"# Skill\n{skill or '(none)'}\n\n# Memory\n{memory or '(none)'}\n\n" + f"# Task\n{task.intent}\n\n{task.context_excerpt}\n\n" + "Return ONLY the final answer text." + ) + cmd = [ + self.claude_path, "-p", "--output-format", "text", + "--bare", "--disable-slash-commands", + "--allowedTools", "Bash", + "--exclude-dynamic-system-prompt-sections", + ] + if self.model: + cmd += ["--model", self.model] + cmd += ["--", prompt] + try: + proc = subprocess.run( + cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work, + ) + resp = (proc.stdout or "").strip() + except Exception: + resp = "" + self._tokens += len(prompt) // 4 + len(resp) // 4 + called: List[str] = [] + if os.path.exists(calllog): + with open(calllog) as f: + logged = {ln.strip() for ln in f if ln.strip()} + called = [t for t in (tools or ["search"]) if t in logged] + return resp, called + finally: + try: + shutil.rmtree(work, ignore_errors=True) + except Exception: + pass def resolve_codex_path(explicit: str = "") -> str: """Find the REAL `@openai/codex` binary, skipping the hermes wrapper. @@ -535,8 +627,67 @@ class CodexCliBackend(CliBackend): except Exception: pass - -# ── Dual backend: target runs the task, optimizer proposes/judges edits ─────── + def attempt_with_tools(self, task, skill, memory, tools): + # Codex exec runs in a sandbox with shell access; expose the same real + # `search` shim and let it run (workspace-write so the shim can log). + import tempfile, shutil, stat + work = tempfile.mkdtemp(prefix="skillopt_sleep_codextools_") + calllog = os.path.join(work, "_tool_calls.log") + out_path = os.path.join(work, "_last.txt") + try: + for tname in (tools or ["search"]): + shim = os.path.join(work, tname) + with open(shim, "w") as f: + f.write( + "#!/usr/bin/env bash\n" + f'echo "{tname}" >> "{calllog}"\n' + 'echo "(search results: 3 relevant notes found; use them to answer)"\n' + ) + os.chmod(shim, os.stat(shim).st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH) + tool_hint = ( + "Shell tools are available in the working directory: " + + ", ".join(f"./{t}" for t in (tools or ["search"])) + + ". When the skill says to look something up or search before " + "answering, you MUST actually run the tool (e.g. `./search \"query\"`) " + "before giving your final answer." + ) + prompt = ( + "Complete the task. Apply the skill and memory rules EXACTLY, " + "including any rule about searching before answering. Treat a " + "'Learned preferences' block as HARD CONSTRAINTS overriding earlier " + "conflicting skill text.\n\n" + f"{tool_hint}\n\n# Skill\n{skill or '(none)'}\n\n# Memory\n{memory or '(none)'}\n\n" + f"# Task\n{task.intent}\n\n{task.context_excerpt}\n\nReturn ONLY the final answer." + ) + cmd = [ + self.codex_path, "exec", "--skip-git-repo-check", "--color", "never", + "--sandbox", "workspace-write", "-C", work, "-o", out_path, + ] + if self.model: + cmd += ["-m", self.model] + cmd += ["--", prompt] + try: + subprocess.run(cmd, capture_output=True, text=True, timeout=self.timeout, cwd=work) + except Exception: + pass + resp = "" + try: + with open(out_path, encoding="utf-8") as f: + resp = f.read().strip() + except Exception: + resp = "" + self._tokens += len(prompt) // 4 + len(resp) // 4 + called: List[str] = [] + if os.path.exists(calllog): + with open(calllog) as f: + logged = {ln.strip() for ln in f if ln.strip()} + called = [t for t in (tools or ["search"]) if t in logged] + return resp, called + finally: + try: + shutil.rmtree(work, ignore_errors=True) + except Exception: + pass class DualBackend(Backend): """Route operations to two backends, à la SkillOpt's target vs optimizer. @@ -560,6 +711,9 @@ class DualBackend(Backend): def attempt(self, task, skill, memory): return self.target.attempt(task, skill, memory) + def attempt_with_tools(self, task, skill, memory, tools): + return self.target.attempt_with_tools(task, skill, memory, tools) + def judge(self, task, response): # local rule/exact judging needs no model; delegate to target which # already short-circuits those. For rubric judging use the optimizer. diff --git a/skillopt/sleep/replay.py b/skillopt/sleep/replay.py index de3d598..c329731 100644 --- a/skillopt/sleep/replay.py +++ b/skillopt/sleep/replay.py @@ -3,9 +3,9 @@ Re-run mined TaskRecords offline under a given (skill, memory) and score them, producing the (hard, soft) signal SkillOpt's gate consumes. -For Phase 1 the replay is "mock mode": a sandboxed single-shot attempt via -the chosen backend (MockBackend = deterministic; AnthropicBackend = real). -"fresh" worktree replay is Phase 3 and is intentionally not wired here. +Single-shot text replay by default. Tasks whose rule judge requires a tool +call (gbrain's `tool_called`) are run through the backend's real tool loop +(attempt_with_tools), so tool use is verified honestly rather than self-reported. """ from __future__ import annotations @@ -15,9 +15,32 @@ from skillopt.sleep.backend import Backend from skillopt.sleep.types import ReplayResult, TaskRecord +def _required_tools(task: TaskRecord) -> List[str]: + """Tool names a rule judge requires (op == 'tool_called').""" + if task.reference_kind != "rule" or not task.judge: + return [] + tools = [] + for c in task.judge.get("checks", []) or []: + if isinstance(c, dict) and c.get("op") == "tool_called" and c.get("arg"): + tools.append(str(c["arg"])) + return tools + + def replay_one(backend: Backend, task: TaskRecord, skill: str, memory: str) -> ReplayResult: - response = backend.attempt(task, skill, memory) - hard, soft, rationale = backend.judge(task, response) + tools = _required_tools(task) + tools_called: List[str] = [] + if tools: + response, tools_called = backend.attempt_with_tools(task, skill, memory, tools) + else: + response = backend.attempt(task, skill, memory) + + # rule judges may need the detected tool calls; score locally when possible + if task.reference_kind == "rule" and task.judge: + from skillopt.sleep.judges import score_rule_judge + hard, soft, rationale = score_rule_judge(task.judge, response, tools_called) + else: + hard, soft, rationale = backend.judge(task, response) + return ReplayResult( id=task.id, hard=float(hard), @@ -26,6 +49,7 @@ def replay_one(backend: Backend, task: TaskRecord, skill: str, memory: str) -> R fail_reason="" if hard >= 1.0 else (rationale or "below threshold"), task_type=(task.tags[0] if task.tags else "task"), judge_rationale=rationale, + tools_called=tools_called, ) diff --git a/skillopt/sleep/types.py b/skillopt/sleep/types.py index a453f15..9e2837e 100644 --- a/skillopt/sleep/types.py +++ b/skillopt/sleep/types.py @@ -85,6 +85,7 @@ class ReplayResult: fail_reason: str = "" task_type: str = "task" judge_rationale: str = "" + tools_called: List[str] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: return asdict(self) diff --git a/tests/test_sleep_engine.py b/tests/test_sleep_engine.py index d409bb2..6892c26 100644 --- a/tests/test_sleep_engine.py +++ b/tests/test_sleep_engine.py @@ -213,6 +213,32 @@ class TestLlmMiner(unittest.TestCase): self.assertEqual(make_llm_miner(EmptyBackend())([digest]), []) +class TestToolLoop(unittest.TestCase): + def test_tool_called_judge_via_replay(self): + from skillopt.sleep.backend import MockBackend + from skillopt.sleep.replay import replay_one, _required_tools + from skillopt.sleep.memory import set_learned + from skillopt.sleep.types import TaskRecord + + task = TaskRecord( + id="qa1", project="/p", intent="answer the question", + reference_kind="rule", + judge={"kind": "rule", "checks": [{"op": "tool_called", "arg": "search"}]}, + ) + self.assertEqual(_required_tools(task), ["search"]) + be = MockBackend() + # deficient skill: no instruction to search -> tool not called -> hard 0 + deficient = "Answer from memory. Do NOT use tools." + r0 = replay_one(be, task, deficient, "") + self.assertEqual(r0.hard, 0.0) + self.assertEqual(r0.tools_called, []) + # learned rule to use ./search -> tool called -> hard 1 + learned = set_learned(deficient, ["Before answering you MUST run ./search first."]) + r1 = replay_one(be, task, learned, "") + self.assertEqual(r1.hard, 1.0) + self.assertEqual(r1.tools_called, ["search"]) + + class TestFullCycleAndAdopt(unittest.TestCase): def test_cycle_stage_then_adopt_with_backup(self): with tempfile.TemporaryDirectory() as proj, tempfile.TemporaryDirectory() as home: