feat(copilot): implement attempt_with_tools with cross-platform tool shims

Adds honest tool-call detection for CopilotCliBackend, mirroring the
Claude/Codex backends. Writes per-tool executable shims into the work dir
and detects real invocations from a calllog (not self-reported markers).
The Copilot backend is Windows-validated, so shims are cross-platform:
a .cmd batch shim on Windows and a chmod'd bash shim on POSIX, with an
OS-specific tool hint. Mirrors _call's flags/env (isolated COPILOT_HOME,
--allow-all-tools, MCP/instruction disabling) and the UTF-8 subprocess fix.

Adds test_attempt_with_tools_honest_detection: a CI-friendly, OS-aware
stub stands in for the CLI, runs the shim, and asserts both JSONL parsing
and log-based detection. Validated live on Windows (real Copilot call) and
on Linux/WSL (POSIX path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
DB Lee
2026-06-12 09:05:13 -07:00
parent 013a7cd83a
commit 5799695951
2 changed files with 153 additions and 2 deletions

View File

@@ -810,8 +810,10 @@ class CopilotCliBackend(CliBackend):
response to stdout on all platforms, so JSONL is used for robust capture.
The call runs in a clean temp cwd with streaming disabled and tools allowed
(so non-interactive mode never blocks on a permission prompt); the prompts
ask for final-answer text only, so no tool use is expected.
(so non-interactive mode never blocks on a permission prompt); ``_call``'s
prompts ask for final-answer text only, so no tool use is expected there,
while ``attempt_with_tools`` exposes real, cross-platform callable shims in
the working directory for honest tool-call detection.
Startup overhead is minimised: each invocation points ``COPILOT_HOME`` at a
dedicated, isolated config dir (no user ``mcp-config.json``, so the user's
@@ -896,6 +898,108 @@ class CopilotCliBackend(CliBackend):
parts.append(content)
return "\n".join(parts).strip()
def attempt_with_tools(self, task, skill, memory, tools):
# Expose REAL, callable tool shims in the working directory so the
# gbrain quick-answerer judge (tool_called=search) is validated
# honestly: we detect each call from the shim's log, not from a
# self-reported marker. The Copilot CLI is the Windows-validated
# backend, so the shims must be cross-platform — a bash `#!/usr/bin/env
# bash` + chmod shim does NOT execute via `./tool` under PowerShell/cmd,
# so on Windows we emit a `.cmd` batch shim instead.
import shutil
import stat
work = tempfile.mkdtemp(prefix="skillopt_sleep_copilottools_")
calllog = os.path.join(work, "_tool_calls.log")
tool_names = tools or ["search"]
is_windows = os.name == "nt"
try:
for tname in tool_names:
if is_windows:
shim = os.path.join(work, f"{tname}.cmd")
with open(shim, "w") as f:
# `%~n0` is the script's own base name (the tool name);
# writing it keeps the calllog line == tool name so the
# honest-detection match below works unchanged.
f.write(
"@echo off\n"
f'echo %~n0>>"{calllog}"\n'
"echo (search results: 3 relevant notes found; use them to answer)\n"
)
else:
shim = os.path.join(work, tname)
with open(shim, "w") as f:
f.write(
"#!/usr/bin/env bash\n"
f'echo "{tname}" >> "{calllog}"\n'
'echo "(search results: 3 relevant notes found; use them to answer)"\n'
)
os.chmod(shim, os.stat(shim).st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
if is_windows:
tool_hint = (
"You have shell tools available in the current directory: "
+ ", ".join(f"{t}.cmd" for t in tool_names)
+ " (each callable as `" + tool_names[0] + "` or `.\\"
+ tool_names[0] + "`). When the skill says to look something "
"up or search before answering, you MUST actually run the "
"tool (e.g. `" + tool_names[0] + " \"query\"`) before giving "
"your final answer."
)
else:
tool_hint = (
"You have shell tools available in the current directory: "
+ ", ".join(f"./{t}" for t in tool_names)
+ ". When the skill says to look something up or search before "
"answering, you MUST actually run the tool (e.g. `./search \"query\"`) "
"before giving your final answer."
)
prompt = (
"You are completing a task. Apply the skill and memory rules EXACTLY, "
"including any rule about searching/looking up before answering. "
"Treat a 'Learned preferences' block as HARD CONSTRAINTS that override "
"earlier conflicting skill text.\n\n"
f"{tool_hint}\n\n"
f"# Skill\n{skill or '(none)'}\n\n# Memory\n{memory or '(none)'}\n\n"
f"# Task\n{task.intent}\n\n{task.context_excerpt}\n\n"
"Return ONLY the final answer text."
)
cmd = [
self.copilot_path, "-p", prompt,
"--output-format", "json",
"--stream", "off",
"--no-color",
"--log-level", "none",
"--allow-all-tools",
"-C", work,
]
if not self.full_env:
cmd += ["--disable-builtin-mcps", "--no-custom-instructions"]
if self.model:
cmd += ["--model", self.model]
env = os.environ.copy()
if self.copilot_home:
env["COPILOT_HOME"] = self.copilot_home
resp = ""
try:
proc = subprocess.run(
cmd, capture_output=True, text=True, encoding="utf-8",
errors="replace", timeout=self.timeout, cwd=work, env=env,
)
resp = self._parse_jsonl_response(proc.stdout or "")
except Exception:
resp = ""
self._tokens += len(prompt) // 4 + len(resp) // 4
called: List[str] = []
if os.path.exists(calllog):
with open(calllog) as f:
logged = {ln.strip() for ln in f if ln.strip()}
called = [t for t in tool_names if t in logged]
return resp, called
finally:
try:
shutil.rmtree(work, ignore_errors=True)
except Exception:
pass
class DualBackend(Backend):
"""Route operations to two backends, à la SkillOpt's target vs optimizer.

View File

@@ -580,6 +580,53 @@ class TestCopilotBackend(unittest.TestCase):
else:
os.environ["SKILLOPT_SLEEP_COPILOT_HOME"] = prev
def test_attempt_with_tools_honest_detection(self):
# End-to-end (no real CLI): a tiny per-OS stub stands in for `copilot`.
# It runs the local `search` shim the backend writes into its work dir
# (so the calllog is written — honest detection) then prints one JSONL
# assistant.message. Proves both the JSONL parse and that the tool call
# is detected from the shim's log, not from a self-reported marker.
import shutil
import stat
from skillopt_sleep.backend import CopilotCliBackend
stub_dir = tempfile.mkdtemp(prefix="skillopt_sleep_stub_")
try:
if os.name == "nt":
stub = os.path.join(stub_dir, "copilot.cmd")
with open(stub, "w") as f:
# The backend writes `search.cmd`; run it (explicit `.\` so
# cmd's `call` resolves it from the cwd reliably) so the
# calllog is populated, then emit the JSONL line. None of
# `{ } " :` need escaping in batch echo (no > < | & ^ %).
f.write(
"@echo off\n"
'call .\\search.cmd "q" >nul 2>&1\n'
'echo {"type":"assistant.message","data":{"content":"Paris"}}\n'
)
else:
stub = os.path.join(stub_dir, "copilot")
with open(stub, "w") as f:
f.write(
"#!/usr/bin/env bash\n"
'./search "q" >/dev/null 2>&1\n'
"echo '{\"type\":\"assistant.message\",\"data\":{\"content\":\"Paris\"}}'\n"
)
os.chmod(
stub,
os.stat(stub).st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH,
)
be = CopilotCliBackend(copilot_path=stub, timeout=60)
task = TaskRecord(id="t1", project="p", intent="What is the capital of France?")
resp, called = be.attempt_with_tools(task, skill="", memory="", tools=["search"])
self.assertEqual(resp, "Paris") # JSONL parsed via _parse_jsonl_response
self.assertEqual(called, ["search"]) # shim ran; detected from calllog
finally:
shutil.rmtree(stub_dir, ignore_errors=True)
if __name__ == "__main__":
unittest.main(verbosity=2)