mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
feat(copilot): implement attempt_with_tools with cross-platform tool shims
Adds honest tool-call detection for CopilotCliBackend, mirroring the Claude/Codex backends. Writes per-tool executable shims into the work dir and detects real invocations from a calllog (not self-reported markers). The Copilot backend is Windows-validated, so shims are cross-platform: a .cmd batch shim on Windows and a chmod'd bash shim on POSIX, with an OS-specific tool hint. Mirrors _call's flags/env (isolated COPILOT_HOME, --allow-all-tools, MCP/instruction disabling) and the UTF-8 subprocess fix. Adds test_attempt_with_tools_honest_detection: a CI-friendly, OS-aware stub stands in for the CLI, runs the shim, and asserts both JSONL parsing and log-based detection. Validated live on Windows (real Copilot call) and on Linux/WSL (POSIX path). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -810,8 +810,10 @@ class CopilotCliBackend(CliBackend):
|
||||
response to stdout on all platforms, so JSONL is used for robust capture.
|
||||
|
||||
The call runs in a clean temp cwd with streaming disabled and tools allowed
|
||||
(so non-interactive mode never blocks on a permission prompt); the prompts
|
||||
ask for final-answer text only, so no tool use is expected.
|
||||
(so non-interactive mode never blocks on a permission prompt); ``_call``'s
|
||||
prompts ask for final-answer text only, so no tool use is expected there,
|
||||
while ``attempt_with_tools`` exposes real, cross-platform callable shims in
|
||||
the working directory for honest tool-call detection.
|
||||
|
||||
Startup overhead is minimised: each invocation points ``COPILOT_HOME`` at a
|
||||
dedicated, isolated config dir (no user ``mcp-config.json``, so the user's
|
||||
@@ -896,6 +898,108 @@ class CopilotCliBackend(CliBackend):
|
||||
parts.append(content)
|
||||
return "\n".join(parts).strip()
|
||||
|
||||
def attempt_with_tools(self, task, skill, memory, tools):
|
||||
# Expose REAL, callable tool shims in the working directory so the
|
||||
# gbrain quick-answerer judge (tool_called=search) is validated
|
||||
# honestly: we detect each call from the shim's log, not from a
|
||||
# self-reported marker. The Copilot CLI is the Windows-validated
|
||||
# backend, so the shims must be cross-platform — a bash `#!/usr/bin/env
|
||||
# bash` + chmod shim does NOT execute via `./tool` under PowerShell/cmd,
|
||||
# so on Windows we emit a `.cmd` batch shim instead.
|
||||
import shutil
|
||||
import stat
|
||||
work = tempfile.mkdtemp(prefix="skillopt_sleep_copilottools_")
|
||||
calllog = os.path.join(work, "_tool_calls.log")
|
||||
tool_names = tools or ["search"]
|
||||
is_windows = os.name == "nt"
|
||||
try:
|
||||
for tname in tool_names:
|
||||
if is_windows:
|
||||
shim = os.path.join(work, f"{tname}.cmd")
|
||||
with open(shim, "w") as f:
|
||||
# `%~n0` is the script's own base name (the tool name);
|
||||
# writing it keeps the calllog line == tool name so the
|
||||
# honest-detection match below works unchanged.
|
||||
f.write(
|
||||
"@echo off\n"
|
||||
f'echo %~n0>>"{calllog}"\n'
|
||||
"echo (search results: 3 relevant notes found; use them to answer)\n"
|
||||
)
|
||||
else:
|
||||
shim = os.path.join(work, tname)
|
||||
with open(shim, "w") as f:
|
||||
f.write(
|
||||
"#!/usr/bin/env bash\n"
|
||||
f'echo "{tname}" >> "{calllog}"\n'
|
||||
'echo "(search results: 3 relevant notes found; use them to answer)"\n'
|
||||
)
|
||||
os.chmod(shim, os.stat(shim).st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
|
||||
if is_windows:
|
||||
tool_hint = (
|
||||
"You have shell tools available in the current directory: "
|
||||
+ ", ".join(f"{t}.cmd" for t in tool_names)
|
||||
+ " (each callable as `" + tool_names[0] + "` or `.\\"
|
||||
+ tool_names[0] + "`). When the skill says to look something "
|
||||
"up or search before answering, you MUST actually run the "
|
||||
"tool (e.g. `" + tool_names[0] + " \"query\"`) before giving "
|
||||
"your final answer."
|
||||
)
|
||||
else:
|
||||
tool_hint = (
|
||||
"You have shell tools available in the current directory: "
|
||||
+ ", ".join(f"./{t}" for t in tool_names)
|
||||
+ ". When the skill says to look something up or search before "
|
||||
"answering, you MUST actually run the tool (e.g. `./search \"query\"`) "
|
||||
"before giving your final answer."
|
||||
)
|
||||
prompt = (
|
||||
"You are completing a task. Apply the skill and memory rules EXACTLY, "
|
||||
"including any rule about searching/looking up before answering. "
|
||||
"Treat a 'Learned preferences' block as HARD CONSTRAINTS that override "
|
||||
"earlier conflicting skill text.\n\n"
|
||||
f"{tool_hint}\n\n"
|
||||
f"# Skill\n{skill or '(none)'}\n\n# Memory\n{memory or '(none)'}\n\n"
|
||||
f"# Task\n{task.intent}\n\n{task.context_excerpt}\n\n"
|
||||
"Return ONLY the final answer text."
|
||||
)
|
||||
cmd = [
|
||||
self.copilot_path, "-p", prompt,
|
||||
"--output-format", "json",
|
||||
"--stream", "off",
|
||||
"--no-color",
|
||||
"--log-level", "none",
|
||||
"--allow-all-tools",
|
||||
"-C", work,
|
||||
]
|
||||
if not self.full_env:
|
||||
cmd += ["--disable-builtin-mcps", "--no-custom-instructions"]
|
||||
if self.model:
|
||||
cmd += ["--model", self.model]
|
||||
env = os.environ.copy()
|
||||
if self.copilot_home:
|
||||
env["COPILOT_HOME"] = self.copilot_home
|
||||
resp = ""
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
cmd, capture_output=True, text=True, encoding="utf-8",
|
||||
errors="replace", timeout=self.timeout, cwd=work, env=env,
|
||||
)
|
||||
resp = self._parse_jsonl_response(proc.stdout or "")
|
||||
except Exception:
|
||||
resp = ""
|
||||
self._tokens += len(prompt) // 4 + len(resp) // 4
|
||||
called: List[str] = []
|
||||
if os.path.exists(calllog):
|
||||
with open(calllog) as f:
|
||||
logged = {ln.strip() for ln in f if ln.strip()}
|
||||
called = [t for t in tool_names if t in logged]
|
||||
return resp, called
|
||||
finally:
|
||||
try:
|
||||
shutil.rmtree(work, ignore_errors=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
class DualBackend(Backend):
|
||||
"""Route operations to two backends, à la SkillOpt's target vs optimizer.
|
||||
|
||||
@@ -580,6 +580,53 @@ class TestCopilotBackend(unittest.TestCase):
|
||||
else:
|
||||
os.environ["SKILLOPT_SLEEP_COPILOT_HOME"] = prev
|
||||
|
||||
def test_attempt_with_tools_honest_detection(self):
|
||||
# End-to-end (no real CLI): a tiny per-OS stub stands in for `copilot`.
|
||||
# It runs the local `search` shim the backend writes into its work dir
|
||||
# (so the calllog is written — honest detection) then prints one JSONL
|
||||
# assistant.message. Proves both the JSONL parse and that the tool call
|
||||
# is detected from the shim's log, not from a self-reported marker.
|
||||
import shutil
|
||||
import stat
|
||||
|
||||
from skillopt_sleep.backend import CopilotCliBackend
|
||||
|
||||
stub_dir = tempfile.mkdtemp(prefix="skillopt_sleep_stub_")
|
||||
try:
|
||||
if os.name == "nt":
|
||||
stub = os.path.join(stub_dir, "copilot.cmd")
|
||||
with open(stub, "w") as f:
|
||||
# The backend writes `search.cmd`; run it (explicit `.\` so
|
||||
# cmd's `call` resolves it from the cwd reliably) so the
|
||||
# calllog is populated, then emit the JSONL line. None of
|
||||
# `{ } " :` need escaping in batch echo (no > < | & ^ %).
|
||||
f.write(
|
||||
"@echo off\n"
|
||||
'call .\\search.cmd "q" >nul 2>&1\n'
|
||||
'echo {"type":"assistant.message","data":{"content":"Paris"}}\n'
|
||||
)
|
||||
else:
|
||||
stub = os.path.join(stub_dir, "copilot")
|
||||
with open(stub, "w") as f:
|
||||
f.write(
|
||||
"#!/usr/bin/env bash\n"
|
||||
'./search "q" >/dev/null 2>&1\n'
|
||||
"echo '{\"type\":\"assistant.message\",\"data\":{\"content\":\"Paris\"}}'\n"
|
||||
)
|
||||
os.chmod(
|
||||
stub,
|
||||
os.stat(stub).st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH,
|
||||
)
|
||||
|
||||
be = CopilotCliBackend(copilot_path=stub, timeout=60)
|
||||
task = TaskRecord(id="t1", project="p", intent="What is the capital of France?")
|
||||
resp, called = be.attempt_with_tools(task, skill="", memory="", tools=["search"])
|
||||
|
||||
self.assertEqual(resp, "Paris") # JSONL parsed via _parse_jsonl_response
|
||||
self.assertEqual(called, ["search"]) # shim ran; detected from calllog
|
||||
finally:
|
||||
shutil.rmtree(stub_dir, ignore_errors=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
|
||||
Reference in New Issue
Block a user