mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
* Robustness for the claude/codex backends on Windows: argv overflow, subprocess encoding, tolerant JSON, test-eval dirs
Fixes surfaced running SkillOpt end-to-end on the bundled `claude` backend
(local Claude CLI) on Windows. None changes the OpenAI/GPT happy path.
1. skillopt/engine/trainer.py — the final test-eval directory
(test_eval_final/) is written to before being created; add
os.makedirs(..., exist_ok=True), matching the two sibling test-eval dirs.
Without it, summary.json raises FileNotFoundError when a rollout yields
zero predictions.
2. skillopt/model/claude_backend.py
a. Pass the prompt via stdin (not argv): on Windows the whole command line
is capped at ~32 KB and a large optimizer prompt (the success-analyst
minibatch carrying several report trajectories) overflows it with
[WinError 206], killing the run after retries.
b. Pass the system prompt via --append-system-prompt-file (a temp file),
not argv. The system prompt here is the skill being optimized, which
SkillOpt grows over training; since the ~32 KB cap applies to the SUM of
all argv, a grown skill would re-hit [WinError 206] even with the prompt
on stdin.
c. Pin the subprocess encoding to utf-8 (errors="replace"). With text=True
and no encoding=, stdin is encoded with the system codepage; on a zh-CN
box (cp936/GBK) a prompt containing an emoji or some Latin-1 characters
raises UnicodeEncodeError before the CLI even starts, failing every retry.
3. skillopt/model/codex_backend.py — the same utf-8 encoding pin on its
subprocess.run(input=...) call (identical unpinned-encoding pattern).
4. skillopt/utils/json_utils.py — extract_json() returned None for valid-
looking JSON that strict json.loads rejects (unescaped ASCII quotes inside
CJK string values, trailing commas), silently dropping the analyst's edits
on non-schema backends (Claude/Qwen): reflect produces N edits, 0 applied.
Add a json_repair fallback, but only on a single unambiguous object — a
balanced-brace extractor plus a refuse-on-multiple-objects guard — so a
chain-of-thought "scratch + final" response can't make repair silently
return the wrong (discarded) object, which would be worse than None (None is
detectable and retryable; a wrong-but-valid edit is applied blind). Declare
json_repair in requirements.txt and the claude/qwen optional extras so the
fallback is actually present (it otherwise no-ops, dropping edits silently).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
(cherry picked from commit dca74a683e)
* fix(json_utils): harden tolerant JSON fallback from PR #77
Follow-up fixes on top of the cherry-picked Windows-robustness change:
1. Make _top_level_brace_objects() fully string-aware in its OUTER scan, not
just inside an object. A '{' inside quoted prose (e.g. '"set it to {x}"')
no longer starts a candidate object, so extract_json() returns None for
prose pseudo-JSON instead of repairing it into a bogus dict — which would
be strictly worse than dropping the edit, since extract_json feeds the
optimizer's skill edits.
2. Pick the repair candidate BEFORE importing json_repair, so the missing-
dependency RuntimeWarning only fires when there is genuinely a single
malformed object that could have been repaired. Ordinary no-JSON / prose
replies (the common case) now return None silently instead of warning on
every call.
3. Resolve dependency-metadata inconsistency: json_repair is optional, so add
it to the `all` extra (it was already in `claude`/`qwen`) and demote it
from a hard requirement to an optional/commented entry in requirements.txt,
matching the project's convention for backend-specific deps.
Adds regression tests for prose-with-braces (-> None), no-warning-on-plain-
text, single-object repair, and multi-object ambiguity. Existing 22 json
tests still pass with and without json_repair installed.
Co-Authored-By: Claude <noreply@anthropic.com>
---------
Co-authored-by: samuelgoofus-boop <260247789+samuelgoofus-boop@users.noreply.github.com>
Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
163 lines
6.4 KiB
Python
163 lines
6.4 KiB
Python
"""Tests for skillopt.utils.json_utils."""
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from skillopt.utils.json_utils import (
|
|
_top_level_brace_objects,
|
|
extract_json,
|
|
extract_json_array,
|
|
)
|
|
|
|
|
|
class TestExtractJson:
|
|
"""extract_json — extract a JSON object from LLM response text."""
|
|
|
|
def test_code_fence_json(self) -> None:
|
|
text = 'Some text\n```json\n{"key": "value", "num": 42}\n```\nmore text'
|
|
assert extract_json(text) == {"key": "value", "num": 42}
|
|
|
|
def test_bare_json_object(self) -> None:
|
|
text = 'The result is {"answer": "yes", "score": 0.95}.'
|
|
assert extract_json(text) == {"answer": "yes", "score": 0.95}
|
|
|
|
def test_code_fence_takes_precedence(self) -> None:
|
|
"""If fence content parses successfully it should be preferred over bare."""
|
|
text = (
|
|
'```json\n{"source": "fence"}\n```\n'
|
|
'Then also {"source": "bare"}'
|
|
)
|
|
assert extract_json(text) == {"source": "fence"}
|
|
|
|
def test_broken_fence_falls_back_to_bare(self) -> None:
|
|
"""When fence content is invalid JSON, fall back to bare {...} match."""
|
|
# Use invalid fence content that has no braces so the greedy bare
|
|
# regex doesn't swallow the valid object.
|
|
text = (
|
|
'```json\nnot json at all\n```\n'
|
|
'Answer: {"fallback": "yes"}'
|
|
)
|
|
assert extract_json(text) == {"fallback": "yes"}
|
|
|
|
def test_nested_json(self) -> None:
|
|
text = '```json\n{"outer": {"inner": [1, 2, 3]}}\n```'
|
|
assert extract_json(text) == {"outer": {"inner": [1, 2, 3]}}
|
|
|
|
def test_no_json_returns_none(self) -> None:
|
|
assert extract_json("Just plain text without JSON.") is None
|
|
|
|
def test_empty_string_returns_none(self) -> None:
|
|
assert extract_json("") is None
|
|
|
|
def test_malformed_json_returns_none(self) -> None:
|
|
assert extract_json("{broken") is None
|
|
|
|
def test_empty_json_object(self) -> None:
|
|
assert extract_json('{"empty": {}}') == {"empty": {}}
|
|
|
|
def test_json_with_escaped_chars(self) -> None:
|
|
text = '{"message": "hello\\nworld"}'
|
|
assert extract_json(text) == {"message": "hello\nworld"}
|
|
|
|
def test_only_fence_with_no_json_syntax(self) -> None:
|
|
"""Code fences without valid JSON content should not match."""
|
|
text = "```\nplain code block\n```"
|
|
assert extract_json(text) is None
|
|
|
|
|
|
class TestTopLevelBraceObjects:
|
|
"""_top_level_brace_objects — string/escape-aware top-level object scan."""
|
|
|
|
def test_single_clean_object(self) -> None:
|
|
assert _top_level_brace_objects('{"a": 1}') == ['{"a": 1}']
|
|
|
|
def test_two_top_level_objects(self) -> None:
|
|
assert _top_level_brace_objects('{"a":1}\n{"b":2}') == ['{"a":1}', '{"b":2}']
|
|
|
|
def test_brace_inside_quoted_prose_is_ignored(self) -> None:
|
|
"""A '{' inside a quoted string must NOT start an object (the bug)."""
|
|
# Brace-shaped content inside a string, with no real object → no spans.
|
|
assert _top_level_brace_objects('label is "set it to {x: 1}" done') == []
|
|
|
|
def test_real_object_after_quoted_brace(self) -> None:
|
|
"""Quoted-prose braces are skipped; a later real object is still found."""
|
|
text = 'note "{wrong: 1}" then actual {"edit": "right"}'
|
|
assert _top_level_brace_objects(text) == ['{"edit": "right"}']
|
|
|
|
|
|
class TestExtractJsonTolerantFallback:
|
|
"""extract_json — json_repair fallback for malformed non-OpenAI output."""
|
|
|
|
def test_prose_pseudo_json_returns_none(self) -> None:
|
|
"""Regression: brace-shaped prose inside quotes must not be 'repaired'
|
|
into a bogus dict. It returned {'op': 'delete'} before the fix."""
|
|
text = 'The literal string "{op: delete}" appears in prose, not as JSON.'
|
|
assert extract_json(text) is None
|
|
|
|
def test_no_warning_on_plain_text(self, recwarn: pytest.WarningsRecorder) -> None:
|
|
"""No json_repair warning for ordinary no-JSON replies (no candidate)."""
|
|
assert extract_json("Just plain text without JSON.") is None
|
|
assert extract_json("") is None
|
|
assert [w for w in recwarn.list if issubclass(w.category, RuntimeWarning)] == []
|
|
|
|
def test_trailing_comma_repaired_when_available(self) -> None:
|
|
"""With json_repair installed, a single malformed object is repaired."""
|
|
pytest.importorskip("json_repair")
|
|
assert extract_json('{"edit": "add", "text": "x",}') == {"edit": "add", "text": "x"}
|
|
|
|
def test_two_malformed_objects_too_ambiguous(self) -> None:
|
|
"""Multiple top-level objects are ambiguous → None, never guess."""
|
|
pytest.importorskip("json_repair")
|
|
assert extract_json('{"first": true,} noise {"second": true,}') is None
|
|
|
|
|
|
class TestExtractJsonArray:
|
|
"""extract_json_array — extract a JSON array from LLM response text."""
|
|
|
|
def test_code_fence_array(self) -> None:
|
|
text = '```json\n["a", "b", "c"]\n```'
|
|
assert extract_json_array(text) == ["a", "b", "c"]
|
|
|
|
def test_bare_array(self) -> None:
|
|
text = "The items are [1, 2, 3]."
|
|
assert extract_json_array(text) == [1, 2, 3]
|
|
|
|
def test_code_fence_takes_precedence(self) -> None:
|
|
text = (
|
|
'```json\n["from_fence"]\n```\n'
|
|
'also ["from_bare"]'
|
|
)
|
|
assert extract_json_array(text) == ["from_fence"]
|
|
|
|
def test_broken_fence_falls_back_to_bare(self) -> None:
|
|
text = (
|
|
'```json\nnot json at all\n```\n'
|
|
'values: [42]'
|
|
)
|
|
assert extract_json_array(text) == [42]
|
|
|
|
def test_nested_array(self) -> None:
|
|
text = '```json\n[[1, 2], [3, 4]]\n```'
|
|
assert extract_json_array(text) == [[1, 2], [3, 4]]
|
|
|
|
def test_no_array_returns_none(self) -> None:
|
|
assert extract_json_array("no brackets here") is None
|
|
|
|
def test_empty_string_returns_none(self) -> None:
|
|
assert extract_json_array("") is None
|
|
|
|
def test_malformed_array_returns_none(self) -> None:
|
|
assert extract_json_array("[1, 2, ") is None
|
|
|
|
def test_empty_json_array(self) -> None:
|
|
assert extract_json_array("[]") == []
|
|
|
|
def test_array_of_objects(self) -> None:
|
|
text = '[{"x": 1}, {"x": 2}]'
|
|
assert extract_json_array(text) == [{"x": 1}, {"x": 2}]
|
|
|
|
def test_object_not_confused_with_array(self) -> None:
|
|
"""extract_json_array should not match a bare JSON object."""
|
|
text = '{"this is an object": true}'
|
|
assert extract_json_array(text) is None
|