mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
fix(json_utils): reject prose pseudo-JSON in single quotes/backticks (#82)
Follow-up to the string-aware brace scan: that change only skipped
double-quoted prose, so brace-shaped text in single quotes, backticks, or
bare prose (e.g. `{op: delete}`, '{x: 1}') still reached json_repair and was
fabricated into a bogus dict — strictly worse than None, since extract_json
feeds the optimizer's skill edits.
Add a _looks_json_like() guard before repair: a genuine JSON object's first
non-space char after `{` is `"` (a key) or `}` (empty). Prose pseudo-objects
start with a bare word and are rejected, while legitimate repair targets
(trailing commas, unescaped quotes inside string values) all begin with `"`
and pass — including objects whose string VALUES contain single quotes or
backticks, which must not be rejected.
Found by an independent GPT-5.5 re-review of the merged #79 code. Adds
regression tests for single-quoted / backticked / bare prose (-> None) and
for legitimate objects with quote/backtick string values (still repaired).
Tests: 30 pass (+3 skip) without json_repair, 33 pass with it, both clean
under -W error::RuntimeWarning.
Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -71,6 +71,25 @@ def _top_level_brace_objects(text: str) -> list[str]:
|
||||
return spans
|
||||
|
||||
|
||||
def _looks_json_like(span: str) -> bool:
|
||||
"""Heuristic: does ``span`` look like an intended JSON object (vs. prose)?
|
||||
|
||||
A genuine JSON object's first non-space character after ``{`` is either ``"``
|
||||
(a string key) or ``}`` (an empty object). Prose pseudo-objects that the
|
||||
repair pass would otherwise fabricate into bogus dicts — ``{op: delete}``,
|
||||
``{x: 1}`` quoted in single quotes or backticks, etc. — start with a bare
|
||||
word and are rejected. This complements the string-aware scan, which only
|
||||
skips *double*-quoted prose; single-quoted / backticked / unquoted prose
|
||||
braces are caught here instead. Legitimate repair targets (trailing commas,
|
||||
unescaped quotes inside string values) all begin with ``"`` and pass.
|
||||
"""
|
||||
inner = span.strip()
|
||||
if not (inner.startswith("{") and inner.endswith("}")):
|
||||
return False
|
||||
after_brace = inner[1:].lstrip()
|
||||
return after_brace[:1] in ('"', '}')
|
||||
|
||||
|
||||
def extract_json(text: str) -> dict | None:
|
||||
"""Extract a JSON object from LLM response text.
|
||||
|
||||
@@ -111,6 +130,12 @@ def extract_json(text: str) -> dict | None:
|
||||
# 0 or >1 top-level objects → too ambiguous to repair safely → None
|
||||
if not candidate:
|
||||
return None
|
||||
# Final guard: only repair spans that actually look like an intended JSON
|
||||
# object. Prose pseudo-objects in single quotes / backticks / bare text
|
||||
# (e.g. `{op: delete}`) reach here because the scan only skips double-quoted
|
||||
# prose; repairing them would fabricate a wrong dict (worse than None).
|
||||
if not _looks_json_like(candidate):
|
||||
return None
|
||||
try:
|
||||
from json_repair import repair_json
|
||||
except ModuleNotFoundError:
|
||||
|
||||
@@ -94,6 +94,32 @@ class TestExtractJsonTolerantFallback:
|
||||
text = 'The literal string "{op: delete}" appears in prose, not as JSON.'
|
||||
assert extract_json(text) is None
|
||||
|
||||
def test_single_quoted_and_backticked_prose_returns_none(self) -> None:
|
||||
"""Regression: pseudo-JSON in single quotes / backticks / bare prose must
|
||||
not be repaired into a bogus dict (the string-aware scan only skips
|
||||
double-quoted prose; the JSON-like guard catches the rest)."""
|
||||
for text in (
|
||||
"The literal string '{op: delete}' appears in prose, not JSON.",
|
||||
"The inline code `{op: delete}` appears in prose, not JSON.",
|
||||
"The literal string 'set it to {x: 1}' appears in prose.",
|
||||
"A bare mapping {op: delete} written in prose.",
|
||||
):
|
||||
assert extract_json(text) is None, text
|
||||
|
||||
def test_json_string_values_with_quotes_still_repair(self) -> None:
|
||||
"""The JSON-like guard must NOT reject legitimate objects whose string
|
||||
values contain single quotes or backticks."""
|
||||
pytest.importorskip("json_repair")
|
||||
assert extract_json('{"msg": "it\'s a test",}') == {"msg": "it's a test"}
|
||||
assert extract_json('{"code": "use `backtick` here",}') == {"code": "use `backtick` here"}
|
||||
|
||||
def test_no_warning_on_quoted_prose(self, recwarn: pytest.WarningsRecorder) -> None:
|
||||
"""Prose pseudo-JSON (no real candidate) must not warn even without
|
||||
json_repair installed — the JSON-like guard returns None before import."""
|
||||
assert extract_json("The inline code `{op: delete}` appears in prose.") is None
|
||||
assert extract_json("A bare mapping {op: delete} in prose.") is None
|
||||
assert [w for w in recwarn.list if issubclass(w.category, RuntimeWarning)] == []
|
||||
|
||||
def test_no_warning_on_plain_text(self, recwarn: pytest.WarningsRecorder) -> None:
|
||||
"""No json_repair warning for ordinary no-JSON replies (no candidate)."""
|
||||
assert extract_json("Just plain text without JSON.") is None
|
||||
|
||||
Reference in New Issue
Block a user