fix(json_utils): reject prose pseudo-JSON in single quotes/backticks (#82)

Follow-up to the string-aware brace scan: that change only skipped
double-quoted prose, so brace-shaped text in single quotes, backticks, or
bare prose (e.g. `{op: delete}`, '{x: 1}') still reached json_repair and was
fabricated into a bogus dict — strictly worse than None, since extract_json
feeds the optimizer's skill edits.

Add a _looks_json_like() guard before repair: a genuine JSON object's first
non-space char after `{` is `"` (a key) or `}` (empty). Prose pseudo-objects
start with a bare word and are rejected, while legitimate repair targets
(trailing commas, unescaped quotes inside string values) all begin with `"`
and pass — including objects whose string VALUES contain single quotes or
backticks, which must not be rejected.

Found by an independent GPT-5.5 re-review of the merged #79 code. Adds
regression tests for single-quoted / backticked / bare prose (-> None) and
for legitimate objects with quote/backtick string values (still repaired).
Tests: 30 pass (+3 skip) without json_repair, 33 pass with it, both clean
under -W error::RuntimeWarning.

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Yifan Yang
2026-06-23 20:31:39 +08:00
committed by GitHub
parent baad64a3b9
commit 2d7e37a395
2 changed files with 51 additions and 0 deletions

View File

@@ -71,6 +71,25 @@ def _top_level_brace_objects(text: str) -> list[str]:
return spans
def _looks_json_like(span: str) -> bool:
"""Heuristic: does ``span`` look like an intended JSON object (vs. prose)?
A genuine JSON object's first non-space character after ``{`` is either ``"``
(a string key) or ``}`` (an empty object). Prose pseudo-objects that the
repair pass would otherwise fabricate into bogus dicts — ``{op: delete}``,
``{x: 1}`` quoted in single quotes or backticks, etc. — start with a bare
word and are rejected. This complements the string-aware scan, which only
skips *double*-quoted prose; single-quoted / backticked / unquoted prose
braces are caught here instead. Legitimate repair targets (trailing commas,
unescaped quotes inside string values) all begin with ``"`` and pass.
"""
inner = span.strip()
if not (inner.startswith("{") and inner.endswith("}")):
return False
after_brace = inner[1:].lstrip()
return after_brace[:1] in ('"', '}')
def extract_json(text: str) -> dict | None:
"""Extract a JSON object from LLM response text.
@@ -111,6 +130,12 @@ def extract_json(text: str) -> dict | None:
# 0 or >1 top-level objects → too ambiguous to repair safely → None
if not candidate:
return None
# Final guard: only repair spans that actually look like an intended JSON
# object. Prose pseudo-objects in single quotes / backticks / bare text
# (e.g. `{op: delete}`) reach here because the scan only skips double-quoted
# prose; repairing them would fabricate a wrong dict (worse than None).
if not _looks_json_like(candidate):
return None
try:
from json_repair import repair_json
except ModuleNotFoundError:

View File

@@ -94,6 +94,32 @@ class TestExtractJsonTolerantFallback:
text = 'The literal string "{op: delete}" appears in prose, not as JSON.'
assert extract_json(text) is None
def test_single_quoted_and_backticked_prose_returns_none(self) -> None:
"""Regression: pseudo-JSON in single quotes / backticks / bare prose must
not be repaired into a bogus dict (the string-aware scan only skips
double-quoted prose; the JSON-like guard catches the rest)."""
for text in (
"The literal string '{op: delete}' appears in prose, not JSON.",
"The inline code `{op: delete}` appears in prose, not JSON.",
"The literal string 'set it to {x: 1}' appears in prose.",
"A bare mapping {op: delete} written in prose.",
):
assert extract_json(text) is None, text
def test_json_string_values_with_quotes_still_repair(self) -> None:
"""The JSON-like guard must NOT reject legitimate objects whose string
values contain single quotes or backticks."""
pytest.importorskip("json_repair")
assert extract_json('{"msg": "it\'s a test",}') == {"msg": "it's a test"}
assert extract_json('{"code": "use `backtick` here",}') == {"code": "use `backtick` here"}
def test_no_warning_on_quoted_prose(self, recwarn: pytest.WarningsRecorder) -> None:
"""Prose pseudo-JSON (no real candidate) must not warn even without
json_repair installed — the JSON-like guard returns None before import."""
assert extract_json("The inline code `{op: delete}` appears in prose.") is None
assert extract_json("A bare mapping {op: delete} in prose.") is None
assert [w for w in recwarn.list if issubclass(w.category, RuntimeWarning)] == []
def test_no_warning_on_plain_text(self, recwarn: pytest.WarningsRecorder) -> None:
"""No json_repair warning for ordinary no-JSON replies (no candidate)."""
assert extract_json("Just plain text without JSON.") is None