Files
microsoft-SkillOpt/skillopt/utils/json_utils.py
Yifan Yang 2d7e37a395 fix(json_utils): reject prose pseudo-JSON in single quotes/backticks (#82)
Follow-up to the string-aware brace scan: that change only skipped
double-quoted prose, so brace-shaped text in single quotes, backticks, or
bare prose (e.g. `{op: delete}`, '{x: 1}') still reached json_repair and was
fabricated into a bogus dict — strictly worse than None, since extract_json
feeds the optimizer's skill edits.

Add a _looks_json_like() guard before repair: a genuine JSON object's first
non-space char after `{` is `"` (a key) or `}` (empty). Prose pseudo-objects
start with a bare word and are rejected, while legitimate repair targets
(trailing commas, unescaped quotes inside string values) all begin with `"`
and pass — including objects whose string VALUES contain single quotes or
backticks, which must not be rejected.

Found by an independent GPT-5.5 re-review of the merged #79 code. Adds
regression tests for single-quoted / backticked / bare prose (-> None) and
for legitimate objects with quote/backtick string values (still repaired).
Tests: 30 pass (+3 skip) without json_repair, 33 pass with it, both clean
under -W error::RuntimeWarning.

Co-authored-by: Claude <noreply@anthropic.com>
2026-06-23 20:31:39 +08:00

173 lines
6.3 KiB
Python

"""JSON extraction helpers for LLM responses."""
from __future__ import annotations
import json
import re
import warnings
def _top_level_brace_objects(text: str) -> list[str]:
"""Return every balanced *top-level* ``{...}`` span in ``text``.
Fully string/escape aware: braces inside quoted strings are ignored both
when scanning for an object start AND while tracking depth inside one, so a
``{`` that appears in prose (e.g. ``'set it to {x}'``) is never mistaken for
the start of a JSON object. Used to detect ambiguity: when a response carries
more than one top-level object we must not let a repair pass silently pick
one — it may pick the wrong (discarded) edit, strictly worse than None.
"""
spans: list[str] = []
i, n = 0, len(text)
outer_in_str = False
outer_esc = False
while i < n:
ch = text[i]
# Skip over braces that live *inside* a quoted string before any object
# has started — otherwise a `{` in prose like '"set it to {x}"' is wrongly
# treated as an object start, and the repair pass below turns non-JSON
# prose into a bogus dict (strictly worse than returning None).
if outer_in_str:
if outer_esc:
outer_esc = False
elif ch == "\\":
outer_esc = True
elif ch == '"':
outer_in_str = False
i += 1
continue
if ch == '"':
outer_in_str = True
i += 1
continue
if ch != "{":
i += 1
continue
depth = 0
in_str = False
esc = False
start = i
while i < n:
ch = text[i]
if in_str:
if esc:
esc = False
elif ch == "\\":
esc = True
elif ch == '"':
in_str = False
elif ch == '"':
in_str = True
elif ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
spans.append(text[start:i + 1])
i += 1
break
i += 1
else:
break # unterminated final object
return spans
def _looks_json_like(span: str) -> bool:
"""Heuristic: does ``span`` look like an intended JSON object (vs. prose)?
A genuine JSON object's first non-space character after ``{`` is either ``"``
(a string key) or ``}`` (an empty object). Prose pseudo-objects that the
repair pass would otherwise fabricate into bogus dicts — ``{op: delete}``,
``{x: 1}`` quoted in single quotes or backticks, etc. — start with a bare
word and are rejected. This complements the string-aware scan, which only
skips *double*-quoted prose; single-quoted / backticked / unquoted prose
braces are caught here instead. Legitimate repair targets (trailing commas,
unescaped quotes inside string values) all begin with ``"`` and pass.
"""
inner = span.strip()
if not (inner.startswith("{") and inner.endswith("}")):
return False
after_brace = inner[1:].lstrip()
return after_brace[:1] in ('"', '}')
def extract_json(text: str) -> dict | None:
"""Extract a JSON object from LLM response text.
Tries ```json fences first, then bare {...} patterns.
"""
m = re.search(r"```json\s*(.*?)```", text, re.DOTALL)
if m:
try:
return json.loads(m.group(1))
except json.JSONDecodeError:
pass
m = re.search(r"\{.*\}", text, re.DOTALL)
if m:
try:
return json.loads(m.group(0))
except json.JSONDecodeError:
pass
# Tolerant fallback for non-OpenAI backends (Claude/Qwen, …) whose free-form
# JSON strict json.loads rejects — unescaped ASCII quotes inside CJK string
# values, trailing commas, etc. Repair so the analyst's edits aren't silently
# dropped, but ONLY a single unambiguous object: never feed the greedy `{.*}`
# span or the raw text, or json_repair would quietly return one of several
# objects (empirically the wrong/last one) — strictly worse than None, which
# the caller can detect and retry/skip.
#
# Pick the candidate FIRST, before importing json_repair, so the optional
# dependency only matters (and only warns) when there is genuinely a single
# malformed object we could have repaired. Ordinary no-JSON / prose replies
# have no candidate and return None silently.
candidate = None
fenced = re.search(r"```json\s*(.*?)```", text, re.DOTALL)
if fenced and len(_top_level_brace_objects(fenced.group(1))) == 1:
candidate = fenced.group(1)
else:
objs = _top_level_brace_objects(text)
if len(objs) == 1:
candidate = objs[0]
# 0 or >1 top-level objects → too ambiguous to repair safely → None
if not candidate:
return None
# Final guard: only repair spans that actually look like an intended JSON
# object. Prose pseudo-objects in single quotes / backticks / bare text
# (e.g. `{op: delete}`) reach here because the scan only skips double-quoted
# prose; repairing them would fabricate a wrong dict (worse than None).
if not _looks_json_like(candidate):
return None
try:
from json_repair import repair_json
except ModuleNotFoundError:
warnings.warn(
"json_repair not installed; malformed-JSON recovery disabled — "
"a non-OpenAI analyst edit may be silently dropped. pip install json_repair",
RuntimeWarning,
stacklevel=2,
)
return None
try:
repaired = repair_json(candidate, return_objects=True)
if isinstance(repaired, dict) and repaired:
return repaired
except Exception: # noqa: BLE001 — repair is best-effort
pass
return None
def extract_json_array(text: str) -> list | None:
"""Extract a JSON array from LLM response text."""
m = re.search(r"```json\s*(.*?)```", text, re.DOTALL)
if m:
try:
return json.loads(m.group(1))
except json.JSONDecodeError:
pass
m = re.search(r"\[.*\]", text, re.DOTALL)
if m:
try:
return json.loads(m.group(0))
except json.JSONDecodeError:
pass
return None