mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
Follow-up to the string-aware brace scan: that change only skipped
double-quoted prose, so brace-shaped text in single quotes, backticks, or
bare prose (e.g. `{op: delete}`, '{x: 1}') still reached json_repair and was
fabricated into a bogus dict — strictly worse than None, since extract_json
feeds the optimizer's skill edits.
Add a _looks_json_like() guard before repair: a genuine JSON object's first
non-space char after `{` is `"` (a key) or `}` (empty). Prose pseudo-objects
start with a bare word and are rejected, while legitimate repair targets
(trailing commas, unescaped quotes inside string values) all begin with `"`
and pass — including objects whose string VALUES contain single quotes or
backticks, which must not be rejected.
Found by an independent GPT-5.5 re-review of the merged #79 code. Adds
regression tests for single-quoted / backticked / bare prose (-> None) and
for legitimate objects with quote/backtick string values (still repaired).
Tests: 30 pass (+3 skip) without json_repair, 33 pass with it, both clean
under -W error::RuntimeWarning.
Co-authored-by: Claude <noreply@anthropic.com>
173 lines
6.3 KiB
Python
173 lines
6.3 KiB
Python
"""JSON extraction helpers for LLM responses."""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
import warnings
|
|
|
|
|
|
def _top_level_brace_objects(text: str) -> list[str]:
|
|
"""Return every balanced *top-level* ``{...}`` span in ``text``.
|
|
|
|
Fully string/escape aware: braces inside quoted strings are ignored both
|
|
when scanning for an object start AND while tracking depth inside one, so a
|
|
``{`` that appears in prose (e.g. ``'set it to {x}'``) is never mistaken for
|
|
the start of a JSON object. Used to detect ambiguity: when a response carries
|
|
more than one top-level object we must not let a repair pass silently pick
|
|
one — it may pick the wrong (discarded) edit, strictly worse than None.
|
|
"""
|
|
spans: list[str] = []
|
|
i, n = 0, len(text)
|
|
outer_in_str = False
|
|
outer_esc = False
|
|
while i < n:
|
|
ch = text[i]
|
|
# Skip over braces that live *inside* a quoted string before any object
|
|
# has started — otherwise a `{` in prose like '"set it to {x}"' is wrongly
|
|
# treated as an object start, and the repair pass below turns non-JSON
|
|
# prose into a bogus dict (strictly worse than returning None).
|
|
if outer_in_str:
|
|
if outer_esc:
|
|
outer_esc = False
|
|
elif ch == "\\":
|
|
outer_esc = True
|
|
elif ch == '"':
|
|
outer_in_str = False
|
|
i += 1
|
|
continue
|
|
if ch == '"':
|
|
outer_in_str = True
|
|
i += 1
|
|
continue
|
|
if ch != "{":
|
|
i += 1
|
|
continue
|
|
depth = 0
|
|
in_str = False
|
|
esc = False
|
|
start = i
|
|
while i < n:
|
|
ch = text[i]
|
|
if in_str:
|
|
if esc:
|
|
esc = False
|
|
elif ch == "\\":
|
|
esc = True
|
|
elif ch == '"':
|
|
in_str = False
|
|
elif ch == '"':
|
|
in_str = True
|
|
elif ch == "{":
|
|
depth += 1
|
|
elif ch == "}":
|
|
depth -= 1
|
|
if depth == 0:
|
|
spans.append(text[start:i + 1])
|
|
i += 1
|
|
break
|
|
i += 1
|
|
else:
|
|
break # unterminated final object
|
|
return spans
|
|
|
|
|
|
def _looks_json_like(span: str) -> bool:
|
|
"""Heuristic: does ``span`` look like an intended JSON object (vs. prose)?
|
|
|
|
A genuine JSON object's first non-space character after ``{`` is either ``"``
|
|
(a string key) or ``}`` (an empty object). Prose pseudo-objects that the
|
|
repair pass would otherwise fabricate into bogus dicts — ``{op: delete}``,
|
|
``{x: 1}`` quoted in single quotes or backticks, etc. — start with a bare
|
|
word and are rejected. This complements the string-aware scan, which only
|
|
skips *double*-quoted prose; single-quoted / backticked / unquoted prose
|
|
braces are caught here instead. Legitimate repair targets (trailing commas,
|
|
unescaped quotes inside string values) all begin with ``"`` and pass.
|
|
"""
|
|
inner = span.strip()
|
|
if not (inner.startswith("{") and inner.endswith("}")):
|
|
return False
|
|
after_brace = inner[1:].lstrip()
|
|
return after_brace[:1] in ('"', '}')
|
|
|
|
|
|
def extract_json(text: str) -> dict | None:
|
|
"""Extract a JSON object from LLM response text.
|
|
|
|
Tries ```json fences first, then bare {...} patterns.
|
|
"""
|
|
m = re.search(r"```json\s*(.*?)```", text, re.DOTALL)
|
|
if m:
|
|
try:
|
|
return json.loads(m.group(1))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
m = re.search(r"\{.*\}", text, re.DOTALL)
|
|
if m:
|
|
try:
|
|
return json.loads(m.group(0))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
# Tolerant fallback for non-OpenAI backends (Claude/Qwen, …) whose free-form
|
|
# JSON strict json.loads rejects — unescaped ASCII quotes inside CJK string
|
|
# values, trailing commas, etc. Repair so the analyst's edits aren't silently
|
|
# dropped, but ONLY a single unambiguous object: never feed the greedy `{.*}`
|
|
# span or the raw text, or json_repair would quietly return one of several
|
|
# objects (empirically the wrong/last one) — strictly worse than None, which
|
|
# the caller can detect and retry/skip.
|
|
#
|
|
# Pick the candidate FIRST, before importing json_repair, so the optional
|
|
# dependency only matters (and only warns) when there is genuinely a single
|
|
# malformed object we could have repaired. Ordinary no-JSON / prose replies
|
|
# have no candidate and return None silently.
|
|
candidate = None
|
|
fenced = re.search(r"```json\s*(.*?)```", text, re.DOTALL)
|
|
if fenced and len(_top_level_brace_objects(fenced.group(1))) == 1:
|
|
candidate = fenced.group(1)
|
|
else:
|
|
objs = _top_level_brace_objects(text)
|
|
if len(objs) == 1:
|
|
candidate = objs[0]
|
|
# 0 or >1 top-level objects → too ambiguous to repair safely → None
|
|
if not candidate:
|
|
return None
|
|
# Final guard: only repair spans that actually look like an intended JSON
|
|
# object. Prose pseudo-objects in single quotes / backticks / bare text
|
|
# (e.g. `{op: delete}`) reach here because the scan only skips double-quoted
|
|
# prose; repairing them would fabricate a wrong dict (worse than None).
|
|
if not _looks_json_like(candidate):
|
|
return None
|
|
try:
|
|
from json_repair import repair_json
|
|
except ModuleNotFoundError:
|
|
warnings.warn(
|
|
"json_repair not installed; malformed-JSON recovery disabled — "
|
|
"a non-OpenAI analyst edit may be silently dropped. pip install json_repair",
|
|
RuntimeWarning,
|
|
stacklevel=2,
|
|
)
|
|
return None
|
|
try:
|
|
repaired = repair_json(candidate, return_objects=True)
|
|
if isinstance(repaired, dict) and repaired:
|
|
return repaired
|
|
except Exception: # noqa: BLE001 — repair is best-effort
|
|
pass
|
|
return None
|
|
|
|
|
|
def extract_json_array(text: str) -> list | None:
|
|
"""Extract a JSON array from LLM response text."""
|
|
m = re.search(r"```json\s*(.*?)```", text, re.DOTALL)
|
|
if m:
|
|
try:
|
|
return json.loads(m.group(1))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
m = re.search(r"\[.*\]", text, re.DOTALL)
|
|
if m:
|
|
try:
|
|
return json.loads(m.group(0))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
return None
|