microsoft-SkillOpt/skillopt/utils/json_utils.py

"""JSON extraction helpers for LLM responses."""
from __future__ import annotations

import json
import re
import warnings


def _top_level_brace_objects(text: str) -> list[str]:
    """Return every balanced *top-level* ``{...}`` span in ``text``.

    Fully string/escape aware: braces inside quoted strings are ignored both
    when scanning for an object start AND while tracking depth inside one, so a
    ``{`` that appears in prose (e.g. ``'set it to {x}'``) is never mistaken for
    the start of a JSON object. Used to detect ambiguity: when a response carries
    more than one top-level object we must not let a repair pass silently pick
    one — it may pick the wrong (discarded) edit, strictly worse than None.
    """
    spans: list[str] = []
    i, n = 0, len(text)
    outer_in_str = False
    outer_esc = False
    while i < n:
        ch = text[i]
        # Skip over braces that live *inside* a quoted string before any object
        # has started — otherwise a `{` in prose like '"set it to {x}"' is wrongly
        # treated as an object start, and the repair pass below turns non-JSON
        # prose into a bogus dict (strictly worse than returning None).
        if outer_in_str:
            if outer_esc:
                outer_esc = False
            elif ch == "\\":
                outer_esc = True
            elif ch == '"':
                outer_in_str = False
            i += 1
            continue
        if ch == '"':
            outer_in_str = True
            i += 1
            continue
        if ch != "{":
            i += 1
            continue
        depth = 0
        in_str = False
        esc = False
        start = i
        while i < n:
            ch = text[i]
            if in_str:
                if esc:
                    esc = False
                elif ch == "\\":
                    esc = True
                elif ch == '"':
                    in_str = False
            elif ch == '"':
                in_str = True
            elif ch == "{":
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0:
                    spans.append(text[start:i + 1])
                    i += 1
                    break
            i += 1
        else:
            break  # unterminated final object
    return spans


def _looks_json_like(span: str) -> bool:
    """Heuristic: does ``span`` look like an intended JSON object (vs. prose)?

    A genuine JSON object's first non-space character after ``{`` is either ``"``
    (a string key) or ``}`` (an empty object). Prose pseudo-objects that the
    repair pass would otherwise fabricate into bogus dicts — ``{op: delete}``,
    ``{x: 1}`` quoted in single quotes or backticks, etc. — start with a bare
    word and are rejected. This complements the string-aware scan, which only
    skips *double*-quoted prose; single-quoted / backticked / unquoted prose
    braces are caught here instead. Legitimate repair targets (trailing commas,
    unescaped quotes inside string values) all begin with ``"`` and pass.
    """
    inner = span.strip()
    if not (inner.startswith("{") and inner.endswith("}")):
        return False
    after_brace = inner[1:].lstrip()
    return after_brace[:1] in ('"', '}')


def extract_json(text: str) -> dict | None:
    """Extract a JSON object from LLM response text.

    Tries ```json fences first, then bare {...} patterns.
    """
    m = re.search(r"```json\s*(.*?)```", text, re.DOTALL)
    if m:
        try:
            return json.loads(m.group(1))
        except json.JSONDecodeError:
            pass
    m = re.search(r"\{.*\}", text, re.DOTALL)
    if m:
        try:
            return json.loads(m.group(0))
        except json.JSONDecodeError:
            pass
    # Tolerant fallback for non-OpenAI backends (Claude/Qwen, …) whose free-form
    # JSON strict json.loads rejects — unescaped ASCII quotes inside CJK string
    # values, trailing commas, etc. Repair so the analyst's edits aren't silently
    # dropped, but ONLY a single unambiguous object: never feed the greedy `{.*}`
    # span or the raw text, or json_repair would quietly return one of several
    # objects (empirically the wrong/last one) — strictly worse than None, which
    # the caller can detect and retry/skip.
    #
    # Pick the candidate FIRST, before importing json_repair, so the optional
    # dependency only matters (and only warns) when there is genuinely a single
    # malformed object we could have repaired. Ordinary no-JSON / prose replies
    # have no candidate and return None silently.
    candidate = None
    fenced = re.search(r"```json\s*(.*?)```", text, re.DOTALL)
    if fenced and len(_top_level_brace_objects(fenced.group(1))) == 1:
        candidate = fenced.group(1)
    else:
        objs = _top_level_brace_objects(text)
        if len(objs) == 1:
            candidate = objs[0]
        # 0 or >1 top-level objects → too ambiguous to repair safely → None
    if not candidate:
        return None
    # Final guard: only repair spans that actually look like an intended JSON
    # object. Prose pseudo-objects in single quotes / backticks / bare text
    # (e.g. `{op: delete}`) reach here because the scan only skips double-quoted
    # prose; repairing them would fabricate a wrong dict (worse than None).
    if not _looks_json_like(candidate):
        return None
    try:
        from json_repair import repair_json
    except ModuleNotFoundError:
        warnings.warn(
            "json_repair not installed; malformed-JSON recovery disabled — "
            "a non-OpenAI analyst edit may be silently dropped. pip install json_repair",
            RuntimeWarning,
            stacklevel=2,
        )
        return None
    try:
        repaired = repair_json(candidate, return_objects=True)
        if isinstance(repaired, dict) and repaired:
            return repaired
    except Exception:  # noqa: BLE001 — repair is best-effort
        pass
    return None


def extract_json_array(text: str) -> list | None:
    """Extract a JSON array from LLM response text."""
    m = re.search(r"```json\s*(.*?)```", text, re.DOTALL)
    if m:
        try:
            return json.loads(m.group(1))
        except json.JSONDecodeError:
            pass
    m = re.search(r"\[.*\]", text, re.DOTALL)
    if m:
        try:
            return json.loads(m.group(0))
        except json.JSONDecodeError:
            pass
    return None