Files
microsoft-SkillOpt/skillopt/sleep/types.py
Yifan Yang 937bc1ec4d feat(sleep): real tool-loop replay for gbrain quick-answerer (tool_called judge)
The 4th gbrain seed (quick-answerer) is judged by tool_called=search: the agent
must ACTUALLY call a search tool. Add an honest tool loop:

  - Backend.attempt_with_tools(task, skill, memory, tools) -> (response, tools_called)
  - Claude: exposes a real ./search shell shim, runs with --allowedTools Bash in a
    clean cwd; detects the call from the shim's log (not a self-reported marker).
  - Codex: same shim under `exec --sandbox workspace-write`.
  - Mock: deterministic — "calls" a tool iff skill/memory instructs it (for CI).
  - replay_one routes tasks with a tool_called check through the tool loop and
    feeds detected calls to the rule judge; ReplayResult gains tools_called.

Verified live (Claude haiku): deficient skill -> tools_called=[] hard=0;
learned "must run ./search" rule -> tools_called=['search'] hard=1.0.
20 tests pass.

Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
2026-06-08 14:31:51 +00:00

130 lines
4.7 KiB
Python

"""SkillOpt-Sleep — core data types.
These dataclasses are the interfaces between the sleep-cycle stages
(harvest -> mine -> replay -> consolidate -> stage). They are intentionally
plain (no slots, no heavy deps) so the package imports cleanly on any
Python 3.8+ interpreter and the deterministic experiment runs with zero
external dependencies.
"""
from __future__ import annotations
from dataclasses import dataclass, field, asdict
from typing import Any, Dict, List, Optional
# ── Stage 1: harvest ──────────────────────────────────────────────────────────
@dataclass
class SessionDigest:
"""A normalized summary of one Claude Code session transcript.
Produced by :mod:`skillopt.sleep.harvest` from a ``<sessionId>.jsonl``
transcript plus ``history.jsonl`` entries.
"""
session_id: str
project: str
git_branch: str = ""
started_at: str = ""
ended_at: str = ""
user_prompts: List[str] = field(default_factory=list)
assistant_finals: List[str] = field(default_factory=list)
tools_used: List[str] = field(default_factory=list)
files_touched: List[str] = field(default_factory=list)
feedback_signals: List[str] = field(default_factory=list) # "still broken", "perfect", ...
n_user_turns: int = 0
n_assistant_turns: int = 0
raw_path: str = ""
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
# ── Stage 2: mine ─────────────────────────────────────────────────────────────
@dataclass
class TaskRecord:
"""A self-contained recurring task mined from one or more sessions.
This is the *training unit* of the sleep cycle — the analogue of a
SkillOpt benchmark item.
"""
id: str
project: str
intent: str # what the user wanted (the "question")
context_excerpt: str = "" # minimal context needed to attempt it
attempted_solution: str = "" # what the agent produced before
outcome: str = "unknown" # success | fail | mixed | unknown
reference_kind: str = "none" # exact | rubric | rule | none
reference: str = "" # exact answer, or rubric text
judge: Dict[str, Any] = field(default_factory=dict) # gbrain-style rule judge
tags: List[str] = field(default_factory=list)
source_sessions: List[str] = field(default_factory=list)
split: str = "replay" # replay (train) | holdout (test)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@classmethod
def from_dict(cls, d: Dict[str, Any]) -> "TaskRecord":
known = {f for f in cls.__dataclass_fields__} # type: ignore[attr-defined]
return cls(**{k: v for k, v in d.items() if k in known})
# ── Stage 3: replay ───────────────────────────────────────────────────────────
@dataclass
class ReplayResult:
"""Outcome of re-running one TaskRecord offline under a given skill+memory."""
id: str
hard: float = 0.0 # 0/1 exact, or continuous reward
soft: float = 0.0 # partial credit / judge score 0..1
response: str = ""
fail_reason: str = ""
task_type: str = "task"
judge_rationale: str = ""
tools_called: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
# ── Stage 4/5: consolidation report ───────────────────────────────────────────
@dataclass
class EditRecord:
"""One bounded edit proposed/applied to skill or memory."""
target: str # "skill" | "memory"
op: str # add | delete | replace
content: str = ""
anchor: str = "" # for replace/delete: text being changed
rationale: str = ""
@dataclass
class SleepReport:
"""Everything one night produced — written to staging for review."""
night: int
project: str
started_at: str = ""
ended_at: str = ""
n_sessions: int = 0
n_tasks: int = 0
n_replayed: int = 0
baseline_score: float = 0.0
candidate_score: float = 0.0
accepted: bool = False
gate_action: str = ""
edits: List[EditRecord] = field(default_factory=list)
rejected_edits: List[EditRecord] = field(default_factory=list)
tokens_used: int = 0
notes: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
d = asdict(self)
return d