mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
The 4th gbrain seed (quick-answerer) is judged by tool_called=search: the agent
must ACTUALLY call a search tool. Add an honest tool loop:
- Backend.attempt_with_tools(task, skill, memory, tools) -> (response, tools_called)
- Claude: exposes a real ./search shell shim, runs with --allowedTools Bash in a
clean cwd; detects the call from the shim's log (not a self-reported marker).
- Codex: same shim under `exec --sandbox workspace-write`.
- Mock: deterministic — "calls" a tool iff skill/memory instructs it (for CI).
- replay_one routes tasks with a tool_called check through the tool loop and
feeds detected calls to the rule judge; ReplayResult gains tools_called.
Verified live (Claude haiku): deficient skill -> tools_called=[] hard=0;
learned "must run ./search" rule -> tools_called=['search'] hard=1.0.
20 tests pass.
Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
130 lines
4.7 KiB
Python
130 lines
4.7 KiB
Python
"""SkillOpt-Sleep — core data types.
|
|
|
|
These dataclasses are the interfaces between the sleep-cycle stages
|
|
(harvest -> mine -> replay -> consolidate -> stage). They are intentionally
|
|
plain (no slots, no heavy deps) so the package imports cleanly on any
|
|
Python 3.8+ interpreter and the deterministic experiment runs with zero
|
|
external dependencies.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field, asdict
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
|
|
# ── Stage 1: harvest ──────────────────────────────────────────────────────────
|
|
|
|
@dataclass
|
|
class SessionDigest:
|
|
"""A normalized summary of one Claude Code session transcript.
|
|
|
|
Produced by :mod:`skillopt.sleep.harvest` from a ``<sessionId>.jsonl``
|
|
transcript plus ``history.jsonl`` entries.
|
|
"""
|
|
|
|
session_id: str
|
|
project: str
|
|
git_branch: str = ""
|
|
started_at: str = ""
|
|
ended_at: str = ""
|
|
user_prompts: List[str] = field(default_factory=list)
|
|
assistant_finals: List[str] = field(default_factory=list)
|
|
tools_used: List[str] = field(default_factory=list)
|
|
files_touched: List[str] = field(default_factory=list)
|
|
feedback_signals: List[str] = field(default_factory=list) # "still broken", "perfect", ...
|
|
n_user_turns: int = 0
|
|
n_assistant_turns: int = 0
|
|
raw_path: str = ""
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
# ── Stage 2: mine ─────────────────────────────────────────────────────────────
|
|
|
|
@dataclass
|
|
class TaskRecord:
|
|
"""A self-contained recurring task mined from one or more sessions.
|
|
|
|
This is the *training unit* of the sleep cycle — the analogue of a
|
|
SkillOpt benchmark item.
|
|
"""
|
|
|
|
id: str
|
|
project: str
|
|
intent: str # what the user wanted (the "question")
|
|
context_excerpt: str = "" # minimal context needed to attempt it
|
|
attempted_solution: str = "" # what the agent produced before
|
|
outcome: str = "unknown" # success | fail | mixed | unknown
|
|
reference_kind: str = "none" # exact | rubric | rule | none
|
|
reference: str = "" # exact answer, or rubric text
|
|
judge: Dict[str, Any] = field(default_factory=dict) # gbrain-style rule judge
|
|
tags: List[str] = field(default_factory=list)
|
|
source_sessions: List[str] = field(default_factory=list)
|
|
split: str = "replay" # replay (train) | holdout (test)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
@classmethod
|
|
def from_dict(cls, d: Dict[str, Any]) -> "TaskRecord":
|
|
known = {f for f in cls.__dataclass_fields__} # type: ignore[attr-defined]
|
|
return cls(**{k: v for k, v in d.items() if k in known})
|
|
|
|
|
|
# ── Stage 3: replay ───────────────────────────────────────────────────────────
|
|
|
|
@dataclass
|
|
class ReplayResult:
|
|
"""Outcome of re-running one TaskRecord offline under a given skill+memory."""
|
|
|
|
id: str
|
|
hard: float = 0.0 # 0/1 exact, or continuous reward
|
|
soft: float = 0.0 # partial credit / judge score 0..1
|
|
response: str = ""
|
|
fail_reason: str = ""
|
|
task_type: str = "task"
|
|
judge_rationale: str = ""
|
|
tools_called: List[str] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
# ── Stage 4/5: consolidation report ───────────────────────────────────────────
|
|
|
|
@dataclass
|
|
class EditRecord:
|
|
"""One bounded edit proposed/applied to skill or memory."""
|
|
|
|
target: str # "skill" | "memory"
|
|
op: str # add | delete | replace
|
|
content: str = ""
|
|
anchor: str = "" # for replace/delete: text being changed
|
|
rationale: str = ""
|
|
|
|
|
|
@dataclass
|
|
class SleepReport:
|
|
"""Everything one night produced — written to staging for review."""
|
|
|
|
night: int
|
|
project: str
|
|
started_at: str = ""
|
|
ended_at: str = ""
|
|
n_sessions: int = 0
|
|
n_tasks: int = 0
|
|
n_replayed: int = 0
|
|
baseline_score: float = 0.0
|
|
candidate_score: float = 0.0
|
|
accepted: bool = False
|
|
gate_action: str = ""
|
|
edits: List[EditRecord] = field(default_factory=list)
|
|
rejected_edits: List[EditRecord] = field(default_factory=list)
|
|
tokens_used: int = 0
|
|
notes: List[str] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
d = asdict(self)
|
|
return d
|