From 31715a8b43d98ef00f6b2ff24c367739d4256075 Mon Sep 17 00:00:00 2001 From: Kirill Kostarev Date: Fri, 12 Jun 2026 16:37:23 +0300 Subject: [PATCH] Add Codex Desktop transcript harvesting --- README.md | 7 +- docs/sleep/CONTROLLABLE_DREAMING.md | 17 ++ plugins/codex/README.md | 15 +- plugins/codex/skills/skillopt-sleep/SKILL.md | 4 +- skillopt_sleep/__main__.py | 22 +- skillopt_sleep/config.py | 12 +- skillopt_sleep/cycle.py | 16 +- skillopt_sleep/harvest_codex.py | 253 +++++++++++++++++++ skillopt_sleep/harvest_sources.py | 41 +++ skillopt_sleep/memory.py | 19 +- skillopt_sleep/types.py | 11 +- tests/test_sleep_engine.py | 121 +++++++-- 12 files changed, 479 insertions(+), 59 deletions(-) create mode 100644 skillopt_sleep/harvest_codex.py create mode 100644 skillopt_sleep/harvest_sources.py diff --git a/README.md b/README.md index 15404ba..1e6470e 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ Dreams** (offline consolidation; review-then-adopt), and the **agent sleep** idea (short-term experience → long-term competence). One "night": ``` -harvest session transcripts → mine recurring tasks → replay offline +harvest Claude Code / Codex Desktop transcripts → mine recurring tasks → replay offline → consolidate (reflect → bounded edit → GATE on real held-out tasks) → stage proposal → (you) adopt ``` @@ -99,6 +99,11 @@ positive, and the gate blocks regressions Deterministic proof (no API key): `python -m skillopt_sleep.experiments.run_experiment --persona researcher --assert-improves`. +For local sleep cycles, transcript source and replay backend are separate knobs: +use `--source claude` for Claude Code transcripts, `--source codex` for Codex +Desktop archived sessions under `~/.codex/archived_sessions`, and +`--backend codex` only when you want the replay/optimizer to spend Codex budget. + --- ## Extensibility & WebUI diff --git a/docs/sleep/CONTROLLABLE_DREAMING.md b/docs/sleep/CONTROLLABLE_DREAMING.md index b26fea2..9ba3a79 100644 --- a/docs/sleep/CONTROLLABLE_DREAMING.md +++ b/docs/sleep/CONTROLLABLE_DREAMING.md @@ -4,6 +4,23 @@ The sleep engine is no longer a single fixed pipeline. It is a controllable offline "dream / imagination" loop the user steers. This documents the knobs added in the four-stage refactor and how they map to the user's design. +## Transcript sources + +Sleep separates the source of past sessions from the backend used to replay and +optimize tasks: + +```bash +python -m skillopt_sleep dry-run --project "$(pwd)" --source claude --backend mock +python -m skillopt_sleep dry-run --project "$(pwd)" --source codex --backend mock +python -m skillopt_sleep run --project "$(pwd)" --source codex --backend codex +``` + +`--source claude` reads Claude Code transcripts from `~/.claude/projects`. +`--source codex` reads Codex Desktop archives from +`~/.codex/archived_sessions`. `--source auto` tries Codex archives first, then +falls back to Claude Code transcripts. Use `--codex-home /path/to/.codex` or +`--claude-home /path/to/.claude` to point at non-default homes. + ## The mental model > Sleep = an offline imagination rollout. Re-run the user's real diff --git a/plugins/codex/README.md b/plugins/codex/README.md index 376bc46..3c9ceb7 100644 --- a/plugins/codex/README.md +++ b/plugins/codex/README.md @@ -48,13 +48,18 @@ Use the skillopt-sleep skill to adopt the latest staged proposal. Or call the engine directly: ```bash -python -m skillopt_sleep run --project "$(pwd)" --backend codex +python -m skillopt_sleep dry-run --project "$(pwd)" --source codex --backend mock +python -m skillopt_sleep run --project "$(pwd)" --source codex --backend codex ``` -Default backend is `mock` (no API spend). `--backend codex` uses your Codex -budget for real improvement. All the controllable knobs (`--gate on|off`, -`--rollouts-k`, `--budget-tokens`, `--preferences`, optimizer/target split) work -identically — see [`../../docs/sleep/CONTROLLABLE_DREAMING.md`](../../docs/sleep/CONTROLLABLE_DREAMING.md). +`--source codex` reads Codex Desktop archived sessions from +`~/.codex/archived_sessions`. Use `--codex-home /path/to/.codex` to point at a +different Codex home, or `--source auto` to try Codex archives first and fall +back to Claude Code transcripts. Default backend is `mock` (no API spend). +`--backend codex` uses your Codex budget for real improvement. All the +controllable knobs (`--gate on|off`, `--rollouts-k`, `--budget-tokens`, +`--preferences`, optimizer/target split) work identically — see +[`../../docs/sleep/CONTROLLABLE_DREAMING.md`](../../docs/sleep/CONTROLLABLE_DREAMING.md). ## Notes / status diff --git a/plugins/codex/skills/skillopt-sleep/SKILL.md b/plugins/codex/skills/skillopt-sleep/SKILL.md index e47fadb..88b4113 100644 --- a/plugins/codex/skills/skillopt-sleep/SKILL.md +++ b/plugins/codex/skills/skillopt-sleep/SKILL.md @@ -44,11 +44,11 @@ finds the engine and a Python >= 3.10 automatically. ```bash # point at the repo if it isn't auto-detected from CWD: export SKILLOPT_SLEEP_REPO=/path/to/SkillOpt-Sleep - bash "$SKILLOPT_SLEEP_REPO/plugins/run-sleep.sh" status --project "$(pwd)" bash "$SKILLOPT_SLEEP_REPO/plugins/run-sleep.sh" harvest --project "$(pwd)" bash "$SKILLOPT_SLEEP_REPO/plugins/run-sleep.sh" dry-run --project "$(pwd)" --backend mock bash "$SKILLOPT_SLEEP_REPO/plugins/run-sleep.sh" run --project "$(pwd)" --backend codex +bash "$SKILLOPT_SLEEP_REPO/plugins/run-sleep.sh" run --project "$(pwd)" --source codex # harvest from Codex Desktop bash "$SKILLOPT_SLEEP_REPO/plugins/run-sleep.sh" adopt --project "$(pwd)" ``` @@ -56,6 +56,8 @@ Actions are `status`, `harvest`, `dry-run`, `run`, and `adopt`. - Default backend is `mock`, which is deterministic and spends no API budget. - `--backend codex` uses the user's Codex budget for real improvement. +- `--source codex` reads Codex Desktop archived sessions from `~/.codex/archived_sessions`; + use `--codex-home /path/to/.codex` if the archive lives elsewhere. - Keep `dry-run --backend mock` as the first smoke check unless the user explicitly asked for a real optimization run. diff --git a/skillopt_sleep/__main__.py b/skillopt_sleep/__main__.py index 4db47f1..2666ee6 100644 --- a/skillopt_sleep/__main__.py +++ b/skillopt_sleep/__main__.py @@ -9,7 +9,8 @@ Common flags: --project PATH project to evolve (default: cwd) --scope all|invoked harvest scope (default: invoked) - --backend mock|anthropic + --backend mock|claude|codex + --source claude|codex|auto --model NAME --lookback-hours N --auto-adopt @@ -25,10 +26,11 @@ from typing import Any, Dict from skillopt_sleep.config import load_config from skillopt_sleep.cycle import run_sleep_cycle -from skillopt_sleep.harvest import harvest +from skillopt_sleep.harvest_sources import harvest_for_config from skillopt_sleep.mine import mine +from skillopt_sleep.staging import adopt as adopt_staging +from skillopt_sleep.staging import latest_staging from skillopt_sleep.state import SleepState -from skillopt_sleep.staging import latest_staging, adopt as adopt_staging def _add_common(p: argparse.ArgumentParser) -> None: @@ -38,6 +40,9 @@ def _add_common(p: argparse.ArgumentParser) -> None: p.add_argument("--model", default="") p.add_argument("--codex-path", default="", help="path to the real @openai/codex binary") p.add_argument("--claude-home", default="", help="override ~/.claude (also isolates state)") + p.add_argument("--codex-home", default="", help="override ~/.codex for archived session harvest") + p.add_argument("--source", default="", choices=["", "claude", "codex", "auto"], + help="session transcript source") p.add_argument("--lookback-hours", type=int, default=0) p.add_argument("--edit-budget", type=int, default=0) p.add_argument("--auto-adopt", action="store_true") @@ -59,6 +64,10 @@ def _cfg_from_args(args) -> Any: overrides["codex_path"] = os.path.abspath(args.codex_path) if getattr(args, "claude_home", ""): overrides["claude_home"] = os.path.abspath(args.claude_home) + if getattr(args, "codex_home", ""): + overrides["codex_home"] = os.path.abspath(args.codex_home) + if getattr(args, "source", ""): + overrides["transcript_source"] = args.source if getattr(args, "lookback_hours", 0): overrides["lookback_hours"] = args.lookback_hours if getattr(args, "edit_budget", 0): @@ -143,12 +152,7 @@ def cmd_adopt(args) -> int: def cmd_harvest(args) -> int: cfg = _cfg_from_args(args) - digests = harvest( - cfg.transcripts_dir, - scope=cfg.get("projects", "invoked"), - invoked_project=cfg.get("invoked_project", ""), - limit=cfg.get("max_tasks_per_night", 40) * 3, - ) + digests = harvest_for_config(cfg, limit=cfg.get("max_tasks_per_night", 40) * 3) tasks = mine(digests, max_tasks=cfg.get("max_tasks_per_night", 40), holdout_fraction=cfg.get("holdout_fraction", 0.34), seed=cfg.get("seed", 42)) if args.json: diff --git a/skillopt_sleep/config.py b/skillopt_sleep/config.py index 7541527..9ddeff7 100644 --- a/skillopt_sleep/config.py +++ b/skillopt_sleep/config.py @@ -13,17 +13,19 @@ from __future__ import annotations import json import os -from dataclasses import dataclass, field, asdict -from typing import Any, Dict, List, Optional - +from dataclasses import dataclass, field +from typing import Any, Dict, Optional HOME_STATE_DIR = os.path.expanduser("~/.skillopt-sleep") CLAUDE_HOME = os.path.expanduser("~/.claude") +CODEX_HOME = os.path.expanduser("~/.codex") DEFAULTS: Dict[str, Any] = { # ── scope ────────────────────────────────────────────────────────────── "claude_home": CLAUDE_HOME, + "codex_home": CODEX_HOME, + "transcript_source": "claude", # "claude" | "codex" | "auto" "projects": "invoked", # "invoked" | "all" | [list of abs paths] "invoked_project": "", # filled at runtime (cwd) when projects == "invoked" "lookback_hours": 72, # harvest window when no prior sleep recorded @@ -94,6 +96,10 @@ class SleepConfig: def transcripts_dir(self) -> str: return os.path.join(self.data["claude_home"], "projects") + @property + def codex_archived_sessions_dir(self) -> str: + return os.path.join(self.data["codex_home"], "archived_sessions") + @property def history_path(self) -> str: return os.path.join(self.data["claude_home"], "history.jsonl") diff --git a/skillopt_sleep/cycle.py b/skillopt_sleep/cycle.py index 04baa03..c9f8a28 100644 --- a/skillopt_sleep/cycle.py +++ b/skillopt_sleep/cycle.py @@ -10,18 +10,18 @@ CI use. With backend="anthropic" it spends the user's budget for real lift. from __future__ import annotations import os -import time from dataclasses import dataclass -from typing import Any, Dict, List, Optional +from typing import List, Optional from skillopt_sleep.backend import get_backend from skillopt_sleep.config import SleepConfig, load_config from skillopt_sleep.consolidate import consolidate -from skillopt_sleep.harvest import harvest +from skillopt_sleep.harvest_sources import harvest_for_config from skillopt_sleep.memory import ensure_skill_scaffold from skillopt_sleep.mine import mine +from skillopt_sleep.staging import adopt as adopt_staging +from skillopt_sleep.staging import write_staging from skillopt_sleep.state import SleepState, _now_iso -from skillopt_sleep.staging import write_staging, adopt as adopt_staging from skillopt_sleep.types import SessionDigest, SleepReport, TaskRecord @@ -117,10 +117,8 @@ def run_sleep_cycle( n_sessions = 0 else: since = state.last_harvest_for(project) - digests = harvest( - cfg.transcripts_dir, - scope=cfg.get("projects", "invoked"), - invoked_project=cfg.get("invoked_project", ""), + digests = harvest_for_config( + cfg, since_iso=since, limit=cfg.get("max_tasks_per_night", 40) * 3, ) @@ -151,7 +149,7 @@ def run_sleep_cycle( if not skill: skill = ensure_skill_scaffold( "", name=cfg.get("managed_skill_name", "skillopt-sleep-learned"), - description="Preferences and procedures learned from past Claude Code sessions.", + description="Preferences and procedures learned from past local agent sessions.", ) report = SleepReport( diff --git a/skillopt_sleep/harvest_codex.py b/skillopt_sleep/harvest_codex.py new file mode 100644 index 0000000..8e97b31 --- /dev/null +++ b/skillopt_sleep/harvest_codex.py @@ -0,0 +1,253 @@ +"""SkillOpt-Sleep Codex Desktop session harvesting. + +Reads Codex Desktop archived session JSONL files and normalizes them into +``SessionDigest`` records without copying developer/system instructions, tool +arguments, or raw tool outputs. +""" +from __future__ import annotations + +import os +import re +from typing import Any, Dict, Iterable, List, Optional + +from skillopt_sleep.harvest import ( + _detect_feedback, + _is_meta_prompt, + _iter_jsonl, + _project_matches, +) +from skillopt_sleep.types import SessionDigest + +_SECRET_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = ( + (re.compile(r"sk-[A-Za-z0-9_-]{10,}"), "[REDACTED_OPENAI_KEY]"), + (re.compile(r"(?i)(Authorization:\s*Bearer\s+)[^\s\"']+"), r"\1[REDACTED]"), + (re.compile(r"(?i)(Authorization:\s*Basic\s+)[^\s\"']+"), r"\1[REDACTED]"), + ( + re.compile(r"(?i)\b(api[_-]?key|token|password|secret)\b(\s*[:=]\s*)[^\s\"']+"), + r"\1\2[REDACTED]", + ), + ( + re.compile(r"(?i)\b(api[_-]?key|token|password|secret)\b(\s+)[^\s\"']+"), + r"\1\2[REDACTED]", + ), + ( + re.compile( + r"-----BEGIN [A-Z ]*PRIVATE KEY-----.*?-----END [A-Z ]*PRIVATE KEY-----", + re.DOTALL, + ), + "[REDACTED_PRIVATE_KEY]", + ), +) + + +def _payload(rec: Dict[str, Any]) -> Dict[str, Any]: + payload = rec.get("payload") + return payload if isinstance(payload, dict) else {} + + +def _timestamp(rec: Dict[str, Any], payload: Dict[str, Any]) -> str: + for value in ( + payload.get("timestamp"), + rec.get("timestamp"), + payload.get("started_at"), + payload.get("completed_at"), + ): + if isinstance(value, str) and value: + return value + return "" + + +def _text_from_any(content: Any) -> str: + if isinstance(content, str): + return content + if isinstance(content, list): + parts: List[str] = [] + for item in content: + if isinstance(item, str): + parts.append(item) + elif isinstance(item, dict): + if item.get("type") == "text" and item.get("text"): + parts.append(str(item["text"])) + elif item.get("text"): + parts.append(str(item["text"])) + return "\n".join(parts) + if isinstance(content, dict): + if content.get("text"): + return str(content["text"]) + if content.get("content"): + return _text_from_any(content["content"]) + return "" + + +def _strip_codex_meta(text: str) -> str: + stripped = text.strip() + if not stripped: + return "" + if stripped.startswith("", ""): + idx = stripped.rfind(marker) + if idx == -1: + continue + tail = stripped[idx + len(marker):].strip() + if tail and not tail.startswith("<"): + return tail + return "" + return stripped + + +def _sanitize_text(text: str) -> str: + sanitized = _strip_codex_meta(text).replace("\x00", "").strip() + if not sanitized or _is_meta_prompt(sanitized): + return "" + for pattern, replacement in _SECRET_PATTERNS: + sanitized = pattern.sub(replacement, sanitized) + return sanitized + + +def _sanitize_tool_name(name: str) -> str: + return re.sub(r"[^A-Za-z0-9_.:-]+", "_", name)[:80] + + +def _tool_name(payload: Dict[str, Any]) -> str: + payload_type = payload.get("type") + name = payload.get("name") + if isinstance(name, str) and name: + return _sanitize_tool_name(name) + if payload_type == "exec_command_end": + return "exec_command" + if payload_type == "patch_apply_end": + return "apply_patch" + if payload_type == "web_search_call": + return "web_search" + if payload_type == "tool_search_call": + return "tool_search" + if isinstance(payload_type, str) and payload_type.endswith("_tool_call"): + return _sanitize_tool_name(payload_type) + return "" + + +def _dedup(xs: Iterable[str]) -> List[str]: + seen = set() + out: List[str] = [] + for x in xs: + if x not in seen: + seen.add(x) + out.append(x) + return out + + +def digest_codex_archived_session(path: str, project: str = "") -> Optional[SessionDigest]: + """Build a ``SessionDigest`` from one Codex Desktop archived session.""" + session_id = os.path.splitext(os.path.basename(path))[0] + started = "" + ended = "" + session_project = "" + user_prompts: List[str] = [] + assistant_finals: List[str] = [] + tools: List[str] = [] + feedback: List[str] = [] + n_user = 0 + n_asst = 0 + + for rec in _iter_jsonl(path): + payload = _payload(rec) + payload_type = payload.get("type") + ts = _timestamp(rec, payload) + if ts: + if not started: + started = ts + ended = ts + cwd = payload.get("cwd") + if isinstance(cwd, str) and cwd: + if not session_project: + session_project = cwd + if project and _project_matches(cwd, "invoked", project): + session_project = cwd + + role = payload.get("role") + text = "" + output_role = "" + if payload_type == "user_message": + text = _text_from_any(payload.get("message")) + output_role = "user" + elif payload_type == "agent_message": + text = _text_from_any(payload.get("message")) + output_role = "assistant" + elif payload_type == "message" and role in {"user", "assistant"}: + text = _text_from_any(payload.get("content")) + output_role = str(role) + else: + tool = _tool_name(payload) + if tool: + tools.append(tool) + continue + + sanitized = _sanitize_text(text) + if not sanitized: + continue + if output_role == "user": + n_user += 1 + user_prompts.append(sanitized) + feedback.extend(_detect_feedback(sanitized)) + elif output_role == "assistant": + n_asst += 1 + assistant_finals.append(sanitized) + + if project and not _project_matches(session_project or "", "invoked", project): + return None + if n_user == 0 and n_asst == 0: + return None + + return SessionDigest( + session_id=session_id, + project=session_project, + started_at=started, + ended_at=ended, + user_prompts=user_prompts, + assistant_finals=assistant_finals[-5:], + tools_used=_dedup(tools), + files_touched=[], + feedback_signals=feedback, + n_user_turns=n_user, + n_assistant_turns=n_asst, + raw_path=path, + ) + + +def harvest_codex( + archived_sessions_dir: str, + *, + scope: Any = "all", + invoked_project: str = "", + since_iso: Optional[str] = None, + limit: int = 0, +) -> List[SessionDigest]: + """Walk ``~/.codex/archived_sessions`` and return matching digests.""" + digests: List[SessionDigest] = [] + if not os.path.isdir(archived_sessions_dir): + return digests + + paths = [ + os.path.join(archived_sessions_dir, fn) + for fn in os.listdir(archived_sessions_dir) + if fn.endswith(".jsonl") + ] + paths.sort(key=lambda p: os.path.getmtime(p), reverse=True) + + project_hint = invoked_project if scope == "invoked" else "" + for path in paths: + digest = digest_codex_archived_session(path, project=project_hint) + if digest is None: + continue + if not _project_matches(digest.project or "", scope, invoked_project): + continue + if since_iso and digest.ended_at and digest.ended_at < since_iso: + continue + digests.append(digest) + if limit and len(digests) >= limit: + break + return digests diff --git a/skillopt_sleep/harvest_sources.py b/skillopt_sleep/harvest_sources.py new file mode 100644 index 0000000..501aa28 --- /dev/null +++ b/skillopt_sleep/harvest_sources.py @@ -0,0 +1,41 @@ +"""Source selection for SkillOpt-Sleep transcript harvesting.""" +from __future__ import annotations + +from typing import Optional + +from skillopt_sleep.harvest import harvest +from skillopt_sleep.harvest_codex import harvest_codex +from skillopt_sleep.types import SessionDigest + + +def harvest_for_config(cfg, *, since_iso: Optional[str] = None, limit: int = 0) -> list[SessionDigest]: + source = cfg.get("transcript_source", "claude") + scope = cfg.get("projects", "invoked") + invoked_project = cfg.get("invoked_project", "") + + if source == "codex": + return harvest_codex( + cfg.codex_archived_sessions_dir, + scope=scope, + invoked_project=invoked_project, + since_iso=since_iso, + limit=limit, + ) + if source == "auto": + codex_digests = harvest_codex( + cfg.codex_archived_sessions_dir, + scope=scope, + invoked_project=invoked_project, + since_iso=since_iso, + limit=limit, + ) + if codex_digests: + return codex_digests + + return harvest( + cfg.transcripts_dir, + scope=scope, + invoked_project=invoked_project, + since_iso=since_iso, + limit=limit, + ) diff --git a/skillopt_sleep/memory.py b/skillopt_sleep/memory.py index 579d714..ef67f36 100644 --- a/skillopt_sleep/memory.py +++ b/skillopt_sleep/memory.py @@ -12,7 +12,6 @@ from typing import List, Tuple from skillopt_sleep.types import EditRecord - LEARNED_START = "" LEARNED_END = "" _BANNER = ( @@ -79,7 +78,7 @@ def apply_edits(doc: str, edits: List[EditRecord]) -> Tuple[str, List[EditRecord anchor substring. """ lines = current_learned_lines(doc) - norm_set = {_norm(l) for l in lines} + norm_set = {_norm(line) for line in lines} applied: List[EditRecord] = [] for e in edits: @@ -92,31 +91,31 @@ def apply_edits(doc: str, edits: List[EditRecord]) -> Tuple[str, List[EditRecord applied.append(e) elif op == "delete": anchor = _norm(e.anchor or e.content) - keep = [l for l in lines if anchor not in _norm(l)] + keep = [line for line in lines if anchor not in _norm(line)] if len(keep) != len(lines): lines = keep - norm_set = {_norm(l) for l in lines} + norm_set = {_norm(line) for line in lines} applied.append(e) elif op == "replace": anchor = _norm(e.anchor) new_lines = [] changed = False - for l in lines: - if anchor and anchor in _norm(l): + for line in lines: + if anchor and anchor in _norm(line): new_lines.append(e.content.strip()) changed = True else: - new_lines.append(l) + new_lines.append(line) if changed: lines = new_lines - norm_set = {_norm(l) for l in lines} + norm_set = {_norm(line) for line in lines} applied.append(e) return set_learned(doc, lines), applied def ensure_skill_scaffold(doc: str, *, name: str, description: str) -> str: - """Ensure a SKILL.md has YAML frontmatter so Claude Code loads it.""" + """Ensure a SKILL.md has YAML frontmatter so local agents load it.""" if doc.lstrip().startswith("---"): return doc fm = ( @@ -125,6 +124,6 @@ def ensure_skill_scaffold(doc: str, *, name: str, description: str) -> str: f"description: {description}\n" "---\n\n" f"# {name}\n\n" - "Preferences and procedures learned from your past Claude Code sessions.\n" + "Preferences and procedures learned from your past local agent sessions.\n" ) return fm + doc diff --git a/skillopt_sleep/types.py b/skillopt_sleep/types.py index 96a605b..849c170 100644 --- a/skillopt_sleep/types.py +++ b/skillopt_sleep/types.py @@ -8,18 +8,17 @@ external dependencies. """ from __future__ import annotations -from dataclasses import dataclass, field, asdict -from typing import Any, Dict, List, Optional - +from dataclasses import asdict, dataclass, field +from typing import Any, Dict, List # ── Stage 1: harvest ────────────────────────────────────────────────────────── @dataclass class SessionDigest: - """A normalized summary of one Claude Code session transcript. + """A normalized summary of one local agent session transcript. - Produced by :mod:`skillopt_sleep.harvest` from a ``.jsonl`` - transcript plus ``history.jsonl`` entries. + Produced by source-specific harvesters from Claude Code transcripts or + Codex Desktop archived sessions. """ session_id: str diff --git a/tests/test_sleep_engine.py b/tests/test_sleep_engine.py index 2a28dce..1992dc7 100644 --- a/tests/test_sleep_engine.py +++ b/tests/test_sleep_engine.py @@ -15,11 +15,11 @@ from skillopt_sleep.backend import MockBackend, exact_score, keyword_soft_score from skillopt_sleep.config import load_config from skillopt_sleep.consolidate import consolidate from skillopt_sleep.cycle import run_sleep_cycle -from skillopt_sleep.experiments.personas import researcher_persona, programmer_persona -from skillopt_sleep.harvest import digest_transcript, _detect_feedback, _is_meta_prompt +from skillopt_sleep.experiments.personas import programmer_persona, researcher_persona +from skillopt_sleep.harvest import _detect_feedback, _is_meta_prompt, digest_transcript from skillopt_sleep.memory import apply_edits, current_learned_lines, extract_learned, set_learned -from skillopt_sleep.mine import assign_splits, heuristic_mine, dedup_tasks -from skillopt_sleep.staging import adopt, latest_staging +from skillopt_sleep.mine import assign_splits, heuristic_mine +from skillopt_sleep.staging import adopt from skillopt_sleep.types import EditRecord, SessionDigest, TaskRecord @@ -89,6 +89,97 @@ class TestHarvest(unittest.TestCase): self.assertIsInstance(d.session_id, str) self.assertGreaterEqual(d.n_user_turns + d.n_assistant_turns, 0) + def _write_jsonl(self, path, records): + with open(path, "w", encoding="utf-8") as f: + for record in records: + f.write(json.dumps(record) + "\n") + + def test_digest_codex_archived_session_sanitizes_and_skips_meta(self): + from skillopt_sleep.harvest_codex import digest_codex_archived_session + + with tempfile.TemporaryDirectory() as tmp: + path = os.path.join(tmp, "rollout-example.jsonl") + self._write_jsonl(path, [ + {"type": "turn_context", "timestamp": "2026-06-12T10:00:00Z", + "payload": {"cwd": "/repo/Yoshi", "type": None}}, + {"type": "response_item", "timestamp": "2026-06-12T10:00:01Z", + "payload": {"type": "message", "role": "developer", + "content": [{"type": "text", "text": "do not copy"}]}}, + {"type": "response_item", "timestamp": "2026-06-12T10:00:02Z", + "payload": {"type": "user_message", + "message": "# AGENTS.md instructions for /repo/Yoshi\n" + "do not keep"}}, + {"type": "response_item", "timestamp": "2026-06-12T10:00:03Z", + "payload": {"type": "user_message", + "message": "run deploy with sk-1234567890abcdef and token local-secret"}}, + {"type": "response_item", "timestamp": "2026-06-12T10:00:04Z", + "payload": {"type": "function_call", "name": "exec_command", + "arguments": "raw args should not copy"}}, + {"type": "response_item", "timestamp": "2026-06-12T10:00:05Z", + "payload": {"type": "function_call_output", + "output": "raw output should not copy"}}, + {"type": "response_item", "timestamp": "2026-06-12T10:00:06Z", + "payload": {"type": "agent_message", "message": "done"}}, + ]) + + digest = digest_codex_archived_session(path, project="/repo/Yoshi") + + self.assertIsNotNone(digest) + joined = "\n".join(digest.user_prompts + digest.assistant_finals) + self.assertEqual(digest.project, "/repo/Yoshi") + self.assertIn("[REDACTED_OPENAI_KEY]", joined) + self.assertIn("token [REDACTED]", joined) + self.assertIn("exec_command", digest.tools_used) + self.assertNotIn("AGENTS.md instructions", joined) + self.assertNotIn("do not copy", joined) + self.assertNotIn("raw args should not copy", joined) + self.assertNotIn("raw output should not copy", joined) + + def test_harvest_codex_filters_project_and_cli_source(self): + from skillopt_sleep.__main__ import _cfg_from_args + from skillopt_sleep.harvest_sources import harvest_for_config + + with tempfile.TemporaryDirectory() as tmp: + codex_home = os.path.join(tmp, ".codex") + sessions = os.path.join(codex_home, "archived_sessions") + os.makedirs(sessions) + self._write_jsonl(os.path.join(sessions, "rollout-yoshi.jsonl"), [ + {"type": "turn_context", "timestamp": "2026-06-12T10:00:00Z", + "payload": {"cwd": "/repo/Yoshi", "type": None}}, + {"type": "response_item", "timestamp": "2026-06-12T10:00:01Z", + "payload": {"type": "user_message", "message": "fix Yoshi"}}, + {"type": "response_item", "timestamp": "2026-06-12T10:00:02Z", + "payload": {"type": "agent_message", "message": "fixed"}}, + ]) + self._write_jsonl(os.path.join(sessions, "rollout-other.jsonl"), [ + {"type": "turn_context", "timestamp": "2026-06-12T10:00:00Z", + "payload": {"cwd": "/repo/Other", "type": None}}, + {"type": "response_item", "timestamp": "2026-06-12T10:00:01Z", + "payload": {"type": "user_message", "message": "fix Other"}}, + ]) + + Args = type("Args", (), { + "project": "/repo/Yoshi", + "scope": "", + "backend": "", + "model": "", + "codex_path": "", + "claude_home": "", + "codex_home": codex_home, + "source": "codex", + "lookback_hours": 0, + "edit_budget": 0, + "auto_adopt": False, + }) + + cfg = _cfg_from_args(Args()) + digests = harvest_for_config(cfg, limit=10) + + self.assertEqual(cfg.get("transcript_source"), "codex") + self.assertEqual(len(digests), 1) + self.assertEqual(digests[0].session_id, "rollout-yoshi") + self.assertEqual(digests[0].user_prompts, ["fix Yoshi"]) + class TestMine(unittest.TestCase): def _digest(self, prompts, feedback): @@ -115,7 +206,6 @@ class TestMine(unittest.TestCase): def test_dream_never_in_val_or_test(self): # the anti-overfitting guarantee: origin='dream' tasks only ever land in train - from skillopt_sleep.types import TaskRecord real = researcher_persona() dream = [TaskRecord(id=f"d{i}", project="/p", intent=f"dream {i}", origin="dream", derived_from="r0") for i in range(5)] @@ -235,7 +325,7 @@ class TestLlmMiner(unittest.TestCase): class TestMultiObjectiveAndPrefs(unittest.TestCase): def test_multi_objective_reward(self): from skillopt_sleep.replay import multi_objective_reward - from skillopt_sleep.types import ReplayResult, TaskRecord + from skillopt_sleep.types import ReplayResult t = TaskRecord(id="t", project="/p", intent="x") expensive = [(t, ReplayResult(id="t", hard=1.0, tokens=4000, latency_ms=20000))] cheap = [(t, ReplayResult(id="t", hard=1.0, tokens=200, latency_ms=1000))] @@ -249,7 +339,7 @@ class TestMultiObjectiveAndPrefs(unittest.TestCase): def test_preferences_injected_into_reflect(self): from skillopt_sleep.backend import CliBackend - from skillopt_sleep.types import TaskRecord, ReplayResult + from skillopt_sleep.types import ReplayResult captured = {} class CapBackend(CliBackend): @@ -269,7 +359,6 @@ class TestMultiObjectiveAndPrefs(unittest.TestCase): def test_replay_records_cost(self): from skillopt_sleep.backend import MockBackend from skillopt_sleep.replay import replay_one - from skillopt_sleep.types import TaskRecord t = TaskRecord(id="t", project="/p", intent="hello world", reference_kind="exact", reference="hi") r = replay_one(MockBackend(), t, "some skill text", "") @@ -280,7 +369,7 @@ class TestMultiObjectiveAndPrefs(unittest.TestCase): class TestMultiRolloutAndBudget(unittest.TestCase): def test_rolloutset_stats(self): from skillopt_sleep.rollout import RolloutSet - from skillopt_sleep.types import ReplayResult, TaskRecord + from skillopt_sleep.types import ReplayResult rs = RolloutSet(task=TaskRecord(id="t", project="/p", intent="x"), attempts=[ReplayResult(id="t", hard=1.0), ReplayResult(id="t", hard=0.0), @@ -305,7 +394,7 @@ class TestMultiRolloutAndBudget(unittest.TestCase): def test_contrastive_reflect_with_stub(self): from skillopt_sleep.backend import Backend from skillopt_sleep.rollout import RolloutSet, contrastive_reflect - from skillopt_sleep.types import ReplayResult, TaskRecord + from skillopt_sleep.types import ReplayResult class StubBackend(Backend): name = "stub" @@ -323,8 +412,11 @@ class TestMultiRolloutAndBudget(unittest.TestCase): class TestSlowUpdate(unittest.TestCase): def test_protected_field_roundtrip(self): from skillopt_sleep.slow_update import ( - replace_slow_field, extract_slow_field, has_slow_field, - SLOW_UPDATE_START, SLOW_UPDATE_END, + SLOW_UPDATE_END, + SLOW_UPDATE_START, + extract_slow_field, + has_slow_field, + replace_slow_field, ) base = "# skill\nkeep me\n" doc = replace_slow_field(base, "durable lesson A") @@ -341,7 +433,7 @@ class TestSlowUpdate(unittest.TestCase): def test_run_slow_update_with_stub_backend(self): from skillopt_sleep.backend import Backend from skillopt_sleep.slow_update import run_slow_update - from skillopt_sleep.types import TaskRecord, ReplayResult + from skillopt_sleep.types import ReplayResult class StubBackend(Backend): name = "stub" @@ -366,9 +458,8 @@ class TestSlowUpdate(unittest.TestCase): class TestToolLoop(unittest.TestCase): def test_tool_called_judge_via_replay(self): from skillopt_sleep.backend import MockBackend - from skillopt_sleep.replay import replay_one, _required_tools from skillopt_sleep.memory import set_learned - from skillopt_sleep.types import TaskRecord + from skillopt_sleep.replay import _required_tools, replay_one task = TaskRecord( id="qa1", project="/p", intent="answer the question",