From 5487e2c426db8b75a0e8e2714641542325d55f9e Mon Sep 17 00:00:00 2001 From: Yif Yang Date: Tue, 30 Jun 2026 19:47:36 +0000 Subject: [PATCH] fix(skillopt-sleep): redact secrets before persisting cycle diagnostics PR #92 added a per-cycle diagnostics.json that surfaces backend stderr, optimizer replies, and task responses so a 0.0 night is self-diagnosing. Those free-text fields can carry credentials (e.g. a codex 401 stderr dump containing an auth token), so persisting them verbatim was a new on-disk leak surface. - Add a shared redact_secrets() in staging.py and route diagnostics.json's call_error / reflect_raw_head / holdout_detail through it before writing. - Redact the codex and Claude auth-error log lines too (a secondary sink when a file log handler is attached); last_call_error stays raw in memory so _AUTH_MARKERS matching is unaffected. - Centralize _SECRET_PATTERNS in staging.py (harvest_codex now reuses them) and extend coverage to AWS / GitHub / Slack / Google / JWT token shapes. - Tests: secret-shape coverage, private-key blocks, recursive/scalar passthrough, no over-redaction of plain prose, fail-fast auth-error log redaction, and an end-to-end check that diagnostics.json has no secret. Observability-only; the gate and learning algorithm are unchanged. Co-Authored-By: Claude --- skillopt_sleep/backend.py | 7 +- skillopt_sleep/cycle.py | 12 +++- skillopt_sleep/harvest_codex.py | 22 +----- skillopt_sleep/staging.py | 58 ++++++++++++++- tests/test_sleep_engine.py | 122 ++++++++++++++++++++++++++++++++ 5 files changed, 194 insertions(+), 27 deletions(-) diff --git a/skillopt_sleep/backend.py b/skillopt_sleep/backend.py index b1c7208..cf01b0a 100644 --- a/skillopt_sleep/backend.py +++ b/skillopt_sleep/backend.py @@ -582,9 +582,10 @@ class ClaudeCliBackend(CliBackend): combined = check_stdout + "\n" + stderr for marker in self._CLI_ERROR_MARKERS: if marker in combined: + from skillopt_sleep.staging import redact_secrets logging.getLogger("skillopt_sleep").warning( "Claude CLI returned a likely auth error: %s", - combined[:200].replace("\n", " "), + redact_secrets(combined[:200].replace("\n", " ")), ) self.last_call_error = combined[:500] return @@ -843,8 +844,10 @@ class CodexCliBackend(CliBackend): return out err = self.last_call_error or "" if any(m in err for m in self._AUTH_MARKERS): + from skillopt_sleep.staging import redact_secrets logging.getLogger("skillopt_sleep").error( - "codex auth error — re-login required (`codex login`): %s", err[:200] + "codex auth error — re-login required (`codex login`): %s", + redact_secrets(err[:200]), ) break # fail fast: retrying a 401 just burns calls if attempt < retries - 1: diff --git a/skillopt_sleep/cycle.py b/skillopt_sleep/cycle.py index ee726f3..6ad0d4f 100644 --- a/skillopt_sleep/cycle.py +++ b/skillopt_sleep/cycle.py @@ -21,6 +21,7 @@ from skillopt_sleep.harvest_sources import harvest_for_config from skillopt_sleep.memory import ensure_skill_scaffold from skillopt_sleep.mine import mine from skillopt_sleep.staging import adopt as adopt_staging +from skillopt_sleep.staging import redact_secrets from skillopt_sleep.staging import write_staging from skillopt_sleep.state import SleepState, _now_iso from skillopt_sleep.types import SessionDigest, SleepReport, TaskRecord @@ -281,6 +282,9 @@ def run_sleep_cycle( # cycle previously captured none of this, making the gate a black box (#learning-stall). try: import json as _json + # Backend stderr / optimizer replies / task responses can carry + # credentials (e.g. a codex 401 stderr dump), so scrub secret-looking + # substrings before persisting them to the on-disk diagnostics. with open(os.path.join(staging_dir, "diagnostics.json"), "w", encoding="utf-8") as _fh: _json.dump({ "night": night, @@ -292,9 +296,11 @@ def run_sleep_cycle( "accepted": result.accepted, "n_applied_edits": len(result.applied_edits), "n_rejected_edits": len(result.rejected_edits), - "call_error": getattr(result, "call_error", ""), - "reflect_raw_head": (getattr(result, "reflect_raw", "") or "")[:1200], - "holdout_detail": getattr(result, "holdout_detail", []), + "call_error": redact_secrets(getattr(result, "call_error", "")), + "reflect_raw_head": redact_secrets( + (getattr(result, "reflect_raw", "") or "")[:1200] + ), + "holdout_detail": redact_secrets(getattr(result, "holdout_detail", [])), }, _fh, indent=2) except Exception: pass diff --git a/skillopt_sleep/harvest_codex.py b/skillopt_sleep/harvest_codex.py index 8e97b31..c50a237 100644 --- a/skillopt_sleep/harvest_codex.py +++ b/skillopt_sleep/harvest_codex.py @@ -16,29 +16,9 @@ from skillopt_sleep.harvest import ( _iter_jsonl, _project_matches, ) +from skillopt_sleep.staging import _SECRET_PATTERNS from skillopt_sleep.types import SessionDigest -_SECRET_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = ( - (re.compile(r"sk-[A-Za-z0-9_-]{10,}"), "[REDACTED_OPENAI_KEY]"), - (re.compile(r"(?i)(Authorization:\s*Bearer\s+)[^\s\"']+"), r"\1[REDACTED]"), - (re.compile(r"(?i)(Authorization:\s*Basic\s+)[^\s\"']+"), r"\1[REDACTED]"), - ( - re.compile(r"(?i)\b(api[_-]?key|token|password|secret)\b(\s*[:=]\s*)[^\s\"']+"), - r"\1\2[REDACTED]", - ), - ( - re.compile(r"(?i)\b(api[_-]?key|token|password|secret)\b(\s+)[^\s\"']+"), - r"\1\2[REDACTED]", - ), - ( - re.compile( - r"-----BEGIN [A-Z ]*PRIVATE KEY-----.*?-----END [A-Z ]*PRIVATE KEY-----", - re.DOTALL, - ), - "[REDACTED_PRIVATE_KEY]", - ), -) - def _payload(rec: Dict[str, Any]) -> Dict[str, Any]: payload = rec.get("payload") diff --git a/skillopt_sleep/staging.py b/skillopt_sleep/staging.py index 2af5be9..49dd859 100644 --- a/skillopt_sleep/staging.py +++ b/skillopt_sleep/staging.py @@ -9,12 +9,68 @@ from __future__ import annotations import json import os +import re import shutil import time -from typing import List, Optional +from typing import Any, List, Optional from skillopt_sleep.types import SleepReport +# Secret patterns scrubbed from any free-text we persist to the staging dir +# (diagnostics, reports). Kept here so every on-disk artifact shares one +# redaction pass; harvest_codex reuses these for session text too. +_SECRET_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = ( + (re.compile(r"sk-[A-Za-z0-9_-]{10,}"), "[REDACTED_OPENAI_KEY]"), + # Distinctive vendor token prefixes (low false-positive: these prefixes do + # not occur in normal diagnostic prose). + (re.compile(r"\bAKIA[0-9A-Z]{16}\b"), "[REDACTED_AWS_KEY]"), + (re.compile(r"\bgh[pousr]_[A-Za-z0-9]{20,}\b"), "[REDACTED_GITHUB_TOKEN]"), + (re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{10,}\b"), "[REDACTED_SLACK_TOKEN]"), + (re.compile(r"\bAIza[0-9A-Za-z_-]{20,}\b"), "[REDACTED_GOOGLE_KEY]"), + # Bare JWT (three base64url segments) — e.g. a leaked bearer body without + # the "Authorization:" prefix. + (re.compile(r"\beyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\b"), + "[REDACTED_JWT]"), + (re.compile(r"(?i)(Authorization:\s*Bearer\s+)[^\s\"']+"), r"\1[REDACTED]"), + (re.compile(r"(?i)(Authorization:\s*Basic\s+)[^\s\"']+"), r"\1[REDACTED]"), + ( + re.compile(r"(?i)\b(api[_-]?key|token|password|secret)\b(\s*[:=]\s*)[^\s\"']+"), + r"\1\2[REDACTED]", + ), + ( + re.compile(r"(?i)\b(api[_-]?key|token|password|secret)\b(\s+)[^\s\"']+"), + r"\1\2[REDACTED]", + ), + ( + re.compile( + r"-----BEGIN [A-Z ]*PRIVATE KEY-----.*?-----END [A-Z ]*PRIVATE KEY-----", + re.DOTALL, + ), + "[REDACTED_PRIVATE_KEY]", + ), +) + + +def redact_secrets(value: Any) -> Any: + """Scrub secret-looking substrings (API keys, bearer tokens, private keys) + from a string, or recursively from the string leaves of a list/dict. + + Used before writing backend stderr / optimizer replies / task responses to + on-disk diagnostics: those are surfaced for debugging, but the underlying + text (e.g. a codex 401 stderr dump) can carry credentials. Non-string + scalars pass through unchanged. + """ + if isinstance(value, str): + out = value + for pattern, replacement in _SECRET_PATTERNS: + out = pattern.sub(replacement, out) + return out + if isinstance(value, list): + return [redact_secrets(v) for v in value] + if isinstance(value, dict): + return {k: redact_secrets(v) for k, v in value.items()} + return value + def _ts_dir() -> str: return time.strftime("%Y%m%d-%H%M%S", time.localtime()) diff --git a/tests/test_sleep_engine.py b/tests/test_sleep_engine.py index fdd0d56..aee9b7d 100644 --- a/tests/test_sleep_engine.py +++ b/tests/test_sleep_engine.py @@ -1160,5 +1160,127 @@ class TestVerifierDiscipline(unittest.TestCase): self.assertGreater(len(res.rejected_edits), 0) self.assertIn("placeholder", res.rejected_edits[0].content) +class TestDiagnosticsRedaction(unittest.TestCase): + """diagnostics.json surfaces backend stderr / optimizer replies / task + responses for debugging — but those can carry credentials (e.g. a codex 401 + stderr dump). redact_secrets() must scrub them before anything is persisted.""" + + def test_redacts_common_secret_shapes(self): + from skillopt_sleep.staging import redact_secrets + cases = [ + ("error: used sk-ABCDEFGHIJ1234567890 to call", "sk-ABCDEFGHIJ1234567890"), + ("Authorization: Bearer eyJhbGciOi.JIUzI1Ni.qwerty", "eyJhbGciOi.JIUzI1Ni.qwerty"), + ("config api_key=super-secret-value here", "super-secret-value"), + ("token: abc123def456ghi", "abc123def456ghi"), + ("aws AKIAIOSFODNN7EXAMPLE creds", "AKIAIOSFODNN7EXAMPLE"), + ("github ghp_AbCdEf0123456789AbCdEf0123 push", "ghp_AbCdEf0123456789AbCdEf0123"), + ("jwt eyJhbGci0123.eyJzdWIi4567.SflKxwRJ89 here", "eyJhbGci0123.eyJzdWIi4567.SflKxwRJ89"), + ] + for text, secret in cases: + out = redact_secrets(text) + self.assertNotIn(secret, out, f"secret leaked: {text!r} -> {out!r}") + self.assertIn("REDACTED", out, f"no redaction marker in {out!r}") + + def test_does_not_over_redact_plain_prose(self): + """Redaction must not mangle ordinary diagnostic prose that happens to + mention security words without an actual secret value attached.""" + from skillopt_sleep.staging import redact_secrets + for benign in ( + "the gate rejected the edit", + "response was empty, judge scored 0.0", + "held-out 1.000 -> 0.000 reject", + ): + self.assertEqual(redact_secrets(benign), benign, f"over-redacted: {benign!r}") + + def test_redacts_private_key_block(self): + from skillopt_sleep.staging import redact_secrets + blob = ( + "-----BEGIN RSA PRIVATE KEY-----\n" + "MIIEowIBAAKCAQEA...secret...\n" + "-----END RSA PRIVATE KEY-----" + ) + out = redact_secrets("leaked:\n" + blob) + self.assertNotIn("MIIEowIBAAKCAQEA", out) + self.assertIn("[REDACTED_PRIVATE_KEY]", out) + + def test_redacts_recursively_in_lists_and_dicts(self): + from skillopt_sleep.staging import redact_secrets + payload = { + "call_error": "exit 1: api_key=leaked-key-123", + "holdout_detail": [ + {"id": "t1", "response_head": "uses sk-DEADBEEF0001cafe", "hard": 0.0}, + ], + "n_tasks": 3, # non-string scalars pass through untouched + "accepted": False, + } + out = redact_secrets(payload) + self.assertNotIn("leaked-key-123", out["call_error"]) + self.assertNotIn("sk-DEADBEEF0001cafe", out["holdout_detail"][0]["response_head"]) + self.assertEqual(out["n_tasks"], 3) + self.assertIs(out["accepted"], False) + + def test_non_string_scalars_unchanged(self): + from skillopt_sleep.staging import redact_secrets + self.assertEqual(redact_secrets(42), 42) + self.assertEqual(redact_secrets(0.5), 0.5) + self.assertIsNone(redact_secrets(None)) + + def test_diagnostics_json_on_disk_has_no_secret(self): + """End-to-end: a codex-style 401 stderr captured in call_error must not + reach diagnostics.json verbatim once written to the staging dir.""" + import json + from skillopt_sleep.staging import redact_secrets + # Mirror exactly what cycle.py writes (the fields that carry free text). + secret_stderr = ( + "codex exec exited 1: ERROR 401 Unauthorized " + "Authorization: Bearer sk-LEAKED99887766abcdef refresh_token_reused" + ) + diag = { + "night": 1, + "accepted": False, + "call_error": redact_secrets(secret_stderr), + "reflect_raw_head": redact_secrets("optimizer said api_key=should-not-persist"), + "holdout_detail": redact_secrets( + [{"id": "v1", "response_head": "sk-ANOTHERLEAK1234567", "hard": 0.0}] + ), + } + with tempfile.TemporaryDirectory() as tmp: + p = os.path.join(tmp, "diagnostics.json") + with open(p, "w", encoding="utf-8") as fh: + json.dump(diag, fh, indent=2) + with open(p, encoding="utf-8") as fh: + on_disk = fh.read() + for leak in ("sk-LEAKED99887766abcdef", "should-not-persist", "sk-ANOTHERLEAK1234567"): + self.assertNotIn(leak, on_disk, f"secret {leak!r} leaked to diagnostics.json") + # The diagnostic value is still there (we scrub, not drop). + self.assertIn("401 Unauthorized", on_disk) + self.assertIn("REDACTED", on_disk) + + def test_codex_auth_error_log_is_redacted(self): + """The codex auth-error log line (a secondary on-disk sink when a file + log handler is attached) must not emit the raw stderr token verbatim.""" + import logging + from skillopt_sleep.backend import CodexCliBackend + be = CodexCliBackend.__new__(CodexCliBackend) # no __init__ side effects + be.timeout = 1 + be._AUTH_MARKERS = CodexCliBackend._AUTH_MARKERS + secret = "sk-LOGLEAK0011223344aa" + calls = {"n": 0} + + def _fake_once(prompt, *, max_tokens=1024): + calls["n"] += 1 + be.last_call_error = f"401 Unauthorized Authorization: Bearer {secret}" + return "" + + be._call_once = _fake_once + with self.assertLogs("skillopt_sleep", level="ERROR") as cm: + out = be._call("p", retries=3) + self.assertEqual(out, "") + self.assertEqual(calls["n"], 1, "auth error must fail fast, not retry") + joined = "\n".join(cm.output) + self.assertNotIn(secret, joined, "raw token leaked into the log line") + self.assertIn("REDACTED", joined) + + if __name__ == "__main__": unittest.main(verbosity=2)