fix(skillopt-sleep): redact secrets before persisting cycle diagnostics

PR #92 added a per-cycle diagnostics.json that surfaces backend stderr, optimizer replies, and task responses so a 0.0 night is self-diagnosing. Those free-text fields can carry credentials (e.g. a codex 401 stderr dump containing an auth token), so persisting them verbatim was a new on-disk leak surface. - Add a shared redact_secrets() in staging.py and route diagnostics.json's call_error / reflect_raw_head / holdout_detail through it before writing. - Redact the codex and Claude auth-error log lines too (a secondary sink when a file log handler is attached); last_call_error stays raw in memory so _AUTH_MARKERS matching is unaffected. - Centralize _SECRET_PATTERNS in staging.py (harvest_codex now reuses them) and extend coverage to AWS / GitHub / Slack / Google / JWT token shapes. - Tests: secret-shape coverage, private-key blocks, recursive/scalar passthrough, no over-redaction of plain prose, fail-fast auth-error log redaction, and an end-to-end check that diagnostics.json has no secret. Observability-only; the gate and learning algorithm are unchanged. Co-Authored-By: Claude <noreply@anthropic.com>
2026-07-03 14:02:58 +08:00 · 2026-06-30 19:47:36 +00:00
parent b9142bad24
commit 5487e2c426
5 changed files with 194 additions and 27 deletions
--- a/skillopt_sleep/backend.py
+++ b/skillopt_sleep/backend.py
@@ -582,9 +582,10 @@ class ClaudeCliBackend(CliBackend):
        combined = check_stdout + "\n" + stderr
        for marker in self._CLI_ERROR_MARKERS:
            if marker in combined:
+                from skillopt_sleep.staging import redact_secrets
                logging.getLogger("skillopt_sleep").warning(
                    "Claude CLI returned a likely auth error: %s",
-                    combined[:200].replace("\n", " "),
+                    redact_secrets(combined[:200].replace("\n", " ")),
                )
                self.last_call_error = combined[:500]
                return
@@ -843,8 +844,10 @@ class CodexCliBackend(CliBackend):
                return out
            err = self.last_call_error or ""
            if any(m in err for m in self._AUTH_MARKERS):
+                from skillopt_sleep.staging import redact_secrets
                logging.getLogger("skillopt_sleep").error(
-                    "codex auth error — re-login required (`codex login`): %s", err[:200]
+                    "codex auth error — re-login required (`codex login`): %s",
+                    redact_secrets(err[:200]),
                )
                break  # fail fast: retrying a 401 just burns calls
            if attempt < retries - 1:
--- a/skillopt_sleep/cycle.py
+++ b/skillopt_sleep/cycle.py
@@ -21,6 +21,7 @@ from skillopt_sleep.harvest_sources import harvest_for_config
 from skillopt_sleep.memory import ensure_skill_scaffold
 from skillopt_sleep.mine import mine
 from skillopt_sleep.staging import adopt as adopt_staging
+from skillopt_sleep.staging import redact_secrets
 from skillopt_sleep.staging import write_staging
 from skillopt_sleep.state import SleepState, _now_iso
 from skillopt_sleep.types import SessionDigest, SleepReport, TaskRecord
@@ -281,6 +282,9 @@ def run_sleep_cycle(
        # cycle previously captured none of this, making the gate a black box (#learning-stall).
        try:
            import json as _json
+            # Backend stderr / optimizer replies / task responses can carry
+            # credentials (e.g. a codex 401 stderr dump), so scrub secret-looking
+            # substrings before persisting them to the on-disk diagnostics.
            with open(os.path.join(staging_dir, "diagnostics.json"), "w", encoding="utf-8") as _fh:
                _json.dump({
                    "night": night,
@@ -292,9 +296,11 @@ def run_sleep_cycle(
                    "accepted": result.accepted,
                    "n_applied_edits": len(result.applied_edits),
                    "n_rejected_edits": len(result.rejected_edits),
-                    "call_error": getattr(result, "call_error", ""),
-                    "reflect_raw_head": (getattr(result, "reflect_raw", "") or "")[:1200],
-                    "holdout_detail": getattr(result, "holdout_detail", []),
+                    "call_error": redact_secrets(getattr(result, "call_error", "")),
+                    "reflect_raw_head": redact_secrets(
+                        (getattr(result, "reflect_raw", "") or "")[:1200]
+                    ),
+                    "holdout_detail": redact_secrets(getattr(result, "holdout_detail", [])),
                }, _fh, indent=2)
        except Exception:
            pass
--- a/skillopt_sleep/harvest_codex.py
+++ b/skillopt_sleep/harvest_codex.py
@@ -16,29 +16,9 @@ from skillopt_sleep.harvest import (
    _iter_jsonl,
    _project_matches,
 )
+from skillopt_sleep.staging import _SECRET_PATTERNS
 from skillopt_sleep.types import SessionDigest

-_SECRET_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = (
-    (re.compile(r"sk-[A-Za-z0-9_-]{10,}"), "[REDACTED_OPENAI_KEY]"),
-    (re.compile(r"(?i)(Authorization:\s*Bearer\s+)[^\s\"']+"), r"\1[REDACTED]"),
-    (re.compile(r"(?i)(Authorization:\s*Basic\s+)[^\s\"']+"), r"\1[REDACTED]"),
-    (
-        re.compile(r"(?i)\b(api[_-]?key|token|password|secret)\b(\s*[:=]\s*)[^\s\"']+"),
-        r"\1\2[REDACTED]",
-    ),
-    (
-        re.compile(r"(?i)\b(api[_-]?key|token|password|secret)\b(\s+)[^\s\"']+"),
-        r"\1\2[REDACTED]",
-    ),
-    (
-        re.compile(
-            r"-----BEGIN [A-Z ]*PRIVATE KEY-----.*?-----END [A-Z ]*PRIVATE KEY-----",
-            re.DOTALL,
-        ),
-        "[REDACTED_PRIVATE_KEY]",
-    ),
-)
-

 def _payload(rec: Dict[str, Any]) -> Dict[str, Any]:
    payload = rec.get("payload")
--- a/skillopt_sleep/staging.py
+++ b/skillopt_sleep/staging.py
@@ -9,12 +9,68 @@ from __future__ import annotations

 import json
 import os
+import re
 import shutil
 import time
-from typing import List, Optional
+from typing import Any, List, Optional

 from skillopt_sleep.types import SleepReport

+# Secret patterns scrubbed from any free-text we persist to the staging dir
+# (diagnostics, reports). Kept here so every on-disk artifact shares one
+# redaction pass; harvest_codex reuses these for session text too.
+_SECRET_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = (
+    (re.compile(r"sk-[A-Za-z0-9_-]{10,}"), "[REDACTED_OPENAI_KEY]"),
+    # Distinctive vendor token prefixes (low false-positive: these prefixes do
+    # not occur in normal diagnostic prose).
+    (re.compile(r"\bAKIA[0-9A-Z]{16}\b"), "[REDACTED_AWS_KEY]"),
+    (re.compile(r"\bgh[pousr]_[A-Za-z0-9]{20,}\b"), "[REDACTED_GITHUB_TOKEN]"),
+    (re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{10,}\b"), "[REDACTED_SLACK_TOKEN]"),
+    (re.compile(r"\bAIza[0-9A-Za-z_-]{20,}\b"), "[REDACTED_GOOGLE_KEY]"),
+    # Bare JWT (three base64url segments) — e.g. a leaked bearer body without
+    # the "Authorization:" prefix.
+    (re.compile(r"\beyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\b"),
+     "[REDACTED_JWT]"),
+    (re.compile(r"(?i)(Authorization:\s*Bearer\s+)[^\s\"']+"), r"\1[REDACTED]"),
+    (re.compile(r"(?i)(Authorization:\s*Basic\s+)[^\s\"']+"), r"\1[REDACTED]"),
+    (
+        re.compile(r"(?i)\b(api[_-]?key|token|password|secret)\b(\s*[:=]\s*)[^\s\"']+"),
+        r"\1\2[REDACTED]",
+    ),
+    (
+        re.compile(r"(?i)\b(api[_-]?key|token|password|secret)\b(\s+)[^\s\"']+"),
+        r"\1\2[REDACTED]",
+    ),
+    (
+        re.compile(
+            r"-----BEGIN [A-Z ]*PRIVATE KEY-----.*?-----END [A-Z ]*PRIVATE KEY-----",
+            re.DOTALL,
+        ),
+        "[REDACTED_PRIVATE_KEY]",
+    ),
+)
+
+
+def redact_secrets(value: Any) -> Any:
+    """Scrub secret-looking substrings (API keys, bearer tokens, private keys)
+    from a string, or recursively from the string leaves of a list/dict.
+
+    Used before writing backend stderr / optimizer replies / task responses to
+    on-disk diagnostics: those are surfaced for debugging, but the underlying
+    text (e.g. a codex 401 stderr dump) can carry credentials. Non-string
+    scalars pass through unchanged.
+    """
+    if isinstance(value, str):
+        out = value
+        for pattern, replacement in _SECRET_PATTERNS:
+            out = pattern.sub(replacement, out)
+        return out
+    if isinstance(value, list):
+        return [redact_secrets(v) for v in value]
+    if isinstance(value, dict):
+        return {k: redact_secrets(v) for k, v in value.items()}
+    return value
+

 def _ts_dir() -> str:
    return time.strftime("%Y%m%d-%H%M%S", time.localtime())
--- a/tests/test_sleep_engine.py
+++ b/tests/test_sleep_engine.py
@@ -1160,5 +1160,127 @@ class TestVerifierDiscipline(unittest.TestCase):
        self.assertGreater(len(res.rejected_edits), 0)
        self.assertIn("placeholder", res.rejected_edits[0].content)

+class TestDiagnosticsRedaction(unittest.TestCase):
+    """diagnostics.json surfaces backend stderr / optimizer replies / task
+    responses for debugging — but those can carry credentials (e.g. a codex 401
+    stderr dump). redact_secrets() must scrub them before anything is persisted."""
+
+    def test_redacts_common_secret_shapes(self):
+        from skillopt_sleep.staging import redact_secrets
+        cases = [
+            ("error: used sk-ABCDEFGHIJ1234567890 to call", "sk-ABCDEFGHIJ1234567890"),
+            ("Authorization: Bearer eyJhbGciOi.JIUzI1Ni.qwerty", "eyJhbGciOi.JIUzI1Ni.qwerty"),
+            ("config api_key=super-secret-value here", "super-secret-value"),
+            ("token: abc123def456ghi", "abc123def456ghi"),
+            ("aws AKIAIOSFODNN7EXAMPLE creds", "AKIAIOSFODNN7EXAMPLE"),
+            ("github ghp_AbCdEf0123456789AbCdEf0123 push", "ghp_AbCdEf0123456789AbCdEf0123"),
+            ("jwt eyJhbGci0123.eyJzdWIi4567.SflKxwRJ89 here", "eyJhbGci0123.eyJzdWIi4567.SflKxwRJ89"),
+        ]
+        for text, secret in cases:
+            out = redact_secrets(text)
+            self.assertNotIn(secret, out, f"secret leaked: {text!r} -> {out!r}")
+            self.assertIn("REDACTED", out, f"no redaction marker in {out!r}")
+
+    def test_does_not_over_redact_plain_prose(self):
+        """Redaction must not mangle ordinary diagnostic prose that happens to
+        mention security words without an actual secret value attached."""
+        from skillopt_sleep.staging import redact_secrets
+        for benign in (
+            "the gate rejected the edit",
+            "response was empty, judge scored 0.0",
+            "held-out 1.000 -> 0.000 reject",
+        ):
+            self.assertEqual(redact_secrets(benign), benign, f"over-redacted: {benign!r}")
+
+    def test_redacts_private_key_block(self):
+        from skillopt_sleep.staging import redact_secrets
+        blob = (
+            "-----BEGIN RSA PRIVATE KEY-----\n"
+            "MIIEowIBAAKCAQEA...secret...\n"
+            "-----END RSA PRIVATE KEY-----"
+        )
+        out = redact_secrets("leaked:\n" + blob)
+        self.assertNotIn("MIIEowIBAAKCAQEA", out)
+        self.assertIn("[REDACTED_PRIVATE_KEY]", out)
+
+    def test_redacts_recursively_in_lists_and_dicts(self):
+        from skillopt_sleep.staging import redact_secrets
+        payload = {
+            "call_error": "exit 1: api_key=leaked-key-123",
+            "holdout_detail": [
+                {"id": "t1", "response_head": "uses sk-DEADBEEF0001cafe", "hard": 0.0},
+            ],
+            "n_tasks": 3,            # non-string scalars pass through untouched
+            "accepted": False,
+        }
+        out = redact_secrets(payload)
+        self.assertNotIn("leaked-key-123", out["call_error"])
+        self.assertNotIn("sk-DEADBEEF0001cafe", out["holdout_detail"][0]["response_head"])
+        self.assertEqual(out["n_tasks"], 3)
+        self.assertIs(out["accepted"], False)
+
+    def test_non_string_scalars_unchanged(self):
+        from skillopt_sleep.staging import redact_secrets
+        self.assertEqual(redact_secrets(42), 42)
+        self.assertEqual(redact_secrets(0.5), 0.5)
+        self.assertIsNone(redact_secrets(None))
+
+    def test_diagnostics_json_on_disk_has_no_secret(self):
+        """End-to-end: a codex-style 401 stderr captured in call_error must not
+        reach diagnostics.json verbatim once written to the staging dir."""
+        import json
+        from skillopt_sleep.staging import redact_secrets
+        # Mirror exactly what cycle.py writes (the fields that carry free text).
+        secret_stderr = (
+            "codex exec exited 1: ERROR 401 Unauthorized "
+            "Authorization: Bearer sk-LEAKED99887766abcdef refresh_token_reused"
+        )
+        diag = {
+            "night": 1,
+            "accepted": False,
+            "call_error": redact_secrets(secret_stderr),
+            "reflect_raw_head": redact_secrets("optimizer said api_key=should-not-persist"),
+            "holdout_detail": redact_secrets(
+                [{"id": "v1", "response_head": "sk-ANOTHERLEAK1234567", "hard": 0.0}]
+            ),
+        }
+        with tempfile.TemporaryDirectory() as tmp:
+            p = os.path.join(tmp, "diagnostics.json")
+            with open(p, "w", encoding="utf-8") as fh:
+                json.dump(diag, fh, indent=2)
+            with open(p, encoding="utf-8") as fh:
+                on_disk = fh.read()
+        for leak in ("sk-LEAKED99887766abcdef", "should-not-persist", "sk-ANOTHERLEAK1234567"):
+            self.assertNotIn(leak, on_disk, f"secret {leak!r} leaked to diagnostics.json")
+        # The diagnostic value is still there (we scrub, not drop).
+        self.assertIn("401 Unauthorized", on_disk)
+        self.assertIn("REDACTED", on_disk)
+
+    def test_codex_auth_error_log_is_redacted(self):
+        """The codex auth-error log line (a secondary on-disk sink when a file
+        log handler is attached) must not emit the raw stderr token verbatim."""
+        import logging
+        from skillopt_sleep.backend import CodexCliBackend
+        be = CodexCliBackend.__new__(CodexCliBackend)  # no __init__ side effects
+        be.timeout = 1
+        be._AUTH_MARKERS = CodexCliBackend._AUTH_MARKERS
+        secret = "sk-LOGLEAK0011223344aa"
+        calls = {"n": 0}
+
+        def _fake_once(prompt, *, max_tokens=1024):
+            calls["n"] += 1
+            be.last_call_error = f"401 Unauthorized Authorization: Bearer {secret}"
+            return ""
+
+        be._call_once = _fake_once
+        with self.assertLogs("skillopt_sleep", level="ERROR") as cm:
+            out = be._call("p", retries=3)
+        self.assertEqual(out, "")
+        self.assertEqual(calls["n"], 1, "auth error must fail fast, not retry")
+        joined = "\n".join(cm.output)
+        self.assertNotIn(secret, joined, "raw token leaked into the log line")
+        self.assertIn("REDACTED", joined)
+
+
 if __name__ == "__main__":
    unittest.main(verbosity=2)