fix(skillopt-sleep): redact secrets before persisting cycle diagnostics

PR #92 added a per-cycle diagnostics.json that surfaces backend stderr,
optimizer replies, and task responses so a 0.0 night is self-diagnosing.
Those free-text fields can carry credentials (e.g. a codex 401 stderr dump
containing an auth token), so persisting them verbatim was a new on-disk
leak surface.

- Add a shared redact_secrets() in staging.py and route diagnostics.json's
  call_error / reflect_raw_head / holdout_detail through it before writing.
- Redact the codex and Claude auth-error log lines too (a secondary sink
  when a file log handler is attached); last_call_error stays raw in memory
  so _AUTH_MARKERS matching is unaffected.
- Centralize _SECRET_PATTERNS in staging.py (harvest_codex now reuses them)
  and extend coverage to AWS / GitHub / Slack / Google / JWT token shapes.
- Tests: secret-shape coverage, private-key blocks, recursive/scalar
  passthrough, no over-redaction of plain prose, fail-fast auth-error log
  redaction, and an end-to-end check that diagnostics.json has no secret.

Observability-only; the gate and learning algorithm are unchanged.

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Yif Yang
2026-06-30 19:47:36 +00:00
parent b9142bad24
commit 5487e2c426
5 changed files with 194 additions and 27 deletions

View File

@@ -582,9 +582,10 @@ class ClaudeCliBackend(CliBackend):
combined = check_stdout + "\n" + stderr
for marker in self._CLI_ERROR_MARKERS:
if marker in combined:
from skillopt_sleep.staging import redact_secrets
logging.getLogger("skillopt_sleep").warning(
"Claude CLI returned a likely auth error: %s",
combined[:200].replace("\n", " "),
redact_secrets(combined[:200].replace("\n", " ")),
)
self.last_call_error = combined[:500]
return
@@ -843,8 +844,10 @@ class CodexCliBackend(CliBackend):
return out
err = self.last_call_error or ""
if any(m in err for m in self._AUTH_MARKERS):
from skillopt_sleep.staging import redact_secrets
logging.getLogger("skillopt_sleep").error(
"codex auth error — re-login required (`codex login`): %s", err[:200]
"codex auth error — re-login required (`codex login`): %s",
redact_secrets(err[:200]),
)
break # fail fast: retrying a 401 just burns calls
if attempt < retries - 1:

View File

@@ -21,6 +21,7 @@ from skillopt_sleep.harvest_sources import harvest_for_config
from skillopt_sleep.memory import ensure_skill_scaffold
from skillopt_sleep.mine import mine
from skillopt_sleep.staging import adopt as adopt_staging
from skillopt_sleep.staging import redact_secrets
from skillopt_sleep.staging import write_staging
from skillopt_sleep.state import SleepState, _now_iso
from skillopt_sleep.types import SessionDigest, SleepReport, TaskRecord
@@ -281,6 +282,9 @@ def run_sleep_cycle(
# cycle previously captured none of this, making the gate a black box (#learning-stall).
try:
import json as _json
# Backend stderr / optimizer replies / task responses can carry
# credentials (e.g. a codex 401 stderr dump), so scrub secret-looking
# substrings before persisting them to the on-disk diagnostics.
with open(os.path.join(staging_dir, "diagnostics.json"), "w", encoding="utf-8") as _fh:
_json.dump({
"night": night,
@@ -292,9 +296,11 @@ def run_sleep_cycle(
"accepted": result.accepted,
"n_applied_edits": len(result.applied_edits),
"n_rejected_edits": len(result.rejected_edits),
"call_error": getattr(result, "call_error", ""),
"reflect_raw_head": (getattr(result, "reflect_raw", "") or "")[:1200],
"holdout_detail": getattr(result, "holdout_detail", []),
"call_error": redact_secrets(getattr(result, "call_error", "")),
"reflect_raw_head": redact_secrets(
(getattr(result, "reflect_raw", "") or "")[:1200]
),
"holdout_detail": redact_secrets(getattr(result, "holdout_detail", [])),
}, _fh, indent=2)
except Exception:
pass

View File

@@ -16,29 +16,9 @@ from skillopt_sleep.harvest import (
_iter_jsonl,
_project_matches,
)
from skillopt_sleep.staging import _SECRET_PATTERNS
from skillopt_sleep.types import SessionDigest
_SECRET_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = (
(re.compile(r"sk-[A-Za-z0-9_-]{10,}"), "[REDACTED_OPENAI_KEY]"),
(re.compile(r"(?i)(Authorization:\s*Bearer\s+)[^\s\"']+"), r"\1[REDACTED]"),
(re.compile(r"(?i)(Authorization:\s*Basic\s+)[^\s\"']+"), r"\1[REDACTED]"),
(
re.compile(r"(?i)\b(api[_-]?key|token|password|secret)\b(\s*[:=]\s*)[^\s\"']+"),
r"\1\2[REDACTED]",
),
(
re.compile(r"(?i)\b(api[_-]?key|token|password|secret)\b(\s+)[^\s\"']+"),
r"\1\2[REDACTED]",
),
(
re.compile(
r"-----BEGIN [A-Z ]*PRIVATE KEY-----.*?-----END [A-Z ]*PRIVATE KEY-----",
re.DOTALL,
),
"[REDACTED_PRIVATE_KEY]",
),
)
def _payload(rec: Dict[str, Any]) -> Dict[str, Any]:
payload = rec.get("payload")

View File

@@ -9,12 +9,68 @@ from __future__ import annotations
import json
import os
import re
import shutil
import time
from typing import List, Optional
from typing import Any, List, Optional
from skillopt_sleep.types import SleepReport
# Secret patterns scrubbed from any free-text we persist to the staging dir
# (diagnostics, reports). Kept here so every on-disk artifact shares one
# redaction pass; harvest_codex reuses these for session text too.
_SECRET_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = (
(re.compile(r"sk-[A-Za-z0-9_-]{10,}"), "[REDACTED_OPENAI_KEY]"),
# Distinctive vendor token prefixes (low false-positive: these prefixes do
# not occur in normal diagnostic prose).
(re.compile(r"\bAKIA[0-9A-Z]{16}\b"), "[REDACTED_AWS_KEY]"),
(re.compile(r"\bgh[pousr]_[A-Za-z0-9]{20,}\b"), "[REDACTED_GITHUB_TOKEN]"),
(re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{10,}\b"), "[REDACTED_SLACK_TOKEN]"),
(re.compile(r"\bAIza[0-9A-Za-z_-]{20,}\b"), "[REDACTED_GOOGLE_KEY]"),
# Bare JWT (three base64url segments) — e.g. a leaked bearer body without
# the "Authorization:" prefix.
(re.compile(r"\beyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\b"),
"[REDACTED_JWT]"),
(re.compile(r"(?i)(Authorization:\s*Bearer\s+)[^\s\"']+"), r"\1[REDACTED]"),
(re.compile(r"(?i)(Authorization:\s*Basic\s+)[^\s\"']+"), r"\1[REDACTED]"),
(
re.compile(r"(?i)\b(api[_-]?key|token|password|secret)\b(\s*[:=]\s*)[^\s\"']+"),
r"\1\2[REDACTED]",
),
(
re.compile(r"(?i)\b(api[_-]?key|token|password|secret)\b(\s+)[^\s\"']+"),
r"\1\2[REDACTED]",
),
(
re.compile(
r"-----BEGIN [A-Z ]*PRIVATE KEY-----.*?-----END [A-Z ]*PRIVATE KEY-----",
re.DOTALL,
),
"[REDACTED_PRIVATE_KEY]",
),
)
def redact_secrets(value: Any) -> Any:
"""Scrub secret-looking substrings (API keys, bearer tokens, private keys)
from a string, or recursively from the string leaves of a list/dict.
Used before writing backend stderr / optimizer replies / task responses to
on-disk diagnostics: those are surfaced for debugging, but the underlying
text (e.g. a codex 401 stderr dump) can carry credentials. Non-string
scalars pass through unchanged.
"""
if isinstance(value, str):
out = value
for pattern, replacement in _SECRET_PATTERNS:
out = pattern.sub(replacement, out)
return out
if isinstance(value, list):
return [redact_secrets(v) for v in value]
if isinstance(value, dict):
return {k: redact_secrets(v) for k, v in value.items()}
return value
def _ts_dir() -> str:
return time.strftime("%Y%m%d-%H%M%S", time.localtime())

View File

@@ -1160,5 +1160,127 @@ class TestVerifierDiscipline(unittest.TestCase):
self.assertGreater(len(res.rejected_edits), 0)
self.assertIn("placeholder", res.rejected_edits[0].content)
class TestDiagnosticsRedaction(unittest.TestCase):
"""diagnostics.json surfaces backend stderr / optimizer replies / task
responses for debugging — but those can carry credentials (e.g. a codex 401
stderr dump). redact_secrets() must scrub them before anything is persisted."""
def test_redacts_common_secret_shapes(self):
from skillopt_sleep.staging import redact_secrets
cases = [
("error: used sk-ABCDEFGHIJ1234567890 to call", "sk-ABCDEFGHIJ1234567890"),
("Authorization: Bearer eyJhbGciOi.JIUzI1Ni.qwerty", "eyJhbGciOi.JIUzI1Ni.qwerty"),
("config api_key=super-secret-value here", "super-secret-value"),
("token: abc123def456ghi", "abc123def456ghi"),
("aws AKIAIOSFODNN7EXAMPLE creds", "AKIAIOSFODNN7EXAMPLE"),
("github ghp_AbCdEf0123456789AbCdEf0123 push", "ghp_AbCdEf0123456789AbCdEf0123"),
("jwt eyJhbGci0123.eyJzdWIi4567.SflKxwRJ89 here", "eyJhbGci0123.eyJzdWIi4567.SflKxwRJ89"),
]
for text, secret in cases:
out = redact_secrets(text)
self.assertNotIn(secret, out, f"secret leaked: {text!r} -> {out!r}")
self.assertIn("REDACTED", out, f"no redaction marker in {out!r}")
def test_does_not_over_redact_plain_prose(self):
"""Redaction must not mangle ordinary diagnostic prose that happens to
mention security words without an actual secret value attached."""
from skillopt_sleep.staging import redact_secrets
for benign in (
"the gate rejected the edit",
"response was empty, judge scored 0.0",
"held-out 1.000 -> 0.000 reject",
):
self.assertEqual(redact_secrets(benign), benign, f"over-redacted: {benign!r}")
def test_redacts_private_key_block(self):
from skillopt_sleep.staging import redact_secrets
blob = (
"-----BEGIN RSA PRIVATE KEY-----\n"
"MIIEowIBAAKCAQEA...secret...\n"
"-----END RSA PRIVATE KEY-----"
)
out = redact_secrets("leaked:\n" + blob)
self.assertNotIn("MIIEowIBAAKCAQEA", out)
self.assertIn("[REDACTED_PRIVATE_KEY]", out)
def test_redacts_recursively_in_lists_and_dicts(self):
from skillopt_sleep.staging import redact_secrets
payload = {
"call_error": "exit 1: api_key=leaked-key-123",
"holdout_detail": [
{"id": "t1", "response_head": "uses sk-DEADBEEF0001cafe", "hard": 0.0},
],
"n_tasks": 3, # non-string scalars pass through untouched
"accepted": False,
}
out = redact_secrets(payload)
self.assertNotIn("leaked-key-123", out["call_error"])
self.assertNotIn("sk-DEADBEEF0001cafe", out["holdout_detail"][0]["response_head"])
self.assertEqual(out["n_tasks"], 3)
self.assertIs(out["accepted"], False)
def test_non_string_scalars_unchanged(self):
from skillopt_sleep.staging import redact_secrets
self.assertEqual(redact_secrets(42), 42)
self.assertEqual(redact_secrets(0.5), 0.5)
self.assertIsNone(redact_secrets(None))
def test_diagnostics_json_on_disk_has_no_secret(self):
"""End-to-end: a codex-style 401 stderr captured in call_error must not
reach diagnostics.json verbatim once written to the staging dir."""
import json
from skillopt_sleep.staging import redact_secrets
# Mirror exactly what cycle.py writes (the fields that carry free text).
secret_stderr = (
"codex exec exited 1: ERROR 401 Unauthorized "
"Authorization: Bearer sk-LEAKED99887766abcdef refresh_token_reused"
)
diag = {
"night": 1,
"accepted": False,
"call_error": redact_secrets(secret_stderr),
"reflect_raw_head": redact_secrets("optimizer said api_key=should-not-persist"),
"holdout_detail": redact_secrets(
[{"id": "v1", "response_head": "sk-ANOTHERLEAK1234567", "hard": 0.0}]
),
}
with tempfile.TemporaryDirectory() as tmp:
p = os.path.join(tmp, "diagnostics.json")
with open(p, "w", encoding="utf-8") as fh:
json.dump(diag, fh, indent=2)
with open(p, encoding="utf-8") as fh:
on_disk = fh.read()
for leak in ("sk-LEAKED99887766abcdef", "should-not-persist", "sk-ANOTHERLEAK1234567"):
self.assertNotIn(leak, on_disk, f"secret {leak!r} leaked to diagnostics.json")
# The diagnostic value is still there (we scrub, not drop).
self.assertIn("401 Unauthorized", on_disk)
self.assertIn("REDACTED", on_disk)
def test_codex_auth_error_log_is_redacted(self):
"""The codex auth-error log line (a secondary on-disk sink when a file
log handler is attached) must not emit the raw stderr token verbatim."""
import logging
from skillopt_sleep.backend import CodexCliBackend
be = CodexCliBackend.__new__(CodexCliBackend) # no __init__ side effects
be.timeout = 1
be._AUTH_MARKERS = CodexCliBackend._AUTH_MARKERS
secret = "sk-LOGLEAK0011223344aa"
calls = {"n": 0}
def _fake_once(prompt, *, max_tokens=1024):
calls["n"] += 1
be.last_call_error = f"401 Unauthorized Authorization: Bearer {secret}"
return ""
be._call_once = _fake_once
with self.assertLogs("skillopt_sleep", level="ERROR") as cm:
out = be._call("p", retries=3)
self.assertEqual(out, "")
self.assertEqual(calls["n"], 1, "auth error must fail fast, not retry")
joined = "\n".join(cm.output)
self.assertNotIn(secret, joined, "raw token leaked into the log line")
self.assertIn("REDACTED", joined)
if __name__ == "__main__":
unittest.main(verbosity=2)