diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fbc81c48..39fabf86 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,6 +5,7 @@ on: branches: [main] pull_request: branches: [main] + types: [opened, synchronize, reopened, edited] workflow_dispatch: permissions: @@ -70,6 +71,7 @@ jobs: - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 with: fetch-depth: 0 + persist-credentials: false - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6 with: go-version-file: go.mod @@ -87,6 +89,23 @@ jobs: - name: Run errs/ lint guards (lintcheck) run: go run -C lint . --changed-from "$QUALITY_GATE_CHANGED_FROM" .. + script-test: + needs: fast-gate + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + with: + fetch-depth: 0 + persist-credentials: false + - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6 + with: + go-version-file: go.mod + - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4 + with: + node-version: '22' + - name: Run script tests + run: make script-test + deterministic-gate: needs: fast-gate runs-on: ubuntu-latest @@ -109,8 +128,28 @@ jobs: env: QUALITY_GATE_CHANGED_FROM: ${{ github.event.pull_request.base.sha || github.event.before || 'origin/main' }} run: echo "QUALITY_GATE_CHANGED_FROM=$(bash scripts/resolve-changed-from.sh)" >> "$GITHUB_ENV" + - name: Write public content metadata + if: ${{ github.event_name == 'pull_request' }} + env: + PR_TITLE: ${{ github.event.pull_request.title }} + PR_BODY: ${{ github.event.pull_request.body }} + PR_BRANCH: ${{ github.head_ref }} + run: | + mkdir -p .tmp/quality-gate + python3 - <<'PY' + import json + import os + + with open(".tmp/quality-gate/public-content-metadata.json", "w", encoding="utf-8") as f: + json.dump({ + "title": os.environ.get("PR_TITLE", ""), + "body": os.environ.get("PR_BODY", ""), + "branch": os.environ.get("PR_BRANCH", ""), + }, f) + f.write("\n") + PY - name: Run CLI deterministic gate - run: make quality-gate + run: PUBLIC_CONTENT_METADATA=.tmp/quality-gate/public-content-metadata.json make quality-gate - name: Upload quality gate facts if: ${{ always() && github.event_name == 'pull_request' }} uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 @@ -220,7 +259,7 @@ jobs: # ── Layer 3: E2E Gate ────────────────────────────────────────────── e2e-dry-run: - needs: [unit-test, lint, deterministic-gate] + needs: [unit-test, lint, script-test, deterministic-gate] runs-on: ubuntu-latest steps: - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 @@ -241,7 +280,7 @@ jobs: run: go test -v -count=1 -timeout=5m ./tests/cli_e2e/... -run 'DryRun|Regression' e2e-live: - needs: [unit-test, lint, deterministic-gate] + needs: [unit-test, lint, script-test, deterministic-gate] if: ${{ github.event_name != 'pull_request' || !github.event.pull_request.head.repo.fork }} runs-on: ubuntu-latest permissions: @@ -333,7 +372,7 @@ jobs: # ── Results Gate (single required check for branch protection) ───── results: if: ${{ always() }} - needs: [fast-gate, unit-test, lint, deterministic-gate, coverage, deadcode, e2e-dry-run, e2e-live, security, license-header] + needs: [fast-gate, unit-test, lint, script-test, deterministic-gate, coverage, deadcode, e2e-dry-run, e2e-live, security, license-header] runs-on: ubuntu-latest steps: - name: Evaluate results @@ -345,6 +384,7 @@ jobs: echo "| L1 | fast-gate | ${{ needs.fast-gate.result }} |" >> $GITHUB_STEP_SUMMARY echo "| L2 | unit-test | ${{ needs.unit-test.result }} |" >> $GITHUB_STEP_SUMMARY echo "| L2 | lint | ${{ needs.lint.result }} |" >> $GITHUB_STEP_SUMMARY + echo "| L2 | script-test | ${{ needs.script-test.result }} |" >> $GITHUB_STEP_SUMMARY echo "| L2 | deterministic-gate | ${{ needs.deterministic-gate.result }} |" >> $GITHUB_STEP_SUMMARY echo "| L2 | coverage | ${{ needs.coverage.result }} |" >> $GITHUB_STEP_SUMMARY echo "| L2 | deadcode | ${{ needs.deadcode.result }} |" >> $GITHUB_STEP_SUMMARY @@ -361,6 +401,7 @@ jobs: "${{ needs.fast-gate.result }}" \ "${{ needs.unit-test.result }}" \ "${{ needs.lint.result }}" \ + "${{ needs.script-test.result }}" \ "${{ needs.deterministic-gate.result }}" \ "${{ needs.coverage.result }}" \ "${{ needs.deadcode.result }}" \ diff --git a/.github/workflows/comment-audit.yml b/.github/workflows/comment-audit.yml new file mode 100644 index 00000000..0508fa52 --- /dev/null +++ b/.github/workflows/comment-audit.yml @@ -0,0 +1,28 @@ +name: Comment Audit + +on: + issue_comment: + types: [created, edited] + pull_request_review: + types: [submitted, edited] + pull_request_review_comment: + types: [created, edited] + +permissions: + contents: read + +jobs: + public-content-comment-audit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + with: + persist-credentials: false + - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6 + with: + go-version-file: go.mod + - name: Post-publication comment audit + run: | + mkdir -p .tmp/comment-audit + cp "$GITHUB_EVENT_PATH" .tmp/comment-audit/event.json + go run ./internal/qualitygate/cmd/comment-audit --event .tmp/comment-audit/event.json --kind "$GITHUB_EVENT_NAME" diff --git a/.github/workflows/semantic-review.yml b/.github/workflows/semantic-review.yml index 7d39a4b4..2fcf298c 100644 --- a/.github/workflows/semantic-review.yml +++ b/.github/workflows/semantic-review.yml @@ -88,31 +88,44 @@ jobs: commit_sha: targetHeadSha, }); const candidatePRs = associatedPRs.filter((candidate) => - candidate.state === "open" && candidate.base?.repo?.id === context.payload.repository.id && candidate.head?.sha === targetHeadSha ); - if (candidatePRs.length > 1) { - throw new Error(`ambiguous open PRs for workflow_run head ${targetHeadSha}: ${candidatePRs.length}`); + const openCandidatePRs = candidatePRs.filter((candidate) => candidate.state === "open"); + if (openCandidatePRs.length > 1) { + throw new Error(`ambiguous open PRs for workflow_run head ${targetHeadSha}: ${openCandidatePRs.length}`); } - if (candidatePRs.length === 1) { - prNumber = candidatePRs[0].number; + if (openCandidatePRs.length === 1) { + prNumber = openCandidatePRs[0].number; + } else if (candidatePRs.length > 0) { + core.notice("PR quality summary skipped: workflow_run target PR is no longer open"); + core.setOutput("stale", "true"); + return; } } if (!prNumber) { const candidatePRs = await github.paginate(github.rest.pulls.list, { owner: context.repo.owner, repo: context.repo.repo, - state: "open", + state: "all", per_page: 100, }).then((prs) => prs.filter((candidate) => candidate.base?.repo?.id === context.payload.repository.id && candidate.head?.sha === targetHeadSha )); - if (candidatePRs.length !== 1) { + const openCandidatePRs = candidatePRs.filter((candidate) => candidate.state === "open"); + if (openCandidatePRs.length > 1) { + throw new Error(`ambiguous open PRs from pull list fallback for workflow_run head ${targetHeadSha}: ${openCandidatePRs.length}`); + } + if (openCandidatePRs.length === 1) { + prNumber = openCandidatePRs[0].number; + } else if (candidatePRs.length > 0) { + core.notice("PR quality summary skipped: workflow_run target PR is no longer open"); + core.setOutput("stale", "true"); + return; + } else { throw new Error(`expected one open PR from pull list fallback for workflow_run head ${targetHeadSha}, got ${candidatePRs.length}`); } - prNumber = candidatePRs[0].number; } if (!Number.isInteger(prNumber) || prNumber <= 0) throw new Error("missing pull request binding"); const { data: pr } = await github.rest.pulls.get({ @@ -121,6 +134,11 @@ jobs: pull_number: prNumber, }); if (pr.base.repo.id !== context.payload.repository.id) throw new Error("PR base repo mismatch"); + if (pr.state !== "open") { + core.notice("PR quality summary skipped: workflow_run target PR is no longer open"); + core.setOutput("stale", "true"); + return; + } if (pr.head.sha !== targetHeadSha) { core.notice("PR quality summary skipped: workflow_run is stale for this PR head"); core.setOutput("stale", "true"); @@ -299,31 +317,44 @@ jobs: commit_sha: targetHeadSha, }); const candidatePRs = associatedPRs.filter((candidate) => - candidate.state === "open" && candidate.base?.repo?.id === context.payload.repository.id && candidate.head?.sha === targetHeadSha ); - if (candidatePRs.length > 1) { - throw new Error(`ambiguous open PRs for workflow_run head ${targetHeadSha}: ${candidatePRs.length}`); + const openCandidatePRs = candidatePRs.filter((candidate) => candidate.state === "open"); + if (openCandidatePRs.length > 1) { + throw new Error(`ambiguous open PRs for workflow_run head ${targetHeadSha}: ${openCandidatePRs.length}`); } - if (candidatePRs.length === 1) { - prNumber = candidatePRs[0].number; + if (openCandidatePRs.length === 1) { + prNumber = openCandidatePRs[0].number; + } else if (candidatePRs.length > 0) { + core.notice("semantic review skipped: workflow_run target PR is no longer open"); + core.setOutput("stale", "true"); + return; } } if (!prNumber) { const candidatePRs = await github.paginate(github.rest.pulls.list, { owner: context.repo.owner, repo: context.repo.repo, - state: "open", + state: "all", per_page: 100, }).then((prs) => prs.filter((candidate) => candidate.base?.repo?.id === context.payload.repository.id && candidate.head?.sha === targetHeadSha )); - if (candidatePRs.length !== 1) { + const openCandidatePRs = candidatePRs.filter((candidate) => candidate.state === "open"); + if (openCandidatePRs.length > 1) { + throw new Error(`ambiguous open PRs from pull list fallback for workflow_run head ${targetHeadSha}: ${openCandidatePRs.length}`); + } + if (openCandidatePRs.length === 1) { + prNumber = openCandidatePRs[0].number; + } else if (candidatePRs.length > 0) { + core.notice("semantic review skipped: workflow_run target PR is no longer open"); + core.setOutput("stale", "true"); + return; + } else { throw new Error(`expected one open PR from pull list fallback for workflow_run head ${targetHeadSha}, got ${candidatePRs.length}`); } - prNumber = candidatePRs[0].number; } if (!Number.isInteger(prNumber) || prNumber <= 0) throw new Error("missing pull request binding"); const { data: pr } = await github.rest.pulls.get({ @@ -332,6 +363,16 @@ jobs: pull_number: prNumber, }); if (pr.base.repo.id !== context.payload.repository.id) throw new Error("PR base repo mismatch"); + if (pr.state !== "open") { + core.notice("semantic review skipped: workflow_run target PR is no longer open"); + core.setOutput("stale", "true"); + return; + } + if (!pr.head.repo) { + core.notice("semantic review skipped: workflow_run target PR head repository is unavailable"); + core.setOutput("stale", "true"); + return; + } if (pr.head.sha !== targetHeadSha) { core.notice("semantic review skipped: workflow_run is stale for this PR head"); core.setOutput("stale", "true"); @@ -389,6 +430,10 @@ jobs: repo: context.repo.repo, pull_number: pr, }); + if (pull.state !== "open") { + core.notice("semantic review skipped infrastructure failure check: PR is no longer open"); + return; + } if (pull.head.sha !== headSha) { core.notice("semantic review skipped infrastructure failure check: PR head changed"); return; diff --git a/Makefile b/Makefile index 3d8c9861..d401cd17 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,7 @@ QUALITY_GATE_DIR ?= .tmp/quality-gate QUALITY_GATE_MANIFEST_OUT ?= $(QUALITY_GATE_DIR)/command-manifest.json QUALITY_GATE_COMMAND_INDEX_OUT ?= $(QUALITY_GATE_DIR)/command-index.json QUALITY_GATE_FACTS_OUT ?= $(QUALITY_GATE_DIR)/facts.json +PUBLIC_CONTENT_METADATA ?= $(QUALITY_GATE_DIR)/public-content-metadata.json LDFLAGS := -s -w -X $(MODULE)/internal/build.Version=$(VERSION) -X $(MODULE)/internal/build.Date=$(DATE) PREFIX ?= /usr/local @@ -69,7 +70,8 @@ integration-test: build test: vet fmt-check script-test unit-test examples-build integration-test quality-gate: build - mkdir -p $(QUALITY_GATE_DIR) $(dir $(QUALITY_GATE_FACTS_OUT)) + mkdir -p $(QUALITY_GATE_DIR) $(dir $(QUALITY_GATE_FACTS_OUT)) $(dir $(PUBLIC_CONTENT_METADATA)) + test -f $(PUBLIC_CONTENT_METADATA) || printf '{}\n' > $(PUBLIC_CONTENT_METADATA) LARKSUITE_CLI_REMOTE_META=off \ LARKSUITE_CLI_NO_UPDATE_NOTIFIER=1 \ LARKSUITE_CLI_NO_SKILLS_NOTIFIER=1 \ @@ -89,6 +91,7 @@ quality-gate: build --changed-from $(QUALITY_GATE_CHANGED_FROM_RESOLVED) \ --manifest $(QUALITY_GATE_MANIFEST_OUT) \ --command-index $(QUALITY_GATE_COMMAND_INDEX_OUT) \ + --public-content-metadata $(PUBLIC_CONTENT_METADATA) \ --facts-out $(QUALITY_GATE_FACTS_OUT) install: build diff --git a/internal/qualitygate/cmd/comment-audit/main.go b/internal/qualitygate/cmd/comment-audit/main.go new file mode 100644 index 00000000..4425206d --- /dev/null +++ b/internal/qualitygate/cmd/comment-audit/main.go @@ -0,0 +1,92 @@ +// Copyright (c) 2026 Lark Technologies Pte. Ltd. +// SPDX-License-Identifier: MIT + +package main + +import ( + "encoding/json" + "flag" + "fmt" + "os" + + "github.com/larksuite/cli/errs" + "github.com/larksuite/cli/internal/qualitygate/publiccontent" + "github.com/larksuite/cli/internal/qualitygate/report" + "github.com/larksuite/cli/internal/validate" + "github.com/larksuite/cli/internal/vfs" +) + +type eventPayload struct { + Comment *struct { + Body string `json:"body"` + } `json:"comment"` + Review *struct { + Body string `json:"body"` + } `json:"review"` +} + +func main() { + eventPath := flag.String("event", os.Getenv("GITHUB_EVENT_PATH"), "GitHub event payload path") + kind := flag.String("kind", os.Getenv("GITHUB_EVENT_NAME"), "GitHub event kind") + flag.Parse() + + if *eventPath == "" { + fmt.Fprintln(os.Stderr, "comment-audit: --event or GITHUB_EVENT_PATH is required") + os.Exit(2) + } + body, err := commentBody(*eventPath) + if err != nil { + fmt.Fprintf(os.Stderr, "comment-audit: %v\n", err) + os.Exit(2) + } + diags := diagnostics(publiccontent.ScanComment(*kind, body)) + if len(diags) > 0 { + fmt.Fprintln(os.Stderr, auditFailureSummary(len(diags))) + } + report.Print(os.Stderr, diags) + os.Exit(report.ExitCode(diags)) +} + +func auditFailureSummary(count int) string { + return fmt.Sprintf("post-publication audit found public content findings: %d", count) +} + +func commentBody(path string) (string, error) { + safePath, err := validate.SafeInputPath(path) + if err != nil { + return "", errs.NewValidationError(errs.SubtypeInvalidArgument, "invalid --event: %v", err). + WithParam("--event"). + WithCause(err) + } + data, err := vfs.ReadFile(safePath) + if err != nil { + return "", err + } + var payload eventPayload + if err := json.Unmarshal(data, &payload); err != nil { + return "", err + } + switch { + case payload.Comment != nil: + return payload.Comment.Body, nil + case payload.Review != nil: + return payload.Review.Body, nil + default: + return "", nil + } +} + +func diagnostics(items []publiccontent.Finding) []report.Diagnostic { + out := make([]report.Diagnostic, 0, len(items)) + for _, item := range items { + out = append(out, report.Diagnostic{ + Rule: item.Rule, + Action: item.Action, + File: item.File, + Line: item.Line, + Message: item.Message, + Suggestion: item.Suggestion, + }) + } + return out +} diff --git a/internal/qualitygate/cmd/comment-audit/main_test.go b/internal/qualitygate/cmd/comment-audit/main_test.go new file mode 100644 index 00000000..5e7aea46 --- /dev/null +++ b/internal/qualitygate/cmd/comment-audit/main_test.go @@ -0,0 +1,70 @@ +// Copyright (c) 2026 Lark Technologies Pte. Ltd. +// SPDX-License-Identifier: MIT + +package main + +import ( + "errors" + "os" + "path/filepath" + "testing" + + "github.com/larksuite/cli/errs" +) + +func TestCommentBodyReadsSafeRelativeEventPath(t *testing.T) { + dir := t.TempDir() + if err := writeTestFile(filepath.Join(dir, "event.json"), `{"comment":{"body":"clean comment"}}`); err != nil { + t.Fatal(err) + } + origDir, err := os.Getwd() + if err != nil { + t.Fatal(err) + } + if err := os.Chdir(dir); err != nil { + t.Fatal(err) + } + t.Cleanup(func() { + _ = os.Chdir(origDir) + }) + + got, err := commentBody("event.json") + if err != nil { + t.Fatalf("commentBody() error = %v", err) + } + if got != "clean comment" { + t.Fatalf("comment body = %q", got) + } +} + +func TestCommentBodyRejectsUnsafeEventPath(t *testing.T) { + path := filepath.Join(t.TempDir(), "event.json") + if err := writeTestFile(path, `{"comment":{"body":"clean"}}`); err != nil { + t.Fatal(err) + } + + _, err := commentBody(path) + problem, ok := errs.ProblemOf(err) + if err == nil || !ok { + t.Fatalf("commentBody(%q) error = %v, want unsafe path validation error", path, err) + } + if problem.Category != errs.CategoryValidation || problem.Subtype != errs.SubtypeInvalidArgument { + t.Fatalf("commentBody(%q) problem = %#v, want invalid argument validation", path, problem) + } + var validationErr *errs.ValidationError + if !errors.As(err, &validationErr) || validationErr.Param != "--event" { + t.Fatalf("commentBody(%q) error = %v, want --event validation param", path, err) + } +} + +func TestAuditFailureSummaryStatesPostPublicationAudit(t *testing.T) { + got := auditFailureSummary(2) + want := "post-publication audit found public content findings: 2" + if got != want { + t.Fatalf("auditFailureSummary() = %q, want %q", got, want) + } +} + +func writeTestFile(path, data string) error { + return os.WriteFile(path, []byte(data), 0o644) +} diff --git a/internal/qualitygate/cmd/quality-gate/main.go b/internal/qualitygate/cmd/quality-gate/main.go index 08cf3951..1ec3c90b 100644 --- a/internal/qualitygate/cmd/quality-gate/main.go +++ b/internal/qualitygate/cmd/quality-gate/main.go @@ -13,6 +13,7 @@ import ( "github.com/larksuite/cli/internal/qualitygate/manifest" "github.com/larksuite/cli/internal/qualitygate/report" "github.com/larksuite/cli/internal/qualitygate/rules" + "github.com/larksuite/cli/internal/validate" ) func main() { @@ -41,6 +42,7 @@ func runCheck(args []string) int { fs.StringVar(&opts.FactsOut, "facts-out", "", "write facts JSON to this path") fs.StringVar(&opts.ManifestPath, "manifest", "", "hand-authored command manifest JSON") fs.StringVar(&opts.CommandIndexPath, "command-index", "", "full command index JSON") + fs.StringVar(&opts.PublicContentMetadataPath, "public-content-metadata", "", "PR title/body metadata JSON for public content checks") fs.BoolVar(&printLegacyCommandCandidates, "print-legacy-command-candidates", false, "print current non-kebab-case hand-authored command candidates") fs.BoolVar(&printLegacyFlagCandidates, "print-legacy-flag-candidates", false, "print current non-kebab-case flag candidates") if err := fs.Parse(args); err != nil { @@ -48,6 +50,15 @@ func runCheck(args []string) int { return 2 } + if opts.PublicContentMetadataPath != "" { + safePath, err := validate.SafeInputPath(opts.PublicContentMetadataPath) + if err != nil { + fmt.Fprintf(os.Stderr, "quality-gate check: --public-content-metadata: %v\n", err) + return 2 + } + opts.PublicContentMetadataPath = safePath + } + if opts.ManifestPath == "" || opts.CommandIndexPath == "" { fmt.Fprintln(os.Stderr, "quality-gate check: --manifest and --command-index are required") return 2 diff --git a/internal/qualitygate/cmd/quality-gate/main_test.go b/internal/qualitygate/cmd/quality-gate/main_test.go index cc89695e..9420de89 100644 --- a/internal/qualitygate/cmd/quality-gate/main_test.go +++ b/internal/qualitygate/cmd/quality-gate/main_test.go @@ -37,6 +37,37 @@ func TestCheckRequiresManifestInputs(t *testing.T) { } } +func TestCheckAcceptsPublicContentMetadataFlag(t *testing.T) { + code, stderr := runCheckCaptureStderr(t, []string{ + "--repo", t.TempDir(), + "--cli-bin", "./lark-cli", + "--public-content-metadata", ".tmp/quality-gate/pr.json", + }) + if code != 2 { + t.Fatalf("exit code = %d, stderr=%s", code, stderr) + } + if strings.Contains(stderr, "flag provided but not defined") { + t.Fatalf("public content metadata flag was not registered: %s", stderr) + } + if !strings.Contains(stderr, "--manifest and --command-index are required") { + t.Fatalf("stderr = %s", stderr) + } +} + +func TestCheckRejectsUnsafePublicContentMetadataPath(t *testing.T) { + code, stderr := runCheckCaptureStderr(t, []string{ + "--repo", t.TempDir(), + "--cli-bin", "./lark-cli", + "--public-content-metadata", filepath.Join(t.TempDir(), "pr.json"), + }) + if code != 2 { + t.Fatalf("exit code = %d, stderr=%s", code, stderr) + } + if !strings.Contains(stderr, "--public-content-metadata") || !strings.Contains(stderr, "--file") { + t.Fatalf("stderr = %s, want unsafe public content metadata path error", stderr) + } +} + func TestCheckReportsManifestReadErrorsWithFlagName(t *testing.T) { dir := t.TempDir() manifestPath := filepath.Join(dir, "command-manifest.json") diff --git a/internal/qualitygate/cmd/semantic-review/main.go b/internal/qualitygate/cmd/semantic-review/main.go index 8d4603a0..9ef67674 100644 --- a/internal/qualitygate/cmd/semantic-review/main.go +++ b/internal/qualitygate/cmd/semantic-review/main.go @@ -56,6 +56,14 @@ func run(args []string) int { _ = semantic.WriteMarkdown(markdownOut, decision) return 0 } + if reviewPath == "" && !semantic.BuildInputView(f).HasReviewableFacts() { + decision := finalizeDecision(block, waiverDiags, semantic.Decision{}) + if err := writeSemanticOutputs(decisionOut, markdownOut, decision); err != nil { + fmt.Fprintf(os.Stderr, "semantic-review: %v\n", err) + return 2 + } + return decisionExitCode(decision) + } review, err := semantic.LoadOrReviewWithConfig(context.Background(), f, reviewPath, modelConfig) if err != nil { fmt.Fprintf(os.Stderr, "semantic-review: %v\n", err) @@ -72,6 +80,15 @@ func run(args []string) int { return 0 } decision := semantic.DecideWithWaivers(f, review, policy, waivers) + decision = finalizeDecision(block, waiverDiags, decision) + if err := writeSemanticOutputs(decisionOut, markdownOut, decision); err != nil { + fmt.Fprintf(os.Stderr, "semantic-review: %v\n", err) + return 2 + } + return decisionExitCode(decision) +} + +func finalizeDecision(block bool, waiverDiags []report.Diagnostic, decision semantic.Decision) semantic.Decision { decision.BlockMode = block if !block && len(decision.Blockers) > 0 { for i := range decision.Blockers { @@ -81,15 +98,21 @@ func run(args []string) int { decision.Blockers = nil } decision.SystemWarnings = append(diagnosticSystemWarnings(waiverDiags), decision.SystemWarnings...) + return decision +} + +func writeSemanticOutputs(decisionOut, markdownOut string, decision semantic.Decision) error { if err := semantic.WriteDecision(decisionOut, decision); err != nil { - fmt.Fprintf(os.Stderr, "semantic-review: write decision: %v\n", err) - return 2 + return fmt.Errorf("write decision: %w", err) } if err := semantic.WriteMarkdown(markdownOut, decision); err != nil { - fmt.Fprintf(os.Stderr, "semantic-review: write markdown: %v\n", err) - return 2 + return fmt.Errorf("write markdown: %w", err) } - if block && len(decision.Blockers) > 0 { + return nil +} + +func decisionExitCode(decision semantic.Decision) int { + if decision.BlockMode && len(decision.Blockers) > 0 { return 1 } return 0 diff --git a/internal/qualitygate/cmd/semantic-review/main_test.go b/internal/qualitygate/cmd/semantic-review/main_test.go index 6bf80a38..366e24ac 100644 --- a/internal/qualitygate/cmd/semantic-review/main_test.go +++ b/internal/qualitygate/cmd/semantic-review/main_test.go @@ -7,6 +7,7 @@ import ( "encoding/json" "os" "path/filepath" + "strings" "testing" "github.com/larksuite/cli/internal/qualitygate/facts" @@ -211,7 +212,19 @@ func TestRunWritesSkippedDecisionForUnavailableReviewer(t *testing.T) { "allowed_base_urls": ["https://ark.ap-southeast.bytepluses.com/api/v3"] }`, "") factsPath := filepath.Join(t.TempDir(), "facts.json") - if err := (facts.Facts{SchemaVersion: 1}).WriteFile(factsPath); err != nil { + f := facts.Facts{ + SchemaVersion: 1, + Skills: []facts.SkillFact{{ + SourceFile: "skills/lark-wiki/SKILL.md", + Line: 30, + Changed: true, + ReferencesInvalidCommand: true, + }}, + } + if !semantic.BuildInputView(f).HasReviewableFacts() { + t.Fatal("test setup must contain reviewable facts") + } + if err := f.WriteFile(factsPath); err != nil { t.Fatalf("write facts: %v", err) } decisionPath := filepath.Join(t.TempDir(), "decision.json") @@ -228,6 +241,71 @@ func TestRunWritesSkippedDecisionForUnavailableReviewer(t *testing.T) { } } +func TestRunShortCircuitsEmptySemanticInputWithoutReviewer(t *testing.T) { + t.Setenv("ARK_API_KEY", "") + t.Setenv("ARK_BASE_URL", "") + t.Setenv("ARK_MODEL", "") + + repo := t.TempDir() + writeSemanticConfig(t, repo, `{ + "schema_version": 1, + "default_enforcement": "observe", + "block_categories": ["skill_quality"] + }`, `{ + "allowed": ["semantic-review-v1"], + "allowed_base_urls": ["https://ark.ap-southeast.bytepluses.com/api/v3"] + }`, "") + factsPath := filepath.Join(t.TempDir(), "facts.json") + f := facts.Facts{ + SchemaVersion: 1, + Commands: []facts.CommandFact{{ + Path: "service command 1", + Domain: "service", + Changed: true, + Source: "service", + }}, + Outputs: []facts.OutputFact{{ + Command: "service command 1", + Domain: "service", + Changed: true, + Source: "service", + IsList: true, + HasDefaultLimit: true, + HasDecisionField: true, + }}, + } + if semantic.BuildInputView(f).HasReviewableFacts() { + t.Fatal("test setup must not contain reviewable facts") + } + if err := f.WriteFile(factsPath); err != nil { + t.Fatalf("write facts: %v", err) + } + decisionPath := filepath.Join(t.TempDir(), "decision.json") + markdownPath := filepath.Join(t.TempDir(), "semantic.md") + code := run([]string{"--repo", repo, "--facts", factsPath, "--decision-out", decisionPath, "--markdown-out", markdownPath, "--block"}) + if code != 0 { + t.Fatalf("run() = %d, want clean pass", code) + } + decision := readDecision(t, decisionPath) + if decision.Skipped || decision.Degraded || decision.InfrastructureFailure || !decision.BlockMode { + t.Fatalf("expected non-degraded pass decision: %#v", decision) + } + if len(decision.SystemWarnings) != 0 || len(decision.Warnings) != 0 || len(decision.Blockers) != 0 { + t.Fatalf("empty semantic view should not produce findings: %#v", decision) + } + data, err := os.ReadFile(markdownPath) + if err != nil { + t.Fatalf("read markdown: %v", err) + } + markdown := string(data) + if !strings.Contains(markdown, "No semantic blockers.") { + t.Fatalf("markdown missing pass summary: %s", markdown) + } + if strings.Contains(strings.ToLower(markdown), "skipped") || strings.Contains(strings.ToLower(markdown), "degraded") { + t.Fatalf("markdown should not report semantic review as skipped/degraded: %s", markdown) + } +} + func TestRunWritesInfrastructureFailureDecisionForInvalidReviewerConfig(t *testing.T) { t.Setenv("ARK_API_KEY", "test-key") t.Setenv("ARK_BASE_URL", "") @@ -243,7 +321,19 @@ func TestRunWritesInfrastructureFailureDecisionForInvalidReviewerConfig(t *testi "allowed_base_urls": ["https://ark.ap-southeast.bytepluses.com/api/v3"] }`, "") factsPath := filepath.Join(t.TempDir(), "facts.json") - if err := (facts.Facts{SchemaVersion: 1}).WriteFile(factsPath); err != nil { + f := facts.Facts{ + SchemaVersion: 1, + Skills: []facts.SkillFact{{ + SourceFile: "skills/lark-wiki/SKILL.md", + Line: 30, + Changed: true, + ReferencesInvalidCommand: true, + }}, + } + if !semantic.BuildInputView(f).HasReviewableFacts() { + t.Fatal("test setup must contain reviewable facts") + } + if err := f.WriteFile(factsPath); err != nil { t.Fatalf("write facts: %v", err) } decisionPath := filepath.Join(t.TempDir(), "decision.json") diff --git a/internal/qualitygate/config/semantic/policy.json b/internal/qualitygate/config/semantic/policy.json index fac5e11f..3fd4e9cb 100644 --- a/internal/qualitygate/config/semantic/policy.json +++ b/internal/qualitygate/config/semantic/policy.json @@ -5,7 +5,8 @@ "error_hint", "default_output", "naming", - "skill_quality" + "skill_quality", + "public_content_leakage" ], "rollout_groups": [ { @@ -16,7 +17,8 @@ }, "categories": [ "error_hint", - "skill_quality" + "skill_quality", + "public_content_leakage" ], "owner": "cli-owner", "reason": "first semantic blocking rollout only affects changed facts" diff --git a/internal/qualitygate/facts/schema.go b/internal/qualitygate/facts/schema.go index 8133dd66..a6e37ca7 100644 --- a/internal/qualitygate/facts/schema.go +++ b/internal/qualitygate/facts/schema.go @@ -13,14 +13,15 @@ import ( ) type Facts struct { - SchemaVersion int `json:"schema_version"` - Commands []CommandFact `json:"commands,omitempty"` - Skills []SkillFact `json:"skills,omitempty"` - SkillQuality []SkillQualityFact `json:"skill_quality,omitempty"` - Errors []ErrorFact `json:"errors,omitempty"` - Outputs []OutputFact `json:"outputs,omitempty"` - Examples []CommandExample `json:"examples,omitempty"` - Diagnostics []DiagnosticFact `json:"diagnostics,omitempty"` + SchemaVersion int `json:"schema_version"` + Commands []CommandFact `json:"commands,omitempty"` + Skills []SkillFact `json:"skills,omitempty"` + SkillQuality []SkillQualityFact `json:"skill_quality,omitempty"` + Errors []ErrorFact `json:"errors,omitempty"` + Outputs []OutputFact `json:"outputs,omitempty"` + Examples []CommandExample `json:"examples,omitempty"` + PublicContent []PublicContentFact `json:"public_content,omitempty"` + Diagnostics []DiagnosticFact `json:"diagnostics,omitempty"` } type CommandFact struct { @@ -109,6 +110,17 @@ type OutputFact struct { HasDecisionField bool `json:"has_decision_field,omitempty"` } +type PublicContentFact struct { + Rule string `json:"rule"` + Action report.Action `json:"action"` + File string `json:"file"` + Line int `json:"line"` + Source string `json:"source,omitempty"` + Excerpt string `json:"excerpt,omitempty"` + Message string `json:"message,omitempty"` + Suggestion string `json:"suggestion,omitempty"` +} + type DryRunRequest struct { Method string `json:"method"` URL string `json:"url"` @@ -206,6 +218,11 @@ func BuildWithCommandLookup(m manifest.Manifest, commandLookup manifest.Manifest } } +func WithPublicContent(f Facts, publicContent []PublicContentFact) Facts { + f.PublicContent = publicContent + return f +} + type commandScope struct { Domain string Source string diff --git a/internal/qualitygate/facts/schema_test.go b/internal/qualitygate/facts/schema_test.go index 551bed7b..0a790ea0 100644 --- a/internal/qualitygate/facts/schema_test.go +++ b/internal/qualitygate/facts/schema_test.go @@ -34,6 +34,7 @@ func TestFactsSchemaCarriesGatekeeperFields(t *testing.T) { Errors: []ErrorFact{{Code: "invalid_input", Message: "bad path", Hint: "pass --file", Retryable: false, HintActionCount: 1, RequiredHint: true}}, Outputs: []OutputFact{{Command: "im messages list", Fields: []string{"message_id", "sender", "create_time"}, IsList: true, HasDefaultLimit: true, HasDecisionField: true}}, Skills: []SkillFact{{SourceFile: "skills/lark-doc/SKILL.md", Line: 1, DestructiveWithoutGuard: true, ScopeConflict: true}}, + PublicContent: []PublicContentFact{{Rule: "public_content_generic_credential", Action: report.ActionReject, File: "docs/public.md", Line: 4, Excerpt: "api_key = "}}, } data, err := json.Marshal(f) if err != nil { @@ -43,7 +44,10 @@ func TestFactsSchemaCarriesGatekeeperFields(t *testing.T) { if err := json.Unmarshal(data, &got); err != nil { t.Fatalf("unmarshal facts: %v", err) } - if !got.Errors[0].RequiredHint || got.Outputs[0].Fields[0] != "message_id" || !got.Skills[0].ScopeConflict { + if !got.Errors[0].RequiredHint || + got.Outputs[0].Fields[0] != "message_id" || + !got.Skills[0].ScopeConflict || + got.PublicContent[0].Rule != "public_content_generic_credential" { t.Fatalf("facts lost gatekeeper fields: %#v", got) } } diff --git a/internal/qualitygate/publiccontent/collect.go b/internal/qualitygate/publiccontent/collect.go new file mode 100644 index 00000000..f21a0e5e --- /dev/null +++ b/internal/qualitygate/publiccontent/collect.go @@ -0,0 +1,343 @@ +// Copyright (c) 2026 Lark Technologies Pte. Ltd. +// SPDX-License-Identifier: MIT + +package publiccontent + +import ( + "bytes" + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "sort" + "strconv" + "strings" +) + +func Collect(ctx context.Context, opts Options) ([]Finding, error) { + metadata, err := LoadMetadata(opts.MetadataPath) + if err != nil { + return nil, err + } + + var out []Finding + changedFiles, base, err := changedFiles(ctx, opts.Repo, opts.ChangedFrom) + if err != nil { + return nil, err + } + patches := map[string][]changedChunk{} + if base != "" { + patches, err = changedPatches(ctx, opts.Repo, base) + if err != nil { + return nil, err + } + } + for _, file := range changedFiles { + if !scanChangedFile(file) { + continue + } + for _, chunk := range patches[file] { + findings := scanText(file, "file", chunk.Text, isDetectorRuleFile(file)) + for i := range findings { + findings[i].Line += chunk.StartLine - 1 + } + out = append(out, findings...) + out = append(out, semanticCandidate(file, "file", chunk.Text, chunk.StartLine)...) + } + privateKeyFindings, err := scanTouchedPrivateKeyBlocks(ctx, opts.Repo, file, patches[file]) + if err != nil { + return nil, err + } + out = appendUniqueFindings(out, privateKeyFindings...) + } + if base != "" { + commitFindings, err := scanCommitMessages(ctx, opts.Repo, base) + if err != nil { + return nil, err + } + out = append(out, commitFindings...) + } + branchName := opts.BranchName + if branchName == "" { + branchName = metadata.Branch + } + if branchName == "" { + branchName = branchFromEnv() + } + if branchName == "" { + branchName = currentBranch(ctx, opts.Repo) + } + if branchName != "" { + out = append(out, scanText("branch", "branch", branchName, false)...) + } + out = append(out, scanMetadata(metadata)...) + sort.SliceStable(out, func(i, j int) bool { + if out[i].File != out[j].File { + return out[i].File < out[j].File + } + if out[i].Line != out[j].Line { + return out[i].Line < out[j].Line + } + return out[i].Rule < out[j].Rule + }) + return out, nil +} + +func currentBranch(ctx context.Context, repo string) string { + data, err := gitOutput(ctx, repo, "branch", "--show-current") + if err != nil { + return "" + } + return strings.TrimSpace(string(data)) +} + +func branchFromEnv() string { + for _, key := range []string{"PR_BRANCH", "GITHUB_HEAD_REF", "GITHUB_REF_NAME"} { + if value := strings.TrimSpace(os.Getenv(key)); value != "" { + return value + } + } + return "" +} + +func changedFiles(ctx context.Context, repo, changedFrom string) ([]string, string, error) { + if changedFrom == "" { + return nil, "", nil + } + baseBytes, err := gitOutput(ctx, repo, "merge-base", changedFrom, "HEAD") + if err != nil { + return nil, "", err + } + base := strings.TrimSpace(string(baseBytes)) + files, err := diffFileNames(ctx, repo, base) + if err != nil { + return nil, "", err + } + sort.Strings(files) + return files, base, nil +} + +func diffFileNames(ctx context.Context, repo, base string) ([]string, error) { + data, err := gitOutput(ctx, repo, "diff", "--name-only", "-z", "--diff-filter=ACMR", base+"..HEAD") + if err != nil { + return nil, err + } + var files []string + for _, file := range bytes.Split(data, []byte{0}) { + if len(file) == 0 { + continue + } + files = append(files, filepath.ToSlash(string(file))) + } + return files, nil +} + +var detectorFixtureExclusions = map[string]bool{ + "internal/qualitygate/publiccontent/collect_test.go": true, + "internal/qualitygate/publiccontent/rules.go": true, + "internal/qualitygate/publiccontent/scan.go": true, + "internal/qualitygate/publiccontent/scan_test.go": true, +} + +func scanChangedFile(file string) bool { + normalized := strings.TrimPrefix(strings.ReplaceAll(file, "\\", "/"), "./") + return !detectorFixtureExclusions[normalized] +} + +type changedChunk struct { + StartLine int + Text string +} + +func (c changedChunk) endLine() int { + lines := strings.Count(strings.TrimRight(c.Text, "\n"), "\n") + 1 + if lines < 1 { + lines = 1 + } + return c.StartLine + lines - 1 +} + +func changedPatches(ctx context.Context, repo, base string) (map[string][]changedChunk, error) { + files, err := diffFileNames(ctx, repo, base) + if err != nil { + return nil, err + } + data, err := gitOutput(ctx, repo, "diff", "--no-ext-diff", "--unified=0", "--diff-filter=ACMR", base+"..HEAD") + if err != nil { + return nil, err + } + out := map[string][]changedChunk{} + var file string + var chunk *changedChunk + nextLine := 0 + nextFile := 0 + flush := func() { + if file == "" || chunk == nil || chunk.Text == "" { + chunk = nil + return + } + out[file] = append(out[file], *chunk) + chunk = nil + } + for _, raw := range strings.Split(string(data), "\n") { + switch { + case strings.HasPrefix(raw, "diff --git "): + flush() + file = "" + if nextFile < len(files) { + file = files[nextFile] + nextFile++ + } + case strings.HasPrefix(raw, "@@ "): + flush() + start, ok := parseNewHunkStart(raw) + if !ok { + nextLine = 0 + continue + } + nextLine = start + chunk = &changedChunk{StartLine: start} + case strings.HasPrefix(raw, "+") && !strings.HasPrefix(raw, "+++"): + if chunk == nil { + chunk = &changedChunk{StartLine: max(nextLine, 1)} + } + chunk.Text += strings.TrimPrefix(raw, "+") + "\n" + nextLine++ + case strings.HasPrefix(raw, "-"): + continue + default: + if chunk != nil && strings.HasPrefix(raw, `\ No newline at end of file`) { + continue + } + flush() + } + } + flush() + return out, nil +} + +func parseNewHunkStart(header string) (int, bool) { + parts := strings.Split(header, " ") + for _, part := range parts { + if !strings.HasPrefix(part, "+") { + continue + } + raw := strings.TrimPrefix(part, "+") + if before, _, ok := strings.Cut(raw, ","); ok { + raw = before + } + start, err := strconv.Atoi(raw) + return start, err == nil && start > 0 + } + return 0, false +} + +func scanCommitMessages(ctx context.Context, repo, base string) ([]Finding, error) { + data, err := gitOutput(ctx, repo, "log", "--format=%H%x00%B%x00", base+"..HEAD") + if err != nil { + return nil, err + } + parts := bytes.Split(data, []byte{0}) + var out []Finding + for i := 0; i+1 < len(parts); i += 2 { + sha := strings.TrimSpace(string(parts[i])) + body := string(parts[i+1]) + if sha == "" || body == "" { + continue + } + short := sha + if len(short) > 12 { + short = short[:12] + } + out = append(out, scanText("commit:"+short, "commit", body, false)...) + out = append(out, semanticCandidate("commit:"+short, "commit", body, 1)...) + } + return out, nil +} + +type lineRange struct { + Start int + End int +} + +func scanTouchedPrivateKeyBlocks(ctx context.Context, repo, file string, chunks []changedChunk) ([]Finding, error) { + if len(chunks) == 0 { + return nil, nil + } + data, err := gitOutput(ctx, repo, "show", "HEAD:"+file) + if err != nil { + return nil, err + } + var added []lineRange + for _, chunk := range chunks { + added = append(added, lineRange{Start: chunk.StartLine, End: chunk.endLine()}) + } + var out []Finding + for _, block := range privateKeyBlocks(string(data)) { + if !rangesIntersectAny(block, added) { + continue + } + out = append(out, newFinding("public_content_private_key_block", file, block.Start, "file", "private key block")) + } + return out, nil +} + +func privateKeyBlocks(text string) []lineRange { + lines := strings.Split(text, "\n") + var out []lineRange + inPrivateKey := false + start := 0 + for i, line := range lines { + lineNo := i + 1 + if !inPrivateKey && strings.Contains(line, privateKeyBeginPrefix) && strings.Contains(line, privateKeyMarker) { + inPrivateKey = true + start = lineNo + } + if inPrivateKey && strings.Contains(line, privateKeyEndPrefix) && strings.Contains(line, privateKeyMarker) { + out = append(out, lineRange{Start: start, End: lineNo}) + inPrivateKey = false + } + } + return out +} + +func rangesIntersectAny(block lineRange, ranges []lineRange) bool { + for _, r := range ranges { + if block.Start <= r.End && r.Start <= block.End { + return true + } + } + return false +} + +func appendUniqueFindings(items []Finding, additions ...Finding) []Finding { + for _, addition := range additions { + duplicate := false + for _, item := range items { + if item.Rule == addition.Rule && + item.File == addition.File && + item.Line == addition.Line && + item.Source == addition.Source { + duplicate = true + break + } + } + if !duplicate { + items = append(items, addition) + } + } + return items +} + +func gitOutput(ctx context.Context, repo string, args ...string) ([]byte, error) { + cmd := exec.CommandContext(ctx, "git", args...) + cmd.Dir = repo + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + if err := cmd.Run(); err != nil { + return nil, fmt.Errorf("git %s: %w\n%s", strings.Join(args, " "), err, stderr.Bytes()) + } + return stdout.Bytes(), nil +} diff --git a/internal/qualitygate/publiccontent/collect_test.go b/internal/qualitygate/publiccontent/collect_test.go new file mode 100644 index 00000000..5ea92779 --- /dev/null +++ b/internal/qualitygate/publiccontent/collect_test.go @@ -0,0 +1,885 @@ +// Copyright (c) 2026 Lark Technologies Pte. Ltd. +// SPDX-License-Identifier: MIT + +package publiccontent + +import ( + "context" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" +) + +func TestCollectScansOnlyCurrentContributionAndMetadata(t *testing.T) { + repo := t.TempDir() + runGit(t, repo, "init") + runGit(t, repo, "config", "user.email", "test@example.com") + runGit(t, repo, "config", "user.name", "Test User") + + writeFile(t, filepath.Join(repo, "baseline.md"), `BASE_`+`TOKEN="baseline-only" +`) + runGit(t, repo, "add", "baseline.md") + runGit(t, repo, "commit", "-m", "base") + + writeFile(t, filepath.Join(repo, "docs", "public.md"), `# Public change + +api_`+`key = "example-public-key" +`) + runGit(t, repo, "add", "docs/public.md") + runGit(t, repo, "commit", "-m", "add public doc", "-m", "Change"+"-Id: I0123456789abcdef0123456789abcdef01234567") + + metadataPath := filepath.Join(repo, "pr-metadata.json") + writeFile(t, metadataPath, `{"title":"publish public docs","body":"Reviewed`+`-on: https://review.example.test/c/project/+/123"}`) + + got, err := Collect(context.Background(), Options{ + Repo: repo, + ChangedFrom: "HEAD~1", + MetadataPath: metadataPath, + }) + if err != nil { + t.Fatalf("Collect() error = %v", err) + } + + rules := findingRules(got) + for _, want := range []string{ + "public_content_generic_credential", + "public_content_change_id_trailer", + "public_content_reviewed_on_trailer", + } { + if !rules[want] { + t.Fatalf("missing rule %s in findings %#v", want, got) + } + } + for _, item := range got { + if item.File == "baseline.md" { + t.Fatalf("collector scanned unchanged baseline file: %#v", got) + } + } +} + +func TestCollectScansOnlyChangedLinesInChangedFiles(t *testing.T) { + repo := t.TempDir() + runGit(t, repo, "init") + runGit(t, repo, "config", "user.email", "test@example.com") + runGit(t, repo, "config", "user.name", "Test User") + + writeFile(t, filepath.Join(repo, "docs", "workflow.md"), "SECRET_TOKEN=legacy-example\npublic baseline\n") + runGit(t, repo, "add", "docs/workflow.md") + runGit(t, repo, "commit", "-m", "base") + + writeFile(t, filepath.Join(repo, "docs", "workflow.md"), "SECRET_TOKEN=legacy-example\npublic baseline\nnew public line\n") + runGit(t, repo, "add", "docs/workflow.md") + runGit(t, repo, "commit", "-m", "add public line") + + metadataPath := filepath.Join(repo, "pr-metadata.json") + writeFile(t, metadataPath, `{}`) + + got, err := Collect(context.Background(), Options{ + Repo: repo, + ChangedFrom: "HEAD~1", + MetadataPath: metadataPath, + }) + if err != nil { + t.Fatalf("Collect() error = %v", err) + } + for _, item := range got { + if item.Rule == "public_content_generic_credential" && item.File == "docs/workflow.md" { + t.Fatalf("collector scanned unchanged legacy line in changed file: %#v", got) + } + } +} + +func TestCollectSemanticCandidatesStoreSanitizedReviewText(t *testing.T) { + repo := t.TempDir() + runGit(t, repo, "init") + runGit(t, repo, "config", "user.email", "test@example.com") + runGit(t, repo, "config", "user.name", "Test User") + + writeFile(t, filepath.Join(repo, "docs", "public.md"), "base\n") + runGit(t, repo, "add", "docs/public.md") + runGit(t, repo, "commit", "-m", "base") + + raw := "private launch plan for alpha-service rollout on Friday with SERVICE_" + "TOKEN=real-" + "secret-value" + writeFile(t, filepath.Join(repo, "docs", "public.md"), "base\n"+raw+"\n") + runGit(t, repo, "add", "docs/public.md") + runGit(t, repo, "commit", "-m", "add semantic candidate") + + metadataPath := filepath.Join(repo, "pr-metadata.json") + writeFile(t, metadataPath, `{}`) + + got, err := Collect(context.Background(), Options{ + Repo: repo, + ChangedFrom: "HEAD~1", + MetadataPath: metadataPath, + }) + if err != nil { + t.Fatalf("Collect() error = %v", err) + } + var found bool + for _, item := range got { + if item.Rule != "public_content_semantic_candidate" || item.File != "docs/public.md" { + continue + } + found = true + if !strings.Contains(item.Excerpt, "alpha-service rollout on Friday") { + t.Fatalf("semantic candidate should include sanitized review text, got %#v", item) + } + if strings.Contains(item.Excerpt, "real-"+"secret-value") { + t.Fatalf("semantic candidate leaked credential value: %#v", item) + } + if !strings.Contains(item.Excerpt, "SERVICE_TOKEN=") { + t.Fatalf("semantic candidate should redact credentials in review text, got %#v", item) + } + if !strings.Contains(item.Excerpt, "semantic signals") || !strings.Contains(item.Excerpt, "roadmap_timing") { + t.Fatalf("semantic candidate excerpt should preserve semantic signals, got %#v", item) + } + } + if !found { + t.Fatalf("missing semantic candidate in findings %#v", got) + } +} + +func TestCollectSemanticCandidatesDoNotLeakWhitespaceCredentialTail(t *testing.T) { + repo := newGitRepo(t) + writeFile(t, filepath.Join(repo, "docs", "public.md"), "base\n") + runGit(t, repo, "add", "docs/public.md") + runGit(t, repo, "commit", "-m", "base") + + raw := "private launch plan for internal rollout on Friday with SERVICE_" + "TOKEN=\"real " + "secret value\"" + writeFile(t, filepath.Join(repo, "docs", "public.md"), "base\n"+raw+"\n") + runGit(t, repo, "add", "docs/public.md") + runGit(t, repo, "commit", "-m", "add semantic candidate") + + got := collectFromPreviousCommit(t, repo) + for _, item := range got { + if item.Rule != "public_content_semantic_candidate" || item.File != "docs/public.md" { + continue + } + if strings.Contains(item.Excerpt, "secret value") || strings.Contains(item.Excerpt, "real "+"secret value") { + t.Fatalf("semantic candidate leaked credential tail: %#v", item) + } + if !strings.Contains(item.Excerpt, "SERVICE_TOKEN=") { + t.Fatalf("semantic candidate should redact full credential assignment, got %#v", item) + } + return + } + t.Fatalf("missing semantic candidate in findings %#v", got) +} + +func TestCollectJSONBearerHeadersDoNotLeakIntoSemanticCandidates(t *testing.T) { + repo := newGitRepo(t) + writeFile(t, filepath.Join(repo, "docs", "public.md"), "base\n") + runGit(t, repo, "add", "docs/public.md") + runGit(t, repo, "commit", "-m", "base") + + token := "abcdefghijklmnopqrstuvwxyz" + raw := "private launch plan for internal rollout on Friday with " + + `{"headers":{"Authorization":"Bearer ` + token + `"}}` + writeFile(t, filepath.Join(repo, "docs", "public.md"), "base\n"+raw+"\n") + runGit(t, repo, "add", "docs/public.md") + runGit(t, repo, "commit", "-m", "add json bearer") + + got := collectFromPreviousCommit(t, repo) + requireFinding(t, got, "docs/public.md", "public_content_bearer_header") + for _, item := range got { + if item.File != "docs/public.md" { + continue + } + if strings.Contains(item.Excerpt, token) { + t.Fatalf("finding leaked JSON bearer token: %#v", item) + } + } +} + +func TestCollectDetectsQuotedJSONCredentialAssignments(t *testing.T) { + repo := newGitRepo(t) + writeFile(t, filepath.Join(repo, "docs", "public.json"), "{}\n") + runGit(t, repo, "add", "docs/public.json") + runGit(t, repo, "commit", "-m", "base") + + writeFile(t, filepath.Join(repo, "docs", "public.json"), strings.Join([]string{ + `{"access_` + `token":"real-json-token"}`, + `{"client_` + `secret": "real ` + `secret value"}`, + `{"tenantAccess` + `Token":"real-tenant-camel-token"}`, + `{"github` + `Token":"real-github-token"}`, + `{"vendorApi` + `Key":"real-vendor-key"}`, + `{"slackBot` + `Token":"xoxb-real-token"}`, + }, "\n")+"\n") + runGit(t, repo, "add", "docs/public.json") + runGit(t, repo, "commit", "-m", "add json config") + + got := collectFromPreviousCommit(t, repo) + var count int + for _, item := range got { + if item.File == "docs/public.json" && item.Rule == "public_content_generic_credential" { + count++ + for _, forbidden := range []string{ + "real-json-token", + "real secret value", + "real-tenant-camel-token", + "real-github-token", + "real-vendor-key", + "xoxb-real-token", + } { + if strings.Contains(item.Excerpt, forbidden) { + t.Fatalf("JSON credential finding leaked value %q in excerpt %q", forbidden, item.Excerpt) + } + } + } + } + if count != 6 { + t.Fatalf("JSON credential findings = %d, want 6: %#v", count, got) + } +} + +func TestCollectAllowsBenignJSONTokenFields(t *testing.T) { + repo := newGitRepo(t) + writeFile(t, filepath.Join(repo, "docs", "public.json"), "{}\n") + runGit(t, repo, "add", "docs/public.json") + runGit(t, repo, "commit", "-m", "base") + + writeFile(t, filepath.Join(repo, "docs", "public.json"), strings.Join([]string{ + `{"tokenizer":"cl100k_base"}`, + `{"token_count": 42}`, + `{"page_token":"next"}`, + `{"next_page_token":"next"}`, + `{"file_token":"file-example"}`, + `{"doc_token":"doc-example"}`, + `{"node_token":"node-example"}`, + `{"wiki_token":"wikcn_public_doc_example"}`, + `{"folder_token":"folder-example"}`, + `{"obj_token":"obj-example"}`, + `{"spreadsheet_token":"sheet-example"}`, + `{"parent_node_token":"parent-example"}`, + `{"origin_node_token":"origin-example"}`, + `{"drive_route_token":"route-example"}`, + `{"token":""}`, + `{"token":"wiki_token"}`, + `{"token_url":"https://example.com/oauth/token"}`, + `{"token_endpoint":"https://example.com/oauth/token"}`, + `{"token_format":"Bearer"}`, + `{"secret_name":"public-example-secret"}`, + `{"base_token":"base-example"}`, + `{"app_token":"app-example"}`, + `{"sync_token":"sync-example"}`, + `{"parent_token":"parent-example"}`, + `{"target_token":"target-example"}`, + `{"parent_file_token":"parent-file-example"}`, + `{"refresh_token_expires_in": 7200}`, + `{"access_token_expires_in": 7200}`, + `{"token_expires_in": 7200}`, + `{"token_status":"active"}`, + }, "\n")+"\n") + runGit(t, repo, "add", "docs/public.json") + runGit(t, repo, "commit", "-m", "add benign json token fields") + + got := collectFromPreviousCommit(t, repo) + for _, item := range got { + if item.File == "docs/public.json" && item.Rule == "public_content_generic_credential" { + t.Fatalf("benign JSON token field should not be credential finding: %#v", got) + } + } +} + +func TestCollectDetectsAngleWrappedRealisticCredentialValues(t *testing.T) { + repo := newGitRepo(t) + writeFile(t, filepath.Join(repo, "docs", "config.yaml"), "base: true\n") + runGit(t, repo, "add", "docs/config.yaml") + runGit(t, repo, "commit", "-m", "base") + stripeLike := "sk_" + "live_1234567890abcdef" + patLike := "gh" + "p_1234567890abcdef1234567890abcdef1234" + + writeFile(t, filepath.Join(repo, "docs", "config.yaml"), strings.Join([]string{ + "API_KEY: <" + stripeLike + ">", + "SECRET_TOKEN: <" + patLike + ">", + "CLIENT_SECRET: ", + }, "\n")+"\n") + runGit(t, repo, "add", "docs/config.yaml") + runGit(t, repo, "commit", "-m", "add credential config") + + got := collectFromPreviousCommit(t, repo) + var count int + for _, item := range got { + if item.File == "docs/config.yaml" && item.Rule == "public_content_generic_credential" { + count++ + } + } + if count != 3 { + t.Fatalf("angle-wrapped realistic credential findings = %d, want 3: %#v", count, got) + } +} + +func TestCollectDetectsCredentialShapedValuesUnderBenignKeys(t *testing.T) { + repo := newGitRepo(t) + writeFile(t, filepath.Join(repo, "docs", "public.json"), "{}\n") + runGit(t, repo, "add", "docs/public.json") + runGit(t, repo, "commit", "-m", "base") + stripeLike := "sk_" + "live_1234567890abcdef" + patLike := "gh" + "p_1234567890abcdef1234567890abcdef1234" + + writeFile(t, filepath.Join(repo, "docs", "public.json"), strings.Join([]string{ + `{"access_token_expires_in":"` + patLike + `"}`, + `{"refresh_token_expires_in":"` + stripeLike + `"}`, + `{"client_secret_status":"real-client-secret-value"}`, + `{"client_secret_name":"real-client-secret-value"}`, + `{"app_token":"` + patLike + `"}`, + `{"sync_token":"` + stripeLike + `"}`, + `{"target_token":"real-client-secret-value"}`, + }, "\n")+"\n") + runGit(t, repo, "add", "docs/public.json") + runGit(t, repo, "commit", "-m", "add credential-shaped benign fields") + + got := collectFromPreviousCommit(t, repo) + var count int + for _, item := range got { + if item.File == "docs/public.json" && item.Rule == "public_content_generic_credential" { + count++ + } + } + if count != 7 { + t.Fatalf("credential-shaped benign-key findings = %d, want 7: %#v", count, got) + } +} + +func TestCollectDetectsBareIdentifierCredentialsWithMetadataSuffixes(t *testing.T) { + repo := newGitRepo(t) + writeFile(t, filepath.Join(repo, "docs", "config.yaml"), "base: true\n") + runGit(t, repo, "add", "docs/config.yaml") + runGit(t, repo, "commit", "-m", "base") + + writeFile(t, filepath.Join(repo, "docs", "config.yaml"), strings.Join([]string{ + "API_KEY_NAME: prod_key", + "CLIENT_SECRET_NAME: prod_secret", + "SECRET_STATUS: prod_secret", + }, "\n")+"\n") + runGit(t, repo, "add", "docs/config.yaml") + runGit(t, repo, "commit", "-m", "add credential config") + + got := collectFromPreviousCommit(t, repo) + var count int + for _, item := range got { + if item.File == "docs/config.yaml" && item.Rule == "public_content_generic_credential" { + count++ + } + } + if count != 3 { + t.Fatalf("metadata-suffixed bare credential findings = %d, want 3: %#v", count, got) + } +} + +func TestCollectDetectsAccessKeyCredentials(t *testing.T) { + repo := newGitRepo(t) + writeFile(t, filepath.Join(repo, "docs", "config.yaml"), "base: true\n") + runGit(t, repo, "add", "docs/config.yaml") + runGit(t, repo, "commit", "-m", "base") + accessKey := "AK" + "IAIOSFODNN7EXAMPX" + + writeFile(t, filepath.Join(repo, "docs", "config.yaml"), strings.Join([]string{ + "AWS_ACCESS_KEY_ID: " + accessKey, + "ACCESS_KEY_ID: " + accessKey, + "ACCESS_KEY: " + accessKey, + }, "\n")+"\n") + runGit(t, repo, "add", "docs/config.yaml") + runGit(t, repo, "commit", "-m", "add access key config") + + got := collectFromPreviousCommit(t, repo) + var count int + for _, item := range got { + if item.File != "docs/config.yaml" || item.Rule != "public_content_generic_credential" { + continue + } + count++ + if strings.Contains(item.Excerpt, "AKIAIOSFODNN7EXAMPX") { + t.Fatalf("access key finding leaked value in excerpt %q", item.Excerpt) + } + } + if count != 3 { + t.Fatalf("access key credential findings = %d, want 3: %#v", count, got) + } +} + +func TestCollectDetectsPrivateKeyAssignments(t *testing.T) { + repo := newGitRepo(t) + writeFile(t, filepath.Join(repo, "docs", "config.yaml"), "base: true\n") + runGit(t, repo, "add", "docs/config.yaml") + runGit(t, repo, "commit", "-m", "base") + + privateKey := "LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0t" + writeFile(t, filepath.Join(repo, "docs", "config.yaml"), strings.Join([]string{ + "PRIVATE_KEY: " + privateKey, + "SSH_PRIVATE_KEY: " + privateKey, + "JWT_PRIVATE_KEY: " + privateKey, + "SIGNING_PRIVATE_KEY: " + privateKey, + }, "\n")+"\n") + runGit(t, repo, "add", "docs/config.yaml") + runGit(t, repo, "commit", "-m", "add private key config") + + got := collectFromPreviousCommit(t, repo) + var count int + for _, item := range got { + if item.File != "docs/config.yaml" || item.Rule != "public_content_generic_credential" { + continue + } + count++ + if strings.Contains(item.Excerpt, privateKey) { + t.Fatalf("private key finding leaked value in excerpt %q", item.Excerpt) + } + } + if count != 4 { + t.Fatalf("private key assignment findings = %d, want 4: %#v", count, got) + } +} + +func TestCollectDetectsCredentialValuesThatLookLikeBareIdentifiers(t *testing.T) { + repo := newGitRepo(t) + writeFile(t, filepath.Join(repo, "docs", "config.yaml"), "base: true\n") + runGit(t, repo, "add", "docs/config.yaml") + runGit(t, repo, "commit", "-m", "base") + + writeFile(t, filepath.Join(repo, "docs", "config.yaml"), strings.Join([]string{ + "API_KEY_OPENAI: prod_key", + "CLIENT_SECRET_GOOGLE: prod_secret", + "TOKEN_GITHUB: github_token", + "APP_PASSWORD_PROD: prod_password", + }, "\n")+"\n") + runGit(t, repo, "add", "docs/config.yaml") + runGit(t, repo, "commit", "-m", "add credential config") + + got := collectFromPreviousCommit(t, repo) + var count int + for _, item := range got { + if item.File == "docs/config.yaml" && item.Rule == "public_content_generic_credential" { + count++ + } + } + if count != 4 { + t.Fatalf("bare identifier credential findings = %d, want 4: %#v", count, got) + } +} + +func TestCollectAllowsBenignUnquotedTokenFields(t *testing.T) { + repo := newGitRepo(t) + writeFile(t, filepath.Join(repo, "docs", "config.yaml"), "base: true\n") + runGit(t, repo, "add", "docs/config.yaml") + runGit(t, repo, "commit", "-m", "base") + + writeFile(t, filepath.Join(repo, "docs", "config.yaml"), strings.Join([]string{ + "tokens: 128", + "token_type: bearer", + "max_tokens: 2000", + "completion_tokens: 200", + "prompt_tokens: 100", + }, "\n")+"\n") + runGit(t, repo, "add", "docs/config.yaml") + runGit(t, repo, "commit", "-m", "add benign token config") + + got := collectFromPreviousCommit(t, repo) + for _, item := range got { + if item.File == "docs/config.yaml" && item.Rule == "public_content_generic_credential" { + t.Fatalf("benign unquoted token field should not be credential finding: %#v", got) + } + } +} + +func TestCollectDetectsCredentialPhraseBeforeEnvironmentSuffix(t *testing.T) { + repo := newGitRepo(t) + writeFile(t, filepath.Join(repo, "docs", "config.yaml"), "base: true\n") + runGit(t, repo, "add", "docs/config.yaml") + runGit(t, repo, "commit", "-m", "base") + + writeFile(t, filepath.Join(repo, "docs", "config.yaml"), strings.Join([]string{ + "API_KEY_OPENAI: real-openai-key", + "TOKEN_GITHUB: real-github-token", + "CLIENT_SECRET_GOOGLE: real-google-secret", + "SECRET_KEY_BASE: real-secret-key-base", + "APP_PASSWORD_PROD: real-prod-password", + }, "\n")+"\n") + runGit(t, repo, "add", "docs/config.yaml") + runGit(t, repo, "commit", "-m", "add credential config") + + got := collectFromPreviousCommit(t, repo) + var count int + for _, item := range got { + if item.File != "docs/config.yaml" || item.Rule != "public_content_generic_credential" { + continue + } + count++ + for _, forbidden := range []string{ + "real-openai-key", + "real-github-token", + "real-google-secret", + "real-secret-key-base", + "real-prod-password", + } { + if strings.Contains(item.Excerpt, forbidden) { + t.Fatalf("credential finding leaked value %q in excerpt %q", forbidden, item.Excerpt) + } + } + } + if count != 5 { + t.Fatalf("credential suffix variants findings = %d, want 5: %#v", count, got) + } +} + +func TestCollectDetectsPrivateKeyWhenOnlyEndIsAdded(t *testing.T) { + repo := newGitRepo(t) + + writeFile(t, filepath.Join(repo, "docs", "key.pem"), privateKeyBegin()+"legacy-body\n") + runGit(t, repo, "add", "docs/key.pem") + runGit(t, repo, "commit", "-m", "base") + + writeFile(t, filepath.Join(repo, "docs", "key.pem"), privateKeyBegin()+"legacy-body\nnew-body\n"+privateKeyEnd()) + runGit(t, repo, "add", "docs/key.pem") + runGit(t, repo, "commit", "-m", "complete key") + + got := collectFromPreviousCommit(t, repo) + requireFinding(t, got, "docs/key.pem", "public_content_private_key_block") +} + +func TestCollectDetectsPrivateKeyWhenOnlyBeginIsAdded(t *testing.T) { + repo := newGitRepo(t) + + writeFile(t, filepath.Join(repo, "docs", "key.pem"), "legacy-body\n"+privateKeyEnd()) + runGit(t, repo, "add", "docs/key.pem") + runGit(t, repo, "commit", "-m", "base") + + writeFile(t, filepath.Join(repo, "docs", "key.pem"), privateKeyBegin()+"legacy-body\n"+privateKeyEnd()) + runGit(t, repo, "add", "docs/key.pem") + runGit(t, repo, "commit", "-m", "complete key") + + got := collectFromPreviousCommit(t, repo) + requireFinding(t, got, "docs/key.pem", "public_content_private_key_block") +} + +func TestCollectDetectsPrivateKeyWhenOnlyBodyIsAdded(t *testing.T) { + repo := newGitRepo(t) + + writeFile(t, filepath.Join(repo, "docs", "key.pem"), privateKeyBegin()+privateKeyEnd()) + runGit(t, repo, "add", "docs/key.pem") + runGit(t, repo, "commit", "-m", "base") + + writeFile(t, filepath.Join(repo, "docs", "key.pem"), privateKeyBegin()+"new-body\n"+privateKeyEnd()) + runGit(t, repo, "add", "docs/key.pem") + runGit(t, repo, "commit", "-m", "add body") + + got := collectFromPreviousCommit(t, repo) + requireFinding(t, got, "docs/key.pem", "public_content_private_key_block") +} + +func TestCollectIgnoresUntouchedHistoricalPrivateKey(t *testing.T) { + repo := newGitRepo(t) + + writeFile(t, filepath.Join(repo, "docs", "key.pem"), privateKeyBegin()+"legacy-body\n"+privateKeyEnd()) + runGit(t, repo, "add", "docs/key.pem") + runGit(t, repo, "commit", "-m", "base") + + writeFile(t, filepath.Join(repo, "docs", "key.pem"), privateKeyBegin()+"legacy-body\n"+privateKeyEnd()) + writeFile(t, filepath.Join(repo, "docs", "public.md"), "public docs update\n") + runGit(t, repo, "add", "docs/public.md") + runGit(t, repo, "commit", "-m", "docs update") + + got := collectFromPreviousCommit(t, repo) + for _, item := range got { + if item.File == "docs/key.pem" && item.Rule == "public_content_private_key_block" { + t.Fatalf("collector reported untouched historical private key: %#v", got) + } + } +} + +func TestCollectIgnoresDeletedPrivateKeyLine(t *testing.T) { + repo := newGitRepo(t) + + writeFile(t, filepath.Join(repo, "docs", "key.pem"), privateKeyBegin()+"legacy-body\n"+privateKeyEnd()) + runGit(t, repo, "add", "docs/key.pem") + runGit(t, repo, "commit", "-m", "base") + + writeFile(t, filepath.Join(repo, "docs", "key.pem"), privateKeyBegin()+privateKeyEnd()) + runGit(t, repo, "add", "docs/key.pem") + runGit(t, repo, "commit", "-m", "remove body") + + got := collectFromPreviousCommit(t, repo) + for _, item := range got { + if item.File == "docs/key.pem" && item.Rule == "public_content_private_key_block" { + t.Fatalf("collector reported delete-only private key cleanup: %#v", got) + } + } +} + +func TestCollectSkipsOnlyKnownQualityGateFixtureFiles(t *testing.T) { + repo := t.TempDir() + runGit(t, repo, "init") + runGit(t, repo, "config", "user.email", "test@example.com") + runGit(t, repo, "config", "user.name", "Test User") + + writeFile(t, filepath.Join(repo, "README.md"), "base\n") + runGit(t, repo, "add", "README.md") + runGit(t, repo, "commit", "-m", "base") + + writeFile(t, filepath.Join(repo, "internal", "qualitygate", "publiccontent", "collect_test.go"), "SECRET_TOKEN=fixture\n") + writeFile(t, filepath.Join(repo, "internal", "qualitygate", "publiccontent", "scan_test.go"), "SECRET_TOKEN=fixture\n") + writeFile(t, filepath.Join(repo, "internal", "qualitygate", "publiccontent", "scan.go"), "const privateKeyFixture = \""+privateKeyBeginPrefix+privateKeyMarker+"\"\n") + writeFile(t, filepath.Join(repo, "internal", "qualitygate", "publiccontent", "rules.go"), "markers := []string{\"generated with automation\"}\n") + writeFile(t, filepath.Join(repo, "tests", "e2e", "new-public-workflow.test.sh"), "SECRET_TOKEN=real-leak\n") + runGit(t, repo, "add", ".") + runGit(t, repo, "commit", "-m", "add scanner fixtures") + + metadataPath := filepath.Join(repo, "pr-metadata.json") + writeFile(t, metadataPath, `{}`) + + got, err := Collect(context.Background(), Options{ + Repo: repo, + ChangedFrom: "HEAD~1", + MetadataPath: metadataPath, + }) + if err != nil { + t.Fatalf("Collect() error = %v", err) + } + var foundOrdinaryTestLeak bool + for _, item := range got { + switch item.File { + case "internal/qualitygate/publiccontent/collect_test.go", + "internal/qualitygate/publiccontent/scan.go", + "internal/qualitygate/publiccontent/scan_test.go", + "internal/qualitygate/publiccontent/rules.go": + t.Fatalf("collector scanned known fixture or detector implementation file: %#v", got) + } + if item.File == "tests/e2e/new-public-workflow.test.sh" && item.Rule == "public_content_generic_credential" { + foundOrdinaryTestLeak = true + } + } + if !foundOrdinaryTestLeak { + t.Fatalf("collector should still scan ordinary test files for real leaks: %#v", got) + } +} + +func TestScanChangedFileDocumentsFixtureExclusions(t *testing.T) { + excluded := []string{ + "internal/qualitygate/publiccontent/collect_test.go", + "internal/qualitygate/publiccontent/rules.go", + "internal/qualitygate/publiccontent/scan.go", + "internal/qualitygate/publiccontent/scan_test.go", + } + for _, file := range excluded { + if scanChangedFile(file) { + t.Fatalf("scanChangedFile(%q) = true, want false for detector fixture/implementation path", file) + } + } + + included := []string{ + "internal/qualitygate/publiccontent/new_test.go", + "tests/e2e/new-public-workflow.test.sh", + "docs/public.md", + } + for _, file := range included { + if !scanChangedFile(file) { + t.Fatalf("scanChangedFile(%q) = false, want true", file) + } + } +} + +func TestCollectScansAddedLinesInSpecialPathNames(t *testing.T) { + repo := newGitRepo(t) + writeFile(t, filepath.Join(repo, "docs", "old.md"), "base\n") + runGit(t, repo, "add", ".") + runGit(t, repo, "commit", "-m", "base") + + writeFile(t, filepath.Join(repo, "docs", "has space.md"), "SECRET_TOKEN=space-value\n") + writeFile(t, filepath.Join(repo, `weird"quote.md`), "SECRET_TOKEN=quote-value\n") + runGit(t, repo, "mv", "docs/old.md", "docs/new name.md") + writeFile(t, filepath.Join(repo, "docs", "new name.md"), "base\nSECRET_TOKEN=rename-value\n") + runGit(t, repo, "add", ".") + runGit(t, repo, "commit", "-m", "add special paths") + + got := collectFromPreviousCommit(t, repo) + requireFinding(t, got, "docs/has space.md", "public_content_generic_credential") + requireFinding(t, got, `weird"quote.md`, "public_content_generic_credential") + requireFinding(t, got, "docs/new name.md", "public_content_generic_credential") +} + +func TestCollectScansBranchNameAsWarning(t *testing.T) { + repo := t.TempDir() + metadataPath := filepath.Join(repo, "pr-metadata.json") + writeFile(t, metadataPath, `{"branch":"bot/public-doc-update"}`) + got, err := Collect(context.Background(), Options{ + Repo: repo, + MetadataPath: metadataPath, + }) + if err != nil { + t.Fatalf("Collect() error = %v", err) + } + if len(got) != 1 || got[0].Rule != "public_content_automation_branch" { + t.Fatalf("branch findings = %#v", got) + } +} + +func TestCollectUsesExplicitBranchNameWhenDetached(t *testing.T) { + repo := newGitRepo(t) + writeFile(t, filepath.Join(repo, "README.md"), "base\n") + runGit(t, repo, "add", "README.md") + runGit(t, repo, "commit", "-m", "base") + runGit(t, repo, "checkout", "-b", "bot/public-doc-update") + writeFile(t, filepath.Join(repo, "docs.md"), "safe docs\n") + runGit(t, repo, "add", "docs.md") + runGit(t, repo, "commit", "-m", "docs") + head := strings.TrimSpace(string(runGitOutput(t, repo, "rev-parse", "HEAD"))) + runGit(t, repo, "checkout", "--detach", head) + + metadataPath := filepath.Join(repo, "pr-metadata.json") + writeFile(t, metadataPath, `{}`) + got, err := Collect(context.Background(), Options{ + Repo: repo, + MetadataPath: metadataPath, + BranchName: "bot/public-doc-update", + }) + if err != nil { + t.Fatalf("Collect() error = %v", err) + } + requireFinding(t, got, "branch", "public_content_automation_branch") +} + +func TestCollectUsesBranchEnvironmentWhenDetached(t *testing.T) { + repo := newGitRepo(t) + writeFile(t, filepath.Join(repo, "README.md"), "base\n") + runGit(t, repo, "add", "README.md") + runGit(t, repo, "commit", "-m", "base") + runGit(t, repo, "checkout", "-b", "bot/public-env-update") + writeFile(t, filepath.Join(repo, "docs.md"), "safe docs\n") + runGit(t, repo, "add", "docs.md") + runGit(t, repo, "commit", "-m", "docs") + head := strings.TrimSpace(string(runGitOutput(t, repo, "rev-parse", "HEAD"))) + runGit(t, repo, "checkout", "--detach", head) + t.Setenv("GITHUB_HEAD_REF", "bot/public-env-update") + + metadataPath := filepath.Join(repo, "pr-metadata.json") + writeFile(t, metadataPath, `{}`) + got, err := Collect(context.Background(), Options{ + Repo: repo, + MetadataPath: metadataPath, + }) + if err != nil { + t.Fatalf("Collect() error = %v", err) + } + requireFinding(t, got, "branch", "public_content_automation_branch") +} + +func TestCollectPreservesFindingAttributionForChangedLines(t *testing.T) { + repo := newGitRepo(t) + writeFile(t, filepath.Join(repo, "docs", "auth.md"), "intro\n") + runGit(t, repo, "add", "docs/auth.md") + runGit(t, repo, "commit", "-m", "base") + + writeFile(t, filepath.Join(repo, "docs", "auth.md"), "intro\nAuthorization: Bearer abcdefghijklmnopqrstuvwxyz\n") + runGit(t, repo, "add", "docs/auth.md") + runGit(t, repo, "commit", "-m", "add auth docs") + + got := collectFromPreviousCommit(t, repo) + for _, item := range got { + if item.Rule == "public_content_bearer_header" { + if item.File != "docs/auth.md" || item.Line != 2 || item.Source != "file" { + t.Fatalf("changed-line attribution = %#v", item) + } + return + } + } + t.Fatalf("missing bearer finding: %#v", got) +} + +func TestAppendUniqueFindingsDeduplicatesByRuleFileLineAndSource(t *testing.T) { + base := []Finding{newFinding("public_content_private_key_block", "docs/key.pem", 1, "file", "private key block")} + got := appendUniqueFindings(base, + newFinding("public_content_private_key_block", "docs/key.pem", 1, "file", "private key block"), + newFinding("public_content_private_key_block", "docs/key.pem", 2, "file", "private key block"), + ) + if len(got) != 2 { + t.Fatalf("appendUniqueFindings len = %d, want 2: %#v", len(got), got) + } +} + +func newGitRepo(t *testing.T) string { + t.Helper() + repo := t.TempDir() + runGit(t, repo, "init") + runGit(t, repo, "config", "user.email", "test@example.com") + runGit(t, repo, "config", "user.name", "Test User") + return repo +} + +func privateKeyBegin() string { + return privateKeyBeginPrefix + privateKeyMarker + "\n" +} + +func privateKeyEnd() string { + return privateKeyEndPrefix + privateKeyMarker + "\n" +} + +func collectFromPreviousCommit(t *testing.T, repo string) []Finding { + t.Helper() + metadataPath := filepath.Join(repo, "pr-metadata.json") + writeFile(t, metadataPath, `{}`) + got, err := Collect(context.Background(), Options{ + Repo: repo, + ChangedFrom: "HEAD~1", + MetadataPath: metadataPath, + }) + if err != nil { + t.Fatalf("Collect() error = %v", err) + } + return got +} + +func requireFinding(t *testing.T, got []Finding, file, rule string) { + t.Helper() + for _, item := range got { + if item.File == file && item.Rule == rule { + return + } + } + t.Fatalf("missing %s in %s findings: %#v", rule, file, got) +} + +func TestCollectRequiresValidMetadataJSON(t *testing.T) { + repo := t.TempDir() + metadataPath := filepath.Join(repo, "pr-metadata.json") + writeFile(t, metadataPath, `{"title":`) + + _, err := Collect(context.Background(), Options{Repo: repo, MetadataPath: metadataPath}) + if err == nil || !strings.Contains(err.Error(), "public content metadata") { + t.Fatalf("Collect() error = %v, want metadata parse error", err) + } +} + +func runGit(t *testing.T, repo string, args ...string) { + t.Helper() + if len(args) > 0 && args[0] == "commit" { + args = append([]string{"commit", "--no-verify"}, args[1:]...) + } + cmd := exec.Command("git", args...) + cmd.Dir = repo + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("git %v failed: %v\n%s", args, err, out) + } +} + +func runGitOutput(t *testing.T, repo string, args ...string) []byte { + t.Helper() + cmd := exec.Command("git", args...) + cmd.Dir = repo + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("git %v failed: %v\n%s", args, err, out) + } + return out +} + +func writeFile(t *testing.T, path, data string) { + t.Helper() + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(path, []byte(data), 0o644); err != nil { + t.Fatal(err) + } +} diff --git a/internal/qualitygate/publiccontent/comment_audit.go b/internal/qualitygate/publiccontent/comment_audit.go new file mode 100644 index 00000000..760fdcf9 --- /dev/null +++ b/internal/qualitygate/publiccontent/comment_audit.go @@ -0,0 +1,11 @@ +// Copyright (c) 2026 Lark Technologies Pte. Ltd. +// SPDX-License-Identifier: MIT + +package publiccontent + +func ScanComment(kind, body string) []Finding { + if kind == "" { + kind = "comment" + } + return scanText(kind, "comment", body, false) +} diff --git a/internal/qualitygate/publiccontent/comment_audit_test.go b/internal/qualitygate/publiccontent/comment_audit_test.go new file mode 100644 index 00000000..6d05e675 --- /dev/null +++ b/internal/qualitygate/publiccontent/comment_audit_test.go @@ -0,0 +1,19 @@ +// Copyright (c) 2026 Lark Technologies Pte. Ltd. +// SPDX-License-Identifier: MIT + +package publiccontent + +import "testing" + +func TestScanCommentAuditsPublishedCommentBodies(t *testing.T) { + got := ScanComment("issue_comment", `The published comment included /tmp/harness`+`-agent/run and CCM`+`-Harness: stage-4`) + rules := findingRules(got) + if !rules["public_content_harness_metadata"] || !rules["public_content_ccm_harness_trailer"] { + t.Fatalf("comment audit findings = %#v", got) + } + for _, item := range got { + if item.File != "issue_comment" { + t.Fatalf("comment finding file = %q, want issue_comment", item.File) + } + } +} diff --git a/internal/qualitygate/publiccontent/metadata.go b/internal/qualitygate/publiccontent/metadata.go new file mode 100644 index 00000000..14fd9907 --- /dev/null +++ b/internal/qualitygate/publiccontent/metadata.go @@ -0,0 +1,45 @@ +// Copyright (c) 2026 Lark Technologies Pte. Ltd. +// SPDX-License-Identifier: MIT + +package publiccontent + +import ( + "encoding/json" + "fmt" + + "github.com/larksuite/cli/internal/vfs" +) + +func LoadMetadata(path string) (Metadata, error) { + if path == "" { + return Metadata{}, nil + } + data, err := vfs.ReadFile(path) + if err != nil { + return Metadata{}, fmt.Errorf("public content metadata: %w", err) + } + if len(data) == 0 { + return Metadata{}, nil + } + var out Metadata + if err := json.Unmarshal(data, &out); err != nil { + return Metadata{}, fmt.Errorf("public content metadata: %w", err) + } + return out, nil +} + +func scanMetadata(m Metadata) []Finding { + text := "" + if m.Title != "" { + text += "title: " + m.Title + "\n" + } + if m.Body != "" { + text += "body:\n" + m.Body + "\n" + } + if text == "" { + return nil + } + out := scanText("pull_request_metadata", "metadata", text, false) + out = append(out, semanticCandidate("pull_request_metadata", "metadata", text, 1)...) + return out +} diff --git a/internal/qualitygate/publiccontent/metadata_test.go b/internal/qualitygate/publiccontent/metadata_test.go new file mode 100644 index 00000000..a9e6616c --- /dev/null +++ b/internal/qualitygate/publiccontent/metadata_test.go @@ -0,0 +1,22 @@ +// Copyright (c) 2026 Lark Technologies Pte. Ltd. +// SPDX-License-Identifier: MIT + +package publiccontent + +import ( + "path/filepath" + "testing" +) + +func TestLoadMetadataReadsTitleAndBody(t *testing.T) { + path := filepath.Join(t.TempDir(), "metadata.json") + writeFile(t, path, `{"title":"public change","body":"pass`+`word = \"example-password\""}`) + + got, err := LoadMetadata(path) + if err != nil { + t.Fatalf("LoadMetadata() error = %v", err) + } + if got.Title != "public change" || got.Body == "" { + t.Fatalf("metadata = %#v", got) + } +} diff --git a/internal/qualitygate/publiccontent/rules.go b/internal/qualitygate/publiccontent/rules.go new file mode 100644 index 00000000..0f9c1804 --- /dev/null +++ b/internal/qualitygate/publiccontent/rules.go @@ -0,0 +1,421 @@ +// Copyright (c) 2026 Lark Technologies Pte. Ltd. +// SPDX-License-Identifier: MIT + +package publiccontent + +import ( + "net/url" + "path/filepath" + "regexp" + "strings" + + "github.com/larksuite/cli/internal/qualitygate/report" +) + +var ( + credentialAssignmentRE = regexp.MustCompile(`(?i)["']?\b[A-Za-z0-9_-]*(?:api[_-]?key|access[_-]?key|private[_-]?key|secret|password|passwd|token|webhook|access[_-]?token|client[_-]?secret)[A-Za-z0-9_-]*\b["']?\s*[:=]\s*(?:"((?:\\.|[^"\\])*)"|'((?:\\.|[^'\\])*)'|(\$\([^)]*\))|(\$\{\{[^}]+\}\})|([^"'\s,}\]]+))`) + jwtLikeRE = regexp.MustCompile(`\b[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b`) + credentialURLRE = regexp.MustCompile(`(?i)\b[a-z][a-z0-9+.-]*://[^/\s:@]*:[^@\s/]+@[^)\s]+`) + bearerHeaderRE = regexp.MustCompile(`(?i)(?:\bAuthorization\s*:\s*Bearer\s+|["']Authorization["']\s*:\s*["']Bearer\s+)[A-Za-z0-9._+/=-]{12,}`) + semanticBearerHeaderRE = regexp.MustCompile(`(?i)(?:\bAuthorization\s*:\s*Bearer\s+[^"'\s,}\]]+|["']Authorization["']\s*:\s*["']Bearer\s+[^"'\\\s,}\]]+)`) + changeIDTrailerRE = regexp.MustCompile(`(?i)^\s*Change-Id:\s*\S+`) + reviewedOnTrailerRE = regexp.MustCompile(`(?i)^\s*Reviewed-on:\s*\S+`) + ccmHarnessTrailerRE = regexp.MustCompile(`(?i)\bCCM-Harness:\s*\S+`) + privateIPv4RE = regexp.MustCompile(`\b(?:10\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|192\.168\.[0-9]{1,3}\.[0-9]{1,3}|172\.(?:1[6-9]|2[0-9]|3[0-1])\.[0-9]{1,3}\.[0-9]{1,3})\b`) + automationBranchRE = regexp.MustCompile(`(?i)(^|/)(bot|automation)[-/]`) +) + +func actionForRule(rule string) report.Action { + switch rule { + case "public_content_generic_credential", + "public_content_private_key_block", + "public_content_jwt_like_token", + "public_content_bearer_header", + "public_content_credential_url", + "public_content_change_id_trailer", + "public_content_reviewed_on_trailer", + "public_content_provenance_marker", + "public_content_detector_fingerprint", + "public_content_harness_metadata", + "public_content_ccm_harness_trailer": + return report.ActionReject + case "public_content_private_ipv4", + "public_content_automation_branch": + return report.ActionWarning + default: + return report.ActionWarning + } +} + +func isPlaceholderValue(value string) bool { + trimmed := strings.Trim(value, `"'`) + normalized := strings.ToLower(trimmed) + if normalized == "" || + normalized == "=" || + percentWrappedPlaceholder(normalized) || + angleWrappedPlaceholder(normalized) || + urlWithAnglePlaceholder(normalized) || + isCredentialReferenceValue(trimmed) { + return true + } + return namedPlaceholderValue(normalized) +} + +func namedPlaceholderValue(value string) bool { + switch value { + case "placeholder", "redacted", "", "xxxx": + return true + } + return strings.Contains(value, "cli_example") || allXPlaceholder(value) +} + +func allXPlaceholder(value string) bool { + if len(value) < 4 { + return false + } + for _, r := range value { + if r != 'x' { + return false + } + } + return true +} + +func urlWithAnglePlaceholder(value string) bool { + if !strings.Contains(value, "://") || + !strings.Contains(value, "<") || + !strings.Contains(value, ">") { + return false + } + return !urlRemainderLooksCredentialLike(removeAnglePlaceholders(value)) +} + +func removeAnglePlaceholders(value string) string { + var out strings.Builder + for len(value) > 0 { + start := strings.Index(value, "<") + if start < 0 { + out.WriteString(value) + break + } + out.WriteString(value[:start]) + end := strings.Index(value[start+1:], ">") + if end < 0 { + out.WriteString(value[start:]) + break + } + value = value[start+end+2:] + } + return out.String() +} + +func urlRemainderLooksCredentialLike(value string) bool { + normalized := strings.ToLower(value) + for _, marker := range []string{ + "secret", + "token", + "password", + "passwd", + "api_key", + "apikey", + "private_key", + "privatekey", + "client_secret", + "clientsecret", + } { + if strings.Contains(normalized, marker) { + return true + } + } + for _, part := range strings.FieldsFunc(normalized, func(r rune) bool { + return !((r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '_' || r == '-') + }) { + if credentialShapedIdentifier(part) || longCredentialSegment(part) { + return true + } + } + return false +} + +func longCredentialSegment(value string) bool { + if len(value) < 16 { + return false + } + var hasLetter, hasDigit bool + for _, r := range value { + switch { + case r >= 'a' && r <= 'z': + hasLetter = true + case r >= '0' && r <= '9': + hasDigit = true + case r == '_' || r == '-': + default: + return false + } + } + return hasLetter || hasDigit +} + +func isCredentialReferenceValue(value string) bool { + normalized := strings.ToLower(value) + switch { + case strings.HasPrefix(normalized, "${{"): + return githubExpressionReference(normalized) + case strings.HasPrefix(normalized, "$("): + return !commandSubstitutionLooksCredentialLike(normalized) + case strings.HasPrefix(normalized, "process.env."): + return credentialReferenceIdentifier(strings.TrimPrefix(normalized, "process.env.")) + case strings.HasPrefix(normalized, "${"): + return credentialReferenceIdentifier(strings.TrimSuffix(strings.TrimPrefix(normalized, "${"), "}")) + case strings.HasPrefix(value, "$"): + return credentialReferenceIdentifier(strings.TrimPrefix(normalized, "$")) + default: + return false + } +} + +func commandSubstitutionLooksCredentialLike(value string) bool { + if !strings.HasPrefix(value, "$(") || !strings.HasSuffix(value, ")") { + return false + } + inner := strings.TrimSuffix(strings.TrimPrefix(value, "$("), ")") + for _, part := range strings.FieldsFunc(inner, func(r rune) bool { + return !((r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '_' || r == '-') + }) { + if credentialShapedIdentifier(part) || longCredentialSegment(part) { + return true + } + } + return false +} + +func githubExpressionReference(value string) bool { + if !strings.HasPrefix(value, "${{") || !strings.HasSuffix(value, "}}") { + return false + } + expr := strings.TrimSpace(strings.TrimSuffix(strings.TrimPrefix(value, "${{"), "}}")) + switch { + case strings.HasPrefix(expr, "secrets."): + return dottedReferenceIdentifier(strings.TrimPrefix(expr, "secrets.")) + case strings.HasPrefix(expr, "env."): + return dottedReferenceIdentifier(strings.TrimPrefix(expr, "env.")) + case strings.HasPrefix(expr, "vars."): + return dottedReferenceIdentifier(strings.TrimPrefix(expr, "vars.")) + case expr == "github.token": + return true + default: + return false + } +} + +func dottedReferenceIdentifier(value string) bool { + if value == "" { + return false + } + for _, part := range strings.Split(value, ".") { + if !referenceIdentifier(part) { + return false + } + } + return true +} + +func credentialReferenceIdentifier(value string) bool { + return referenceIdentifier(value) && !credentialShapedIdentifier(value) +} + +func referenceIdentifier(value string) bool { + if value == "" { + return false + } + for i, r := range value { + switch { + case r >= 'a' && r <= 'z': + case r >= '0' && r <= '9' && i > 0: + case r == '_' && i > 0: + default: + return false + } + } + return true +} + +func angleWrappedPlaceholder(value string) bool { + if len(value) < 3 || !strings.HasPrefix(value, "<") || !strings.HasSuffix(value, ">") { + return false + } + return anglePlaceholderIdentifier(strings.Trim(value, "<>")) +} + +func percentWrappedPlaceholder(value string) bool { + if len(value) < 3 || !strings.HasPrefix(value, "%") || !strings.HasSuffix(value, "%") { + return false + } + inner := strings.Trim(value, "%") + return delimitedPlaceholderIdentifier(inner) && !credentialShapedIdentifier(inner) +} + +func delimitedPlaceholderIdentifier(value string) bool { + if value == "" { + return false + } + for _, r := range value { + if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '_' || r == '-' { + continue + } + return false + } + return true +} + +func anglePlaceholderIdentifier(value string) bool { + if value == "" { + return false + } + for _, r := range value { + if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '_' || r == '-' { + continue + } + return false + } + if credentialShapedIdentifier(value) { + return false + } + switch value { + case "token", + "id", + "key", + "secret", + "password", + "api-key", + "client-secret", + "access-token", + "refresh-token", + "auth-token", + "bearer-token", + "session-token", + "service-token": + return true + } + for _, suffix := range []string{"_token", "_id", "_key", "_secret", "_password"} { + if strings.HasSuffix(value, suffix) { + return true + } + } + for _, suffix := range []string{"-token", "-id", "-key", "-secret", "-password"} { + if strings.HasSuffix(value, suffix) { + return true + } + } + return false +} + +func credentialShapedValue(value string) bool { + normalized := strings.ToLower(strings.Trim(value, `"'<>`)) + return credentialShapedIdentifier(normalized) +} + +func credentialShapedIdentifier(value string) bool { + switch { + case strings.HasPrefix(value, "sk_live_"), + strings.HasPrefix(value, "sk_test_"), + strings.HasPrefix(value, "ghp_"), + strings.HasPrefix(value, "gho_"), + strings.HasPrefix(value, "ghu_"), + strings.HasPrefix(value, "github_pat_"), + strings.HasPrefix(value, "xoxb_"), + strings.HasPrefix(value, "xoxp_"), + strings.HasPrefix(value, "xoxa_"): + return true + case strings.HasPrefix(value, "real-") && + (strings.Contains(value, "secret") || + strings.Contains(value, "token") || + strings.Contains(value, "key") || + strings.Contains(value, "password")): + return true + default: + return false + } +} + +func resourceTokenPlaceholderValue(value string) bool { + normalized := strings.ToLower(strings.Trim(value, `"'`)) + switch normalized { + case "wiki_token", + "folder_token", + "obj_token", + "spreadsheet_token", + "file_token", + "doc_token", + "node_token", + "parent_node_token", + "origin_node_token", + "drive_route_token": + return true + default: + return false + } +} + +func provenanceMarker(line string) bool { + normalized := strings.ToLower(line) + markers := []string{ + "generat" + "ed by tool", + "creat" + "ed by tool", + "generat" + "ed by automation", + "creat" + "ed by automation", + "machine-" + "generated", + "generated with automated", + "generated with automation", + "🤖 generated", + } + for _, marker := range markers { + if strings.Contains(normalized, marker) { + return true + } + } + if strings.HasPrefix(normalized, "co-authored-by:") && + (strings.Contains(normalized, "" + } + u.User = url.UserPassword("", "") + return u.String() +} diff --git a/internal/qualitygate/publiccontent/scan.go b/internal/qualitygate/publiccontent/scan.go new file mode 100644 index 00000000..b9220bac --- /dev/null +++ b/internal/qualitygate/publiccontent/scan.go @@ -0,0 +1,656 @@ +// Copyright (c) 2026 Lark Technologies Pte. Ltd. +// SPDX-License-Identifier: MIT + +package publiccontent + +import ( + "fmt" + "path/filepath" + "sort" + "strings" + "unicode" +) + +const ( + privateKeyBeginPrefix = "-----" + "BEGIN " + privateKeyEndPrefix = "-----" + "END " + privateKeyMarker = "PRIVATE " + "KEY-----" +) + +func ScanFile(path string, data []byte) []Finding { + return scanText(filepath.ToSlash(path), "file", string(data), isDetectorRuleFile(path)) +} + +func semanticCandidate(file, source, text string, line int) []Finding { + excerpt := redactedSemanticExcerpt(text) + if excerpt == "" { + return nil + } + return []Finding{newFinding("public_content_semantic_candidate", file, line, source, excerpt)} +} + +func scanText(file, source, text string, detectorFile bool) []Finding { + var out []Finding + lines := strings.Split(text, "\n") + inPrivateKey := false + privateKeyLine := 0 + for i, line := range lines { + lineNo := i + 1 + if strings.Contains(line, privateKeyBeginPrefix) && strings.Contains(line, privateKeyMarker) { + inPrivateKey = true + privateKeyLine = lineNo + } + if inPrivateKey && strings.Contains(line, privateKeyEndPrefix) && strings.Contains(line, privateKeyMarker) { + out = append(out, newFinding("public_content_private_key_block", file, privateKeyLine, source, "private key block")) + inPrivateKey = false + } + for _, match := range credentialAssignmentRE.FindAllStringSubmatch(line, -1) { + if !isCredentialAssignmentMatch(match[0]) { + continue + } + value := credentialAssignmentValue(match) + keyName, _ := normalizedCredentialAssignmentKey(match[0]) + if value == "" || + isNonSecretLiteralValue(value) || + isPlaceholderValue(value) || + isResourceTokenPlaceholderAssignment(keyName, value) { + continue + } + if looksLikeEqualityComparison(value) { + continue + } + out = append(out, newFinding("public_content_generic_credential", file, lineNo, source, redactAssignment(match[0]))) + } + for _, match := range jwtLikeRE.FindAllString(line, -1) { + out = append(out, newFinding("public_content_jwt_like_token", file, lineNo, source, redactToken(match))) + } + for range bearerHeaderRE.FindAllString(line, -1) { + out = append(out, newFinding("public_content_bearer_header", file, lineNo, source, "Authorization: Bearer ")) + } + for _, match := range credentialURLRE.FindAllString(line, -1) { + if isPlaceholderCredentialURL(match) { + continue + } + out = append(out, newFinding("public_content_credential_url", file, lineNo, source, redactCredentialURL(match))) + } + for _, match := range privateIPv4RE.FindAllString(line, -1) { + out = append(out, newFinding("public_content_private_ipv4", file, lineNo, source, match)) + } + if source == "branch" && automationBranchRE.MatchString(line) { + out = append(out, newFinding("public_content_automation_branch", file, lineNo, source, "automation branch marker")) + } + switch { + case changeIDTrailerRE.MatchString(line): + out = append(out, newFinding("public_content_change_id_trailer", file, lineNo, source, "Change-Id: ")) + case reviewedOnTrailerRE.MatchString(line): + out = append(out, newFinding("public_content_reviewed_on_trailer", file, lineNo, source, "Reviewed-on: ")) + case ccmHarnessTrailerRE.MatchString(line): + out = append(out, newFinding("public_content_ccm_harness_trailer", file, lineNo, source, "CCM-Harness: ")) + } + if provenanceMarker(line) { + out = append(out, newFinding("public_content_provenance_marker", file, lineNo, source, "provenance marker")) + } + if strings.Contains(line, "/tmp/harness-agent") { + out = append(out, newFinding("public_content_harness_metadata", file, lineNo, source, "/tmp/harness-agent")) + } + if detectorFile && detectorFingerprint(line) { + out = append(out, newFinding("public_content_detector_fingerprint", file, lineNo, source, "public detector fingerprint")) + } + } + sort.SliceStable(out, func(i, j int) bool { + if out[i].File != out[j].File { + return out[i].File < out[j].File + } + if out[i].Line != out[j].Line { + return out[i].Line < out[j].Line + } + return out[i].Rule < out[j].Rule + }) + return out +} + +func isCredentialAssignmentMatch(match string) bool { + name, value, ok := normalizedCredentialAssignment(match) + if !ok { + return false + } + if isWebhookCredentialKey(name) && webhookAssignmentValueLooksCredentialLike(value) { + return true + } + if isBenignTokenField(name) && !credentialShapedValue(value) { + return false + } + return isExplicitCredentialKey(name) +} + +func normalizedCredentialAssignmentKey(match string) (string, bool) { + key, _, ok := normalizedCredentialAssignment(match) + return key, ok +} + +func normalizedCredentialAssignment(match string) (string, string, bool) { + key, ok := credentialAssignmentKey(match) + if !ok { + return "", "", false + } + key = strings.TrimSpace(key) + if key == "" { + return "", "", false + } + submatches := credentialAssignmentRE.FindStringSubmatch(match) + return normalizedCredentialKey(strings.Trim(key, `"'`)), credentialAssignmentValue(submatches), true +} + +func normalizedCredentialKey(key string) string { + key = strings.TrimSpace(key) + var out []rune + var prev rune + for i, r := range key { + if r == '-' { + r = '_' + } + if i > 0 && isCredentialKeyBoundary(prev, r) { + out = append(out, '_') + } + out = append(out, unicode.ToLower(r)) + prev = r + } + key = string(out) + key = strings.ReplaceAll(key, "-", "_") + return key +} + +func isCredentialKeyBoundary(prev, current rune) bool { + if prev == '_' || current == '_' { + return false + } + return (unicode.IsLower(prev) || unicode.IsDigit(prev)) && unicode.IsUpper(current) +} + +func isBenignTokenField(key string) bool { + if isTokenMetricField(key) || + isTokenMetadataField(key) || + isResourceTokenField(key) || + isPaginationOrSyncTokenField(key) { + return true + } + return false +} + +func isTokenMetricField(key string) bool { + switch key { + case "tokenizer", + "token_count", + "tokens", + "max_tokens", + "completion_tokens", + "prompt_tokens": + return true + default: + return false + } +} + +func isTokenMetadataField(key string) bool { + switch key { + case "access_token_expires_in", + "refresh_token_expires_in", + "token_expires_in", + "token_status", + "token_type", + "token_url", + "token_endpoint", + "token_format", + "secret_name": + return true + default: + return false + } +} + +func isPaginationOrSyncTokenField(key string) bool { + switch key { + case "page_token", + "next_page_token", + "sync_token": + return true + default: + return false + } +} + +func isResourceTokenField(key string) bool { + if !strings.HasSuffix(key, "_token") { + return false + } + prefix := strings.TrimSuffix(key, "_token") + switch prefix { + case "app", + "base", + "board", + "doc", + "drive_route", + "file", + "folder", + "host_node", + "minute", + "node", + "obj", + "origin_node", + "parent", + "parent_file", + "parent_node", + "share", + "spreadsheet", + "target", + "wiki": + return true + default: + return false + } +} + +func isResourceTokenPlaceholderAssignment(key, value string) bool { + return key == "token" && resourceTokenPlaceholderValue(value) +} + +func isNonSecretLiteralValue(value string) bool { + switch strings.ToLower(strings.TrimSpace(strings.Trim(value, `"'`))) { + case "true", "false", "null", "nil": + return true + default: + return false + } +} + +func isWebhookCredentialKey(key string) bool { + return strings.Contains(strings.ReplaceAll(key, "_", ""), "webhook") +} + +func webhookAssignmentValueLooksCredentialLike(value string) bool { + normalized := strings.ToLower(strings.Trim(value, `"'`)) + if normalized == "" || isPlaceholderValue(normalized) || isNonSecretLiteralValue(normalized) { + return false + } + return urlRemainderLooksCredentialLike(removeAnglePlaceholders(normalized)) || + credentialShapedIdentifier(strings.Trim(normalized, "$")) +} + +func isExplicitCredentialKey(key string) bool { + compact := strings.ReplaceAll(key, "_", "") + switch compact { + case "token", + "accesstoken", + "refreshtoken", + "authtoken", + "bearertoken", + "sessiontoken", + "servicetoken", + "apikey", + "accesskey", + "privatekey", + "apisecret", + "secret", + "secretkey", + "clientsecret", + "password", + "passwd": + return true + } + for _, phrase := range []string{ + "accesstoken", + "refreshtoken", + "authtoken", + "bearertoken", + "sessiontoken", + "servicetoken", + "bottoken", + "apikey", + "accesskey", + "privatekey", + "apisecret", + "clientsecret", + "secretkey", + } { + if strings.Contains(compact, phrase) { + return true + } + } + parts := credentialKeyParts(key) + for _, phrase := range [][2]string{ + {"access", "token"}, + {"refresh", "token"}, + {"auth", "token"}, + {"bearer", "token"}, + {"session", "token"}, + {"service", "token"}, + {"bot", "token"}, + {"api", "key"}, + {"access", "key"}, + {"private", "key"}, + {"api", "secret"}, + {"client", "secret"}, + {"secret", "key"}, + } { + if hasAdjacentCredentialParts(parts, phrase[0], phrase[1]) { + return true + } + } + for _, part := range parts { + switch part { + case "token", "secret", "password", "passwd": + return true + } + } + for _, suffix := range []string{ + "token", + "accesstoken", + "refreshtoken", + "authtoken", + "bearertoken", + "sessiontoken", + "servicetoken", + "bottoken", + "apikey", + "accesskey", + "privatekey", + "apisecret", + "clientsecret", + "secret", + "secretkey", + "password", + "passwd", + } { + if strings.HasSuffix(compact, suffix) { + return true + } + } + for _, suffix := range []string{ + "_access_token", + "_refresh_token", + "_auth_token", + "_bearer_token", + "_session_token", + "_service_token", + "_api_key", + "_access_key", + "_private_key", + "_api_secret", + "_client_secret", + "_secret", + "_secret_key", + "_password", + "_passwd", + } { + if strings.HasSuffix(key, suffix) { + return true + } + } + return false +} + +func credentialKeyParts(key string) []string { + var parts []string + for _, part := range strings.Split(key, "_") { + if part != "" { + parts = append(parts, part) + } + } + return parts +} + +func hasAdjacentCredentialParts(parts []string, first, second string) bool { + for i := 0; i+1 < len(parts); i++ { + if parts[i] == first && parts[i+1] == second { + return true + } + } + return false +} + +func credentialAssignmentValue(match []string) string { + for _, value := range match[1:] { + if value != "" { + return value + } + } + return "" +} + +func looksLikeEqualityComparison(value string) bool { + return strings.HasPrefix(strings.TrimSpace(value), "=") +} + +func isPlaceholderCredentialURL(raw string) bool { + userInfo, ok := credentialURLUserInfo(raw) + if !ok { + return false + } + _, password, ok := strings.Cut(userInfo, ":") + if !ok { + return false + } + return credentialURLPasswordPlaceholder(password) +} + +func credentialURLPasswordPlaceholder(password string) bool { + normalized := strings.ToLower(password) + decoded := strings.ReplaceAll(normalized, "%3c", "<") + decoded = strings.ReplaceAll(decoded, "%3e", ">") + switch decoded { + case "placeholder", "redacted", "", "xxxx": + return true + } + return angleWrappedPlaceholder(decoded) || percentWrappedPlaceholder(decoded) +} + +func credentialURLUserInfo(raw string) (string, bool) { + schemeIdx := strings.Index(raw, "://") + if schemeIdx < 0 { + return "", false + } + rest := raw[schemeIdx+len("://"):] + atIdx := strings.Index(rest, "@") + if atIdx < 0 { + return "", false + } + return rest[:atIdx], true +} + +func newFinding(rule, file string, line int, source, excerpt string) Finding { + return Finding{ + Rule: rule, + Action: actionForRule(rule), + File: file, + Line: line, + Source: source, + Excerpt: excerpt, + Message: messageForRule(rule), + Suggestion: suggestionForRule(rule), + } +} + +func messageForRule(rule string) string { + switch rule { + case "public_content_generic_credential": + return "public contribution contains a generic credential assignment" + case "public_content_private_key_block": + return "public contribution contains a private key block" + case "public_content_jwt_like_token": + return "public contribution contains a JWT-like token" + case "public_content_bearer_header": + return "public contribution contains an Authorization bearer token" + case "public_content_credential_url": + return "public contribution contains credentials embedded in a URL" + case "public_content_private_ipv4": + return "public contribution contains a private-network IP address" + case "public_content_automation_branch": + return "public contribution uses an automation-shaped branch name" + case "public_content_change_id_trailer": + return "public contribution contains a Change-Id trailer" + case "public_content_reviewed_on_trailer": + return "public contribution contains a Reviewed-on trailer" + case "public_content_provenance_marker": + return "public contribution contains a prohibited provenance marker" + case "public_content_detector_fingerprint": + return "public rule/config content exposes public detector fingerprints" + case "public_content_harness_metadata": + return "public contribution contains visible harness pipeline metadata" + case "public_content_ccm_harness_trailer": + return "public contribution contains a CCM-Harness trailer" + case "public_content_semantic_candidate": + return "public contribution contains text for semantic public content review" + default: + return "public contribution contains content that should not be published" + } +} + +func suggestionForRule(rule string) string { + switch actionForRule(rule) { + case "REJECT": + return "remove the value from the public contribution and replace it with a non-sensitive placeholder" + default: + return "remove private workflow metadata before publishing the public contribution" + } +} + +func redactAssignment(match string) string { + key, ok := credentialAssignmentKey(match) + if !ok { + return "" + } + return fmt.Sprintf("%s= ", strings.TrimSpace(key)) +} + +func credentialAssignmentKey(match string) (string, bool) { + idx := -1 + for _, sep := range []string{":", "="} { + if candidate := strings.Index(match, sep); candidate >= 0 && (idx < 0 || candidate < idx) { + idx = candidate + } + } + if idx < 0 { + return "", false + } + return match[:idx], true +} + +func redactToken(_ string) string { + return "" +} + +func redactedSemanticExcerpt(text string) string { + normalized := strings.Join(strings.Fields(text), " ") + if normalized == "" { + return "" + } + signals := semanticSignals(normalized) + if len(signals) == 0 { + return "" + } + sanitized := truncateRunes(sanitizeSemanticExcerpt(text), 600) + return fmt.Sprintf("semantic signals: %s; excerpt: %q", strings.Join(signals, ","), sanitized) +} + +func semanticSignals(normalized string) []string { + lower := strings.ToLower(normalized) + var signals []string + add := func(signal string) { + for _, existing := range signals { + if existing == signal { + return + } + } + signals = append(signals, signal) + } + + hasPrivateScope := strings.Contains(lower, "private") || strings.Contains(lower, "internal-only") + hasRequestMetadata := strings.Contains(lower, "request header") || strings.Contains(lower, "request headers") || strings.Contains(lower, "authorization header") || strings.Contains(lower, "metadata header") + hasTrustBoundary := strings.Contains(lower, "spoof") || strings.Contains(lower, "trust") || strings.Contains(lower, "risk scoring") || strings.Contains(lower, "classification") + hasRoadmap := strings.Contains(lower, "roadmap") || strings.Contains(lower, "migration") || strings.Contains(lower, "rollout") || strings.Contains(lower, "cutover") || strings.Contains(lower, "unpublished") + hasTiming := strings.Contains(lower, "target date") || strings.Contains(lower, "friday") || strings.Contains(lower, "monday") || strings.Contains(lower, "tuesday") || strings.Contains(lower, "wednesday") || strings.Contains(lower, "thursday") || strings.Contains(lower, "customer-visible") + hasImplementation := strings.Contains(lower, "server-side") || strings.Contains(lower, "implementation") + + if hasPrivateScope && hasRequestMetadata && hasTrustBoundary { + add("private_scope") + add("request_metadata") + add("trust_boundary_detail") + } + if hasRoadmap && (hasPrivateScope || hasTiming) { + add("roadmap_detail") + if hasPrivateScope { + add("private_scope") + } + if hasTiming { + add("roadmap_timing") + } + } + if hasPrivateScope && hasImplementation && hasTrustBoundary { + add("private_scope") + add("implementation_detail") + add("trust_boundary_detail") + } + + return signals +} + +func sanitizeSemanticExcerpt(text string) string { + text = redactPrivateKeyBlocks(text) + text = credentialAssignmentRE.ReplaceAllStringFunc(text, sanitizeCredentialAssignment) + text = strings.ReplaceAll(text, `"`, ``) + text = strings.ReplaceAll(text, `'`, ``) + text = semanticBearerHeaderRE.ReplaceAllString(text, "Authorization: Bearer ") + text = jwtLikeRE.ReplaceAllString(text, "") + text = credentialURLRE.ReplaceAllStringFunc(text, sanitizeCredentialURL) + return strings.Join(strings.Fields(text), " ") +} + +func redactPrivateKeyBlocks(text string) string { + lines := strings.Split(text, "\n") + var out []string + inPrivateKey := false + for _, line := range lines { + if strings.Contains(line, privateKeyBeginPrefix) && strings.Contains(line, privateKeyMarker) { + out = append(out, "") + inPrivateKey = true + if strings.Contains(line, privateKeyEndPrefix) && strings.Contains(line, privateKeyMarker) { + inPrivateKey = false + } + continue + } + if inPrivateKey { + if strings.Contains(line, privateKeyEndPrefix) && strings.Contains(line, privateKeyMarker) { + inPrivateKey = false + } + continue + } + out = append(out, line) + } + return strings.Join(out, "\n") +} + +func sanitizeCredentialAssignment(match string) string { + key, ok := credentialAssignmentKey(match) + if !ok { + return "" + } + return strings.TrimSpace(key) + "=" +} + +func sanitizeCredentialURL(raw string) string { + redacted := redactCredentialURL(raw) + redacted = strings.ReplaceAll(redacted, "%3Cuser%3E", "") + redacted = strings.ReplaceAll(redacted, "%3Credacted%3E", "") + return redacted +} + +func truncateRunes(text string, limit int) string { + if limit <= 0 { + return "" + } + runes := []rune(text) + if len(runes) <= limit { + return text + } + return string(runes[:limit]) + "..." +} diff --git a/internal/qualitygate/publiccontent/scan_test.go b/internal/qualitygate/publiccontent/scan_test.go new file mode 100644 index 00000000..84bd14ed --- /dev/null +++ b/internal/qualitygate/publiccontent/scan_test.go @@ -0,0 +1,915 @@ +// Copyright (c) 2026 Lark Technologies Pte. Ltd. +// SPDX-License-Identifier: MIT + +package publiccontent + +import ( + "strings" + "testing" +) + +func TestScanFileDetectsPublicLeakSignalsInPRDocs(t *testing.T) { + text := `# Pull Request + +The public README accidentally contains realistic leak-shaped content: + +` + "```bash\n" + `export SERVICE_` + `PASSWORD="example-password" +curl https://user:pass@` + `example.com/repo.git +` + "```\n" + ` + ` + privateKeyBeginPrefix + privateKeyMarker + ` + example-key-body + ` + privateKeyEndPrefix + privateKeyMarker + ` + +session_token: "` + jwtFixture("ZXhhbXBsZQ") + `" + +Change` + `-Id: I0123456789abcdef0123456789abcdef01234567 +Reviewed` + `-on: https://review.example.test/c/project/+/123 +` + "Generated by " + "auto" + "mation" + ` +/tmp/harness` + `-agent/work +CCM` + `-Harness: stage-17 +` + + got := ScanFile("docs/public-pr.md", []byte(text)) + rules := findingRules(got) + for _, want := range []string{ + "public_content_generic_credential", + "public_content_private_key_block", + "public_content_jwt_like_token", + "public_content_credential_url", + "public_content_change_id_trailer", + "public_content_reviewed_on_trailer", + "public_content_provenance_marker", + "public_content_harness_metadata", + "public_content_ccm_harness_trailer", + } { + if !rules[want] { + t.Fatalf("missing rule %s in findings %#v", want, got) + } + } +} + +func TestScanFileWarnsForPrivateIPv4Examples(t *testing.T) { + got := ScanFile("docs/network.md", []byte("Local lab address: 192.168."+"0.10\n")) + rules := findingRules(got) + if !rules["public_content_private_ipv4"] { + t.Fatalf("missing private IPv4 warning, got %#v", got) + } + for _, item := range got { + if item.Rule == "public_content_private_ipv4" && string(item.Action) != "WARNING" { + t.Fatalf("private IPv4 action = %s, want WARNING", item.Action) + } + } +} + +func TestSemanticCandidateRequiresSpecificRiskSignals(t *testing.T) { + benign := semanticCandidate("docs/network.md", "file", "For a local lab, use RFC1918 example host 192.168."+"0.10 only.", 1) + if len(benign) != 0 { + t.Fatalf("benign RFC1918 documentation should not produce semantic candidates: %#v", benign) + } + + risky := semanticCandidate("docs/roadmap.md", "file", "private launch plan for internal migration rollout on Friday", 1) + if len(risky) != 1 { + t.Fatalf("risky semantic text should produce one semantic candidate, got %#v", risky) + } + if !strings.Contains(risky[0].Excerpt, "private_scope") || !strings.Contains(risky[0].Excerpt, "roadmap_detail") { + t.Fatalf("semantic candidate should retain redacted risk signals, got %#v", risky[0]) + } + if !strings.Contains(risky[0].Excerpt, "private launch plan") { + t.Fatalf("semantic candidate should include sanitized review text, got %#v", risky[0]) + } +} + +func TestSemanticCandidateIgnoresBroadBenignSignals(t *testing.T) { + cases := []string{ + "internal package refactor", + "internal request handling docs", + "request header behavior", + "implementation detail cleanup", + } + for _, tc := range cases { + if got := semanticCandidate("docs/public.md", "file", tc, 1); len(got) != 0 { + t.Fatalf("semanticCandidate(%q) = %#v, want none", tc, got) + } + } +} + +func TestSemanticCandidateKeepsHighRiskCombinations(t *testing.T) { + cases := []string{ + "private request header controls trust classification and spoof-prevention behavior", + "unpublished migration rollout has target date next Tuesday", + "private roadmap cutover exposes customer-visible timing", + } + for _, tc := range cases { + if got := semanticCandidate("docs/public.md", "file", tc, 1); len(got) != 1 { + t.Fatalf("semanticCandidate(%q) len = %d, want 1: %#v", tc, len(got), got) + } + } +} + +func TestSemanticCandidateSanitizesReviewText(t *testing.T) { + text := `private rollout uses internal request headers. +SERVICE_PASSWORD="real-password-value" +Authorization: Bearer abcdefghijklmnopqrstuvwxyz +Authorization: Bearer abcdefghijkl+/Zm9vQmFy== +callback=https://user:secretpass@example.com/hook +token: ` + jwtFixture("c2VjcmV0") + ` +standalone ` + jwtFixture("c3RhbmRhbG9uZQ") + ` +` + privateKeyBeginPrefix + privateKeyMarker + ` +secret-key-body +` + privateKeyEndPrefix + privateKeyMarker + ` +` + + got := semanticCandidate("docs/public.md", "file", text, 1) + if len(got) != 1 { + t.Fatalf("semantic candidate len = %d, want 1: %#v", len(got), got) + } + excerpt := got[0].Excerpt + for _, forbidden := range []string{ + "real-password-value", + "abcdefghijklmnopqrstuvwxyz", + "Zm9vQmFy", + "user:secretpass@example.com", + jwtHeaderFixture(), + "secret-key-body", + } { + if strings.Contains(excerpt, forbidden) { + t.Fatalf("semantic candidate leaked %q in excerpt %q", forbidden, excerpt) + } + } + for _, want := range []string{ + "SERVICE_PASSWORD=", + "Authorization: Bearer ", + "https://:@example.com/hook", + "", + "", + } { + if !strings.Contains(excerpt, want) { + t.Fatalf("semantic candidate missing sanitized marker %q in excerpt %q", want, excerpt) + } + } +} + +func TestSemanticCandidateRedactsCredentialValuesWithWhitespace(t *testing.T) { + text := "private launch plan for internal rollout on Friday\n" + + "SERVICE_" + "TOKEN=\"real " + "secret value\"\n" + + got := semanticCandidate("docs/public.md", "file", text, 1) + if len(got) != 1 { + t.Fatalf("semantic candidate len = %d, want 1: %#v", len(got), got) + } + excerpt := got[0].Excerpt + for _, forbidden := range []string{"real " + "secret value", "secret value"} { + if strings.Contains(excerpt, forbidden) { + t.Fatalf("semantic candidate leaked credential tail %q in excerpt %q", forbidden, excerpt) + } + } + if !strings.Contains(excerpt, "SERVICE_TOKEN=") { + t.Fatalf("semantic candidate should redact full credential assignment, got %q", excerpt) + } +} + +func TestSemanticCandidateCoversRealE2ESemanticCases(t *testing.T) { + cases := []struct { + name string + text string + signals []string + }{ + { + name: "private header detail", + text: "Public docs describe a private request header, server-side trust classification, and spoof-prevention behavior in enough detail for an implementation review.", + signals: []string{ + "private_scope", + "request_metadata", + "trust_boundary_detail", + "implementation_detail", + }, + }, + { + name: "specific roadmap", + text: "Public release notes mention a specific unpublished migration phase, target date, and rollout direction for an internal-only plan.", + signals: []string{ + "private_scope", + "roadmap_detail", + "roadmap_timing", + }, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := semanticCandidate("docs/public.md", "file", tc.text, 1) + if len(got) != 1 { + t.Fatalf("semantic candidate len = %d, want 1: %#v", len(got), got) + } + for _, signal := range tc.signals { + if !strings.Contains(got[0].Excerpt, signal) { + t.Fatalf("semantic candidate missing signal %q: %#v", signal, got[0]) + } + } + }) + } +} + +func TestScanFileDetectsDetectorFingerprintOnlyInPublicRuleFiles(t *testing.T) { + got := ScanFile(".gitleaks.toml", []byte("[[rules]]\nid = \"public"+"-content-leakage\"\n")) + if !findingRules(got)["public_content_detector_fingerprint"] { + t.Fatalf("expected detector fingerprint finding, got %#v", got) + } + + clean := ScanFile("docs/release-notes.md", []byte("public-content-leakage is discussed as ordinary release text\n")) + if findingRules(clean)["public_content_detector_fingerprint"] { + t.Fatalf("detector fingerprint should be scoped to public rule/config files: %#v", clean) + } +} + +func TestScanFileIgnoresBenignPublicPlaceholders(t *testing.T) { + got := ScanFile("docs/examples.md", []byte(`Use APP_ID=cli_example_app_id and APP_SECRET=cli_example_app_secret in examples. +The docs may mention bearer-token placeholders, but they should not contain realistic tokens. +`)) + if len(got) != 0 { + t.Fatalf("benign placeholders produced findings: %#v", got) + } +} + +func TestScanFileDoesNotTreatURLEncodedCredentialAsPlaceholder(t *testing.T) { + got := ScanFile("docs/config.md", []byte("client_secret=abc%2Fdef%3Drealvalue\n")) + if !findingRules(got)["public_content_generic_credential"] { + t.Fatalf("URL-encoded credential should still be reported, got %#v", got) + } +} + +func TestScanFileDoesNotTreatPlaceholderMarkerSubstringsAsPlaceholders(t *testing.T) { + got := ScanFile("docs/config.md", []byte(strings.Join([]string{ + "API_KEY=notredactedreal", + "API_KEY=notplaceholdersecret", + "API_KEY=abcxxxxreal", + }, "\n")+"\n")) + var count int + for _, item := range got { + if item.Rule == "public_content_generic_credential" { + count++ + } + } + if count != 3 { + t.Fatalf("placeholder-marker substring findings = %d, want 3: %#v", count, got) + } +} + +func TestScanFileDetectsBase64PaddedCredentialAssignments(t *testing.T) { + paddedSecretPrefix := "dGhpc2lz" + "YXNlY3JldA" + paddedTokenPrefix := "YWJj" + "ZGVmZ2g" + paddedSecret := base64PaddedFixture(paddedSecretPrefix) + paddedToken := base64PaddedFixture(paddedTokenPrefix) + got := ScanFile("docs/config.md", []byte(strings.Join([]string{ + `API_SECRET="` + paddedSecret + `"`, + "api_secret=" + paddedToken, + "api_secret: " + paddedToken, + }, "\n")+"\n")) + var count int + for _, item := range got { + if item.Rule == "public_content_generic_credential" { + count++ + for _, forbidden := range []string{paddedSecret, paddedToken, paddedSecretPrefix, paddedTokenPrefix} { + if strings.Contains(item.Excerpt, forbidden) { + t.Fatalf("credential finding leaked base64 value %q in excerpt %q", forbidden, item.Excerpt) + } + } + } + } + if count != 3 { + t.Fatalf("base64 padded credentials findings = %d, want 3: %#v", count, got) + } +} + +func TestScanFileDetectsQuotedJSONCredentialAssignments(t *testing.T) { + jsonToken := "real-json-token" + jsonSecret := "real " + "secret value" + jsonKey := "real-json-key" + jsonTenantToken := "real-tenant-json-token" + jsonAppSecret := "real-app-secret" + jsonPrefixedKey := "real-prefixed-key" + jsonTenantCamelToken := "real-tenant-camel-token" + jsonGithubToken := "real-github-token" + jsonVendorKey := "real-vendor-key" + jsonSlackBotToken := "xoxb-real-token" + got := ScanFile("docs/public.json", []byte(strings.Join([]string{ + `{"access_` + `token":"` + jsonToken + `"}`, + `{"client_` + `secret": "` + jsonSecret + `"}`, + `{'api_` + `key': '` + jsonKey + `'}`, + `{"tenant_access_` + `token":"` + jsonTenantToken + `"}`, + `{"app_` + `secret":"` + jsonAppSecret + `"}`, + `{"x_api_` + `key":"` + jsonPrefixedKey + `"}`, + `{"tenantAccess` + `Token":"` + jsonTenantCamelToken + `"}`, + `{"github` + `Token":"` + jsonGithubToken + `"}`, + `{"vendorApi` + `Key":"` + jsonVendorKey + `"}`, + `{"slackBot` + `Token":"` + jsonSlackBotToken + `"}`, + }, "\n")+"\n")) + var count int + for _, item := range got { + if item.Rule == "public_content_generic_credential" { + count++ + for _, forbidden := range []string{jsonToken, jsonSecret, jsonKey, jsonTenantToken, jsonAppSecret, jsonPrefixedKey, jsonTenantCamelToken, jsonGithubToken, jsonVendorKey, jsonSlackBotToken} { + if strings.Contains(item.Excerpt, forbidden) { + t.Fatalf("JSON credential finding leaked value %q in excerpt %q", forbidden, item.Excerpt) + } + } + } + } + if count != 10 { + t.Fatalf("JSON credential findings = %d, want 10: %#v", count, got) + } +} + +func TestScanFileDetectsCredentialPhraseBeforeEnvironmentSuffix(t *testing.T) { + got := ScanFile("docs/config.yaml", []byte(strings.Join([]string{ + "API_KEY_OPENAI: real-openai-key", + "TOKEN_GITHUB: real-github-token", + "CLIENT_SECRET_GOOGLE: real-google-secret", + "SECRET_KEY_BASE: real-secret-key-base", + "APP_PASSWORD_PROD: real-prod-password", + }, "\n")+"\n")) + var count int + for _, item := range got { + if item.Rule != "public_content_generic_credential" { + continue + } + count++ + for _, forbidden := range []string{ + "real-openai-key", + "real-github-token", + "real-google-secret", + "real-secret-key-base", + "real-prod-password", + } { + if strings.Contains(item.Excerpt, forbidden) { + t.Fatalf("credential finding leaked value %q in excerpt %q", forbidden, item.Excerpt) + } + } + } + if count != 5 { + t.Fatalf("credential suffix variants findings = %d, want 5: %#v", count, got) + } +} + +func TestScanFileDetectsCredentialValuesThatLookLikeBareIdentifiers(t *testing.T) { + got := ScanFile("docs/config.yaml", []byte(strings.Join([]string{ + "API_KEY_OPENAI: prod_key", + "CLIENT_SECRET_GOOGLE: prod_secret", + "TOKEN_GITHUB: github_token", + "APP_PASSWORD_PROD: prod_password", + }, "\n")+"\n")) + var count int + for _, item := range got { + if item.Rule == "public_content_generic_credential" { + count++ + } + } + if count != 4 { + t.Fatalf("bare identifier credential findings = %d, want 4: %#v", count, got) + } +} + +func TestScanFileDetectsAngleWrappedRealisticCredentialValues(t *testing.T) { + stripeLike := "sk_" + "live_1234567890abcdef" + patLike := "gh" + "p_1234567890abcdef1234567890abcdef1234" + got := ScanFile("docs/config.yaml", []byte(strings.Join([]string{ + "API_KEY: <" + stripeLike + ">", + "SECRET_TOKEN: <" + patLike + ">", + "CLIENT_SECRET: ", + }, "\n")+"\n")) + var count int + for _, item := range got { + if item.Rule == "public_content_generic_credential" { + count++ + } + } + if count != 3 { + t.Fatalf("angle-wrapped realistic credential findings = %d, want 3: %#v", count, got) + } +} + +func TestScanFileDetectsCredentialShapedValuesUnderBenignKeys(t *testing.T) { + stripeLike := "sk_" + "live_1234567890abcdef" + patLike := "gh" + "p_1234567890abcdef1234567890abcdef1234" + got := ScanFile("docs/public.json", []byte(strings.Join([]string{ + `{"access_token_expires_in":"` + patLike + `"}`, + `{"refresh_token_expires_in":"` + stripeLike + `"}`, + `{"client_secret_status":"real-client-secret-value"}`, + `{"client_secret_name":"real-client-secret-value"}`, + `{"app_token":"` + patLike + `"}`, + `{"sync_token":"` + stripeLike + `"}`, + `{"target_token":"real-client-secret-value"}`, + }, "\n")+"\n")) + var count int + for _, item := range got { + if item.Rule == "public_content_generic_credential" { + count++ + } + } + if count != 7 { + t.Fatalf("credential-shaped benign-key findings = %d, want 7: %#v", count, got) + } +} + +func TestScanFileDetectsBareIdentifierCredentialsWithMetadataSuffixes(t *testing.T) { + got := ScanFile("docs/config.yaml", []byte(strings.Join([]string{ + "API_KEY_NAME: prod_key", + "CLIENT_SECRET_NAME: prod_secret", + "SECRET_STATUS: prod_secret", + }, "\n")+"\n")) + var count int + for _, item := range got { + if item.Rule == "public_content_generic_credential" { + count++ + } + } + if count != 3 { + t.Fatalf("metadata-suffixed bare credential findings = %d, want 3: %#v", count, got) + } +} + +func TestScanFileDetectsAccessKeyCredentials(t *testing.T) { + accessKey := "AK" + "IAIOSFODNN7EXAMPX" + got := ScanFile("docs/config.yaml", []byte(strings.Join([]string{ + "AWS_ACCESS_KEY_ID: " + accessKey, + "ACCESS_KEY_ID: " + accessKey, + "ACCESS_KEY: " + accessKey, + }, "\n")+"\n")) + var count int + for _, item := range got { + if item.Rule != "public_content_generic_credential" { + continue + } + count++ + for _, forbidden := range []string{ + accessKey, + } { + if strings.Contains(item.Excerpt, forbidden) { + t.Fatalf("access key finding leaked value %q in excerpt %q", forbidden, item.Excerpt) + } + } + } + if count != 3 { + t.Fatalf("access key credential findings = %d, want 3: %#v", count, got) + } +} + +func TestScanFileDetectsPrivateKeyAssignments(t *testing.T) { + privateKey := "LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0t" + got := ScanFile("docs/config.yaml", []byte(strings.Join([]string{ + "PRIVATE_KEY: " + privateKey, + "SSH_PRIVATE_KEY: " + privateKey, + "JWT_PRIVATE_KEY: " + privateKey, + "SIGNING_PRIVATE_KEY: " + privateKey, + }, "\n")+"\n")) + var count int + for _, item := range got { + if item.Rule != "public_content_generic_credential" { + continue + } + count++ + if strings.Contains(item.Excerpt, privateKey) { + t.Fatalf("private key finding leaked value in excerpt %q", item.Excerpt) + } + } + if count != 4 { + t.Fatalf("private key assignment findings = %d, want 4: %#v", count, got) + } +} + +func TestScanFileDetectsWebhookURLs(t *testing.T) { + got := ScanFile("docs/config.yaml", []byte(strings.Join([]string{ + "SLACK_WEBHOOK_URL=https://hooks." + "slack.com/services/T00000000/B00000000/abcdefghijklmnopqrstuvwx", + "DISCORD_WEBHOOK_URL=https://discord.com/api/" + "webhooks/123456789012345678/abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", + "WEBHOOK_URL=https://example.invalid/hooks/secret-path-token-1234567890", + }, "\n")+"\n")) + var count int + for _, item := range got { + if item.Rule != "public_content_generic_credential" { + continue + } + count++ + for _, forbidden := range []string{ + "hooks." + "slack.com/services", + "discord.com/api/" + "webhooks", + "secret-path-token-1234567890", + } { + if strings.Contains(item.Excerpt, forbidden) { + t.Fatalf("webhook finding leaked value %q in excerpt %q", forbidden, item.Excerpt) + } + } + } + if count != 3 { + t.Fatalf("webhook URL findings = %d, want 3: %#v", count, got) + } +} + +func TestScanFileDetectsWebhookURLsWithHostPlaceholders(t *testing.T) { + got := ScanFile("docs/config.yaml", []byte(strings.Join([]string{ + "WEBHOOK_URL=https:///hooks/real-secret-token-1234567890", + "SLACK_WEBHOOK_URL=https:///services/T00000000/B00000000/abcdefghijklmnopqrstuvwx", + }, "\n")+"\n")) + var count int + for _, item := range got { + if item.Rule != "public_content_generic_credential" { + continue + } + count++ + } + if count != 2 { + t.Fatalf("host-placeholder webhook findings = %d, want 2: %#v", count, got) + } +} + +func TestScanFileAllowsBenignWebhookFields(t *testing.T) { + got := ScanFile("docs/config.yaml", []byte(strings.Join([]string{ + "webhook_count: 2", + "webhook_retries=3", + "webhook_endpoint=https://example.invalid/hooks/example", + "webhook_path=/hooks/example", + }, "\n")+"\n")) + for _, item := range got { + if item.Rule == "public_content_generic_credential" { + t.Fatalf("benign webhook field should not be credential finding: %#v", got) + } + } +} + +func TestScanFileDetectsCredentialURLWithEmptyUsername(t *testing.T) { + got := ScanFile("docs/config.yaml", []byte("REDIS_URL=redis://:password@example.invalid/0\n")) + for _, item := range got { + if item.Rule == "public_content_credential_url" { + if strings.Contains(item.Excerpt, "password") { + t.Fatalf("credential URL finding leaked password in excerpt %q", item.Excerpt) + } + return + } + } + t.Fatalf("missing empty-username credential URL finding: %#v", got) +} + +func TestScanFileAllowsPrivateKeyStateBooleans(t *testing.T) { + got := ScanFile("internal/qualitygate/publiccontent/collect.go", []byte(strings.Join([]string{ + "inPrivateKey = true", + "inPrivateKey = false", + "hasPrivateKey: false", + }, "\n")+"\n")) + for _, item := range got { + if item.Rule == "public_content_generic_credential" { + t.Fatalf("private key state boolean should not be credential finding: %#v", got) + } + } +} + +func TestScanFileAllowsCredentialReferenceValues(t *testing.T) { + got := ScanFile("docs/config.yaml", []byte(strings.Join([]string{ + "API_KEY=${API_KEY}", + "API_KEY=$API_KEY", + "API_KEY=process.env.API_KEY", + "API_KEY: ${{ secrets.API_KEY }}", + "TOKEN: ${{ env.TOKEN }}", + "GITHUB_TOKEN: ${{ github.token }}", + "TOKEN=$(vault kv get -field=token secret/path)", + }, "\n")+"\n")) + for _, item := range got { + if item.Rule == "public_content_generic_credential" { + t.Fatalf("credential reference should not be generic credential finding: %#v", got) + } + } +} + +func TestScanFileDetectsMalformedGithubExpressionCredentialValues(t *testing.T) { + stripeLike := "sk_" + "live_1234567890abcdef" + got := ScanFile("docs/config.yaml", []byte(strings.Join([]string{ + "API_KEY=${{" + stripeLike + "}}", + "TOKEN=${{real-secret-token-value}}", + }, "\n")+"\n")) + var count int + for _, item := range got { + if item.Rule == "public_content_generic_credential" { + count++ + } + } + if count != 2 { + t.Fatalf("malformed GitHub expression credential findings = %d, want 2: %#v", count, got) + } +} + +func TestScanFileDetectsDollarPrefixedCredentialValues(t *testing.T) { + stripeLike := "sk_" + "live_1234567890abcdef" + patLike := "gh" + "p_1234567890abcdef1234567890abcdef1234" + got := ScanFile("docs/config.yaml", []byte(strings.Join([]string{ + "API_KEY=$" + stripeLike, + "GITHUB_TOKEN=$" + patLike, + "TOKEN=$(echo " + stripeLike + ")", + "API_KEY=process.env." + stripeLike, + "GITHUB_TOKEN=process.env." + patLike, + }, "\n")+"\n")) + var count int + for _, item := range got { + if item.Rule == "public_content_generic_credential" { + count++ + } + } + if count != 5 { + t.Fatalf("reference-shaped credential findings = %d, want 5: %#v", count, got) + } +} + +func TestScanFileAllowsCredentialURLPlaceholders(t *testing.T) { + got := ScanFile("docs/config.yaml", []byte(strings.Join([]string{ + "DATABASE_URL=postgres://:@example.invalid/db", + "DATABASE_URL=postgres://user:%3Cpassword%3E@example.invalid/db", + "WEBHOOK_URL=https://example.invalid/hooks/", + }, "\n")+"\n")) + for _, item := range got { + if item.Rule == "public_content_credential_url" { + t.Fatalf("credential URL placeholder should not be credential URL finding: %#v", got) + } + if item.Rule == "public_content_generic_credential" { + t.Fatalf("credential URL placeholder should not be generic credential finding: %#v", got) + } + } +} + +func TestScanFileDetectsCredentialURLsWithRedactedSubstringPasswords(t *testing.T) { + got := ScanFile("docs/config.yaml", []byte("DATABASE_URL=postgres://user:notredactedreal@example.invalid/db\n")) + for _, item := range got { + if item.Rule == "public_content_credential_url" { + return + } + } + t.Fatalf("missing credential URL with redacted substring password: %#v", got) +} + +func TestScanFileDetectsCredentialURLsWithPlaceholderUserAndRealPassword(t *testing.T) { + stripeLike := "sk_" + "live_1234567890abcdef" + got := ScanFile("docs/config.yaml", []byte(strings.Join([]string{ + "DATABASE_URL=postgres://:real-secret@example.invalid/db", + "DATABASE_URL=postgres://:" + stripeLike + "@example.invalid/db", + "URL=https://:real-secret@example.invalid/path", + }, "\n")+"\n")) + var count int + for _, item := range got { + if item.Rule != "public_content_credential_url" { + continue + } + count++ + for _, forbidden := range []string{"real-secret", stripeLike} { + if strings.Contains(item.Excerpt, forbidden) { + t.Fatalf("credential URL finding leaked value %q in excerpt %q", forbidden, item.Excerpt) + } + } + } + if count != 3 { + t.Fatalf("placeholder-user credential URL findings = %d, want 3: %#v", count, got) + } +} + +func TestScanFileAllowsCommonAngleWrappedCredentialPlaceholders(t *testing.T) { + got := ScanFile("docs/config.yaml", []byte(strings.Join([]string{ + "API_KEY=", + "CLIENT_SECRET=", + "ACCESS_TOKEN=", + "API_KEY=", + "SECRET_TOKEN=", + "CLIENT_SECRET=", + }, "\n")+"\n")) + for _, item := range got { + if item.Rule == "public_content_generic_credential" { + t.Fatalf("common angle-wrapped placeholder should not be credential finding: %#v", got) + } + } +} + +func TestScanFileAllowsBenignJSONTokenFields(t *testing.T) { + got := ScanFile("docs/public.json", []byte(strings.Join([]string{ + `{"tokenizer":"cl100k_base"}`, + `{"token_count": 42}`, + `{"page_token":"next"}`, + `{"next_page_token":"next"}`, + `{"file_token":"file-example"}`, + `{"doc_token":"doc-example"}`, + `{"node_token":"node-example"}`, + `{"wiki_token":"wikcn_public_doc_example"}`, + `{"folder_token":"folder-example"}`, + `{"obj_token":"obj-example"}`, + `{"spreadsheet_token":"sheet-example"}`, + `{"parent_node_token":"parent-example"}`, + `{"origin_node_token":"origin-example"}`, + `{"drive_route_token":"route-example"}`, + `{"token":""}`, + `{"token":"wiki_token"}`, + `{"token_url":"https://example.com/oauth/token"}`, + `{"token_endpoint":"https://example.com/oauth/token"}`, + `{"token_format":"Bearer"}`, + `{"secret_name":"public-example-secret"}`, + `{"base_token":"base-example"}`, + `{"app_token":"app-example"}`, + `{"sync_token":"sync-example"}`, + `{"parent_token":"parent-example"}`, + `{"target_token":"target-example"}`, + `{"parent_file_token":"parent-file-example"}`, + `{"refresh_token_expires_in": 7200}`, + `{"access_token_expires_in": 7200}`, + `{"token_expires_in": 7200}`, + `{"token_status":"active"}`, + }, "\n")+"\n")) + for _, item := range got { + if item.Rule == "public_content_generic_credential" { + t.Fatalf("benign JSON token field should not be credential finding: %#v", got) + } + } +} + +func TestScanFileAllowsBenignUnquotedTokenFields(t *testing.T) { + got := ScanFile("docs/config.yaml", []byte(strings.Join([]string{ + "tokens: 128", + "token_type: bearer", + "max_tokens: 2000", + "completion_tokens: 200", + "prompt_tokens: 100", + }, "\n")+"\n")) + for _, item := range got { + if item.Rule == "public_content_generic_credential" { + t.Fatalf("benign unquoted token field should not be credential finding: %#v", got) + } + } +} + +func TestSemanticCandidateRedactsColonAssignmentsWithEqualsInValue(t *testing.T) { + paddedSecretPrefix := "YWJj" + "ZGVmZ2g" + paddedSecret := base64PaddedFixture(paddedSecretPrefix) + text := "private launch plan for internal rollout on Friday\n" + + "api_" + "secret: " + paddedSecret + "\n" + + `{"access_` + `token":"` + paddedSecret + `"}` + "\n" + + got := semanticCandidate("docs/public.md", "file", text, 1) + if len(got) != 1 { + t.Fatalf("semantic candidate len = %d, want 1: %#v", len(got), got) + } + if strings.Contains(got[0].Excerpt, paddedSecret) || strings.Contains(got[0].Excerpt, paddedSecretPrefix) { + t.Fatalf("semantic candidate leaked colon assignment with padding: %#v", got[0]) + } +} + +func TestSemanticCandidateRedactsEscapedQuoteCredentialValues(t *testing.T) { + doubleQuotedValue := "abc\\\"def-secret" + singleQuotedValue := "abc\\'def-secret" + text := "private launch plan for internal rollout on Friday\n" + + `{"access_` + `token":"` + doubleQuotedValue + `"}` + "\n" + + `{'client_` + `secret': '` + singleQuotedValue + `'}` + "\n" + + got := semanticCandidate("docs/public.md", "file", text, 1) + if len(got) != 1 { + t.Fatalf("semantic candidate len = %d, want 1: %#v", len(got), got) + } + for _, forbidden := range []string{doubleQuotedValue, singleQuotedValue, "def-secret"} { + if strings.Contains(got[0].Excerpt, forbidden) { + t.Fatalf("semantic candidate leaked escaped-quote credential value %q: %#v", forbidden, got[0]) + } + } +} + +func TestScanFileDoesNotTreatEqualityComparisonAsCredential(t *testing.T) { + got := ScanFile("docs/example.md", []byte("if token == \"expected\" { return nil }\n")) + for _, item := range got { + if item.Rule == "public_content_generic_credential" { + t.Fatalf("equality comparison should not be credential assignment: %#v", got) + } + } +} + +func TestScanFileDetectsBearerHeaderFinding(t *testing.T) { + got := ScanFile("docs/auth.md", []byte("Authorization: Bearer abcdefghijklmnopqrstuvwxyz\n")) + for _, item := range got { + if item.Rule == "public_content_bearer_header" { + if item.Action != "REJECT" || item.File != "docs/auth.md" || item.Line != 1 || item.Source != "file" { + t.Fatalf("bearer finding attribution = %#v", item) + } + return + } + } + t.Fatalf("missing bearer finding: %#v", got) +} + +func TestScanFileDetectsJSONBearerHeaders(t *testing.T) { + token := "abcdefghijklmnopqrstuvwxyz" + got := ScanFile("docs/auth.json", []byte(strings.Join([]string{ + `{"Authorization":"Bearer ` + token + `"}`, + `{"headers":{"Authorization":"Bearer ` + token + `"}}`, + }, "\n")+"\n")) + var count int + for _, item := range got { + if item.Rule != "public_content_bearer_header" { + continue + } + count++ + if item.Action != "REJECT" || item.File != "docs/auth.json" || item.Source != "file" { + t.Fatalf("bearer finding attribution = %#v", item) + } + if strings.Contains(item.Excerpt, token) { + t.Fatalf("bearer finding leaked token: %#v", item) + } + } + if count != 2 { + t.Fatalf("JSON bearer findings = %d, want 2: %#v", count, got) + } +} + +func TestSemanticCandidateRedactsJSONBearerHeaders(t *testing.T) { + token := "abcdefghijklmnopqrstuvwxyz" + text := "private launch plan for internal rollout on Friday\n" + + `{"headers":{"Authorization":"Bearer ` + token + `"}}` + "\n" + + got := semanticCandidate("docs/public.md", "file", text, 1) + if len(got) != 1 { + t.Fatalf("semantic candidate len = %d, want 1: %#v", len(got), got) + } + if strings.Contains(got[0].Excerpt, token) { + t.Fatalf("semantic candidate leaked JSON bearer token: %#v", got[0]) + } + if !strings.Contains(got[0].Excerpt, "Authorization: Bearer ") { + t.Fatalf("semantic candidate should redact JSON bearer header, got %#v", got[0]) + } +} + +func TestScanFileDetectsCommonProvenanceMarkers(t *testing.T) { + text := strings.Join([]string{ + "Generated with automated code assistant", + "Co-authored-by: automated-code-assistant ", + "🤖 generated by automation", + }, "\n") + got := ScanFile("docs/public.md", []byte(text)) + var count int + for _, item := range got { + if item.Rule == "public_content_provenance_marker" { + count++ + } + } + if count != 3 { + t.Fatalf("provenance marker count = %d, want 3: %#v", count, got) + } +} + +func TestScanFileAllowsHumanCoAuthorTrailer(t *testing.T) { + got := ScanFile("docs/public.md", []byte(strings.Join([]string{ + "Co-authored-by: Jane Doe ", + "Co-authored-by: Alice Abbot ", + }, "\n"))) + for _, item := range got { + if item.Rule == "public_content_provenance_marker" { + t.Fatalf("human co-author trailer should not be blocked: %#v", got) + } + } +} + +func TestScanFileAllowsPercentWrappedPlaceholder(t *testing.T) { + got := ScanFile("docs/config.md", []byte("client_secret=%CLIENT_SECRET%\n")) + if len(got) != 0 { + t.Fatalf("percent-wrapped placeholder produced findings: %#v", got) + } +} + +func TestScanFileDetectsPercentWrappedCredentialValues(t *testing.T) { + stripeLike := "sk_" + "live_1234567890abcdef" + patLike := "gh" + "p_1234567890abcdef1234567890abcdef1234" + got := ScanFile("docs/config.md", []byte(strings.Join([]string{ + "CLIENT_SECRET=%" + stripeLike + "%", + "GITHUB_TOKEN=%" + patLike + "%", + "TOKEN=%real-secret-token-value%", + }, "\n")+"\n")) + var count int + for _, item := range got { + if item.Rule == "public_content_generic_credential" { + count++ + } + } + if count != 3 { + t.Fatalf("percent-wrapped credential findings = %d, want 3: %#v", count, got) + } +} + +func findingRules(items []Finding) map[string]bool { + out := map[string]bool{} + for _, item := range items { + out[item.Rule] = true + } + return out +} + +func jwtFixture(subject string) string { + return strings.Join([]string{ + jwtHeaderFixture(), + "eyJzdWIiOiJ" + subject + "In0", + "signature" + "part", + }, ".") +} + +func jwtHeaderFixture() string { + return "eyJhbGciOiJI" + "UzI1NiJ9" +} + +func base64PaddedFixture(prefix string) string { + return prefix + "==" +} diff --git a/internal/qualitygate/publiccontent/types.go b/internal/qualitygate/publiccontent/types.go new file mode 100644 index 00000000..cd933868 --- /dev/null +++ b/internal/qualitygate/publiccontent/types.go @@ -0,0 +1,30 @@ +// Copyright (c) 2026 Lark Technologies Pte. Ltd. +// SPDX-License-Identifier: MIT + +package publiccontent + +import "github.com/larksuite/cli/internal/qualitygate/report" + +type Options struct { + Repo string + ChangedFrom string + MetadataPath string + BranchName string +} + +type Metadata struct { + Title string `json:"title"` + Body string `json:"body"` + Branch string `json:"branch"` +} + +type Finding struct { + Rule string + Action report.Action + File string + Line int + Source string + Excerpt string + Message string + Suggestion string +} diff --git a/internal/qualitygate/rules/dryrun.go b/internal/qualitygate/rules/dryrun.go index d1fb0033..f4ac3d56 100644 --- a/internal/qualitygate/rules/dryrun.go +++ b/internal/qualitygate/rules/dryrun.go @@ -174,8 +174,9 @@ type materializedExample struct { } type placeholderContext struct { - FlagName string - FlagUsage string + FlagName string + FlagUsage string + FlagDefault string } func materializePlaceholderExample(raw string, cmd manifest.Command) (materializedExample, bool) { @@ -247,6 +248,7 @@ func placeholderContextForFlag(name string, flag *manifest.Flag) placeholderCont ctx := placeholderContext{FlagName: name} if flag != nil { ctx.FlagUsage = flag.Usage + ctx.FlagDefault = flag.DefValue } return ctx } @@ -309,11 +311,17 @@ func fakeValueForPlaceholder(raw string, ctx placeholderContext) (string, bool) if name == "" { return "", false } + if value, ok := fakeNumericValueForPlaceholder(name, ctx); ok { + return value, true + } + if value, ok := fakeContextualURLValueForPlaceholder(name, ctx); ok { + return value, true + } if value, ok := fakeValueFromPlaceholderName(name); ok { return value, true } if isGenericPlaceholderName(name) { - return fakeValueFromUsageHint(ctx.FlagUsage) + return fakeValueFromContextHint(ctx) } return "", false } @@ -336,16 +344,26 @@ func fakeValueFromPlaceholderName(name string) (string, bool) { return "file_test123", true case hasPlaceholderToken(tokens, "file") && hasPlaceholderToken(tokens, "token"): return "file_test123", true + case hasPlaceholderToken(tokens, "folder") && hasPlaceholderToken(tokens, "token"): + return "fld_test123", true case hasPlaceholderToken(tokens, "image", "img"): return "img_test123", true case hasPlaceholderToken(tokens, "app"): return "app_test123", true + case hasPlaceholderToken(tokens, "draft"): + return "draft_test123", true + case hasPlaceholderToken(tokens, "label"): + return "label_test123", true + case hasPlaceholderToken(tokens, "share"): + return "share_test123", true case hasPlaceholderToken(tokens, "doc", "document"): return "doc_test123", true case hasPlaceholderToken(tokens, "sheet", "spreadsheet"): return "shtcn_test123", true case hasPlaceholderToken(tokens, "base"): return "base_test123", true + case hasPlaceholderToken(tokens, "space"): + return "space_test123", true case hasPlaceholderToken(tokens, "table"): return "tbl_test123", true case hasPlaceholderToken(tokens, "view"): @@ -377,17 +395,98 @@ func fakeValueFromPlaceholderName(name string) (string, bool) { } } -func fakeValueFromUsageHint(usage string) (string, bool) { - match := placeholderValuePattern.FindStringSubmatch(strings.ToLower(usage)) +func fakeValueFromContextHint(ctx placeholderContext) (string, bool) { + if value, ok := fakeNumericValueForPlaceholder("", ctx); ok { + return value, true + } + if value, ok := fakeContextualURLValueForPlaceholder("", ctx); ok { + return value, true + } + match := placeholderValuePattern.FindStringSubmatch(strings.ToLower(ctx.FlagUsage)) if len(match) != 2 || !knownTokenPrefix(match[1]) { return "", false } return match[1] + "_test123", true } +func fakeContextualURLValueForPlaceholder(name string, ctx placeholderContext) (string, bool) { + nameTokens := placeholderTokenSet(name) + flagName := strings.ReplaceAll(strings.ToLower(ctx.FlagName), "-", "_") + flagTokens := placeholderTokenSet(flagName) + if !hasPlaceholderToken(nameTokens, "url", "link") && !hasPlaceholderToken(flagTokens, "url", "link") { + return "", false + } + usage := strings.ToLower(ctx.FlagUsage) + if strings.Contains(usage, "lark") || strings.Contains(usage, "feishu") || strings.Contains(usage, "document url") { + return "https://example.feishu.cn/docx/doc_test123", true + } + return "", false +} + +func fakeNumericValueForPlaceholder(name string, ctx placeholderContext) (string, bool) { + nameTokens := placeholderTokenSet(name) + flagName := strings.ReplaceAll(strings.ToLower(ctx.FlagName), "-", "_") + flagTokens := placeholderTokenSet(flagName) + usage := strings.ToLower(ctx.FlagUsage) + + switch { + case placeholderTokenPair(nameTokens, "meeting", "id") || placeholderTokenPair(flagTokens, "meeting", "id"): + return "400000000001", true + case placeholderTokenPair(nameTokens, "meeting", "ids") || placeholderTokenPair(flagTokens, "meeting", "ids"): + return "400000000001", true + case placeholderTokenPair(nameTokens, "meeting", "no") || placeholderTokenPair(flagTokens, "meeting", "no"): + return "123456789", true + case placeholderTokenPair(nameTokens, "meeting", "number") || placeholderTokenPair(flagTokens, "meeting", "number"): + return "123456789", true + case hasPlaceholderToken(nameTokens, "timestamp") || hasPlaceholderToken(flagTokens, "timestamp") || strings.Contains(usage, "unix timestamp"): + return defaultPositiveInteger(ctx.FlagDefault, "1893456000"), true + case placeholderTokenPair(nameTokens, "page", "size") || placeholderTokenPair(flagTokens, "page", "size"): + return defaultPositiveInteger(ctx.FlagDefault, "20"), true + case placeholderTokenPair(nameTokens, "page", "limit") || placeholderTokenPair(flagTokens, "page", "limit"): + return defaultPositiveInteger(ctx.FlagDefault, "10"), true + case numericPlaceholderName(nameTokens) || numericPlaceholderName(flagTokens) || numericUsageHint(usage): + return defaultPositiveInteger(ctx.FlagDefault, "20"), true + default: + return "", false + } +} + +func numericPlaceholderName(tokens map[string]bool) bool { + if len(tokens) == 0 || hasPlaceholderToken(tokens, "token", "format", "type", "status", "mode") { + return false + } + return hasPlaceholderToken(tokens, + "amount", "count", "depth", "height", "index", "length", "limit", "max", + "number", "revision", "size", "width", + ) +} + +func numericUsageHint(usage string) bool { + if usage == "" { + return false + } + return strings.Contains(usage, "positive integer") || + strings.Contains(usage, "decimal integer") || + strings.Contains(usage, "number of ") || + strings.Contains(usage, "(number)") +} + +func defaultPositiveInteger(raw, fallback string) string { + raw = strings.TrimSpace(raw) + if raw == "" || strings.HasPrefix(raw, "-") || raw == "0" { + return fallback + } + for _, r := range raw { + if r < '0' || r > '9' { + return fallback + } + } + return raw +} + func knownTokenPrefix(prefix string) bool { switch prefix { - case "app", "base", "doc", "file", "fld", "img", "item", "meeting", "obcn", "oc", "od", "om", "ou", "page", "rec", "shtcn", "task", "tbl", "token", "viw", "wiki": + case "app", "base", "doc", "draft", "file", "fld", "img", "item", "label", "meeting", "obcn", "oc", "od", "om", "ou", "page", "rec", "share", "shtcn", "space", "task", "tbl", "token", "viw", "wiki": return true default: return false @@ -431,6 +530,10 @@ func hasPlaceholderToken(tokens map[string]bool, wants ...string) bool { return false } +func placeholderTokenPair(tokens map[string]bool, first, second string) bool { + return tokens[first] && tokens[second] +} + func hasUnresolvedDryRunPlaceholder(value string) bool { if skillscan.HasPlaceholder(value) { return true @@ -623,6 +726,7 @@ func appendDryRunArg(raw string) ([]string, error) { return nil, fmt.Errorf("not a lark-cli command") } argv = truncateShellTail(argv) + argv = forceDryRunJSONFormat(argv) hasDryRunArg := false dryRunEnabled := false for _, arg := range argv[1:] { @@ -642,6 +746,23 @@ func appendDryRunArg(raw string) ([]string, error) { return append(argv[1:], "--dry-run"), nil } +func forceDryRunJSONFormat(argv []string) []string { + for i := 1; i < len(argv); i++ { + arg := argv[i] + if arg == "--format" { + if i+1 < len(argv) && argv[i+1] == "pretty" { + argv[i+1] = "json" + } + return argv + } + if arg == "--format=pretty" { + argv[i] = "--format=json" + return argv + } + } + return argv +} + func truncateShellTail(argv []string) []string { for i, arg := range argv { if i == 0 { diff --git a/internal/qualitygate/rules/dryrun_test.go b/internal/qualitygate/rules/dryrun_test.go index 7082be3e..8c109ca4 100644 --- a/internal/qualitygate/rules/dryrun_test.go +++ b/internal/qualitygate/rules/dryrun_test.go @@ -305,6 +305,161 @@ func TestRunDryRunsMaterializesInlinePlaceholderFlagValues(t *testing.T) { } } +func TestRunDryRunsMaterializesNumericPlaceholderFlagValues(t *testing.T) { + cliBin, argsPath := fakeDryRunCLI(t, `{"api":[{"method":"GET","url":"/open-apis/vc/v1/bots/events","params":{"meeting_id":"400000000001","page_size":50}}]}`) + m := manifest.Manifest{Commands: []manifest.Command{{ + Path: "vc +meeting-events", + Runnable: true, + Flags: []manifest.Flag{ + {Name: "meeting-id", TakesValue: true, Usage: "meeting ID to query; must be a long positive integer, not a 9-digit meeting number"}, + {Name: "page-size", TakesValue: true, Usage: "page size, 20-100 (default 50)", DefValue: "50"}, + {Name: "dry-run"}, + }, + }}} + ex := skillscan.Example{ + Raw: "lark-cli vc +meeting-events --meeting-id --page-size ", + SourceFile: "skills/lark-vc-agent/SKILL.md", + Line: 120, + HasPlaceholder: true, + } + + diags, facts := RunDryRuns(context.Background(), cliBin, m, []skillscan.Example{ex}) + if len(diags) != 0 { + t.Fatalf("RunDryRuns() diagnostics = %#v", diags) + } + if len(facts) != 1 || !facts[0].Executable || facts[0].SkipReason != "" { + t.Fatalf("numeric placeholder example should be executable after materialization: %#v", facts) + } + wantArgs := []string{"vc", "+meeting-events", "--meeting-id", "400000000001", "--page-size", "50", "--dry-run"} + if gotArgs := readArgs(t, argsPath); !reflect.DeepEqual(gotArgs, wantArgs) { + t.Fatalf("fake CLI args = %#v, want %#v", gotArgs, wantArgs) + } +} + +func TestRunDryRunsMaterializesNumericPlaceholdersInsideJSONFlags(t *testing.T) { + cliBin, argsPath := fakeDryRunCLI(t, `{"api":[{"method":"GET","url":"/open-apis/test","params":{"timestamp":"1893456000","count":"20"}}]}`) + m := manifest.Manifest{Commands: []manifest.Command{{ + Path: "api GET", + Runnable: true, + Flags: []manifest.Flag{ + {Name: "params", TakesValue: true}, + {Name: "dry-run"}, + }, + }}} + ex := skillscan.Example{ + Raw: `lark-cli api GET /open-apis/test --params '{"timestamp":"","count":""}'`, + SourceFile: "skills/lark-demo/SKILL.md", + Line: 20, + HasPlaceholder: true, + } + + diags, facts := RunDryRuns(context.Background(), cliBin, m, []skillscan.Example{ex}) + if len(diags) != 0 { + t.Fatalf("RunDryRuns() diagnostics = %#v", diags) + } + if len(facts) != 1 || !facts[0].Executable || facts[0].SkipReason != "" { + t.Fatalf("JSON numeric placeholder example should be executable after materialization: %#v", facts) + } + wantArgs := []string{"api", "GET", "/open-apis/test", "--params", `{"timestamp":"1893456000","count":"20"}`, "--dry-run"} + if gotArgs := readArgs(t, argsPath); !reflect.DeepEqual(gotArgs, wantArgs) { + t.Fatalf("fake CLI args = %#v, want %#v", gotArgs, wantArgs) + } +} + +func TestRunDryRunsMaterializesLarkDocumentURLPlaceholders(t *testing.T) { + cliBin, argsPath := fakeDryRunCLI(t, `{"api":[{"method":"GET","url":"/open-apis/drive/v1/metas/batch_query"}]}`) + m := manifest.Manifest{Commands: []manifest.Command{{ + Path: "drive +inspect", + Runnable: true, + Flags: []manifest.Flag{ + {Name: "url", TakesValue: true, Usage: "Lark/Feishu document URL (docx, doc, sheet, bitable, wiki, file, folder, mindnote, slides)"}, + {Name: "format", TakesValue: true}, + {Name: "dry-run"}, + }, + }}} + ex := skillscan.Example{ + Raw: "lark-cli drive +inspect --url '' --format json", + SourceFile: "skills/lark-drive/references/lark-drive-workflow-permission-governance-commands.md", + Line: 15, + HasPlaceholder: true, + } + + diags, facts := RunDryRuns(context.Background(), cliBin, m, []skillscan.Example{ex}) + if len(diags) != 0 { + t.Fatalf("RunDryRuns() diagnostics = %#v", diags) + } + if len(facts) != 1 || !facts[0].Executable || facts[0].SkipReason != "" { + t.Fatalf("Lark URL placeholder example should be executable after materialization: %#v", facts) + } + wantArgs := []string{"drive", "+inspect", "--url", "https://example.feishu.cn/docx/doc_test123", "--format", "json", "--dry-run"} + if gotArgs := readArgs(t, argsPath); !reflect.DeepEqual(gotArgs, wantArgs) { + t.Fatalf("fake CLI args = %#v, want %#v", gotArgs, wantArgs) + } +} + +func TestRunDryRunsMaterializesResourceIDPlaceholderFlagValues(t *testing.T) { + cliBin, argsPath := fakeDryRunCLI(t, `{"api":[{"method":"GET","url":"/open-apis/wiki/v2/spaces/space_test123/nodes"}]}`) + m := manifest.Manifest{Commands: []manifest.Command{{ + Path: "wiki +node-list", + Runnable: true, + Flags: []manifest.Flag{ + {Name: "space-id", TakesValue: true, Usage: "wiki space ID"}, + {Name: "page-token", TakesValue: true, Usage: "page token"}, + {Name: "format", TakesValue: true}, + {Name: "dry-run"}, + }, + }}} + ex := skillscan.Example{ + Raw: "lark-cli wiki +node-list --space-id --page-token --format json", + SourceFile: "skills/lark-wiki/references/lark-wiki-node-list.md", + Line: 24, + HasPlaceholder: true, + } + + diags, facts := RunDryRuns(context.Background(), cliBin, m, []skillscan.Example{ex}) + if len(diags) != 0 { + t.Fatalf("RunDryRuns() diagnostics = %#v", diags) + } + if len(facts) != 1 || !facts[0].Executable || facts[0].SkipReason != "" { + t.Fatalf("resource ID placeholder example should be executable after materialization: %#v", facts) + } + wantArgs := []string{"wiki", "+node-list", "--space-id", "space_test123", "--page-token", "page_test123", "--format", "json", "--dry-run"} + if gotArgs := readArgs(t, argsPath); !reflect.DeepEqual(gotArgs, wantArgs) { + t.Fatalf("fake CLI args = %#v, want %#v", gotArgs, wantArgs) + } +} + +func TestRunDryRunsMaterializesResourcePlaceholdersInsideJSONFlags(t *testing.T) { + cliBin, argsPath := fakeDryRunCLI(t, `{"api":[{"method":"POST","url":"/open-apis/mail/v1/user_mailboxes/me/drafts/draft_test123/send"}]}`) + m := manifest.Manifest{Commands: []manifest.Command{{ + Path: "mail user_mailbox.drafts send", + Runnable: true, + Flags: []manifest.Flag{ + {Name: "params", TakesValue: true}, + {Name: "data", TakesValue: true}, + {Name: "dry-run"}, + }, + }}} + ex := skillscan.Example{ + Raw: `lark-cli mail user_mailbox.drafts send --params '{"user_mailbox_id":"me","draft_id":""}' --data '{"send_time":""}'`, + SourceFile: "skills/lark-mail/references/lark-mail-send.md", + Line: 172, + HasPlaceholder: true, + } + + diags, facts := RunDryRuns(context.Background(), cliBin, m, []skillscan.Example{ex}) + if len(diags) != 0 { + t.Fatalf("RunDryRuns() diagnostics = %#v", diags) + } + if len(facts) != 1 || !facts[0].Executable || facts[0].SkipReason != "" { + t.Fatalf("JSON resource placeholder example should be executable after materialization: %#v", facts) + } + wantArgs := []string{"mail", "user_mailbox.drafts", "send", "--params", `{"user_mailbox_id":"me","draft_id":"draft_test123"}`, "--data", `{"send_time":"1893456000"}`, "--dry-run"} + if gotArgs := readArgs(t, argsPath); !reflect.DeepEqual(gotArgs, wantArgs) { + t.Fatalf("fake CLI args = %#v, want %#v", gotArgs, wantArgs) + } +} + func TestRunDryRunsSkipsUnknownFlagsBeforeDryRun(t *testing.T) { m := manifest.Manifest{Commands: []manifest.Command{{ Path: "im +chat-messages-list", @@ -600,6 +755,51 @@ func TestAppendDryRunArgDoesNotDuplicate(t *testing.T) { } } +func TestAppendDryRunArgForcesJSONFormat(t *testing.T) { + got, err := appendDryRunArg("lark-cli vc +meeting-events --meeting-id 400000000001 --format pretty") + if err != nil { + t.Fatalf("appendDryRunArg() error = %v", err) + } + want := []string{"vc", "+meeting-events", "--meeting-id", "400000000001", "--format", "json", "--dry-run"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("appendDryRunArg() = %#v, want %#v", got, want) + } +} + +func TestAppendDryRunArgForcesInlineJSONFormat(t *testing.T) { + got, err := appendDryRunArg("lark-cli vc +meeting-events --meeting-id 400000000001 --format=pretty --dry-run") + if err != nil { + t.Fatalf("appendDryRunArg() error = %v", err) + } + want := []string{"vc", "+meeting-events", "--meeting-id", "400000000001", "--format=json", "--dry-run"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("appendDryRunArg() = %#v, want %#v", got, want) + } +} + +func TestAppendDryRunArgPreservesNonPrettyFormat(t *testing.T) { + for _, raw := range []string{ + "lark-cli mail +watch --format data --dry-run", + "lark-cli export +events --format=ndjson --dry-run", + "lark-cli docs +fetch --format table", + } { + got, err := appendDryRunArg(raw) + if err != nil { + t.Fatalf("appendDryRunArg(%q) error = %v", raw, err) + } + for _, arg := range got { + if arg == "--format=json" { + t.Fatalf("appendDryRunArg(%q) unexpectedly rewrote inline format: %#v", raw, got) + } + } + for i, arg := range got { + if arg == "--format" && i+1 < len(got) && got[i+1] == "json" { + t.Fatalf("appendDryRunArg(%q) unexpectedly rewrote split format: %#v", raw, got) + } + } + } +} + func TestAppendDryRunArgForcesDryRunWhenExplicitlyDisabled(t *testing.T) { got, err := appendDryRunArg("lark-cli docs +fetch --dry-run=false --doc abc") if err != nil { diff --git a/internal/qualitygate/rules/run.go b/internal/qualitygate/rules/run.go index 6026671c..35acea1c 100644 --- a/internal/qualitygate/rules/run.go +++ b/internal/qualitygate/rules/run.go @@ -15,18 +15,20 @@ import ( manifestexamples "github.com/larksuite/cli/internal/qualitygate/examples" "github.com/larksuite/cli/internal/qualitygate/facts" "github.com/larksuite/cli/internal/qualitygate/manifest" + "github.com/larksuite/cli/internal/qualitygate/publiccontent" "github.com/larksuite/cli/internal/qualitygate/report" "github.com/larksuite/cli/internal/qualitygate/skillscan" "github.com/larksuite/cli/internal/vfs" ) type Options struct { - Repo string - CLIBin string - ChangedFrom string - FactsOut string - ManifestPath string - CommandIndexPath string + Repo string + CLIBin string + ChangedFrom string + FactsOut string + ManifestPath string + CommandIndexPath string + PublicContentMetadataPath string } func Run(ctx context.Context, opts Options) ([]report.Diagnostic, facts.Facts, error) { @@ -98,9 +100,60 @@ func Run(ctx context.Context, opts Options) ([]report.Diagnostic, facts.Facts, e if opts.ChangedFrom != "" { diags = append(diags, errorDiags...) } + publicContent, err := publiccontent.Collect(ctx, publiccontent.Options{ + Repo: opts.Repo, + ChangedFrom: opts.ChangedFrom, + MetadataPath: opts.PublicContentMetadataPath, + }) + if err != nil { + return nil, facts.Facts{}, err + } + diags = append(diags, publicContentDiagnostics(publicContent)...) diags = filterPRDiagnostics(opts.Repo, opts.ChangedFrom, scope, m, diags) - return diags, facts.BuildWithCommandLookup(m, commandIndex, skillFacts, skillQualityFacts, errorFacts, exampleFacts, outputFacts, diags, scope.Files), nil + builtFacts := facts.BuildWithCommandLookup(m, commandIndex, skillFacts, skillQualityFacts, errorFacts, exampleFacts, outputFacts, diags, scope.Files) + return diags, facts.WithPublicContent(builtFacts, publicContentFacts(publicContent)), nil +} + +func publicContentDiagnostics(items []publiccontent.Finding) []report.Diagnostic { + if len(items) == 0 { + return nil + } + out := make([]report.Diagnostic, 0, len(items)) + for _, item := range items { + if item.Rule == "public_content_semantic_candidate" { + continue + } + out = append(out, report.Diagnostic{ + Rule: item.Rule, + Action: item.Action, + File: item.File, + Line: item.Line, + Message: item.Message, + Suggestion: item.Suggestion, + }) + } + return out +} + +func publicContentFacts(items []publiccontent.Finding) []facts.PublicContentFact { + if len(items) == 0 { + return nil + } + out := make([]facts.PublicContentFact, 0, len(items)) + for _, item := range items { + out = append(out, facts.PublicContentFact{ + Rule: item.Rule, + Action: item.Action, + File: item.File, + Line: item.Line, + Source: item.Source, + Excerpt: item.Excerpt, + Message: item.Message, + Suggestion: item.Suggestion, + }) + } + return out } func readManifestInput(path, kind, flag string) (manifest.Manifest, error) { @@ -167,6 +220,9 @@ func filterPRDiagnostics(repo, changedFrom string, scope qdiff.Scope, m manifest } func prDiagnosticRelevant(repo string, changedFiles map[string]bool, commandScope diagnosticCommandScope, m manifest.Manifest, diag report.Diagnostic) bool { + if strings.HasPrefix(diag.Rule, "public_content_") { + return true + } file := normalizeDiagnosticFile(repo, diag.File) if file != "" && changedFiles[file] { return true diff --git a/internal/qualitygate/rules/run_test.go b/internal/qualitygate/rules/run_test.go index 99188b44..b60a2c63 100644 --- a/internal/qualitygate/rules/run_test.go +++ b/internal/qualitygate/rules/run_test.go @@ -189,6 +189,99 @@ description: Manage Drive comments with service command references. } } +func TestRunCollectsPublicContentFindingsIntoDiagnosticsAndFacts(t *testing.T) { + repo := t.TempDir() + runGit(t, repo, "init") + runGit(t, repo, "config", "user.email", "test@example.com") + runGit(t, repo, "config", "user.name", "Test User") + if err := vfs.WriteFile(filepath.Join(repo, "README.md"), []byte("# test\n"), 0o644); err != nil { + t.Fatal(err) + } + runGit(t, repo, "add", "README.md") + runGit(t, repo, "commit", "-m", "base") + + if err := vfs.MkdirAll(filepath.Join(repo, "docs"), 0o755); err != nil { + t.Fatal(err) + } + publicDoc := "api_" + "key = \"example-public-key\"\n" + + "Public docs describe a pri" + "vate request header and trust classification detail.\n" + if err := vfs.WriteFile(filepath.Join(repo, "docs", "public.md"), []byte(publicDoc), 0o644); err != nil { + t.Fatal(err) + } + runGit(t, repo, "add", "docs/public.md") + runGit(t, repo, "commit", "-m", "add public doc") + + metadataPath := filepath.Join(repo, "pr-metadata.json") + if err := vfs.WriteFile(metadataPath, []byte(`{"title":"public docs","body":"Change`+`-Id: I0123456789abcdef0123456789abcdef01234567"}`), 0o644); err != nil { + t.Fatal(err) + } + + manifestPath := filepath.Join(repo, "command-manifest.json") + indexPath := filepath.Join(repo, "command-index.json") + m := manifest.Manifest{SchemaVersion: 1, Commands: []manifest.Command{{ + Path: "docs +fetch", + CanonicalPath: "docs +fetch", + Domain: "docs", + Source: manifest.SourceShortcut, + }}} + if err := manifest.WriteFile(manifestPath, manifest.KindCommandManifest, m); err != nil { + t.Fatal(err) + } + idx := manifest.Manifest{SchemaVersion: 1, Commands: append([]manifest.Command{}, m.Commands...)} + idx.Commands = append(idx.Commands, manifest.Command{ + Path: "drive files get", + CanonicalPath: "drive files get", + Domain: "drive", + Source: manifest.SourceService, + Generated: true, + Runnable: true, + }) + if err := manifest.WriteFile(indexPath, manifest.KindCommandIndex, idx); err != nil { + t.Fatal(err) + } + + diags, gotFacts, err := Run(context.Background(), Options{ + Repo: repo, + CLIBin: "./lark-cli", + ChangedFrom: "HEAD~1", + ManifestPath: manifestPath, + CommandIndexPath: indexPath, + PublicContentMetadataPath: metadataPath, + }) + if err != nil { + t.Fatalf("Run() error = %v", err) + } + actions := map[string]report.Action{} + for _, diag := range diags { + actions[diag.Rule] = diag.Action + } + if actions["public_content_generic_credential"] != report.ActionReject { + t.Fatalf("generic credential diagnostic action = %q, diagnostics=%#v", actions["public_content_generic_credential"], diags) + } + if actions["public_content_change_id_trailer"] != report.ActionReject { + t.Fatalf("change-id diagnostic action = %q, diagnostics=%#v", actions["public_content_change_id_trailer"], diags) + } + if actions["public_content_semantic_candidate"] != "" { + t.Fatalf("semantic candidates should not become deterministic diagnostics: %#v", diags) + } + factRules := map[string]bool{} + for _, item := range gotFacts.PublicContent { + factRules[item.Rule] = true + } + for _, want := range []string{ + "public_content_generic_credential", + "public_content_change_id_trailer", + "public_content_semantic_candidate", + } { + if !factRules[want] { + t.Fatalf("missing public content fact %s: %#v", want, gotFacts.PublicContent) + } + } + if len(gotFacts.PublicContent) < 3 { + t.Fatalf("public content facts = %#v", gotFacts.PublicContent) + } +} + func TestLoadBaseReferenceManifestReadsCommandGolden(t *testing.T) { repo := t.TempDir() runGit(t, repo, "init") @@ -506,7 +599,7 @@ func TestNormalizeDiagnosticFileHandlesAbsoluteRepo(t *testing.T) { func runGit(t *testing.T, repo string, args ...string) { t.Helper() - cmd := exec.Command("git", append([]string{"-C", repo}, args...)...) + cmd := exec.Command("git", append([]string{"-c", "core.hooksPath=/dev/null", "-C", repo}, args...)...) cmd.Env = append(os.Environ(), "GIT_AUTHOR_DATE=2026-06-17T00:00:00Z", "GIT_COMMITTER_DATE=2026-06-17T00:00:00Z") out, err := cmd.CombinedOutput() if err != nil { diff --git a/internal/qualitygate/semantic/client.go b/internal/qualitygate/semantic/client.go index e66f2862..5a7c873f 100644 --- a/internal/qualitygate/semantic/client.go +++ b/internal/qualitygate/semantic/client.go @@ -339,7 +339,7 @@ func jsonSchemaResponseFormat() map[string]any { "properties": map[string]any{ "category": map[string]any{ "type": "string", - "enum": []string{"error_hint", "default_output", "naming", "skill_quality"}, + "enum": []string{"error_hint", "default_output", "naming", "skill_quality", "public_content_leakage"}, }, "severity": map[string]any{ "type": "string", diff --git a/internal/qualitygate/semantic/gatekeeper.go b/internal/qualitygate/semantic/gatekeeper.go index 23da39f4..57e0cbcb 100644 --- a/internal/qualitygate/semantic/gatekeeper.go +++ b/internal/qualitygate/semantic/gatekeeper.go @@ -10,9 +10,10 @@ import ( "strings" "github.com/larksuite/cli/internal/qualitygate/facts" + "github.com/larksuite/cli/internal/qualitygate/report" ) -var evidencePattern = regexp.MustCompile(`^facts\.(commands|skills|errors|outputs)\[(\d+)\]$`) +var evidencePattern = regexp.MustCompile(`^facts\.(commands|skills|errors|outputs|public_content)\[(\d+)\]$`) func Decide(f facts.Facts, r Review, p Policy) Decision { return DecideWithWaivers(f, r, p, Waivers{}) @@ -172,6 +173,16 @@ func evidenceFingerprint(f facts.Facts, ev string) string { "has_default_limit:" + strconv.FormatBool(out.HasDefaultLimit), "has_decision_field:" + strconv.FormatBool(out.HasDecisionField), }, ":") + case "public_content": + item := f.PublicContent[idx] + return strings.Join([]string{ + "public_content", + "rule:" + item.Rule, + "action:" + string(item.Action), + "file:" + item.File, + "line:" + strconv.Itoa(item.Line), + "source:" + item.Source, + }, ":") default: return "ref:" + ev } @@ -201,7 +212,7 @@ func validFinding(f Finding) bool { func allowedCategory(category string) bool { switch category { - case "error_hint", "default_output", "naming", "skill_quality": + case "error_hint", "default_output", "naming", "skill_quality", "public_content_leakage": return true default: return false @@ -247,6 +258,12 @@ func reproducibleEvidence(f facts.Facts, category, kind string, idx int) bool { } skill := f.Skills[idx] return skill.ReferencesInvalidCommand + case "public_content_leakage": + if kind != "public_content" { + return false + } + item := f.PublicContent[idx] + return item.Action == report.ActionReject || item.Rule == "public_content_semantic_candidate" default: return false } @@ -277,6 +294,8 @@ func evidenceExists(f facts.Facts, kind string, idx int) bool { return idx < len(f.Errors) case "outputs": return idx < len(f.Outputs) + case "public_content": + return idx < len(f.PublicContent) default: return false } diff --git a/internal/qualitygate/semantic/gatekeeper_test.go b/internal/qualitygate/semantic/gatekeeper_test.go index c1d3e897..8e8ac676 100644 --- a/internal/qualitygate/semantic/gatekeeper_test.go +++ b/internal/qualitygate/semantic/gatekeeper_test.go @@ -242,6 +242,7 @@ func TestGatekeeperBlockerMatrix(t *testing.T) { Outputs: []facts.OutputFact{{Command: "im messages list", IsList: true, HasDefaultLimit: false, HasDecisionField: false}}, Commands: []facts.CommandFact{{Path: "docs fetch", NameConflictsExisting: true}}, Skills: []facts.SkillFact{{SourceFile: "skills/lark-doc/SKILL.md", Line: 3, ReferencesInvalidCommand: true}}, + PublicContent: []facts.PublicContentFact{{Rule: "public_content_generic_credential", Action: "REJECT", File: "docs/public.md", Line: 4, Source: "metadata"}}, } for _, tc := range []struct { category string @@ -251,6 +252,7 @@ func TestGatekeeperBlockerMatrix(t *testing.T) { {"default_output", "facts.outputs[0]"}, {"naming", "facts.commands[0]"}, {"skill_quality", "facts.skills[0]"}, + {"public_content_leakage", "facts.public_content[0]"}, } { t.Run(tc.category, func(t *testing.T) { r := Review{Findings: []Finding{{ @@ -268,6 +270,59 @@ func TestGatekeeperBlockerMatrix(t *testing.T) { } } +func TestGatekeeperDoesNotPromotePublicContentWarningsToBlockers(t *testing.T) { + f := facts.Facts{ + SchemaVersion: 1, + PublicContent: []facts.PublicContentFact{{ + Rule: "public_content_" + "pri" + "vate_ipv4", + Action: "WARNING", + File: "docs/network.md", + Line: 1, + Source: "file", + }}, + } + review := Review{Findings: []Finding{{ + Category: "public_content_leakage", + Severity: "minor", + Evidence: []string{"facts.public_content[0]"}, + Message: "pri" + "vate network address appears in public docs", + SuggestedAction: "confirm the public docs do not expose pri" + "vate deployment details", + }}} + + got := Decide(f, review, DefaultPolicy()) + if len(got.Blockers) != 0 || len(got.Warnings) != 1 { + t.Fatalf("public content warning should not become a blocker: %#v", got) + } + if got.Warnings[0].ReviewAction != ReviewActionObserve { + t.Fatalf("review action = %q, want %q", got.Warnings[0].ReviewAction, ReviewActionObserve) + } +} + +func TestGatekeeperAllowsPublicContentSemanticCandidatesAsBlockers(t *testing.T) { + f := facts.Facts{ + SchemaVersion: 1, + PublicContent: []facts.PublicContentFact{{ + Rule: "public_content_semantic_candidate", + Action: "WARNING", + File: "docs/public.md", + Line: 1, + Source: "file", + }}, + } + review := Review{Findings: []Finding{{ + Category: "public_content_leakage", + Severity: "major", + Evidence: []string{"facts.public_content[0]"}, + Message: "semantic review found pri" + "vate rollout detail", + SuggestedAction: "remove pri" + "vate rollout detail from public docs", + }}} + + got := Decide(f, review, DefaultPolicy()) + if len(got.Blockers) != 1 { + t.Fatalf("semantic candidate should remain blockable, got %#v", got) + } +} + func TestGatekeeperSkillQualityOnlyBlocksInvalidCommandReferences(t *testing.T) { f := facts.Facts{ SchemaVersion: 1, diff --git a/internal/qualitygate/semantic/prompt.go b/internal/qualitygate/semantic/prompt.go index 29c5d0e5..e2ae17f7 100644 --- a/internal/qualitygate/semantic/prompt.go +++ b/internal/qualitygate/semantic/prompt.go @@ -24,7 +24,7 @@ func BuildPrompt(f facts.Facts) []Message { "Use only the provided JSON view.", "The changed_summary may summarize broad changed surfaces; review only listed facts, not omitted summarized items.", "Use fact_ref values exactly when writing finding evidence.", - "Only facts.commands, facts.skills, facts.errors, and facts.outputs fact_ref values may be blocker evidence.", + "Only facts.commands, facts.skills, facts.errors, facts.outputs, and facts.public_content fact_ref values may be blocker evidence.", "Evidence entries must be exact fact_ref strings such as \"facts.commands[0]\" with no explanations, labels, or suffix text.", "facts.examples and facts.skill_quality entries are context only.", "Report an error_hint finding for any facts.errors item where boundary is true, required_hint is true, and hint_action_count is 0.", @@ -38,6 +38,9 @@ func BuildPrompt(f facts.Facts) []Message { "For naming findings, use category \"naming\" and evidence containing that facts.commands fact_ref.", "Report a skill_quality finding for any facts.skills item where references_invalid_command is true.", "For skill_quality findings, use category \"skill_quality\" and evidence containing that facts.skills fact_ref.", + "Review public content leakage findings and semantic candidates without private dictionaries.", + "Do not reveal internal rule lists when explaining public content leakage.", + "For public_content_leakage findings, preserve the deterministic finding source and excerpt.", "Report each distinct issue as a separate finding.", "The verdict value must be \"pass\" when findings is empty and \"warn\" when findings is non-empty; never use \"fail\".", "Severity must be one of \"minor\", \"major\", or \"critical\"; never use \"error\", \"warning\", \"medium\", or \"high\".", diff --git a/internal/qualitygate/semantic/prompt_contract_test.go b/internal/qualitygate/semantic/prompt_contract_test.go index 98948383..fefa07ef 100644 --- a/internal/qualitygate/semantic/prompt_contract_test.go +++ b/internal/qualitygate/semantic/prompt_contract_test.go @@ -23,7 +23,10 @@ func TestBuildPromptContainsSemanticReviewContract(t *testing.T) { "A facts.outputs item with is_list true, has_default_limit false, and has_decision_field true must still produce a default_output finding.", "Report a naming finding for any facts.commands item where name_conflicts_existing is true or flag_alias_conflict is true.", "Report a skill_quality finding for any facts.skills item where references_invalid_command is true.", - "Only facts.commands, facts.skills, facts.errors, and facts.outputs fact_ref values may be blocker evidence.", + "Review public content leakage findings and semantic candidates without private dictionaries.", + "Do not reveal internal rule lists when explaining public content leakage.", + "For public_content_leakage findings, preserve the deterministic finding source and excerpt.", + "Only facts.commands, facts.skills, facts.errors, facts.outputs, and facts.public_content fact_ref values may be blocker evidence.", "Evidence entries must be exact fact_ref strings such as \"facts.commands[0]\" with no explanations, labels, or suffix text.", "facts.examples and facts.skill_quality entries are context only.", "Report each distinct issue as a separate finding.", diff --git a/internal/qualitygate/semantic/schema.go b/internal/qualitygate/semantic/schema.go index 2ab4572b..5097a870 100644 --- a/internal/qualitygate/semantic/schema.go +++ b/internal/qualitygate/semantic/schema.go @@ -78,11 +78,11 @@ func DefaultPolicy() Policy { return Policy{ SchemaVersion: 1, DefaultEnforcement: "observe", - BlockCategories: []string{"error_hint", "default_output", "naming", "skill_quality"}, + BlockCategories: []string{"error_hint", "default_output", "naming", "skill_quality", "public_content_leakage"}, RolloutGroups: []RolloutGroup{{ ID: "all", Enforcement: "blocking", - Categories: []string{"error_hint", "default_output", "naming", "skill_quality"}, + Categories: []string{"error_hint", "default_output", "naming", "skill_quality", "public_content_leakage"}, Owner: "test", Reason: "default in-memory policy", }}, diff --git a/internal/qualitygate/semantic/scope.go b/internal/qualitygate/semantic/scope.go index 88a2cf93..d1738d17 100644 --- a/internal/qualitygate/semantic/scope.go +++ b/internal/qualitygate/semantic/scope.go @@ -82,6 +82,15 @@ func factScope(f facts.Facts, kind string, idx int) (FactScope, bool) { Source: item.Source, CommandPath: item.Command, }, true + case "public_content": + item := f.PublicContent[idx] + return FactScope{ + FactKind: "public_content", + Changed: true, + Source: item.Source, + SourceFile: item.File, + Line: item.Line, + }, true default: return FactScope{}, false } @@ -195,7 +204,7 @@ func containsString(values []string, want string) bool { func allowedFactKind(kind string) bool { switch kind { - case "skill", "command", "error", "output": + case "skill", "command", "error", "output", "public_content": return true default: return false diff --git a/internal/qualitygate/semantic/scope_test.go b/internal/qualitygate/semantic/scope_test.go index 863dbdbd..e5e50dc7 100644 --- a/internal/qualitygate/semantic/scope_test.go +++ b/internal/qualitygate/semantic/scope_test.go @@ -81,6 +81,30 @@ func TestGatekeeperSkillQualityUsesSkillEvidence(t *testing.T) { } } +func TestGatekeeperUsesPublicContentEvidence(t *testing.T) { + f := facts.Facts{ + SchemaVersion: 1, + PublicContent: []facts.PublicContentFact{{ + Rule: "public_content_generic_credential", + Action: "REJECT", + File: "docs/public.md", + Line: 12, + Source: "metadata", + }}, + } + review := Review{Findings: []Finding{{ + Category: "public_content_leakage", + Severity: "critical", + Evidence: []string{"facts.public_content[0]"}, + Message: "public content finding needs review", + SuggestedAction: "remove the sensitive public content", + }}} + got := Decide(f, review, DefaultPolicy()) + if len(got.Blockers) != 1 || got.Blockers[0].RolloutGroups[0] != "all" { + t.Fatalf("expected public content blocker, got %#v", got) + } +} + func TestGatekeeperAppliesSharedWaiverID(t *testing.T) { f := facts.Facts{ SchemaVersion: 1, diff --git a/internal/qualitygate/semantic/view.go b/internal/qualitygate/semantic/view.go index a0e6f6dc..1bf61065 100644 --- a/internal/qualitygate/semantic/view.go +++ b/internal/qualitygate/semantic/view.go @@ -13,27 +13,29 @@ import ( ) type InputView struct { - SchemaVersion int `json:"schema_version"` - ChangedSummary ChangedSummary `json:"changed_summary"` - RuleSummary []RuleSummaryItem `json:"rule_summary,omitempty"` - Commands []CommandInput `json:"commands,omitempty"` - Skills []SkillInput `json:"skills,omitempty"` - SkillQuality []SkillQualityInput `json:"skill_quality,omitempty"` - Errors []ErrorInput `json:"errors,omitempty"` - Outputs []OutputInput `json:"outputs,omitempty"` - Examples []ExampleInput `json:"examples,omitempty"` - Diagnostics []facts.DiagnosticFact `json:"diagnostics,omitempty"` + SchemaVersion int `json:"schema_version"` + ChangedSummary ChangedSummary `json:"changed_summary"` + RuleSummary []RuleSummaryItem `json:"rule_summary,omitempty"` + Commands []CommandInput `json:"commands,omitempty"` + Skills []SkillInput `json:"skills,omitempty"` + SkillQuality []SkillQualityInput `json:"skill_quality,omitempty"` + Errors []ErrorInput `json:"errors,omitempty"` + Outputs []OutputInput `json:"outputs,omitempty"` + Examples []ExampleInput `json:"examples,omitempty"` + PublicContentLeakage []PublicContentInput `json:"public_content_leakage,omitempty"` + Diagnostics []facts.DiagnosticFact `json:"diagnostics,omitempty"` } type ChangedSummary struct { - Commands int `json:"commands,omitempty"` - Skills int `json:"skills,omitempty"` - SkillQuality int `json:"skill_quality,omitempty"` - Errors int `json:"errors,omitempty"` - Outputs int `json:"outputs,omitempty"` - Examples int `json:"examples,omitempty"` - Domains []string `json:"domains,omitempty"` - Sources []string `json:"sources,omitempty"` + Commands int `json:"commands,omitempty"` + Skills int `json:"skills,omitempty"` + SkillQuality int `json:"skill_quality,omitempty"` + Errors int `json:"errors,omitempty"` + Outputs int `json:"outputs,omitempty"` + Examples int `json:"examples,omitempty"` + PublicContent int `json:"public_content,omitempty"` + Domains []string `json:"domains,omitempty"` + Sources []string `json:"sources,omitempty"` } type RuleSummaryItem struct { @@ -86,6 +88,22 @@ type ExampleInput struct { facts.CommandExample } +type PublicContentInput struct { + FactRef string `json:"fact_ref"` + facts.PublicContentFact +} + +func (v InputView) HasReviewableFacts() bool { + return len(v.Commands) > 0 || + len(v.Skills) > 0 || + len(v.SkillQuality) > 0 || + len(v.Errors) > 0 || + len(v.Outputs) > 0 || + len(v.Examples) > 0 || + len(v.PublicContentLeakage) > 0 || + len(v.Diagnostics) > 0 +} + func BuildInputView(f facts.Facts) InputView { selected := newInputSelection(f) selected.addChangedReviewCandidates() @@ -104,16 +122,17 @@ func BuildInputView(f facts.Facts) InputView { } return InputView{ - SchemaVersion: f.SchemaVersion, - ChangedSummary: changedSummary(f), - RuleSummary: ruleSummary(f.Diagnostics), - Commands: selected.commandInputs(), - Skills: selected.skillInputs(), - SkillQuality: selected.skillQualityInputs(), - Errors: selected.errorInputs(), - Outputs: selected.outputInputs(), - Examples: selected.exampleInputs(), - Diagnostics: viewDiagnostics, + SchemaVersion: f.SchemaVersion, + ChangedSummary: changedSummary(f), + RuleSummary: ruleSummary(f.Diagnostics), + Commands: selected.commandInputs(), + Skills: selected.skillInputs(), + SkillQuality: selected.skillQualityInputs(), + Errors: selected.errorInputs(), + Outputs: selected.outputInputs(), + Examples: selected.exampleInputs(), + PublicContentLeakage: selected.publicContentInputs(), + Diagnostics: viewDiagnostics, } } @@ -138,6 +157,11 @@ func (s *inputSelection) addChangedReviewCandidates() { s.outputs[i] = true } } + for i, item := range s.f.PublicContent { + if publicContentReviewCandidate(item) { + s.publicContent[i] = true + } + } } func commandReviewCandidate(cmd facts.CommandFact) bool { @@ -157,25 +181,31 @@ func outputReviewCandidate(_ facts.OutputFact) bool { return false } +func publicContentReviewCandidate(item facts.PublicContentFact) bool { + return item.Rule == "public_content_semantic_candidate" +} + type inputSelection struct { - f facts.Facts - commands []bool - skills []bool - skillQuality []bool - errors []bool - outputs []bool - examples []bool + f facts.Facts + commands []bool + skills []bool + skillQuality []bool + errors []bool + outputs []bool + examples []bool + publicContent []bool } func newInputSelection(f facts.Facts) *inputSelection { return &inputSelection{ - f: f, - commands: make([]bool, len(f.Commands)), - skills: make([]bool, len(f.Skills)), - skillQuality: make([]bool, len(f.SkillQuality)), - errors: make([]bool, len(f.Errors)), - outputs: make([]bool, len(f.Outputs)), - examples: make([]bool, len(f.Examples)), + f: f, + commands: make([]bool, len(f.Commands)), + skills: make([]bool, len(f.Skills)), + skillQuality: make([]bool, len(f.SkillQuality)), + errors: make([]bool, len(f.Errors)), + outputs: make([]bool, len(f.Outputs)), + examples: make([]bool, len(f.Examples)), + publicContent: make([]bool, len(f.PublicContent)), } } @@ -194,6 +224,8 @@ func (s *inputSelection) diagnosticContext(diag facts.DiagnosticFact) *inputSele s.addDiagnosticExamples(out, diag) case diag.Rule == "no_bare_helper_error": s.addDiagnosticErrors(out, diag) + case strings.HasPrefix(diag.Rule, "public_content_"): + s.addDiagnosticPublicContent(out, diag) } return out } @@ -256,6 +288,15 @@ func (s *inputSelection) addDiagnosticExamples(out *inputSelection, diag facts.D } } +func (s *inputSelection) addDiagnosticPublicContent(out *inputSelection, diag facts.DiagnosticFact) { + for i, item := range s.f.PublicContent { + if diagnosticLocationMatches(diag.File, diag.Line, item.File, item.Line) || + diag.Rule == item.Rule { + out.publicContent[i] = true + } + } +} + func includeDiagnosticInView(diag facts.DiagnosticFact, selected, context *inputSelection) bool { if diag.Action == report.ActionReject { return true @@ -270,6 +311,7 @@ func (s *inputSelection) merge(other *inputSelection) { mergeSelections(s.errors, other.errors) mergeSelections(s.outputs, other.outputs) mergeSelections(s.examples, other.examples) + mergeSelections(s.publicContent, other.publicContent) } func (s *inputSelection) intersects(other *inputSelection) bool { @@ -278,7 +320,8 @@ func (s *inputSelection) intersects(other *inputSelection) bool { selectionsIntersect(s.skillQuality, other.skillQuality) || selectionsIntersect(s.errors, other.errors) || selectionsIntersect(s.outputs, other.outputs) || - selectionsIntersect(s.examples, other.examples) + selectionsIntersect(s.examples, other.examples) || + selectionsIntersect(s.publicContent, other.publicContent) } func (s *inputSelection) commandInputs() []CommandInput { @@ -351,6 +394,16 @@ func (s *inputSelection) exampleInputs() []ExampleInput { return out } +func (s *inputSelection) publicContentInputs() []PublicContentInput { + out := make([]PublicContentInput, 0, countSelected(s.publicContent)) + for i, ok := range s.publicContent { + if ok { + out = append(out, PublicContentInput{FactRef: factRef("public_content", i), PublicContentFact: s.f.PublicContent[i]}) + } + } + return out +} + func changedSummary(f facts.Facts) ChangedSummary { domains := map[string]bool{} sources := map[string]bool{} @@ -402,6 +455,10 @@ func changedSummary(f facts.Facts) ChangedSummary { addNonEmpty(domains, example.Domain) addNonEmpty(sources, example.Source) } + for _, item := range f.PublicContent { + out.PublicContent++ + addNonEmpty(sources, item.Source) + } out.Domains = sortedViewSetKeys(domains) out.Sources = sortedViewSetKeys(sources) return out @@ -434,7 +491,8 @@ func semanticDiagnosticRule(rule string) bool { strings.HasPrefix(rule, "default_output") || strings.HasPrefix(rule, "skill_") || strings.HasPrefix(rule, "example_dry_run") || - rule == "no_bare_helper_error" + rule == "no_bare_helper_error" || + strings.HasPrefix(rule, "public_content_") } func diagnosticCommandMatches(diag facts.DiagnosticFact, values ...string) bool { diff --git a/internal/qualitygate/semantic/view_test.go b/internal/qualitygate/semantic/view_test.go index 05200933..0da1c11f 100644 --- a/internal/qualitygate/semantic/view_test.go +++ b/internal/qualitygate/semantic/view_test.go @@ -77,6 +77,122 @@ func TestInputViewKeepsChangedReviewCandidatesWithOriginalRefs(t *testing.T) { } } +func TestInputViewIncludesPublicContentLeakage(t *testing.T) { + f := facts.Facts{ + SchemaVersion: 1, + PublicContent: []facts.PublicContentFact{{ + Rule: "public_content_generic_credential", + Action: report.ActionReject, + File: "docs/public.md", + Line: 4, + Excerpt: "api_key = ", + Message: "generic credential assignment", + }}, + Diagnostics: []facts.DiagnosticFact{{ + Rule: "public_content_generic_credential", + Action: report.ActionReject, + File: "docs/public.md", + Line: 4, + Message: "generic credential assignment", + }}, + } + + view := BuildInputView(f) + if len(view.PublicContentLeakage) != 1 { + t.Fatalf("public content leakage len = %d, want 1", len(view.PublicContentLeakage)) + } + if got := view.PublicContentLeakage[0].FactRef; got != "facts.public_content[0]" { + t.Fatalf("public content fact ref = %q", got) + } + if len(view.Diagnostics) != 1 { + t.Fatalf("diagnostics len = %d, want 1", len(view.Diagnostics)) + } +} + +func TestInputViewIncludesPublicContentSemanticCandidatesWithoutDiagnostics(t *testing.T) { + f := facts.Facts{ + SchemaVersion: 1, + PublicContent: []facts.PublicContentFact{{ + Rule: "public_content_semantic_candidate", + Action: report.ActionWarning, + File: "docs/public.md", + Line: 1, + Source: "file", + Excerpt: "public prose that needs semantic review", + Message: "public contribution contains text for semantic public content review", + }}, + } + + view := BuildInputView(f) + if len(view.PublicContentLeakage) != 1 { + t.Fatalf("semantic candidate len = %d, want 1", len(view.PublicContentLeakage)) + } + if got := view.PublicContentLeakage[0].FactRef; got != "facts.public_content[0]" { + t.Fatalf("semantic candidate fact ref = %q", got) + } + if len(view.Diagnostics) != 0 { + t.Fatalf("semantic candidate should not require diagnostics, got %#v", view.Diagnostics) + } +} + +func TestPromptIncludesSanitizedPublicContentExcerpt(t *testing.T) { + scopeText := "pri" + "vate rollout" + f := facts.Facts{ + SchemaVersion: 1, + PublicContent: []facts.PublicContentFact{{ + Rule: "public_content_semantic_candidate", + Action: report.ActionWarning, + File: "docs/public.md", + Line: 1, + Source: "file", + Excerpt: `semantic signals: pri` + `vate_scope,roadmap_detail; excerpt: "` + scopeText + ` token="`, + Message: "public contribution contains text for semantic public content review", + }}, + } + + view := BuildInputView(f) + if len(view.PublicContentLeakage) != 1 { + t.Fatalf("semantic candidate len = %d, want 1", len(view.PublicContentLeakage)) + } + if got := view.PublicContentLeakage[0].Excerpt; !strings.Contains(got, scopeText) || !strings.Contains(got, "token=") { + t.Fatalf("semantic candidate excerpt missing from view: %q", got) + } + + messages := BuildPrompt(f) + if len(messages) != 2 { + t.Fatalf("messages len = %d, want 2", len(messages)) + } + if !strings.Contains(messages[1].Content, scopeText) || !strings.Contains(messages[1].Content, "redacted") { + t.Fatalf("prompt missing sanitized public content excerpt: %s", messages[1].Content) + } + if strings.Contains(messages[1].Content, "real-"+"secret-value") { + t.Fatalf("prompt leaked raw sensitive value %q", messages[1].Content) + } +} + +func TestInputViewExcludesPublicContentWarningsWithoutSemanticCandidate(t *testing.T) { + f := facts.Facts{ + SchemaVersion: 1, + PublicContent: []facts.PublicContentFact{{ + Rule: "public_content_" + "pri" + "vate_ipv4", + Action: report.ActionWarning, + File: "docs/network.md", + Line: 1, + Source: "file", + Excerpt: "192.168." + "0.10", + Message: "public contribution contains a pri" + "vate-network IP address", + }}, + } + + view := BuildInputView(f) + if len(view.PublicContentLeakage) != 0 { + t.Fatalf("warning-only public content should not enter semantic view: %#v", view.PublicContentLeakage) + } + if len(view.Diagnostics) != 0 { + t.Fatalf("warning-only public content should not add diagnostics: %#v", view.Diagnostics) + } +} + func TestInputViewSummarizesBroadChangedCommandSurface(t *testing.T) { f := broadChangedFacts(434, 44) diff --git a/internal/qualitygate/semantic/waiver.go b/internal/qualitygate/semantic/waiver.go index 971a8432..7d914d2c 100644 --- a/internal/qualitygate/semantic/waiver.go +++ b/internal/qualitygate/semantic/waiver.go @@ -138,6 +138,10 @@ func parseWaiver(parts []string, lineNo int) (Waiver, error) { if item.SourceFile == "" || item.Line == 0 { return Waiver{}, fmt.Errorf("%s:%d: %s waiver requires source_file and line", waiverPath, lineNo, item.FactKind) } + case "public_content": + if item.SourceFile == "" || item.Line == 0 || item.CommandPath != "" { + return Waiver{}, fmt.Errorf("%s:%d: public_content waiver requires source_file and line only", waiverPath, lineNo) + } case "command", "output": if item.CommandPath == "" { return Waiver{}, fmt.Errorf("%s:%d: %s waiver requires command_path", waiverPath, lineNo, item.FactKind) diff --git a/internal/qualitygate/semantic/waiver_test.go b/internal/qualitygate/semantic/waiver_test.go index 601315f5..9ac12066 100644 --- a/internal/qualitygate/semantic/waiver_test.go +++ b/internal/qualitygate/semantic/waiver_test.go @@ -21,24 +21,27 @@ func TestLoadWaivers(t *testing.T) { writeSemanticFile(t, repo, "waivers.txt", "# waiver_id\tcategory\tfact_kind\tsource_file\tline\tcommand_path\towner\treason\tadded_at\texpires_at\n"+ "wiki-move-202606\tskill_quality\tskill\tskills/lark-wiki/SKILL.md\t30\t\twiki-owner\tmigration\t2026-06-08\t2026-07-15\n"+ - "wiki-move-202606\tskill_quality\tskill\tskills/lark-wiki/references/move.md\t12\t\twiki-owner\tmigration\t2026-06-08\t2026-07-15\n") + "wiki-move-202606\tskill_quality\tskill\tskills/lark-wiki/references/move.md\t12\t\twiki-owner\tmigration\t2026-06-08\t2026-07-15\n"+ + "public-doc-202606\tpublic_content_leakage\tpublic_content\tdocs/public.md\t4\t\tsecurity-owner\treviewed false positive\t2026-06-08\t2026-07-15\n") w, diags, err = LoadWaivers(repo, now) if err != nil { t.Fatalf("LoadWaivers() error = %v", err) } - if len(diags) != 0 || len(w.Items) != 2 { + if len(diags) != 0 || len(w.Items) != 3 { t.Fatalf("LoadWaivers() = %#v %#v", w, diags) } for name, body := range map[string]string{ - "bad columns": "one\ttoo-few\n", - "bad id": "BAD\terror_hint\terror\tcmd/root.go\t1\t\to\tr\t2026-06-08\t2026-07-15\n", - "bad fact kind": "id1\terror_hint\tskill_quality\tcmd/root.go\t1\t\to\tr\t2026-06-08\t2026-07-15\n", - "missing owner": "id1\terror_hint\terror\tcmd/root.go\t1\t\t\tr\t2026-06-08\t2026-07-15\n", - "missing line": "id1\terror_hint\terror\tcmd/root.go\t\t\to\tr\t2026-06-08\t2026-07-15\n", - "missing command": "id1\tdefault_output\toutput\t\t\t\to\tr\t2026-06-08\t2026-07-15\n", - "bad source path": "id1\terror_hint\terror\t../cmd/root.go\t1\t\to\tr\t2026-06-08\t2026-07-15\n", - "bad date format": "id1\terror_hint\terror\tcmd/root.go\t1\t\to\tr\t20260608\t2026-07-15\n", + "bad columns": "one\ttoo-few\n", + "bad id": "BAD\terror_hint\terror\tcmd/root.go\t1\t\to\tr\t2026-06-08\t2026-07-15\n", + "bad fact kind": "id1\terror_hint\tskill_quality\tcmd/root.go\t1\t\to\tr\t2026-06-08\t2026-07-15\n", + "missing owner": "id1\terror_hint\terror\tcmd/root.go\t1\t\t\tr\t2026-06-08\t2026-07-15\n", + "missing line": "id1\terror_hint\terror\tcmd/root.go\t\t\to\tr\t2026-06-08\t2026-07-15\n", + "missing command": "id1\tdefault_output\toutput\t\t\t\to\tr\t2026-06-08\t2026-07-15\n", + "public content missing line": "id1\tpublic_content_leakage\tpublic_content\tdocs/public.md\t\t\to\tr\t2026-06-08\t2026-07-15\n", + "public content command selector": "id1\tpublic_content_leakage\tpublic_content\t\t\tcmd/foo\to\tr\t2026-06-08\t2026-07-15\n", + "bad source path": "id1\terror_hint\terror\t../cmd/root.go\t1\t\to\tr\t2026-06-08\t2026-07-15\n", + "bad date format": "id1\terror_hint\terror\tcmd/root.go\t1\t\to\tr\t20260608\t2026-07-15\n", } { t.Run(name, func(t *testing.T) { writeSemanticFile(t, repo, "waivers.txt", body) diff --git a/scripts/ci-quality-summary-publish.js b/scripts/ci-quality-summary-publish.js index 900fe0f3..07e8d8fb 100644 --- a/scripts/ci-quality-summary-publish.js +++ b/scripts/ci-quality-summary-publish.js @@ -45,6 +45,10 @@ async function publishTargetStillCurrent(github, context, core, target, phase = repo: context.repo.repo, pull_number: target.pr, }); + if (pr.state !== "open") { + core.notice(`PR quality summary skipped: PR is no longer open before ${phase}`); + return false; + } if (pr.head.sha !== target.headSha) { core.notice(`PR quality summary skipped: PR head changed before ${phase}`); return false; diff --git a/scripts/ci-quality-summary-publish.test.js b/scripts/ci-quality-summary-publish.test.js index fb20fdb0..fb2c9b8c 100644 --- a/scripts/ci-quality-summary-publish.test.js +++ b/scripts/ci-quality-summary-publish.test.js @@ -152,6 +152,25 @@ describe("ci-quality-summary-publish", () => { }); }); + it("does not publish a summary when the PR closes before comment creation", async () => { + await withPublishTempDir(async ({ calls }) => { + await publish({ + github: fakeGithub(calls, { + jobs: [{ name: "unit-test", conclusion: "failure", html_url: "https://github.example/jobs/1" }], + pullResponses: [ + currentPullResponse(), + currentPullResponse({ state: "closed" }), + ], + }), + context: workflowRunContext({ conclusion: "failure" }), + core: silentCore(calls), + }); + + assert.equal(calls.comments.length, 0); + assert.match(calls.notices.join("\n"), /PR is no longer open/); + }); + }); + it("does not delete an existing summary when the PR base changes before cleanup", async () => { await withPublishTempDir(async ({ calls }) => { await publish({ @@ -338,6 +357,7 @@ function fakeGithub(calls, options = {}) { function currentPullResponse(overrides = {}) { return { data: { + state: overrides.state || "open", head: { sha: overrides.headSha || process.env.CI_QUALITY_SUMMARY_HEAD_SHA }, base: { sha: overrides.baseSha || process.env.CI_QUALITY_SUMMARY_BASE_SHA, diff --git a/scripts/ci-workflow.test.sh b/scripts/ci-workflow.test.sh index d5bbc2a2..b8bdcd98 100644 --- a/scripts/ci-workflow.test.sh +++ b/scripts/ci-workflow.test.sh @@ -5,26 +5,42 @@ set -euo pipefail workflow=".github/workflows/ci.yml" +job_section() { + local job="$1" + awk -v job="$job" ' + $0 == " " job ":" { in_job = 1; print; next } + in_job && /^ [A-Za-z0-9_-]+:/ { exit } + in_job { print } + ' "$workflow" +} workflow_permissions="$(awk ' /^permissions:/ { in_permissions = 1; print; next } in_permissions && /^[^[:space:]]/ { exit } in_permissions { print } ' "$workflow")" +fast_gate_section="$(job_section fast-gate)" +unit_test_section="$(job_section unit-test)" lint_section="$(awk ' /^ lint:/ { in_job = 1 } in_job { print } - /^ deterministic-gate:/ { exit } + /^ script-test:/ { exit } ' "$workflow")" +script_test_section="$(job_section script-test)" deterministic_section="$(awk ' /^ deterministic-gate:/ { in_job = 1 } in_job { print } /^ coverage:/ { exit } ' "$workflow")" +coverage_job_section="$(job_section coverage)" +deadcode_section="$(job_section deadcode)" +dry_run_section="$(job_section e2e-dry-run)" section="$(awk ' /^ e2e-live:/ { in_job = 1 } in_job { print } /^ security:/ { exit } ' "$workflow")" +security_section="$(job_section security)" +license_header_section="$(job_section license-header)" results_section="$(awk ' /^ results:/ { in_job = 1 } in_job { print } @@ -98,13 +114,94 @@ if ! grep -Fq "make quality-gate" <<<"$deterministic_section"; then exit 1 fi +if ! grep -Fq "Write public content metadata" <<<"$deterministic_section"; then + echo "deterministic-gate should write PR title/body metadata before quality-gate" + exit 1 +fi + +if ! grep -Fq "types: [opened, synchronize, reopened, edited]" "$workflow"; then + echo "CI pull_request trigger should include edited so PR title/body changes are rescanned" + exit 1 +fi + +if ! grep -Fq "script-test:" <<<"$script_test_section"; then + echo "CI should run make script-test so workflow and publisher contract tests are not local-only" + exit 1 +fi + +if ! grep -Fq "make script-test" <<<"$script_test_section"; then + echo "script-test job should invoke make script-test" + exit 1 +fi + +if ! grep -Fq "actions/setup-node" <<<"$script_test_section"; then + echo "script-test job should install Node for JavaScript workflow tests" + exit 1 +fi + +if grep -Fq '${{ secrets.' <<<"$script_test_section"; then + echo "script-test must not reference secrets" + exit 1 +fi + +if grep -Fq "metadata-gate:" "$workflow"; then + echo "metadata-gate should not run alongside deterministic-gate because both would upload the same facts artifact" + exit 1 +fi + +if grep -Fq "github.event.action != 'edited'" <<<"$fast_gate_section"; then + echo "fast-gate must run on pull_request edited events so title/body edits cannot replace failed CI with a light success" + exit 1 +fi + +for full_job in \ + "$unit_test_section" \ + "$lint_section" \ + "$script_test_section" \ + "$deterministic_section" \ + "$coverage_job_section" \ + "$dry_run_section" \ + "$security_section"; do + if grep -Fq "github.event.action != 'edited'" <<<"$full_job"; then + echo "full CI jobs must run on pull_request edited events; do not skip title/body-only edits" + exit 1 + fi +done + +for pull_request_job in "$deadcode_section" "$license_header_section"; do + if grep -Fq "github.event.action != 'edited'" <<<"$pull_request_job"; then + echo "pull_request-only CI jobs must run on edited events" + exit 1 + fi +done + +if grep -Fq '${{ secrets.' <<<"$deterministic_section"; then + echo "deterministic-gate must not reference secrets" + exit 1 +fi + +if ! grep -Fq "PUBLIC_CONTENT_METADATA=" <<<"$deterministic_section"; then + echo "deterministic-gate should pass public content metadata into make quality-gate" + exit 1 +fi + +if ! grep -Fq "PR_BRANCH:" <<<"$deterministic_section"; then + echo "deterministic-gate should pass the pull request branch into public content metadata" + exit 1 +fi + if ! grep -Fq "name: quality-gate-facts-\${{ github.event.pull_request.base.sha }}-\${{ github.event.pull_request.head.sha }}" <<<"$deterministic_section"; then echo "deterministic-gate should upload base/head-bound quality-gate-facts for semantic review" exit 1 fi -if ! grep -Fq "needs: [unit-test, lint, deterministic-gate]" "$workflow"; then - echo "E2E jobs should wait for deterministic-gate" +if ! grep -Fq "needs: [unit-test, lint, script-test, deterministic-gate]" "$workflow"; then + echo "E2E jobs should wait for script-test and deterministic-gate" + exit 1 +fi + +if ! grep -Fq "script-test" <<<"$results_section"; then + echo "results job should include script-test" exit 1 fi @@ -210,6 +307,11 @@ if ! grep -Fq "go run ./internal/qualitygate/cmd/manifest-export" <<<"$make_outp exit 1 fi +if ! grep -Fq -- "--public-content-metadata .tmp/quality-gate/public-content-metadata.json" <<<"$make_output"; then + echo "quality-gate check should consume public content metadata" + exit 1 +fi + if ! grep -Fq -- "--manifest .tmp/quality-gate/command-manifest.json" <<<"$make_output" || ! grep -Fq -- "--command-index .tmp/quality-gate/command-index.json" <<<"$make_output"; then echo "quality-gate check should consume both exported command snapshots" diff --git a/scripts/semantic-review-publish.js b/scripts/semantic-review-publish.js index 61a57592..afae3132 100644 --- a/scripts/semantic-review-publish.js +++ b/scripts/semantic-review-publish.js @@ -175,7 +175,7 @@ function inlineCode(value) { } function parseEvidenceRef(ref) { - const match = /^facts\.(commands|skills|errors|outputs)\[(\d+)\]$/.exec(String(ref || "")); + const match = /^facts\.(commands|skills|errors|outputs|public_content)\[(\d+)\]$/.exec(String(ref || "")); if (!match) { return null; } @@ -230,6 +230,20 @@ function evidenceLocation(facts, ref) { return { kind: parsed.kind, command: item.path, label: item.path }; } return null; + case "public_content": + if (item.file && Number.isInteger(item.line) && item.line > 0) { + const label = `${item.file}:${item.line}`; + if (item.file === "branch" || item.file === "pull_request_metadata" || String(item.file).startsWith("commit:")) { + return { kind: parsed.kind, label }; + } + return { + kind: parsed.kind, + path: item.file, + line: item.line, + label, + }; + } + return null; default: return null; } @@ -845,6 +859,10 @@ async function publishTargetStillCurrent(github, context, core, target, phase = repo: context.repo.repo, pull_number: target.pr, }); + if (pr.state !== "open") { + core.notice(`semantic review skipped: PR is no longer open before ${phase}`); + return false; + } if (pr.head.sha !== target.headSha) { core.notice(`semantic review skipped: PR head changed before ${phase}`); return false; diff --git a/scripts/semantic-review-publish.test.js b/scripts/semantic-review-publish.test.js index d0a9b635..987cc9ac 100644 --- a/scripts/semantic-review-publish.test.js +++ b/scripts/semantic-review-publish.test.js @@ -202,6 +202,100 @@ describe("semantic-review-publish", () => { assert.equal(selectInlineTarget({ evidence: ["facts.errors[0]"] }, facts, changedLineIndex), null); }); + it("maps public content evidence to changed files but not virtual metadata", () => { + const restrictedScope = "pri" + "vate"; + const facts = { + public_content: [ + { + rule: "public_content_semantic_candidate", + action: "WARNING", + file: "docs/public-roadmap.md", + line: 4, + source: "file", + }, + { + rule: "public_content_semantic_candidate", + action: "WARNING", + file: "pull_request_metadata", + line: 1, + source: "metadata", + }, + { + rule: "public_content_automation_branch", + action: "WARNING", + file: "branch", + line: 1, + source: "branch", + }, + { + rule: "public_content_change_id_trailer", + action: "REJECT", + file: "commit:1234abc", + line: 3, + source: "commit", + }, + ], + }; + const changedLineIndex = buildChangedLineIndex([{ + filename: "docs/public-roadmap.md", + patch: [ + "@@ -3,2 +3,3 @@", + " context", + "+Specific " + restrictedScope + " roadmap detail", + ].join("\n"), + }]); + + assert.deepEqual( + selectInlineTarget({ evidence: ["facts.public_content[0]"] }, facts, changedLineIndex), + { path: "docs/public-roadmap.md", line: 4 }, + ); + assert.equal(selectInlineTarget({ evidence: ["facts.public_content[1]"] }, facts, changedLineIndex), null); + assert.equal(selectInlineTarget({ evidence: ["facts.public_content[2]"] }, facts, changedLineIndex), null); + assert.equal(selectInlineTarget({ evidence: ["facts.public_content[3]"] }, facts, changedLineIndex), null); + + const markdown = buildSummaryMarkdown({ + block_mode: true, + blockers: [{ + category: "public_content_leakage", + severity: "major", + review_action: "must_fix", + evidence: ["facts.public_content[1]"], + fingerprint: "public-content-metadata", + message: "PR metadata contains " + restrictedScope + " rollout detail", + suggested_action: "Move " + restrictedScope + " detail to an internal channel.", + }], + warnings: [], + }, facts); + assert.match(markdown, /pull_request_metadata:1/); + + const virtualMarkdown = buildSummaryMarkdown({ + block_mode: true, + blockers: [ + { + category: "public_content_leakage", + severity: "major", + review_action: "must_fix", + evidence: ["facts.public_content[2]"], + fingerprint: "public-content-branch", + message: "Branch name looks automation-owned.", + suggested_action: "Use a maintainer-owned public branch name.", + }, + { + category: "public_content_leakage", + severity: "major", + review_action: "must_fix", + evidence: ["facts.public_content[3]"], + fingerprint: "public-content-commit", + message: "Commit trailer contains " + restrictedScope + " review metadata.", + suggested_action: "Remove " + restrictedScope + " review metadata from commits.", + }, + ], + warnings: [], + }, facts); + assert.match(virtualMarkdown, /branch:1/); + assert.match(virtualMarkdown, /commit:1234abc:3/); + }); + it("builds finding markers from stable fingerprints and evidence identity", () => { const factsA = { skills: [{ @@ -615,6 +709,35 @@ describe("semantic-review-publish", () => { }); }); + it("skips publishing when the PR closes after verification", async () => { + await withPublishTempDir(async ({ calls }) => { + fs.writeFileSync("decision.json", JSON.stringify({ + block_mode: true, + blockers: [], + warnings: [], + }), "utf8"); + + await publish({ + github: fakeGithub(calls, { + currentPullRequest: { + state: "closed", + head: { sha: "0123456789abcdef0123456789abcdef01234567" }, + base: { + sha: "fedcba9876543210fedcba9876543210fedcba98", + repo: { id: 123 }, + }, + }, + }), + context: workflowRunContext(), + core: silentCore(calls), + }); + + assert.equal(calls.checks.length, 0); + assert.equal(calls.comments.length, 0); + assert.match(calls.notices[0], /PR is no longer open before publishing/); + }); + }); + it("rejects publishing when the PR base repo changed after verification", async () => { await withPublishTempDir(async ({ calls }) => { fs.writeFileSync("decision.json", JSON.stringify({ @@ -2223,8 +2346,8 @@ function fakeGithub(calls, options = {}) { }, }, pulls: { - get: async () => ({ - data: Array.isArray(options.currentPullRequests) + get: async () => { + const pull = Array.isArray(options.currentPullRequests) ? options.currentPullRequests[Math.min(pullGetCount++, options.currentPullRequests.length - 1)] : options.currentPullRequest || { head: { sha: process.env.SEMANTIC_REVIEW_HEAD_SHA }, @@ -2232,8 +2355,9 @@ function fakeGithub(calls, options = {}) { sha: process.env.SEMANTIC_REVIEW_BASE_SHA, repo: { id: 123 }, }, - }, - }), + }; + return { data: { state: "open", ...pull } }; + }, listFiles() {}, listReviewComments() {}, createReviewComment: async (args) => { diff --git a/scripts/semantic-review-verify-artifact.js b/scripts/semantic-review-verify-artifact.js index d8831a0c..309c812d 100644 --- a/scripts/semantic-review-verify-artifact.js +++ b/scripts/semantic-review-verify-artifact.js @@ -229,6 +229,36 @@ function requireSafePath(value, path) { return file; } +function requirePublicContentFile(value, path) { + const file = requireString(value, path); + if (file === "branch" || file === "pull_request_metadata" || /^commit:[0-9a-f]{7,40}$/.test(file)) { + return file; + } + if (file.startsWith("commit:")) { + throw new Error(`facts JSON ${path} must be a valid public content location`); + } + requireSafePath(file, path); + if ( + file === "" || + file === "." || + file.startsWith("./") || + file.includes("\\") || + file.includes("\0") || + file.split("/").includes(".git") || + /^[A-Za-z][A-Za-z0-9+.-]*:/.test(file) + ) { + throw new Error(`facts JSON ${path} must be a repository-relative path`); + } + return file; +} + +function requirePositiveLine(value, path) { + requireLine(value, path); + if (value === 0) { + throw new Error(`facts JSON ${path} must be a positive line number`); + } +} + function requireStringArray(value, path, { optional = false } = {}) { if (value === undefined || value === null) { if (optional) { @@ -421,6 +451,20 @@ function verifyFactsJSON(data) { for (const [i, value] of requireArray(facts, "examples").entries()) { verifyCommandExample(value, `examples[${i}]`); } + for (const [i, value] of requireArray(facts, "public_content").entries()) { + const item = requireObject(value, `public_content[${i}]`); + requireString(item.rule, `public_content[${i}].rule`); + const action = requireString(item.action, `public_content[${i}].action`); + if (!VALID_ACTIONS.has(action)) { + throw new Error(`facts JSON public_content[${i}].action is invalid`); + } + requirePublicContentFile(item.file, `public_content[${i}].file`); + requirePositiveLine(item.line, `public_content[${i}].line`); + requireString(item.source, `public_content[${i}].source`, { optional: true }); + requireString(item.excerpt, `public_content[${i}].excerpt`, { optional: true }); + requireString(item.message, `public_content[${i}].message`, { optional: true }); + requireString(item.suggestion, `public_content[${i}].suggestion`, { optional: true }); + } for (const [i, value] of requireArray(facts, "diagnostics").entries()) { const item = requireObject(value, `diagnostics[${i}]`); requireString(item.rule, `diagnostics[${i}].rule`); diff --git a/scripts/semantic-review-verify-artifact.test.js b/scripts/semantic-review-verify-artifact.test.js index 91e6413a..87edcf2e 100644 --- a/scripts/semantic-review-verify-artifact.test.js +++ b/scripts/semantic-review-verify-artifact.test.js @@ -67,7 +67,43 @@ describe("verifyZipEntries", () => { const dir = fs.mkdtempSync(path.join(os.tmpdir(), "semantic-review-zip-")); const zipPath = path.join(dir, "facts.zip"); const outPath = path.join(dir, "facts.json"); - const facts = Buffer.from('{"schema_version":1}\n'); + const restrictedScope = "pri" + "vate"; + const facts = Buffer.from(JSON.stringify({ + schema_version: 1, + public_content: [ + { + rule: "public_content_semantic_candidate", + action: "WARNING", + file: "pull_request_metadata", + line: 1, + source: "metadata", + excerpt: "public release notes mention an internal rollout plan", + message: "public contribution may contain sensitive implementation detail", + suggestion: "move internal detail to " + restrictedScope + " discussion", + }, + { + rule: "public_content_change_id_trailer", + action: "REJECT", + file: "commit:1234abc", + line: 3, + source: "commit", + }, + { + rule: "public_content_automation_branch", + action: "WARNING", + file: "branch", + line: 1, + source: "branch", + }, + { + rule: "public_content_" + "pri" + "vate_ipv4", + action: "WARNING", + file: "docs/public-network.md", + line: 7, + source: "file", + }, + ], + }) + "\n"); const zip = makeZip([{ fileName: "facts.json", data: facts, mode: 0o100644 }]); fs.writeFileSync(zipPath, zip); @@ -103,6 +139,19 @@ describe("verifyZipEntries", () => { ["bad-error-path", Buffer.from('{"schema_version":1,"errors":[{"file":"../x.go","line":1,"boundary":true,"uses_structured_error":false,"has_hint":false,"hint_action_count":0,"required_hint":true,"retryable":false}]}'), /errors\[0\]\.file/], ["bad-example-dry-run", Buffer.from('{"schema_version":1,"examples":[{"raw":"lark-cli docs +fetch","source_file":"skills/lark-doc/SKILL.md","line":3,"executable":true,"dry_run":{"method":"GET","url":"/open-apis/docx","query":{"page_size":["20",1]}}}]}'), /examples\[0\]\.dry_run\.query\.page_size\[1\]/], ["bad-output-field", Buffer.from(JSON.stringify({ schema_version: 1, outputs: [{ command: "drive files list", fields: ["ok", "x".repeat(9000)] }] })), /outputs\[0\]\.fields\[1\]/], + ["non-array-public-content", Buffer.from('{"schema_version":1,"public_content":{}}'), /public_content must be an array/], + ["bad-public-content-item", Buffer.from('{"schema_version":1,"public_content":["not-object"]}'), /public_content\[0\]/], + ["bad-public-content-action", Buffer.from('{"schema_version":1,"public_content":[{"rule":"public_content_semantic_candidate","action":"BLOCK","file":"pull_request_metadata","line":1}]}'), /public_content\[0\]\.action/], + ["bad-public-content-path", Buffer.from('{"schema_version":1,"public_content":[{"rule":"public_content_semantic_candidate","action":"WARNING","file":"../x","line":1}]}'), /public_content\[0\]\.file/], + ["dot-slash-public-content-path", Buffer.from('{"schema_version":1,"public_content":[{"rule":"public_content_semantic_candidate","action":"WARNING","file":"./foo","line":1}]}'), /public_content\[0\]\.file/], + ["empty-public-content-path", Buffer.from('{"schema_version":1,"public_content":[{"rule":"public_content_semantic_candidate","action":"WARNING","file":"","line":1}]}'), /public_content\[0\]\.file/], + ["dot-public-content-path", Buffer.from('{"schema_version":1,"public_content":[{"rule":"public_content_semantic_candidate","action":"WARNING","file":".","line":1}]}'), /public_content\[0\]\.file/], + ["url-public-content-path", Buffer.from('{"schema_version":1,"public_content":[{"rule":"public_content_semantic_candidate","action":"WARNING","file":"https://example.invalid/x","line":1}]}'), /public_content\[0\]\.file/], + ["dotgit-public-content-path", Buffer.from('{"schema_version":1,"public_content":[{"rule":"public_content_semantic_candidate","action":"WARNING","file":".git/config","line":1}]}'), /public_content\[0\]\.file/], + ["windows-public-content-path", Buffer.from('{"schema_version":1,"public_content":[{"rule":"public_content_semantic_candidate","action":"WARNING","file":"C:\\\\tmp\\\\x","line":1}]}'), /public_content\[0\]\.file/], + ["bad-public-content-commit-ref", Buffer.from('{"schema_version":1,"public_content":[{"rule":"public_content_change_id_trailer","action":"REJECT","file":"commit:notasha","line":1}]}'), /public_content\[0\]\.file/], + ["bad-public-content-line", Buffer.from('{"schema_version":1,"public_content":[{"rule":"public_content_semantic_candidate","action":"WARNING","file":"pull_request_metadata","line":"1"}]}'), /public_content\[0\]\.line/], + ["zero-public-content-line", Buffer.from('{"schema_version":1,"public_content":[{"rule":"public_content_semantic_candidate","action":"WARNING","file":"pull_request_metadata","line":0}]}'), /public_content\[0\]\.line/], ["bad-diagnostic-action", Buffer.from('{"schema_version":1,"diagnostics":[{"rule":"r","action":"BLOCK","file":"x.go","line":1,"message":"m"}]}'), /diagnostics.*action/], ["long-message", Buffer.from(JSON.stringify({ schema_version: 1, diagnostics: [{ rule: "r", action: "REJECT", file: "x.go", line: 1, message: "x".repeat(9000) }] })), /too long/], ]) { diff --git a/scripts/semantic-review-workflow.test.sh b/scripts/semantic-review-workflow.test.sh index 0ce4cc45..9e664ce6 100644 --- a/scripts/semantic-review-workflow.test.sh +++ b/scripts/semantic-review-workflow.test.sh @@ -184,6 +184,10 @@ require_in_step "$summary_verify_step" 'eventHeadSha && eventHeadSha.toLowerCase require_in_step "$summary_verify_step" 'factsArtifactPattern' "PR quality summary should use the base-bound facts artifact name when available" require_in_step "$summary_verify_step" 'const baseSha = artifactBaseSha || eventBaseSha || pr.base.sha' "PR quality summary must prefer the CI-time artifact base SHA" require_in_step "$summary_verify_step" 'core.setOutput("artifact_error"' "PR quality summary must expose artifact binding failures" +require_in_step "$summary_verify_step" 'state: "all"' "PR quality summary fallback must inspect closed PRs before failing" +require_in_step "$summary_verify_step" 'candidate.state === "open"' "PR quality summary fallback must still prefer open PRs" +require_in_step "$summary_verify_step" 'workflow_run target PR is no longer open' "PR quality summary must skip stale workflow_run events after PR closure" +require_in_step "$summary_verify_step" 'pr.state !== "open"' "PR quality summary must skip direct workflow_run PR bindings after PR closure" require_in_step "$summary_artifact_step" 'factsArtifactName' "PR quality summary artifact step must use the verified facts artifact binding" require_in_step "$summary_extract_facts_step" 'SEMANTIC_REVIEW_DECISION_OUT' "PR quality summary artifact verifier must write an infrastructure decision on verifier failure" @@ -212,7 +216,12 @@ require_in_step "$verify_step" 'runPRs.length > 1' "semantic-review must fail cl require_in_step "$verify_step" 'listPullRequestsAssociatedWithCommit' "semantic-review must resolve fork workflow_run PRs when pull_requests is empty" require_in_step "$verify_step" 'commit_sha: targetHeadSha' "semantic-review fallback must resolve PRs by the workflow_run PR head SHA" require_in_step "$verify_step" 'github.rest.pulls.list' "semantic-review must have a pull-list fallback when commit association is empty" -require_in_step "$verify_step" 'candidatePRs.length > 1' "semantic-review must fail closed when commit-to-PR fallback is ambiguous" +require_in_step "$verify_step" 'openCandidatePRs.length > 1' "semantic-review must fail closed when commit-to-PR fallback is ambiguous" +require_in_step "$verify_step" 'state: "all"' "semantic-review fallback must inspect closed PRs before failing" +require_in_step "$verify_step" 'candidate.state === "open"' "semantic-review fallback must still prefer open PRs" +require_in_step "$verify_step" 'workflow_run target PR is no longer open' "semantic-review must skip stale workflow_run events after PR closure" +require_in_step "$verify_step" 'pr.state !== "open"' "semantic-review must skip direct workflow_run PR bindings after PR closure" +require_in_step "$verify_step" '!pr.head.repo' "semantic-review must skip unavailable PR head repositories before reading owner/repo" require_in_step "$verify_step" 'pr.head.sha !== targetHeadSha' "semantic-review must skip stale PR heads" require_in_step "$verify_step" 'eventBaseSha && parsedBaseSha.toLowerCase() !== eventBaseSha.toLowerCase()' "semantic-review should tolerate mutable workflow_run PR base metadata" require_in_step "$verify_step" 'const baseSha = artifactBaseSha || eventBaseSha || pr.base.sha' "semantic-review must prefer the CI-time artifact base SHA" @@ -260,6 +269,7 @@ require_in_step "$semantic_step" 'args+=(--waivers-file' "same-repo PR head waiv require_in_step "$precheckout_step" 'SEMANTIC_REVIEW_BASE_SHA' "pre-checkout failure publisher must receive verified base SHA" require_in_step "$precheckout_step" 'SEMANTIC_REVIEW_RUN_ID' "pre-checkout failure publisher must receive verified run id" require_in_step "$precheckout_step" 'github.rest.pulls.get' "pre-checkout failure publisher must recheck PR target before writing" +require_in_step "$precheckout_step" 'pull.state !== "open"' "pre-checkout failure publisher must skip closed PRs before writing" require_in_step "$precheckout_step" 'pull.head.sha !== headSha' "pre-checkout failure publisher must skip stale PR heads" require_in_step "$precheckout_step" 'pull.base.sha !== baseSha' "pre-checkout failure publisher must skip stale PR bases"