feat: check shortcut example commands against the live CLI tree (#1244)

Validate the example commands embedded in shortcut definitions (the "Example: lark-cli ..." lines in each shortcut's Tips, shown in --help) against the real command tree built by cmd.Build. Implemented entirely as test-only code in cmd/ (package cmd_test), so it ships in no binary and is not importable by product code; the truth source is cmd.Build, the same tree the binary uses, so the check cannot drift. It runs in the standard unit-test CI job (go test ./cmd/...); a renamed command or unaccepted flag in an example fails that job.
2026-07-03 14:02:43 +08:00 · 2026-06-05 10:59:55 +08:00
parent ac116e7ca3
commit a4a4bd6ee0
5 changed files with 788 additions and 0 deletions
--- a/cmd/cmdexample_parse_test.go
+++ b/cmd/cmdexample_parse_test.go
@@ -0,0 +1,222 @@
+// Copyright (c) 2026 Lark Technologies Pte. Ltd.
+// SPDX-License-Identifier: MIT
+
+package cmd_test
+
+import (
+	"regexp"
+	"strings"
+)
+
+// ref is one lark-cli command reference extracted from a shortcut example.
+type ref struct {
+	line  int      // 1-based line number (the line where the command starts)
+	raw   string   // reconstructed command text, for error display
+	words []string // command words before the first flag (subcommand candidates)
+	flags []string // flag tokens used, e.g. "--query", "-q"
+}
+
+const cliToken = "lark-cli"
+
+// subcommandStart guards against false positives from prose: a real command's
+// first word is ASCII (a service name or a +shortcut). A token starting with
+// CJK / punctuation is treated as narration, not a command.
+var subcommandStart = regexp.MustCompile(`^[A-Za-z+]`)
+
+// shellStops are standalone tokens that terminate a command (pipes, redirects,
+// separators). Separators glued to a token (`get;`, `foo|`) are handled inline.
+var shellStops = map[string]bool{
+	"|": true, "||": true, "&&": true, "&": true, ";": true,
+	">": true, ">>": true, "<": true, "2>": true, "2>&1": true,
+}
+
+// wordTrailPunct is sentence / CJK punctuation that can cling to a command word
+// in prose ("auth login." / "auth login，"); stripped so the word still resolves
+// instead of being dropped as an unknown command or non-ASCII narration.
+const wordTrailPunct = `.,;:!?"')]}，。、；：！？）】」』`
+
+// parseRefs extracts every lark-cli command reference from text (a shortcut's
+// Tips line, which may embed an "Example: lark-cli ..." command). It is
+// deliberately format-agnostic: it keys on the "lark-cli" token whether it sits
+// in a ```bash fence, an inline `code` span, or bare prose. Backslash
+// line-continuations are joined first so a multi-line invocation is parsed as
+// one command; inline-code backticks and trailing # comments terminate it.
+func parseRefs(content string) []ref {
+	var refs []ref
+	lines := strings.Split(content, "\n")
+	for i := 0; i < len(lines); i++ {
+		lineNo := i + 1
+		logical := lines[i]
+		// Shell line continuation: a trailing backslash joins the next physical
+		// line. Without this, flags on the continuation lines of a multi-line
+		// `lark-cli ... \` example are never seen by the checker.
+		for endsWithBackslash(logical) && i+1 < len(lines) {
+			logical = strings.TrimRight(logical, " \t")
+			logical = logical[:len(logical)-1] // drop the trailing backslash
+			i++
+			logical += " " + lines[i]
+		}
+		refs = append(refs, parseLine(logical, lineNo)...)
+	}
+	return refs
+}
+
+func endsWithBackslash(s string) bool {
+	return strings.HasSuffix(strings.TrimRight(s, " \t"), `\`)
+}
+
+func parseLine(line string, lineNo int) []ref {
+	var refs []ref
+	rest := line
+	for {
+		idx := strings.Index(rest, cliToken)
+		if idx < 0 {
+			break
+		}
+		after := rest[idx+len(cliToken):]
+		beforeOK := idx == 0 || isBoundary(rest[idx-1])
+		afterOK := after == "" || isBoundary(after[0])
+		if beforeOK && afterOK {
+			if words, flags, raw, ok := parseCmd(after); ok {
+				refs = append(refs, ref{line: lineNo, raw: cliToken + raw, words: words, flags: flags})
+			}
+		}
+		rest = after
+	}
+	return refs
+}
+
+// parseCmd tokenizes the text following "lark-cli" into leading command words
+// (the subcommand path, up to the first flag) and flag tokens. It stops at a
+// shell separator (standalone or glued), an inline-code backtick, a comment, or
+// a placeholder/prose word. ok=false filters out non-commands.
+func parseCmd(after string) (words, flags []string, raw string, ok bool) {
+	// An inline code span ends at the next backtick; a command never spans one.
+	if i := strings.IndexByte(after, '`'); i >= 0 {
+		after = after[:i]
+	}
+	// Drop $(...) command substitutions so flags belonging to the inner command
+	// (e.g. `--data "$(jq -n --arg x ...)"`) are not mistaken for lark-cli flags.
+	after = stripCmdSubst(after)
+
+	var kept []string
+	inFlags := false
+	for _, orig := range strings.Fields(after) {
+		tok := orig
+		if shellStops[tok] || strings.HasPrefix(tok, "#") {
+			break
+		}
+		// A shell separator glued to a token ends the command mid-token
+		// ("get;", "foo|next"): keep the part before it, handle it, then stop.
+		stop := false
+		if i := strings.IndexAny(tok, ";|"); i >= 0 {
+			tok, stop = tok[:i], true
+		}
+		switch {
+		case tok == "" || tok == "-":
+			// empty (after a glued separator) or a bare stdin marker — skip
+		case strings.HasPrefix(tok, "-"):
+			if f := normalizeFlag(tok); f != "" {
+				inFlags = true
+				flags = append(flags, f)
+				kept = append(kept, tok)
+			}
+		case inFlags:
+			// positional / flag value after the first flag — not a command word
+			kept = append(kept, tok)
+		default:
+			// Command-path word. ASCII placeholder markers (<x>, [x], {x|y},
+			// +<verb>, ...) end the command — checked on the RAW token so the
+			// trailing-punct stripping below cannot erase a "..." ellipsis
+			// ("base +..." must stay a placeholder, not become "+").
+			if strings.ContainsAny(tok, "<>[]{}|") || strings.Contains(tok, "...") {
+				stop = true
+				break
+			}
+			// Strip trailing sentence/CJK punctuation so "login." / "login，"
+			// resolve to "login"; non-ASCII narration ends the command.
+			w := strings.TrimRight(tok, wordTrailPunct)
+			if w == "" || hasNonASCII(w) {
+				stop = true
+				break
+			}
+			words = append(words, w)
+			kept = append(kept, tok)
+		}
+		if stop {
+			break
+		}
+	}
+	if len(kept) > 0 {
+		raw = " " + strings.Join(kept, " ")
+	}
+	// Keep root-only refs ("lark-cli --help") and refs whose first word looks
+	// like a subcommand; drop prose ("lark-cli 就能搞定 ...").
+	if len(words) == 0 {
+		return words, flags, raw, len(flags) > 0
+	}
+	if !subcommandStart.MatchString(words[0]) {
+		return nil, nil, "", false
+	}
+	return words, flags, raw, true
+}
+
+// stripCmdSubst removes $(...) command substitutions (including nested ones)
+// from s, leaving the surrounding text intact. Backtick substitutions are
+// already handled upstream (a command never spans a backtick).
+func stripCmdSubst(s string) string {
+	var b strings.Builder
+	depth := 0
+	for i := 0; i < len(s); i++ {
+		if depth == 0 && i+1 < len(s) && s[i] == '$' && s[i+1] == '(' {
+			depth = 1
+			i++ // skip '('
+			continue
+		}
+		if depth > 0 {
+			switch s[i] {
+			case '(':
+				depth++
+			case ')':
+				depth--
+			}
+			continue
+		}
+		b.WriteByte(s[i])
+	}
+	return b.String()
+}
+
+// isPlaceholderOrProse reports whether a command word is a doc placeholder
+// (<resource>, [flags], {a|b}, +<verb>, ...) or narration (CJK / other
+// non-ASCII), rather than a literal command token.
+func isPlaceholderOrProse(w string) bool {
+	if hasNonASCII(w) {
+		return true
+	}
+	return strings.ContainsAny(w, "<>[]{}|") || strings.Contains(w, "...")
+}
+
+func hasNonASCII(s string) bool {
+	return strings.IndexFunc(s, func(r rune) bool { return r > 127 }) >= 0
+}
+
+// flagShape matches the leading flag token, stripping any trailing junk such as
+// a "=value" suffix or punctuation that bled in from the surrounding markdown
+// ("--help\"", "--help;", "--params={}"). The underscore is allowed because
+// real flags use it ("--input_format", "--output_as"). Returns "" for non-flags.
+var flagShape = regexp.MustCompile(`^--?[A-Za-z][A-Za-z0-9_-]*`)
+
+// normalizeFlag extracts the canonical flag token from tok, or "" if tok is not
+// a real flag (e.g. a shell-string fragment like "-草稿'").
+func normalizeFlag(tok string) string {
+	return flagShape.FindString(tok)
+}
+
+func isBoundary(b byte) bool {
+	switch b {
+	case ' ', '\t', '`', '(', ')', '\'', '"', '*':
+		return true
+	}
+	return false
+}