mirror of
https://github.com/larksuite/cli.git
synced 2026-07-06 00:06:28 +08:00
* feat(mail): HTML lint library + Larksuite-native autofix + lark-mail skill 为 lark-cli mail 域写信链路引入 HTML lint 能力,提升邮件 HTML 的兼容性、 安全性与 Larksuite-native 格式适配。 lint 库(shortcuts/mail/lint/): - 四档分类:pass / native-autofix / warn-autofix / error-strip - 安全规则覆盖 script / iframe / on* 事件处理器 / javascript: 及其它 危险 URL scheme 等 XSS 向量,未知 scheme 一律删除并归 error - Larksuite-native 格式自动修复:双层 div 段落、原生多级列表结构、 灰边引用、Larksuite 蓝链接 - cleaned_html 输出确定性稳定(位置索引派生 data-ol-id),便于 golden-file 测试与缓存 +lint-html 独立预检 shortcut: - 只读、不调 API、不建草稿,供 AI / 用户 / CI 在写信前预览 lint 结果 写入路径内置 lint(6 个 compose shortcut): - +send / +draft-create / +draft-edit / +reply / +reply-all / +forward 在 emlbuilder 之前强制 lint 净化 HTML - 默认 envelope 对 lint 改动透明(无 lint 字段),保持小巧供 AI 消费; --show-lint-details 显式取证返回 lint_applied[] / original_blocked[] - --body-file 支持从文件读取 body(32MB 上限),与 --body 互斥 预制 HTML 邮件模板(skills/lark-mail/assets/templates/): - 资讯周报 / 个人周报 / 团队周报 / 调研报告 / 求职简历 5 套 - 按 Larksuite mail-editor 原生格式编写,含正确的多级列表嵌套结构 lark-mail skill 文档: - references/lark-mail-html.md:邮件 HTML 写法指南(24 个格式 section + 颜色调色盘 + URL scheme + 官方模板套用流程) - references/lark-mail-lint-html.md:+lint-html 用法 - SKILL.md 顶部 CRITICAL 引导 * fix(mail): remove unused readAttr func and apply gofmt Drop the unused `readAttr` helper in shortcuts/mail/lint/linter.go that was flagged by golangci-lint (unused linter). Apply gofmt to linter.go and rules.go which had minor formatting issues. * fix(mail): address compose lint and guidance
354 lines
12 KiB
Go
354 lines
12 KiB
Go
// Copyright (c) 2026 Lark Technologies Pte. Ltd.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
package lint
|
|
|
|
import "strings"
|
|
|
|
// Rule IDs surfaced through Finding.RuleID. UPPER_SNAKE_CASE naming is the
|
|
// contract for the stdout envelope. New rules MUST keep this naming convention
|
|
// so AI / test consumers can pattern-match reliably.
|
|
const (
|
|
// Tag-level rules.
|
|
RuleTagFontToSpan = "TAG_FONT_TO_SPAN"
|
|
RuleTagCenterToDiv = "TAG_CENTER_TO_DIV"
|
|
RuleTagMarqueeToText = "TAG_MARQUEE_TO_TEXT"
|
|
RuleTagBlinkToText = "TAG_BLINK_TO_TEXT"
|
|
RuleTagScriptBlocked = "TAG_SCRIPT_BLOCKED"
|
|
RuleTagIframeBlocked = "TAG_IFRAME_BLOCKED"
|
|
RuleTagObjectBlocked = "TAG_OBJECT_BLOCKED"
|
|
RuleTagEmbedBlocked = "TAG_EMBED_BLOCKED"
|
|
RuleTagFormBlocked = "TAG_FORM_BLOCKED"
|
|
RuleTagInputBlocked = "TAG_INPUT_BLOCKED"
|
|
RuleTagLinkBlocked = "TAG_LINK_BLOCKED"
|
|
RuleTagMetaBlocked = "TAG_META_BLOCKED"
|
|
RuleTagBaseBlocked = "TAG_BASE_BLOCKED"
|
|
RuleTagUnknownStripped = "TAG_UNKNOWN_STRIPPED"
|
|
|
|
// Attribute-level rules.
|
|
RuleAttrEventHandlerBlocked = "ATTR_EVENT_HANDLER_BLOCKED"
|
|
RuleAttrJSURLBlocked = "ATTR_JS_URL_BLOCKED"
|
|
RuleAttrUnsafeSchemeBlocked = "ATTR_UNSAFE_SCHEME_BLOCKED"
|
|
|
|
// Style-level rules.
|
|
RuleStylePropertyDropped = "STYLE_PROPERTY_DROPPED"
|
|
|
|
// Feishu-native autofix rules. These autofix the inline style /
|
|
// class / nesting shape of common elements so AI-authored HTML
|
|
// matches what Feishu mail-editor itself emits, fixing the visual
|
|
// "extra blank line between blocks", "list bullets/numbers missing",
|
|
// "link color wrong" etc. classes of issues. The rewrite is purely
|
|
// additive — user-supplied inline styles take precedence; the lib
|
|
// only fills the missing properties.
|
|
RuleStyleListNative = "STYLE_LIST_NATIVE_INLINE_APPLIED"
|
|
RuleStyleListItemNative = "STYLE_LIST_ITEM_NATIVE_INLINE_APPLIED"
|
|
RuleStyleBlockquoteNative = "STYLE_BLOCKQUOTE_NATIVE_INLINE_APPLIED"
|
|
RuleStyleLinkNative = "STYLE_LINK_NATIVE_INLINE_APPLIED"
|
|
RuleStyleParaWrapper = "STYLE_PARA_WRAPPER_REWRITTEN"
|
|
|
|
// RuleListDirectChildNonLI fires when a <ul> or <ol> has a non-<li>
|
|
// element child (e.g. nested <ul><ul>). HTML spec requires list children
|
|
// to be <li>; browsers silently hoist the nested list out and the visual
|
|
// nesting falls apart. The lib autofixes by wrapping the offending child
|
|
// in a synthetic <li>.
|
|
RuleListDirectChildNonLI = "LIST_DIRECT_CHILD_NON_LI"
|
|
)
|
|
|
|
// Tag classification ----------------------------------------------------------
|
|
|
|
// allowedTags enumerates tags that pass through verbatim (tag classification row "通过").
|
|
// Lower-case canonical names; the parser normalises tag names so we don't need
|
|
// case-insensitive comparison at lookup time.
|
|
var allowedTags = map[string]bool{
|
|
"p": true,
|
|
"div": true,
|
|
"span": true,
|
|
"br": true,
|
|
"hr": true,
|
|
"a": true,
|
|
"img": true,
|
|
"table": true,
|
|
"thead": true,
|
|
"tbody": true,
|
|
"tfoot": true,
|
|
"tr": true,
|
|
"td": true,
|
|
"th": true,
|
|
"ul": true,
|
|
"ol": true,
|
|
"li": true,
|
|
"blockquote": true,
|
|
"pre": true,
|
|
"code": true,
|
|
"b": true,
|
|
"i": true,
|
|
"em": true,
|
|
"strong": true,
|
|
"u": true,
|
|
"s": true,
|
|
"strike": true,
|
|
"h1": true,
|
|
"h2": true,
|
|
"h3": true,
|
|
"h4": true,
|
|
"h5": true,
|
|
"h6": true,
|
|
"sub": true,
|
|
"sup": true,
|
|
"section": true,
|
|
"article": true,
|
|
"header": true,
|
|
"footer": true,
|
|
"nav": true,
|
|
"main": true,
|
|
"figure": true,
|
|
"figcaption": true,
|
|
"caption": true,
|
|
"colgroup": true,
|
|
"col": true,
|
|
// Document structural tags (golang.org/x/net/html always wraps fragments
|
|
// in <html><head><body>); we treat them as transparent so the wrapper
|
|
// nodes the parser inserts don't generate spurious findings.
|
|
"html": true,
|
|
"head": true,
|
|
"body": true,
|
|
}
|
|
|
|
// blockedTags enumerates tags whose content is removed in full and a
|
|
// SeverityError finding is emitted (tag classification row "错误(删除)"). Each entry
|
|
// maps to the rule id surfaced in Finding.RuleID.
|
|
var blockedTags = map[string]string{
|
|
"script": RuleTagScriptBlocked,
|
|
"iframe": RuleTagIframeBlocked,
|
|
"object": RuleTagObjectBlocked,
|
|
"embed": RuleTagEmbedBlocked,
|
|
"form": RuleTagFormBlocked,
|
|
"input": RuleTagInputBlocked,
|
|
"select": RuleTagInputBlocked,
|
|
"option": RuleTagInputBlocked,
|
|
"button": RuleTagInputBlocked,
|
|
"link": RuleTagLinkBlocked,
|
|
"meta": RuleTagMetaBlocked,
|
|
"base": RuleTagBaseBlocked,
|
|
}
|
|
|
|
// warnAutofixTags enumerates tags rewritten when AutoFix is true (tag
|
|
// classification row "警告 + 自动修复"). The replacement strategy is per-tag.
|
|
var warnAutofixTags = map[string]string{
|
|
"font": RuleTagFontToSpan,
|
|
"center": RuleTagCenterToDiv,
|
|
"marquee": RuleTagMarqueeToText,
|
|
"blink": RuleTagBlinkToText,
|
|
}
|
|
|
|
// classifyTag returns the rule kind for the given lower-case tag name.
|
|
//
|
|
// kind is one of "allow", "warn", "error", "unknown". For "warn" / "error",
|
|
// ruleID names the firing rule; for "unknown", the caller falls back to
|
|
// allow-list-by-default but emits a hint via RuleTagUnknownStripped only when
|
|
// the tag is structurally suspect (e.g. <object>-like). The cli's existing
|
|
// `htmlTagRe` regex is the de-facto allow-list shipping with the codebase, so
|
|
// we don't aggressively flag anything outside `allowedTags` — drop-through
|
|
// preserves user intent for niche tags (e.g. `<details>` / `<summary>`) that
|
|
// browsers + Feishu native renderer already handle.
|
|
func classifyTag(tag string) (kind, ruleID string) {
|
|
tag = strings.ToLower(tag)
|
|
if allowedTags[tag] {
|
|
return "allow", ""
|
|
}
|
|
if id, ok := blockedTags[tag]; ok {
|
|
return "error", id
|
|
}
|
|
if id, ok := warnAutofixTags[tag]; ok {
|
|
return "warn", id
|
|
}
|
|
// Unknown / niche tags: pass through silently. The cli's existing
|
|
// `htmlTagRe` (mail_quote.go:333) tolerates them too. Users authoring
|
|
// HTML in Feishu native classes (`adit-html-block*`, `history-quote-*`,
|
|
// `lark-mail-doc-quote`) hit this path — they MUST pass through unchanged
|
|
// so reply / forward quote markup survives lint round-trips.
|
|
return "allow", ""
|
|
}
|
|
|
|
// Attribute / URL / style classification --------------------------------------
|
|
|
|
// allowedURLSchemes lists URL schemes that pass through hyperlink-bearing
|
|
// attrs (`href`, `src`, `cite`, `formaction` etc.). Allowed: http(s), mailto,
|
|
// cid, data:image/*; everything else (notably javascript: and vbscript:) is
|
|
// blocked. Empty / relative URLs (no scheme) are always
|
|
// allowed because they resolve relatively at render time and pose no
|
|
// injection vector.
|
|
var allowedURLSchemes = map[string]bool{
|
|
"http": true,
|
|
"https": true,
|
|
"mailto": true,
|
|
"cid": true,
|
|
}
|
|
|
|
// blockedURLSchemes is the explicit deny-list. data:image/* is special-cased
|
|
// in classifyURLValue.
|
|
var blockedURLSchemes = map[string]bool{
|
|
"javascript": true,
|
|
"vbscript": true,
|
|
"file": true,
|
|
}
|
|
|
|
// classifyURLValue returns ("ok", "") if the URL value is acceptable, or
|
|
// ("error", ruleID) when it must be removed (javascript:/vbscript:/file:),
|
|
// or ("warn", ruleID) when the scheme is unrecognised but not actively
|
|
// dangerous. Empty values pass through (browsers ignore them).
|
|
func classifyURLValue(raw string) (kind, ruleID string) {
|
|
value := strings.TrimSpace(raw)
|
|
if value == "" {
|
|
return "ok", ""
|
|
}
|
|
// Strip leading whitespace + control bytes that could obscure the
|
|
// scheme (e.g. "java\tscript:..."). The html-parser already strips
|
|
// stray whitespace at attribute boundaries; this is defence-in-depth
|
|
// for older clients that paste from Word with U+0009 / U+0020 inside
|
|
// the scheme prefix.
|
|
value = strings.Map(func(r rune) rune {
|
|
if r < 0x20 || r == 0x7F {
|
|
return -1
|
|
}
|
|
return r
|
|
}, value)
|
|
|
|
// Find the colon delimiter; everything before it is the scheme.
|
|
colon := strings.IndexByte(value, ':')
|
|
if colon < 0 {
|
|
// No scheme → relative URL → allow.
|
|
return "ok", ""
|
|
}
|
|
scheme := strings.ToLower(value[:colon])
|
|
rest := value[colon+1:]
|
|
|
|
switch {
|
|
case allowedURLSchemes[scheme]:
|
|
return "ok", ""
|
|
case scheme == "data":
|
|
// data:image/* is whitelisted; anything else (e.g. data:text/html;...)
|
|
// is rejected. The check tolerates any subtype under image/* (png /
|
|
// jpeg / gif / svg+xml / webp) so users embedding base64 thumbnails
|
|
// don't trip the rule.
|
|
rest = strings.TrimSpace(rest)
|
|
if strings.HasPrefix(strings.ToLower(rest), "image/") {
|
|
return "ok", ""
|
|
}
|
|
return "error", RuleAttrJSURLBlocked
|
|
case blockedURLSchemes[scheme]:
|
|
return "error", RuleAttrJSURLBlocked
|
|
default:
|
|
// Unknown scheme: surface a warning so users see it but don't
|
|
// drop legitimate webcal:/tel: / similar in case downstream
|
|
// renders eventually support them.
|
|
return "warn", RuleAttrUnsafeSchemeBlocked
|
|
}
|
|
}
|
|
|
|
// urlAttributes lists attributes whose value is a URL and must therefore
|
|
// pass classifyURLValue. Lower-case canonical names.
|
|
var urlAttributes = map[string]bool{
|
|
"href": true,
|
|
"src": true,
|
|
"cite": true,
|
|
"formaction": true,
|
|
"action": true,
|
|
"background": true,
|
|
"poster": true,
|
|
}
|
|
|
|
// allowedStyleProps enumerates CSS property names that pass through the
|
|
// inline `style="..."` attribute. Everything else is removed from the
|
|
// property list and surfaced via STYLE_PROPERTY_DROPPED.
|
|
//
|
|
// `border-*` / `padding-*` / `margin-*` are treated as prefix matches by
|
|
// classifyStyleProperty so the four directional variants (border-top etc.)
|
|
// are all admitted without enumerating each.
|
|
var allowedStyleProps = map[string]bool{
|
|
"color": true,
|
|
"background-color": true,
|
|
"font-size": true,
|
|
"font-weight": true,
|
|
"font-style": true,
|
|
"text-align": true,
|
|
"text-decoration": true,
|
|
"line-height": true,
|
|
"padding": true,
|
|
"margin": true,
|
|
"border": true,
|
|
"width": true,
|
|
"height": true,
|
|
"display": true,
|
|
"text-indent": true,
|
|
// Quote-block / native Feishu styles (tag classification "通过").
|
|
// Whitespace + word-break are part of the existing `<pre>` / quote
|
|
// wrapper styles in mail_quote.go (e.g. `bodyDivStyle`).
|
|
"white-space": true,
|
|
"word-break": true,
|
|
"word-wrap": true,
|
|
"overflow": true,
|
|
"overflow-wrap": true,
|
|
"vertical-align": true,
|
|
"list-style": true,
|
|
"list-style-type": true,
|
|
"list-style-position": true,
|
|
"transition": true,
|
|
"font-family": true,
|
|
"text-transform": true,
|
|
"hyphens": true,
|
|
"max-width": true,
|
|
"min-width": true,
|
|
"max-height": true,
|
|
"min-height": true,
|
|
"border-radius": true,
|
|
"box-sizing": true,
|
|
"opacity": true,
|
|
"cursor": true,
|
|
}
|
|
|
|
// stylePropAllowedPrefixes enumerates property name prefixes treated as
|
|
// allowed regardless of suffix (e.g. "border-*"). A trailing "-" makes the
|
|
// prefix self-documenting.
|
|
var stylePropAllowedPrefixes = []string{
|
|
"border-",
|
|
"padding-",
|
|
"margin-",
|
|
}
|
|
|
|
// classifyStyleProperty reports whether the given lower-case property name
|
|
// is in the allow-list (incl. prefix matches).
|
|
func classifyStyleProperty(name string) bool {
|
|
name = strings.ToLower(strings.TrimSpace(name))
|
|
if name == "" {
|
|
return false
|
|
}
|
|
if allowedStyleProps[name] {
|
|
return true
|
|
}
|
|
for _, p := range stylePropAllowedPrefixes {
|
|
if strings.HasPrefix(name, p) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// isEventHandlerAttr reports whether the attribute name is a DOM event
|
|
// handler (`on*`). The lib removes every such attribute regardless of its
|
|
// value (tag classification row "错误(删除)" + the well-known XSS vector).
|
|
func isEventHandlerAttr(name string) bool {
|
|
name = strings.ToLower(strings.TrimSpace(name))
|
|
if !strings.HasPrefix(name, "on") {
|
|
return false
|
|
}
|
|
if len(name) <= 2 {
|
|
return false
|
|
}
|
|
// Defence-in-depth: avoid matching legitimate attrs whose name happens
|
|
// to begin with "on" (e.g. `onerror`-like attrs all start "on" + ascii
|
|
// letter). The `>= 'a'` check filters out "on-something" with hyphens.
|
|
c := name[2]
|
|
return (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')
|
|
}
|