diff --git a/shortcuts/doc/docs_fetch_im_markdown.go b/shortcuts/doc/docs_fetch_im_markdown.go new file mode 100644 index 00000000..13b37002 --- /dev/null +++ b/shortcuts/doc/docs_fetch_im_markdown.go @@ -0,0 +1,861 @@ +// Copyright (c) 2026 Lark Technologies Pte. Ltd. +// SPDX-License-Identifier: MIT + +package doc + +import ( + "fmt" + "html" + "net/url" + "regexp" + "strings" + "unicode/utf8" +) + +type imMarkdownContext struct { + baseURL string + blockquoteDepth int +} + +type imMarkdownHandleFunc func(segment, inner string, attrs map[string]string, imCtx imMarkdownContext) string + +type imMarkdownTagHandler struct { + closeRE *regexp.Regexp + handle imMarkdownHandleFunc +} + +func registerIMMarkdownHandler(tag string, handle imMarkdownHandleFunc) { + imMarkdownHandlers[tag] = imMarkdownTagHandler{ + closeRE: regexp.MustCompile(`(?is)<(/?)` + regexp.QuoteMeta(tag) + `(?:\s[^<>]*?)?\s*/?>`), + handle: handle, + } +} + +var ( + imMarkdownTagStartRE = regexp.MustCompile(`(?s)<([A-Za-z][A-Za-z0-9:_-]*)(?:\s[^<>]*?)?\s*/?>`) + imMarkdownAttrRE = regexp.MustCompile(`([A-Za-z_:][A-Za-z0-9_:.-]*)\s*=\s*(?:"([^"]*)"|'([^']*)')`) + imMarkdownRowTagRE = regexp.MustCompile(`(?is)<(/?)tr\b[^>]*?\s*/?>`) + imMarkdownCellTagRE = regexp.MustCompile(`(?is)<(/?)t[dh]\b[^>]*?\s*/?>`) + imMarkdownCellBreakRE = regexp.MustCompile(`(?i)`) + imMarkdownAnyTagRE = regexp.MustCompile(`(?s)]*?)?>`) + imMarkdownLinkRE = regexp.MustCompile(`(?is)]*\bhref=(?:"([^"]*)"|'([^']*)')[^>]*>(.*?)`) + imMarkdownCodeBlockRE = regexp.MustCompile(`(?is)^\s*]*?)?>(.*?)\s*$`) + imMarkdownLiOpenRE = regexp.MustCompile(`(?is)]*?)?>`) + imMarkdownLiCloseRE = regexp.MustCompile(`(?is)<(/?)li(?:\s[^<>]*?)?\s*/?>`) +) + +var imMarkdownHandlers = map[string]imMarkdownTagHandler{} + +func init() { + registerIMMarkdownHandler("title", handleIMMarkdownTitle) + for level := 1; level <= 9; level++ { + registerIMMarkdownHandler(fmt.Sprintf("h%d", level), handleIMMarkdownHeading(level)) + } + registerIMMarkdownHandler("p", handleIMMarkdownParagraph) + registerIMMarkdownHandler("ul", handleIMMarkdownUnorderedList) + registerIMMarkdownHandler("ol", handleIMMarkdownOrderedList) + registerIMMarkdownHandler("li", handleIMMarkdownListItem) + registerIMMarkdownHandler("callout", handleIMMarkdownCallout) + registerIMMarkdownHandler("blockquote", handleIMMarkdownBlockquote) + registerIMMarkdownHandler("grid", handleIMMarkdownPassthroughContainer) + registerIMMarkdownHandler("column", handleIMMarkdownColumn) + registerIMMarkdownHandler("table", handleIMMarkdownTable) + registerIMMarkdownHandler("colgroup", handleIMMarkdownDiscard) + registerIMMarkdownHandler("col", handleIMMarkdownDiscard) + registerIMMarkdownHandler("pre", handleIMMarkdownPre) + registerIMMarkdownHandler("code", handleIMMarkdownCode) + registerIMMarkdownHandler("latex", handleIMMarkdownLatex) + registerIMMarkdownHandler("hr", handleIMMarkdownHorizontalRule) + registerIMMarkdownHandler("img", handleIMMarkdownImage) + registerIMMarkdownHandler("figure", handleIMMarkdownDiscard) + registerIMMarkdownHandler("source", handleIMMarkdownSource) + registerIMMarkdownHandler("button", handleIMMarkdownDiscard) + registerIMMarkdownHandler("time", handleIMMarkdownDiscard) + registerIMMarkdownHandler("whiteboard", handleIMMarkdownInlineCode) + registerIMMarkdownHandler("sheet", handleIMMarkdownSheet) + registerIMMarkdownHandler("task", handleIMMarkdownConditionalResourceLabel("任务", "task-id", "guid", "token", "id")) + registerIMMarkdownHandler("chat_card", handleIMMarkdownConditionalResourceLabel("群聊卡片", "chat-id", "chat_id", "id")) + registerIMMarkdownHandler("bitable", handleIMMarkdownResourceLabel("多维表格")) + registerIMMarkdownHandler("base_refer", handleIMMarkdownResourceLabel("多维表格")) + registerIMMarkdownHandler("okr", handleIMMarkdownResourceLabel("OKR")) + registerIMMarkdownHandler("poll", handleIMMarkdownDiscard) + registerIMMarkdownHandler("agenda", handleIMMarkdownDiscard) + registerIMMarkdownHandler("folder_manager", handleIMMarkdownDiscard) + registerIMMarkdownHandler("wiki_catalog", handleIMMarkdownDiscard) + registerIMMarkdownHandler("wiki_recent_update", handleIMMarkdownDiscard) + registerIMMarkdownHandler("chart_refer_host_perm", handleIMMarkdownDiscard) + registerIMMarkdownHandler("synced_reference", handleIMMarkdownDiscard) + registerIMMarkdownHandler("synced-source", handleIMMarkdownDiscard) + registerIMMarkdownHandler("mindnote", handleIMMarkdownDiscard) + registerIMMarkdownHandler("bookmark", handleIMMarkdownBookmark) + registerIMMarkdownHandler("cite", handleIMMarkdownCite) + registerIMMarkdownHandler("b", handleIMMarkdownStrong) + registerIMMarkdownHandler("em", handleIMMarkdownEmphasis) + registerIMMarkdownHandler("del", handleIMMarkdownDelete) + registerIMMarkdownHandler("u", handleIMMarkdownPlainInline) + registerIMMarkdownHandler("span", handleIMMarkdownPlainInline) + registerIMMarkdownHandler("a", handleIMMarkdownAnchor) +} + +func isIMMarkdownFetch(runtime interface{ Str(string) string }) bool { + return strings.TrimSpace(runtime.Str("doc-format")) == "im-markdown" +} + +func applyFetchIMMarkdown(data map[string]interface{}, docInput string) { + doc, ok := data["document"].(map[string]interface{}) + if !ok { + return + } + content, ok := doc["content"].(string) + if !ok { + return + } + doc["content"] = convertToIMMarkdown(content, newIMMarkdownContext(docInput)) +} + +func newIMMarkdownContext(docInput string) imMarkdownContext { + base := "https://larkoffice.com" + raw := strings.TrimSpace(docInput) + if extracted, ok := imMarkdownBaseURLFromInput(raw); ok { + base = extracted + } + return imMarkdownContext{baseURL: base} +} + +func (c imMarkdownContext) withBlockquote() imMarkdownContext { + c.blockquoteDepth++ + return c +} + +func (c imMarkdownContext) inBlockquote() bool { + return c.blockquoteDepth > 0 +} + +// imMarkdownBaseURLFromInput keeps the tenant host from --doc when it is a URL +// so generated doc/sheet links point back to the same tenant. parseDocumentRef +// intentionally strips host information, so it cannot serve this formatting path. +func imMarkdownBaseURLFromInput(raw string) (string, bool) { + if raw == "" { + return "", false + } + if u, err := url.Parse(raw); err == nil && u.Scheme != "" && u.Host != "" { + return u.Scheme + "://" + u.Host, true + } + for _, marker := range []string{"/docx/", "/wiki/", "/doc/"} { + idx := strings.Index(raw, marker) + if idx <= 0 { + continue + } + candidate := strings.Trim(raw[:idx], "/") + if candidate == "" { + continue + } + if u, err := url.Parse(candidate); err == nil && u.Scheme != "" && u.Host != "" { + return u.Scheme + "://" + u.Host, true + } + if u, err := url.Parse("https://" + candidate); err == nil && u.Host != "" && strings.Contains(u.Host, ".") { + return "https://" + u.Host, true + } + } + return "", false +} + +func convertToIMMarkdown(content string, imCtx imMarkdownContext) string { + var out strings.Builder + for offset := 0; offset < len(content); { + // Scan only to the next XML-like opening tag. Plain Markdown text between + // registered tags is copied unchanged, so ordinary Markdown is not re-parsed. + loc := imMarkdownTagStartRE.FindStringSubmatchIndex(content[offset:]) + if loc == nil { + out.WriteString(content[offset:]) + break + } + start := offset + loc[0] + openEnd := offset + loc[1] + tag := strings.ToLower(content[offset+loc[2] : offset+loc[3]]) + handler, ok := imMarkdownHandlers[tag] + if !ok { + // Unknown tags are left intact. im-markdown only downgrades tags with + // explicit handlers so future server output does not get guessed at. + out.WriteString(content[offset:openEnd]) + offset = openEnd + continue + } + + out.WriteString(content[offset:start]) + opening := content[start:openEnd] + attrs := parseIMMarkdownAttrs(opening) + if isSelfClosingIMMarkdownTag(opening) { + out.WriteString(handler.handle(opening, "", attrs, imCtx)) + offset = openEnd + continue + } + + // Use the handler's precompiled close regexp to find the matching end tag. + // Depth tracking keeps nested same-name containers paired correctly. + closeStart, closeEnd, found := findIMMarkdownClosingTag(content, openEnd, handler) + if !found { + // Malformed or truncated fragments are preserved as-is from the opening + // tag onward; do not drop content when the XML-ish structure is incomplete. + out.WriteString(content[start:]) + break + } + segment := content[start:closeEnd] + inner := content[openEnd:closeStart] + out.WriteString(handler.handle(segment, inner, attrs, imCtx)) + offset = closeEnd + } + return out.String() +} + +func findIMMarkdownClosingTag(content string, from int, handler imMarkdownTagHandler) (int, int, bool) { + depth := 1 + for _, loc := range handler.closeRE.FindAllStringSubmatchIndex(content[from:], -1) { + start := from + loc[0] + end := from + loc[1] + token := content[start:end] + if loc[2] >= 0 && content[from+loc[2]:from+loc[3]] == "/" { + depth-- + if depth == 0 { + return start, end, true + } + continue + } + if !isSelfClosingIMMarkdownTag(token) { + depth++ + } + } + return 0, 0, false +} + +func parseIMMarkdownAttrs(opening string) map[string]string { + attrs := map[string]string{} + for _, match := range imMarkdownAttrRE.FindAllStringSubmatch(opening, -1) { + value := match[2] + if value == "" { + value = match[3] + } + attrs[strings.ToLower(match[1])] = html.UnescapeString(value) + } + return attrs +} + +func isSelfClosingIMMarkdownTag(tag string) bool { + return strings.HasSuffix(strings.TrimSpace(tag), "/>") +} + +func handleIMMarkdownTitle(_ string, inner string, _ map[string]string, imCtx imMarkdownContext) string { + text := strings.TrimSpace(convertToIMMarkdown(inner, imCtx)) + if text == "" { + return "" + } + return "# " + text +} + +func handleIMMarkdownHeading(level int) imMarkdownHandleFunc { + return func(_ string, inner string, _ map[string]string, imCtx imMarkdownContext) string { + text := strings.TrimSpace(convertToIMMarkdown(inner, imCtx)) + if text == "" { + return "" + } + markdownLevel := level + if markdownLevel > 6 { + markdownLevel = 6 + } + return strings.Repeat("#", markdownLevel) + " " + text + } +} + +func handleIMMarkdownParagraph(_ string, inner string, _ map[string]string, imCtx imMarkdownContext) string { + body := strings.TrimSpace(convertToIMMarkdown(inner, imCtx)) + if body == "" { + return "" + } + if imCtx.inBlockquote() { + return body + "\n" + } + return body +} + +func handleIMMarkdownUnorderedList(_ string, inner string, _ map[string]string, imCtx imMarkdownContext) string { + return convertIMMarkdownListItems(inner, false, imCtx) +} + +func handleIMMarkdownOrderedList(_ string, inner string, _ map[string]string, imCtx imMarkdownContext) string { + return convertIMMarkdownListItems(inner, true, imCtx) +} + +func handleIMMarkdownListItem(_ string, inner string, attrs map[string]string, imCtx imMarkdownContext) string { + prefix := "-" + if seq := strings.TrimSpace(attrs["seq"]); seq != "" && seq != "auto" { + prefix = strings.TrimSuffix(seq, ".") + "." + } + body := strings.TrimSpace(convertToIMMarkdown(inner, imCtx)) + if body == "" { + return "" + } + return prefix + " " + indentIMMarkdownListContinuation(body) + "\n" +} + +func handleIMMarkdownCallout(_ string, inner string, attrs map[string]string, imCtx imMarkdownContext) string { + body := strings.TrimSpace(convertToIMMarkdown(inner, imCtx)) + emoji := strings.TrimSpace(attrs["emoji"]) + if emoji != "" { + if body == "" { + body = emoji + } else { + body = emoji + " " + body + } + } + if body == "" { + return "---\n---" + } + return fmt.Sprintf("---\n%s\n---", body) +} + +func handleIMMarkdownBlockquote(_ string, inner string, _ map[string]string, imCtx imMarkdownContext) string { + body := strings.TrimSpace(convertToIMMarkdown(inner, imCtx.withBlockquote())) + if body == "" { + return "" + } + lines := strings.Split(body, "\n") + for i, line := range lines { + if strings.TrimSpace(line) == "" { + lines[i] = ">" + continue + } + lines[i] = "> " + line + } + return strings.Join(lines, "\n") +} + +func handleIMMarkdownPassthroughContainer(_ string, inner string, _ map[string]string, imCtx imMarkdownContext) string { + return strings.TrimSpace(convertToIMMarkdown(inner, imCtx)) +} + +func handleIMMarkdownColumn(_ string, inner string, _ map[string]string, imCtx imMarkdownContext) string { + body := strings.TrimSpace(convertToIMMarkdown(inner, imCtx)) + if body == "" { + return "" + } + return body + "\n" +} + +func handleIMMarkdownDiscard(_ string, _ string, _ map[string]string, _ imMarkdownContext) string { + return "" +} + +func handleIMMarkdownInlineCode(segment string, _ string, _ map[string]string, _ imMarkdownContext) string { + return imMarkdownInlineCode(segment) +} + +func handleIMMarkdownPre(_ string, inner string, attrs map[string]string, _ imMarkdownContext) string { + lang := strings.TrimSpace(attrs["lang"]) + code := strings.TrimSpace(inner) + if match := imMarkdownCodeBlockRE.FindStringSubmatch(code); match != nil { + code = match[1] + } + return imMarkdownFencedCode(html.UnescapeString(code), lang) +} + +func handleIMMarkdownCode(_ string, inner string, _ map[string]string, _ imMarkdownContext) string { + return imMarkdownInlineCode(markdownPlainText(inner)) +} + +func handleIMMarkdownLatex(_ string, inner string, _ map[string]string, _ imMarkdownContext) string { + expr := strings.TrimSpace(markdownPlainText(inner)) + if expr == "" { + return "" + } + return "$" + strings.ReplaceAll(expr, "$", `\$`) + "$" +} + +func handleIMMarkdownHorizontalRule(_ string, _ string, _ map[string]string, _ imMarkdownContext) string { + return "---" +} + +func handleIMMarkdownImage(_ string, _ string, attrs map[string]string, _ imMarkdownContext) string { + href := firstNonEmpty(attrs["href"], attrs["src"], attrs["url"]) + if href == "" { + return "" + } + alt := firstNonEmpty(attrs["alt"], attrs["name"], attrs["title"]) + return fmt.Sprintf("![%s](%s)", escapeMarkdownLinkText(alt), escapeMarkdownLinkDestination(href)) +} + +func handleIMMarkdownSource(_ string, _ string, attrs map[string]string, _ imMarkdownContext) string { + name := strings.TrimSpace(attrs["name"]) + if name == "" { + return "" + } + return imMarkdownInlineCode(name) +} + +func handleIMMarkdownResourceLabel(label string) imMarkdownHandleFunc { + return func(_ string, _ string, _ map[string]string, _ imMarkdownContext) string { + return imMarkdownInlineCode(label) + } +} + +func handleIMMarkdownConditionalResourceLabel(label string, attrNames ...string) imMarkdownHandleFunc { + return func(_ string, _ string, attrs map[string]string, _ imMarkdownContext) string { + for _, attrName := range attrNames { + if strings.TrimSpace(attrs[attrName]) != "" { + return imMarkdownInlineCode(label) + } + } + return "" + } +} + +func handleIMMarkdownSheet(segment string, _ string, attrs map[string]string, imCtx imMarkdownContext) string { + token := strings.TrimSpace(attrs["token"]) + if token == "" { + return imMarkdownInlineCode(segment) + } + label := "sheet" + if sheetID := strings.TrimSpace(attrs["sheet-id"]); sheetID != "" { + label = "sheet " + sheetID + } + return markdownLink(label, strings.TrimRight(imCtx.baseURL, "/")+"/sheets/"+token) +} + +func handleIMMarkdownBookmark(segment string, inner string, attrs map[string]string, imCtx imMarkdownContext) string { + href := strings.TrimSpace(attrs["href"]) + name := firstNonEmpty(attrs["name"], attrs["title"], markdownLinkLabelText(convertToIMMarkdown(inner, imCtx)), href) + if href == "" { + return name + } + return markdownLink(name, href) +} + +func handleIMMarkdownStrong(_ string, inner string, _ map[string]string, imCtx imMarkdownContext) string { + body := strings.TrimSpace(convertToIMMarkdown(inner, imCtx)) + if body == "" { + return "" + } + return "**" + body + "**" +} + +func handleIMMarkdownEmphasis(_ string, inner string, _ map[string]string, imCtx imMarkdownContext) string { + body := strings.TrimSpace(convertToIMMarkdown(inner, imCtx)) + if body == "" { + return "" + } + return "*" + body + "*" +} + +func handleIMMarkdownDelete(_ string, inner string, _ map[string]string, imCtx imMarkdownContext) string { + body := strings.TrimSpace(convertToIMMarkdown(inner, imCtx)) + if body == "" { + return "" + } + return "~~" + body + "~~" +} + +func handleIMMarkdownPlainInline(_ string, inner string, _ map[string]string, imCtx imMarkdownContext) string { + return strings.TrimSpace(convertToIMMarkdown(inner, imCtx)) +} + +func handleIMMarkdownAnchor(_ string, inner string, attrs map[string]string, imCtx imMarkdownContext) string { + href := strings.TrimSpace(attrs["href"]) + text := firstNonEmpty(markdownLinkLabelText(convertToIMMarkdown(inner, imCtx)), attrs["name"], attrs["title"], href) + if href == "" { + return text + } + return markdownLink(text, href) +} + +func handleIMMarkdownCite(segment string, inner string, attrs map[string]string, imCtx imMarkdownContext) string { + switch strings.ToLower(strings.TrimSpace(attrs["type"])) { + case "user": + userID := firstNonEmpty(attrs["user-id"], attrs["open-id"], attrs["id"]) + name := firstNonEmpty(attrs["user-name"], attrs["name"], markdownPlainText(inner), userID) + if userID == "" { + return name + } + return fmt.Sprintf(`%s`, html.EscapeString(userID), html.EscapeString(name)) + case "doc": + title := firstNonEmpty(attrs["title"], attrs["name"], attrs["doc-id"], "document") + if href := firstNonEmpty(attrs["href"], attrs["url"]); href != "" { + return markdownLink(title, href) + } + docID := firstNonEmpty(attrs["doc-id"], attrs["token"]) + if docID == "" { + return imMarkdownInlineCode(segment) + } + fileType := strings.Trim(strings.ToLower(firstNonEmpty(attrs["file-type"], "docx")), "/") + return markdownLink(title, strings.TrimRight(imCtx.baseURL, "/")+"/"+fileType+"/"+docID) + case "citation": + if text, href, ok := extractIMMarkdownInnerLink(inner); ok { + return markdownLink(text, href) + } + if href := firstNonEmpty(attrs["href"], attrs["url"]); href != "" { + return markdownLink(firstNonEmpty(attrs["title"], attrs["name"], href), href) + } + return markdownPlainText(convertToIMMarkdown(inner, imCtx)) + default: + return imMarkdownInlineCode(segment) + } +} + +func handleIMMarkdownTable(segment string, inner string, _ map[string]string, imCtx imMarkdownContext) string { + // Rows and cells are matched with tag-depth tracking instead of non-greedy + // regex captures. A table nested inside a cell can contain its own and + // ; treating those as the outer row/cell boundary corrupts the table. + rowBodies := extractIMMarkdownElementBodies(inner, imMarkdownRowTagRE) + if len(rowBodies) == 0 { + return imMarkdownInlineCode(segment) + } + + rows := make([][]string, 0, len(rowBodies)) + for _, rowBody := range rowBodies { + cellBodies := extractIMMarkdownElementBodies(rowBody, imMarkdownCellTagRE) + if len(cellBodies) == 0 { + continue + } + row := make([]string, 0, len(cellBodies)) + for _, cellBody := range cellBodies { + row = append(row, normalizeIMMarkdownTableCell(convertToIMMarkdown(cellBody, imCtx))) + } + rows = append(rows, row) + } + if len(rows) == 0 { + return imMarkdownInlineCode(segment) + } + + cols := 0 + for _, row := range rows { + if len(row) > cols { + cols = len(row) + } + } + var out strings.Builder + writeIMMarkdownTableRow(&out, padIMMarkdownTableRow(rows[0], cols)) + separator := make([]string, cols) + for i := range separator { + separator[i] = "-" + } + writeIMMarkdownTableRow(&out, separator) + for _, row := range rows[1:] { + writeIMMarkdownTableRow(&out, padIMMarkdownTableRow(row, cols)) + } + return strings.TrimRight(out.String(), "\n") +} + +// extractIMMarkdownElementBodies returns the inner content of each top-level +// element matched by tagRE. tagRE must expose the optional closing slash as its +// first capture group, matching the row/cell regexes above. +func extractIMMarkdownElementBodies(content string, tagRE *regexp.Regexp) []string { + var bodies []string + for offset := 0; offset < len(content); { + loc := tagRE.FindStringSubmatchIndex(content[offset:]) + if loc == nil { + break + } + openStart := offset + loc[0] + openEnd := offset + loc[1] + opening := content[openStart:openEnd] + if loc[2] >= 0 && content[offset+loc[2]:offset+loc[3]] == "/" { + offset = openEnd + continue + } + if isSelfClosingIMMarkdownTag(opening) { + bodies = append(bodies, "") + offset = openEnd + continue + } + closeStart, closeEnd, found := findIMMarkdownElementClosingTag(content, openEnd, tagRE) + if !found { + break + } + bodies = append(bodies, content[openEnd:closeStart]) + offset = closeEnd + } + return bodies +} + +func findIMMarkdownElementClosingTag(content string, from int, tagRE *regexp.Regexp) (int, int, bool) { + depth := 1 + for _, loc := range tagRE.FindAllStringSubmatchIndex(content[from:], -1) { + start := from + loc[0] + end := from + loc[1] + token := content[start:end] + if loc[2] >= 0 && content[from+loc[2]:from+loc[3]] == "/" { + depth-- + if depth == 0 { + return start, end, true + } + continue + } + if !isSelfClosingIMMarkdownTag(token) { + depth++ + } + } + return 0, 0, false +} + +func normalizeIMMarkdownTableCell(cell string) string { + const brPlaceholder = "\x00BR\x00" + cell = imMarkdownCellBreakRE.ReplaceAllString(cell, brPlaceholder) + cell = imMarkdownAnyTagRE.ReplaceAllStringFunc(cell, func(tag string) string { + name := strings.ToLower(strings.TrimPrefix(imMarkdownAnyTagRE.FindStringSubmatch(tag)[1], "/")) + if name == "at" { + return tag + } + return "" + }) + cell = html.UnescapeString(cell) + cell = strings.ReplaceAll(cell, brPlaceholder, "
") + cell = strings.ReplaceAll(cell, " \n", "
") + cell = strings.ReplaceAll(cell, "\n", "
") + cell = strings.ReplaceAll(cell, "|", `\|`) + lines := strings.Fields(cell) + if len(lines) == 0 { + return "" + } + return strings.Join(lines, " ") +} + +func writeIMMarkdownTableRow(out *strings.Builder, row []string) { + out.WriteString("| ") + out.WriteString(strings.Join(row, " | ")) + out.WriteString(" |\n") +} + +func padIMMarkdownTableRow(row []string, cols int) []string { + if len(row) >= cols { + return row + } + padded := make([]string, cols) + copy(padded, row) + return padded +} + +func convertIMMarkdownListItems(inner string, ordered bool, imCtx imMarkdownContext) string { + var out strings.Builder + for offset, index := 0, 1; offset < len(inner); { + loc := imMarkdownLiOpenRE.FindStringIndex(inner[offset:]) + if loc == nil { + break + } + openStart := offset + loc[0] + openEnd := offset + loc[1] + opening := inner[openStart:openEnd] + closeStart, closeEnd, found := findIMMarkdownListItemClosingTag(inner, openEnd) + if !found { + break + } + body := strings.TrimSpace(convertToIMMarkdown(inner[openEnd:closeStart], imCtx)) + if body != "" { + prefix := "-" + if ordered { + attrs := parseIMMarkdownAttrs(opening) + if seq := strings.TrimSpace(attrs["seq"]); seq != "" && seq != "auto" { + prefix = strings.TrimSuffix(seq, ".") + "." + } else { + prefix = fmt.Sprintf("%d.", index) + } + index++ + } + out.WriteString(prefix) + out.WriteString(" ") + out.WriteString(indentIMMarkdownListContinuation(body)) + out.WriteString("\n") + } + offset = closeEnd + } + return strings.TrimRight(out.String(), "\n") +} + +func findIMMarkdownListItemClosingTag(content string, from int) (int, int, bool) { + depth := 1 + for _, loc := range imMarkdownLiCloseRE.FindAllStringSubmatchIndex(content[from:], -1) { + start := from + loc[0] + end := from + loc[1] + token := content[start:end] + if loc[2] >= 0 && content[from+loc[2]:from+loc[3]] == "/" { + depth-- + if depth == 0 { + return start, end, true + } + continue + } + if !isSelfClosingIMMarkdownTag(token) { + depth++ + } + } + return 0, 0, false +} + +func indentIMMarkdownListContinuation(body string) string { + return strings.ReplaceAll(body, "\n", "\n ") +} + +func extractIMMarkdownInnerLink(inner string) (string, string, bool) { + match := imMarkdownLinkRE.FindStringSubmatch(inner) + if match == nil { + return "", "", false + } + href := match[1] + if href == "" { + href = match[2] + } + text := strings.TrimSpace(markdownPlainText(match[3])) + if text == "" { + text = href + } + return text, html.UnescapeString(href), true +} + +func markdownPlainText(s string) string { + s = imMarkdownCellBreakRE.ReplaceAllString(s, "\n") + s = imMarkdownAnyTagRE.ReplaceAllString(s, "") + return strings.TrimSpace(html.UnescapeString(s)) +} + +func markdownLinkLabelText(s string) string { + text := markdownPlainText(s) + if !strings.Contains(text, "---") { + return text + } + lines := strings.Split(text, "\n") + kept := lines[:0] + for _, line := range lines { + if strings.TrimSpace(line) == "---" { + continue + } + kept = append(kept, line) + } + return strings.TrimSpace(strings.Join(kept, "\n")) +} + +func markdownLink(text, href string) string { + cleanHref := strings.TrimSpace(href) + return fmt.Sprintf("[%s](%s)", escapeMarkdownLinkText(firstNonEmpty(text, cleanHref)), escapeMarkdownLinkDestination(cleanHref)) +} + +func escapeMarkdownLinkText(text string) string { + text = strings.ReplaceAll(text, `\`, `\\`) + text = strings.ReplaceAll(text, `[`, `\[`) + text = strings.ReplaceAll(text, `]`, `\]`) + return text +} + +func escapeMarkdownLinkDestination(href string) string { + // Lark/Feishu IM Markdown does not reliably parse raw spaces or parentheses + // inside (...). Keep URL delimiters like :/?#&= intact, but percent-encode + // characters that can terminate or split the Markdown link destination. + var out strings.Builder + out.Grow(len(href)) + for i := 0; i < len(href); { + if href[i] == '%' { + if i+2 < len(href) && isHexDigit(href[i+1]) && isHexDigit(href[i+2]) { + out.WriteString(href[i : i+3]) + i += 3 + } else { + writePercentEncodedByte(&out, href[i]) + i++ + } + continue + } + if href[i] < utf8.RuneSelf { + if shouldPercentEncodeIMMarkdownURLByte(href[i]) { + writePercentEncodedByte(&out, href[i]) + } else { + out.WriteByte(href[i]) + } + i++ + continue + } + r, size := utf8.DecodeRuneInString(href[i:]) + if r == utf8.RuneError && size == 1 { + writePercentEncodedByte(&out, href[i]) + i++ + continue + } + for _, b := range []byte(href[i : i+size]) { + writePercentEncodedByte(&out, b) + } + i += size + } + return out.String() +} + +func shouldPercentEncodeIMMarkdownURLByte(b byte) bool { + if b <= ' ' || b >= 0x7f { + return true + } + switch b { + case '(', ')', '<', '>', '"', '\\', '^', '`', '{', '|', '}': + return true + default: + return false + } +} + +func writePercentEncodedByte(out *strings.Builder, b byte) { + const hex = "0123456789ABCDEF" + out.WriteByte('%') + out.WriteByte(hex[b>>4]) + out.WriteByte(hex[b&0x0f]) +} + +func isHexDigit(b byte) bool { + return ('0' <= b && b <= '9') || ('a' <= b && b <= 'f') || ('A' <= b && b <= 'F') +} + +func imMarkdownInlineCode(s string) string { + maxRun := 0 + run := 0 + for _, r := range s { + if r == '`' { + run++ + if run > maxRun { + maxRun = run + } + continue + } + run = 0 + } + fence := strings.Repeat("`", maxRun+1) + if strings.HasPrefix(s, "`") || strings.HasSuffix(s, "`") { + return fence + " " + s + " " + fence + } + return fence + s + fence +} + +func imMarkdownFencedCode(code, lang string) string { + maxRun := 0 + for _, line := range strings.Split(code, "\n") { + if run := leadingBacktickRun(line); run > maxRun { + maxRun = run + } + } + fenceLen := maxRun + 1 + if fenceLen < 3 { + fenceLen = 3 + } + fence := strings.Repeat("`", fenceLen) + return fence + strings.TrimSpace(lang) + "\n" + strings.Trim(code, "\n") + "\n" + fence +} + +func leadingBacktickRun(s string) int { + run := 0 + for _, r := range s { + if r != '`' { + break + } + run++ + } + return run +} + +func firstNonEmpty(values ...string) string { + for _, value := range values { + if strings.TrimSpace(value) != "" { + return strings.TrimSpace(value) + } + } + return "" +} diff --git a/shortcuts/doc/docs_fetch_im_markdown_test.go b/shortcuts/doc/docs_fetch_im_markdown_test.go new file mode 100644 index 00000000..971b4878 --- /dev/null +++ b/shortcuts/doc/docs_fetch_im_markdown_test.go @@ -0,0 +1,1305 @@ +// Copyright (c) 2026 Lark Technologies Pte. Ltd. +// SPDX-License-Identifier: MIT + +package doc + +import ( + "reflect" + "strings" + "testing" +) + +func TestApplyFetchIMMarkdown(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + data map[string]interface{} + docInput string + want map[string]interface{} + }{ + { + name: "missing document leaves data unchanged", + data: map[string]interface{}{ + "content": `Roadmap`, + }, + docInput: "https://tenant.example.com/docx/doc_token", + want: map[string]interface{}{ + "content": `Roadmap`, + }, + }, + { + name: "non string content leaves data unchanged", + data: map[string]interface{}{ + "document": map[string]interface{}{ + "content": 123, + }, + }, + docInput: "https://tenant.example.com/docx/doc_token", + want: map[string]interface{}{ + "document": map[string]interface{}{ + "content": 123, + }, + }, + }, + { + name: "converts content with tenant base url", + data: map[string]interface{}{ + "document": map[string]interface{}{ + "content": `Roadmap` + "\n" + ``, + }, + }, + docInput: "https://tenant.example.com/docx/doc_token", + want: map[string]interface{}{ + "document": map[string]interface{}{ + "content": "# Roadmap\n[sheet S1](https://tenant.example.com/sheets/sht_token)", + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + applyFetchIMMarkdown(tt.data, tt.docInput) + if !reflect.DeepEqual(tt.data, tt.want) { + t.Fatalf("data = %#v, want %#v", tt.data, tt.want) + } + }) + } +} + +func TestConvertToIMMarkdownTitle(t *testing.T) { + t.Parallel() + + assertIMMarkdownCases(t, []imMarkdownCase{ + { + name: "plain title", + input: `Roadmap`, + want: "# Roadmap", + }, + { + name: "trim title whitespace", + input: "\n Roadmap \n", + want: "# Roadmap", + }, + { + name: "convert title inner markup", + input: `<b>Bold</b> Title`, + want: "# **Bold** Title", + }, + { + name: "empty title", + input: ` `, + want: "", + }, + { + name: "title followed by text", + input: `Roadmaptail`, + want: "# Roadmaptail", + }, + { + name: "uppercase title is handled case-insensitively", + input: `Roadmap`, + want: "# Roadmap", + }, + { + name: "missing closing title is preserved", + input: `beforeRoadmap`, + want: `before<title>Roadmap`, + }, + }) +} + +func TestConvertToIMMarkdownCallout(t *testing.T) { + t.Parallel() + + assertIMMarkdownCases(t, []imMarkdownCase{ + { + name: "emoji and body", + input: `<callout emoji="💡">Read **this**.</callout>`, + want: "---\n💡 Read **this**.\n---", + }, + { + name: "body without emoji", + input: `<callout>Plain body</callout>`, + want: "---\nPlain body\n---", + }, + { + name: "emoji only", + input: `<callout emoji="✅"></callout>`, + want: "---\n✅\n---", + }, + { + name: "empty callout", + input: `<callout></callout>`, + want: "---\n---", + }, + { + name: "nested callout", + input: `<callout emoji="✅">Outer <callout emoji="💡">Inner</callout></callout>`, + want: "---\n✅ Outer ---\n💡 Inner\n---\n---", + }, + { + name: "callout contains registered tags", + input: `<callout emoji="📝"><bookmark name="Spec" href="https://example.com"></bookmark></callout>`, + want: "---\n📝 [Spec](https://example.com)\n---", + }, + { + name: "callout contains grid and cite", + input: `<callout emoji="📣"><grid><column><cite type="user" user-id="ou_1" user-name="Alice"></cite></column><column><bookmark name="Spec" href="https://example.com"></bookmark></column></grid></callout>`, + want: "---\n📣 <at user_id=\"ou_1\">Alice</at>\n[Spec](https://example.com)\n---", + }, + { + name: "same-name nested callout with trailing text", + input: `<callout emoji="1">a<callout emoji="2">b</callout>c</callout>d`, + want: "---\n1 a---\n2 b\n---c\n---d", + }, + { + name: "missing closing callout is preserved", + input: `before<callout emoji="💡">body`, + want: `before<callout emoji="💡">body`, + }, + }) +} + +func TestConvertToIMMarkdownBlockquote(t *testing.T) { + t.Parallel() + + assertIMMarkdownCases(t, []imMarkdownCase{ + { + name: "single paragraph", + input: `<blockquote><p>quote <a href="https://example.com">link</a></p></blockquote>`, + want: "> quote [link](https://example.com)", + }, + { + name: "multiple paragraphs keep line breaks", + input: `<blockquote><p>first</p><p><b>second</b></p></blockquote>`, + want: "> first\n> **second**", + }, + { + name: "nested blockquote keeps nested markers", + input: `<blockquote><p>outer</p><blockquote><p>inner</p></blockquote></blockquote>`, + want: "> outer\n> > inner", + }, + { + name: "blank line keeps quote marker", + input: "<blockquote>first\n\nsecond</blockquote>", + want: "> first\n>\n> second", + }, + { + name: "empty blockquote", + input: `<blockquote> </blockquote>`, + want: "", + }, + { + name: "plain adjacent paragraphs outside blockquote stay compact", + input: `<p>first</p><p>second</p>`, + want: "firstsecond", + }, + }) +} + +func TestConvertToIMMarkdownParagraphHeadingAndListItemEdges(t *testing.T) { + t.Parallel() + + assertIMMarkdownCases(t, []imMarkdownCase{ + { + name: "empty heading", + input: `<h2> </h2>`, + want: "", + }, + { + name: "empty paragraph", + input: `<p> </p>`, + want: "", + }, + { + name: "top level list item uses seq", + input: "<li seq=\"7\">first\nsecond</li>", + want: "7. first\n second\n", + }, + { + name: "top level empty list item", + input: `<li></li>`, + want: "", + }, + { + name: "unordered list skips non item text and empty items", + input: `<ul>prefix<li>first</li><li> </li><li>second</li></ul>`, + want: "- first\n- second", + }, + { + name: "unclosed list item stops list scan", + input: `<ul><li>first</li><li>second</ul>`, + want: "- first", + }, + }) +} + +func TestConvertToIMMarkdownGridAndColumn(t *testing.T) { + t.Parallel() + + assertIMMarkdownCases(t, []imMarkdownCase{ + { + name: "two columns", + input: `<grid><column width-ratio="0.5">Left</column><column width-ratio="0.5">Right</column></grid>`, + want: "Left\nRight", + }, + { + name: "column converts nested registered tags", + input: `<column><bookmark name="Spec" href="https://example.com"></bookmark></column>`, + want: "[Spec](https://example.com)\n", + }, + { + name: "empty column", + input: `<column> </column>`, + want: "", + }, + { + name: "nested grid", + input: `<grid><column>A</column><column><grid><column>B</column><column>C</column></grid></column></grid>`, + want: "A\nB\nC", + }, + { + name: "grid inside callout", + input: `<callout emoji="📌"><grid><column>A</column><column>B</column></grid></callout>`, + want: "---\n📌 A\nB\n---", + }, + { + name: "adjacent grids do not merge", + input: `<grid><column>A</column></grid><grid><column>B</column></grid>`, + want: "AB", + }, + { + name: "column with nested callout keeps recursive output", + input: `<column><callout emoji="💡">Tip</callout></column>`, + want: "---\n💡 Tip\n---\n", + }, + { + name: "missing closing grid is preserved", + input: `<grid><column>A</column>`, + want: `<grid><column>A</column>`, + }, + }) +} + +func TestConvertToIMMarkdownTable(t *testing.T) { + t.Parallel() + + assertIMMarkdownCases(t, []imMarkdownCase{ + { + name: "basic table", + input: `<table><tr><th>A</th><th>B</th></tr><tr><td>1</td><td>2</td></tr></table>`, + want: "| A | B |\n| - | - |\n| 1 | 2 |", + }, + { + name: "table strips attrs and preserves cell line break", + input: `<table><tr><th vertical-align="top">A</th><th>B</th></tr><tr><td rowspan="2">1</td><td><b>two</b><br/>lines</td></tr></table>`, + want: "| A | B |\n| - | - |\n| 1 | **two**<br>lines |", + }, + { + name: "table escapes pipe", + input: `<table><tr><th>A|B</th></tr><tr><td>x|y</td></tr></table>`, + want: "| A\\|B |\n| - |\n| x\\|y |", + }, + { + name: "table pads ragged rows", + input: `<table><tr><th>A</th><th>B</th></tr><tr><td>1</td></tr></table>`, + want: "| A | B |\n| - | - |\n| 1 | |", + }, + { + name: "table converts nested cite", + input: `<table><tr><th>User</th></tr><tr><td><cite type="user" user-id="ou_1" user-name="Alice"></cite></td></tr></table>`, + want: "| User |\n| - |\n| <at user_id=\"ou_1\">Alice</at> |", + }, + { + name: "table converts nested bookmark and sheet", + input: `<table><tr><th>Link</th><th>Sheet</th></tr><tr><td><bookmark name="Spec" href="https://example.com"></bookmark></td><td><sheet token="sht_1" sheet-id="S1"></sheet></td></tr></table>`, + want: "| Link | Sheet |\n| - | - |\n| [Spec](https://example.com) | [sheet S1](https://larkoffice.com/sheets/sht_1) |", + }, + { + name: "table strips nested unknown html but preserves text", + input: `<table><tr><th>A</th></tr><tr><td><span color="red">red</span> <u>under</u></td></tr></table>`, + want: "| A |\n| - |\n| red under |", + }, + { + name: "table normalizes markdown hard breaks", + input: "<table><tr><th>A</th></tr><tr><td>line1 \nline2</td></tr></table>", + want: "| A |\n| - |\n| line1<br>line2 |", + }, + { + name: "table cell keeps nested table whole", + input: `<table><tr><th>Outer</th></tr><tr><td>before <table><tr><th>Inner</th></tr><tr><td>x</td></tr></table> after</td></tr></table>`, + want: "| Outer |\n| - |\n| before \\| Inner \\|<br>\\| - \\|<br>\\| x \\| after |", + }, + { + name: "table with only data row treats first row as header", + input: `<table><tr><td>A</td><td>B</td></tr></table>`, + want: "| A | B |\n| - | - |", + }, + { + name: "table without rows falls back to inline code", + input: `<table><tbody></tbody></table>`, + want: "`<table><tbody></tbody></table>`", + }, + { + name: "table row without cells falls back to inline code", + input: `<table><tr></tr></table>`, + want: "`<table><tr></tr></table>`", + }, + { + name: "table self closing row falls back to inline code", + input: `<table><tr/></table>`, + want: "`<table><tr/></table>`", + }, + { + name: "table empty cell stays empty", + input: `<table><tr><td> </td></tr></table>`, + want: "| |\n| - |", + }, + { + name: "missing closing table is preserved", + input: `before<table><tr><td>A</td></tr>`, + want: `before<table><tr><td>A</td></tr>`, + }, + }) +} + +func TestIMMarkdownElementExtractionEdges(t *testing.T) { + t.Parallel() + + bodies := extractIMMarkdownElementBodies(`</tr><tr/> <tr><td>x</td></tr><tr>open`, imMarkdownRowTagRE) + if want := []string{"", "<td>x</td>"}; !reflect.DeepEqual(bodies, want) { + t.Fatalf("extractIMMarkdownElementBodies() = %#v, want %#v", bodies, want) + } + + if _, _, ok := findIMMarkdownElementClosingTag(`<tr><td>x`, len("<tr>"), imMarkdownRowTagRE); ok { + t.Fatal("findIMMarkdownElementClosingTag() found closing tag, want false") + } + + start, end, ok := findIMMarkdownListItemClosingTag(`<li>outer<li/>tail</li>`, len("<li>")) + if !ok { + t.Fatal("findIMMarkdownListItemClosingTag() did not find closing tag") + } + if got, want := `<li>outer<li/>tail</li>`[start:end], "</li>"; got != want { + t.Fatalf("closing tag = %q, want %q", got, want) + } + + if _, _, ok := findIMMarkdownListItemClosingTag(`<li>open`, len("<li>")); ok { + t.Fatal("findIMMarkdownListItemClosingTag() found closing tag, want false") + } + + start, end, ok = findIMMarkdownListItemClosingTag(`<li>outer<li>inner</li>tail</li>`, len("<li>")) + if !ok { + t.Fatal("findIMMarkdownListItemClosingTag() did not find nested closing tag") + } + if got, want := `<li>outer<li>inner</li>tail</li>`[start:end], "</li>"; got != want { + t.Fatalf("nested closing tag = %q, want %q", got, want) + } + + if got := convertIMMarkdownListItems("plain text", false, imMarkdownContext{}); got != "" { + t.Fatalf("convertIMMarkdownListItems() = %q, want empty", got) + } +} + +func TestNormalizeIMMarkdownTableCellStripsUnknownTags(t *testing.T) { + t.Parallel() + + got := normalizeIMMarkdownTableCell(`<span style="x">red</span>`) + if want := "red"; got != want { + t.Fatalf("normalizeIMMarkdownTableCell() = %q, want %q", got, want) + } +} + +func TestConvertToIMMarkdownDiscardTags(t *testing.T) { + t.Parallel() + + assertIMMarkdownCases(t, []imMarkdownCase{ + { + name: "figure discarded", + input: `before<figure view-type="Card">hidden</figure>after`, + want: "beforeafter", + }, + { + name: "figure with source discarded", + input: `<figure view-type="Preview"><source href="https://example.com/a.md"/></figure>`, + want: "", + }, + { + name: "self-closing source discarded", + input: `a<source href="https://example.com/a.md"/>b`, + want: "ab", + }, + { + name: "source name becomes inline code", + input: "a<source name=\"report`v1`.pdf\" href=\"https://example.com/a.md\"/>b", + want: "a``report`v1`.pdf``b", + }, + { + name: "button discarded", + input: `a<button>Click</button>b`, + want: "ab", + }, + { + name: "time discarded", + input: `a<time expire-time="123"></time>b`, + want: "ab", + }, + { + name: "colgroup discarded", + input: `a<colgroup><col width="120"/></colgroup>b`, + want: "ab", + }, + { + name: "col discarded", + input: `a<col width="120"/>b`, + want: "ab", + }, + { + name: "self-closing button discarded", + input: `a<button/>b`, + want: "ab", + }, + { + name: "missing closing discard tag is preserved", + input: `a<figure>hidden`, + want: `a<figure>hidden`, + }, + }) +} + +func TestConvertToIMMarkdownWhiteboard(t *testing.T) { + t.Parallel() + + assertIMMarkdownCases(t, []imMarkdownCase{ + { + name: "paired whiteboard", + input: `<whiteboard token="wb_token"></whiteboard>`, + want: "`<whiteboard token=\"wb_token\"></whiteboard>`", + }, + { + name: "self-closing whiteboard", + input: `<whiteboard token="wb_token"/>`, + want: "`<whiteboard token=\"wb_token\"/>`", + }, + { + name: "whiteboard with backticks", + input: "<whiteboard token=\"`wb`\"></whiteboard>", + want: "``<whiteboard token=\"`wb`\"></whiteboard>``", + }, + { + name: "whiteboard preserves inner text as opaque", + input: `<whiteboard token="wb">not exported</whiteboard>`, + want: "`<whiteboard token=\"wb\">not exported</whiteboard>`", + }, + { + name: "missing closing whiteboard is preserved", + input: `<whiteboard token="wb">`, + want: `<whiteboard token="wb">`, + }, + }) +} + +func TestConvertToIMMarkdownSheet(t *testing.T) { + t.Parallel() + + assertIMMarkdownCasesWithContext(t, imMarkdownContext{baseURL: "https://bytedance.larkoffice.com"}, []imMarkdownCase{ + { + name: "sheet with sheet id", + input: `<sheet token="sht_token" sheet-id="S1"></sheet>`, + want: "[sheet S1](https://bytedance.larkoffice.com/sheets/sht_token)", + }, + { + name: "sheet without sheet id", + input: `<sheet token="sht_token"></sheet>`, + want: "[sheet](https://bytedance.larkoffice.com/sheets/sht_token)", + }, + { + name: "sheet without token falls back to inline code", + input: `<sheet sheet-id="S1"></sheet>`, + want: "`<sheet sheet-id=\"S1\"></sheet>`", + }, + { + name: "self-closing sheet", + input: `<sheet token="sht_token" sheet-id="S1"/>`, + want: "[sheet S1](https://bytedance.larkoffice.com/sheets/sht_token)", + }, + { + name: "sheet token is trimmed", + input: `<sheet token=" sht_token " sheet-id="S1"></sheet>`, + want: "[sheet S1](https://bytedance.larkoffice.com/sheets/sht_token)", + }, + { + name: "sheet inside text", + input: `before <sheet token="sht_token"></sheet> after`, + want: "before [sheet](https://bytedance.larkoffice.com/sheets/sht_token) after", + }, + }) +} + +func TestConvertToIMMarkdownBookmark(t *testing.T) { + t.Parallel() + + assertIMMarkdownCases(t, []imMarkdownCase{ + { + name: "name and href", + input: `<bookmark name="Example" href="https://example.com"></bookmark>`, + want: "[Example](https://example.com)", + }, + { + name: "title fallback", + input: `<bookmark title="Example" href="https://example.com"></bookmark>`, + want: "[Example](https://example.com)", + }, + { + name: "inner text fallback", + input: `<bookmark href="https://example.com">Example</bookmark>`, + want: "[Example](https://example.com)", + }, + { + name: "missing href returns label", + input: `<bookmark name="Example"></bookmark>`, + want: "Example", + }, + { + name: "escaped link label", + input: `<bookmark name="A [B]" href="https://example.com"></bookmark>`, + want: "[A \\[B\\]](https://example.com)", + }, + { + name: "href is percent encoded", + input: `<bookmark name="Spec" href="https://example.com/wiki/A B (draft)?q=x y#frag(1)"></bookmark>`, + want: "[Spec](https://example.com/wiki/A%20B%20%28draft%29?q=x%20y#frag%281%29)", + }, + { + name: "href keeps existing percent escapes", + input: `<bookmark name="Spec" href="https://example.com/wiki/A%20B"></bookmark>`, + want: "[Spec](https://example.com/wiki/A%20B)", + }, + { + name: "href escapes invalid percent and unicode", + input: `<bookmark name="Spec" href="https://example.com/wiki/研发%zz?x=1%"></bookmark>`, + want: "[Spec](https://example.com/wiki/%E7%A0%94%E5%8F%91%25zz?x=1%25)", + }, + { + name: "href escapes markdown delimiter bytes", + input: "<bookmark name=\"Spec\" href=\"https://example.com/a<b>|c`d\"></bookmark>", + want: "[Spec](https://example.com/a%3Cb%3E%7Cc%60d)", + }, + { + name: "inner registered tag fallback", + input: `<bookmark href="https://example.com"><cite type="user" user-id="ou_1" user-name="Alice"></cite></bookmark>`, + want: "[Alice](https://example.com)", + }, + { + name: "href fallback as label", + input: `<bookmark href="https://example.com"></bookmark>`, + want: "[https://example.com](https://example.com)", + }, + { + name: "self-closing bookmark without href", + input: `<bookmark name="Example"/>`, + want: "Example", + }, + }) +} + +func TestConvertToIMMarkdownInlineEdges(t *testing.T) { + t.Parallel() + + assertIMMarkdownCases(t, []imMarkdownCase{ + { + name: "empty strong emphasis and delete", + input: `<b> </b><em> </em><del> </del>`, + want: "", + }, + { + name: "anchor without href returns text", + input: `<a>plain <b>text</b></a>`, + want: "plain **text**", + }, + { + name: "anchor without text falls back to href", + input: `<a href="https://example.com/a b"></a>`, + want: "[https://example.com/a b](https://example.com/a%20b)", + }, + { + name: "latex escapes dollars", + input: `<latex>price=$5</latex>`, + want: "$price=\\$5$", + }, + { + name: "empty latex", + input: `<latex> </latex>`, + want: "", + }, + { + name: "image missing href", + input: `<img alt="A"/>`, + want: "", + }, + { + name: "image uses src and title fallback", + input: `<img src="https://example.com/i 1.png" title="A [img]"/>`, + want: "![A \\[img\\]](https://example.com/i%201.png)", + }, + { + name: "plain fenced code", + input: `<pre><code>plain</code></pre>`, + want: "```\nplain\n```", + }, + { + name: "code inline trims nested markup", + input: `<code><b>x</b></code>`, + want: "`x`", + }, + }) +} + +func TestConvertToIMMarkdownCiteUser(t *testing.T) { + t.Parallel() + + assertIMMarkdownCases(t, []imMarkdownCase{ + { + name: "user id and name", + input: `<cite type="user" user-id="ou_abc" user-name="Alice"></cite>`, + want: `<at user_id="ou_abc">Alice</at>`, + }, + { + name: "open id fallback", + input: `<cite type="user" open-id="ou_open" name="Bob"></cite>`, + want: `<at user_id="ou_open">Bob</at>`, + }, + { + name: "name falls back to user id", + input: `<cite type="user" user-id="ou_abc"></cite>`, + want: `<at user_id="ou_abc">ou_abc</at>`, + }, + { + name: "missing user id returns name", + input: `<cite type="user" user-name="Alice"></cite>`, + want: "Alice", + }, + { + name: "escape at XML", + input: `<cite type="user" user-id="ou_"" user-name="A&B"></cite>`, + want: `<at user_id="ou_"">A&B</at>`, + }, + { + name: "inner text fallback when attrs missing name", + input: `<cite type="user" user-id="ou_abc">Alice</cite>`, + want: `<at user_id="ou_abc">Alice</at>`, + }, + { + name: "self-closing user cite", + input: `<cite type="user" user-id="ou_abc" user-name="Alice"/>`, + want: `<at user_id="ou_abc">Alice</at>`, + }, + }) +} + +func TestConvertToIMMarkdownCiteDoc(t *testing.T) { + t.Parallel() + + assertIMMarkdownCasesWithContext(t, imMarkdownContext{baseURL: "https://bytedance.larkoffice.com"}, []imMarkdownCase{ + { + name: "doc id to link", + input: `<cite type="doc" doc-id="doc_token" file-type="docx" title="Spec"></cite>`, + want: "[Spec](https://bytedance.larkoffice.com/docx/doc_token)", + }, + { + name: "href wins", + input: `<cite type="doc" href="https://example.com/doc (draft)" title="Spec"></cite>`, + want: "[Spec](https://example.com/doc%20%28draft%29)", + }, + { + name: "default title and file type", + input: `<cite type="doc" token="doc_token"></cite>`, + want: "[document](https://bytedance.larkoffice.com/docx/doc_token)", + }, + { + name: "missing doc id falls back to inline code", + input: `<cite type="doc" title="Spec"></cite>`, + want: "`<cite type=\"doc\" title=\"Spec\"></cite>`", + }, + { + name: "wiki file type link", + input: `<cite type="doc" doc-id="wiki_token" file-type="wiki" title="Wiki"></cite>`, + want: "[Wiki](https://bytedance.larkoffice.com/wiki/wiki_token)", + }, + { + name: "doc title is escaped", + input: `<cite type="doc" doc-id="doc_token" title="A [B]"></cite>`, + want: "[A \\[B\\]](https://bytedance.larkoffice.com/docx/doc_token)", + }, + }) +} + +func TestConvertToIMMarkdownCiteCitation(t *testing.T) { + t.Parallel() + + assertIMMarkdownCases(t, []imMarkdownCase{ + { + name: "inner anchor", + input: `<cite type="citation"><a href="https://example.com/ref">Ref</a></cite>`, + want: "[Ref](https://example.com/ref)", + }, + { + name: "href attr", + input: `<cite type="citation" href="https://example.com/ref" title="Ref"></cite>`, + want: "[Ref](https://example.com/ref)", + }, + { + name: "plain inner fallback", + input: `<cite type="citation">Plain Ref</cite>`, + want: "Plain Ref", + }, + { + name: "inner anchor text strips markup", + input: `<cite type="citation"><a href="https://example.com/ref"><b>Ref</b></a></cite>`, + want: "[Ref](https://example.com/ref)", + }, + { + name: "single quoted inner anchor falls back to href text", + input: `<cite type="citation"><a href='https://example.com/ref'></a></cite>`, + want: "[https://example.com/ref](https://example.com/ref)", + }, + { + name: "href attr falls back to href label", + input: `<cite type="citation" href="https://example.com/ref"></cite>`, + want: "[https://example.com/ref](https://example.com/ref)", + }, + }) +} + +func TestEscapeMarkdownLinkDestinationInvalidUTF8(t *testing.T) { + t.Parallel() + + got := escapeMarkdownLinkDestination(string([]byte{'a', 0xff, 'b'})) + if want := "a%FFb"; got != want { + t.Fatalf("escapeMarkdownLinkDestination() = %q, want %q", got, want) + } +} + +func TestConvertToIMMarkdownCiteUnknown(t *testing.T) { + t.Parallel() + + assertIMMarkdownCases(t, []imMarkdownCase{ + { + name: "unknown paired cite", + input: `<cite type="unknown">x</cite>`, + want: "`<cite type=\"unknown\">x</cite>`", + }, + { + name: "unknown self-closing cite", + input: `<cite type="unknown"/>`, + want: "`<cite type=\"unknown\"/>`", + }, + }) +} + +func TestConvertToIMMarkdownScannerBoundaries(t *testing.T) { + t.Parallel() + + assertIMMarkdownCases(t, []imMarkdownCase{ + { + name: "unknown tag preserved with known child untouched", + input: `<unknown><bookmark name="Spec" href="https://example.com"></bookmark></unknown>`, + want: `<unknown>[Spec](https://example.com)</unknown>`, + }, + { + name: "registered tag attributes single quotes", + input: `<bookmark name='Spec' href='https://example.com'></bookmark>`, + want: "[Spec](https://example.com)", + }, + { + name: "registered tag name with leading text", + input: `alpha<title>Betagamma`, + want: "alpha# Betagamma", + }, + { + name: "xml comment is preserved", + input: `aT`, + want: "a# T", + }, + { + name: "br is preserved", + input: `a
b`, + want: "a
b", + }, + { + name: "malformed attribute still allows handler", + input: `Inner`, + want: "[Inner](https://example.com)", + }, + }) +} + +func TestConvertToIMMarkdownCompositeNesting(t *testing.T) { + t.Parallel() + + assertIMMarkdownCasesWithContext(t, imMarkdownContext{baseURL: "https://tenant.example.com"}, []imMarkdownCase{ + { + name: "callout grid table and resources", + input: `
OwnerDoc
`, + want: "---\n📌 | Owner | Doc |\n| - | - |\n| Alice | [Spec](https://tenant.example.com/docx/doc_1) |\n[sheet S1](https://tenant.example.com/sheets/sht_1)\n---", + }, + { + name: "grid inside table cell", + input: `
Outer
AB
`, + want: "| Outer |\n| - |\n| A
B |", + }, + { + name: "table inside table cell", + input: `
OuterTail
Inner
x
done
`, + want: "| Outer | Tail |\n| - | - |\n| \\| Inner \\|
\\| - \\|
\\| x \\| | done |", + }, + { + name: "bookmark wraps callout fallback text", + input: `Tip`, + want: "[💡 Tip](https://example.com)", + }, + }) +} + +func TestConvertToIMMarkdownUnclosedFragments(t *testing.T) { + t.Parallel() + + assertIMMarkdownCases(t, []imMarkdownCase{ + { + name: "unclosed title preserves nested registered tag", + input: `before<bookmark name="Spec" href="https://example.com"></bookmark>`, + want: `before<title><bookmark name="Spec" href="https://example.com"></bookmark>`, + }, + { + name: "unclosed callout preserves nested registered tag", + input: `before<callout emoji="💡"><bookmark name="Spec" href="https://example.com"></bookmark>`, + want: `before<callout emoji="💡"><bookmark name="Spec" href="https://example.com"></bookmark>`, + }, + { + name: "unclosed grid preserves closed child", + input: `before<grid><column>A</column>`, + want: `before<grid><column>A</column>`, + }, + { + name: "unclosed column preserves nested registered tag", + input: `before<column><bookmark name="Spec" href="https://example.com"></bookmark>`, + want: `before<column><bookmark name="Spec" href="https://example.com"></bookmark>`, + }, + { + name: "unclosed table preserves nested cite", + input: `before<table><tr><td><cite type="user" user-id="ou_1" user-name="Alice"></cite></td></tr>`, + want: `before<table><tr><td><cite type="user" user-id="ou_1" user-name="Alice"></cite></td></tr>`, + }, + { + name: "unclosed figure preserves nested source", + input: `before<figure><source href="https://example.com/a.md"/>`, + want: `before<figure><source href="https://example.com/a.md"/>`, + }, + { + name: "unclosed whiteboard preserves nested registered tag", + input: `before<whiteboard token="wb"><bookmark name="Spec" href="https://example.com"></bookmark>`, + want: `before<whiteboard token="wb"><bookmark name="Spec" href="https://example.com"></bookmark>`, + }, + { + name: "unclosed sheet preserves nested registered tag", + input: `before<sheet token="sht"><bookmark name="Spec" href="https://example.com"></bookmark>`, + want: `before<sheet token="sht"><bookmark name="Spec" href="https://example.com"></bookmark>`, + }, + { + name: "unclosed bookmark preserves nested cite", + input: `before<bookmark href="https://example.com"><cite type="user" user-id="ou_1" user-name="Alice"></cite>`, + want: `before<bookmark href="https://example.com"><cite type="user" user-id="ou_1" user-name="Alice"></cite>`, + }, + { + name: "unclosed cite preserves inner anchor", + input: `before<cite type="citation"><a href="https://example.com/ref">Ref</a>`, + want: `before<cite type="citation"><a href="https://example.com/ref">Ref</a>`, + }, + }) +} + +func TestConvertToIMMarkdownDeepRegisteredContainers(t *testing.T) { + t.Parallel() + + deepGrid := "leaf" + for i := 0; i < 32; i++ { + deepGrid = "<grid><column>" + deepGrid + "</column></grid>" + } + if got := convertToIMMarkdown(deepGrid, imMarkdownContext{}); got != "leaf" { + t.Fatalf("deep grid conversion = %q, want %q", got, "leaf") + } + + deepCallout := "leaf" + for i := 0; i < 16; i++ { + deepCallout = `<callout emoji="💡">` + deepCallout + `</callout>` + } + got := convertToIMMarkdown(deepCallout, imMarkdownContext{}) + if !strings.Contains(got, "leaf") { + t.Fatalf("deep callout conversion missing leaf:\n%s", got) + } + if count := strings.Count(got, "💡"); count != 16 { + t.Fatalf("deep callout emoji count = %d, want 16\n%s", count, got) + } +} + +func TestConvertToIMMarkdownDocumentExpectedTagsAndEscaping(t *testing.T) { + t.Parallel() + + imCtx := imMarkdownContext{baseURL: "https://bytedance.larkoffice.com"} + input := strings.Join([]string{ + `<h1>Roadmap <span text-color="red">Q1</span></h1>`, + `<h7>Deep Heading</h7>`, + `<p>plain<br/>next <b>Bold</b> <em>Italic</em> <del>Gone</del> <u>Under</u> <span background-color="yellow">Plain</span> <a href="https://example.com/a(b)">A [B]</a></p>`, + `<blockquote><p>quote <a type="url-preview" href="https://example.com/card">Card</a></p></blockquote>`, + `<ul><li>first</li><li><b>second</b></li></ul>`, + `<ol><li seq="auto">one</li><li seq="3">three</li></ol>`, + `<pre lang="Go"><code>fmt.Println("hi")` + "\n```" + `</code></pre>`, + `<p><code>` + "`edge`" + `</code> <latex>E=mc^2</latex> <hr/> <img href="https://example.com/i(1).png" alt="A [img]"/></p>`, + `<source name="report` + "`v1`" + `.pdf"/><source href="https://example.com/no-name"/>`, + `<task task-id="task_1"></task><task></task><chat_card chat-id="chat_1"></chat_card><chat_card></chat_card>`, + `<bitable></bitable><base_refer></base_refer><okr></okr><poll></poll><agenda></agenda><folder_manager></folder_manager><wiki_catalog></wiki_catalog><wiki_recent_update></wiki_recent_update><chart_refer_host_perm></chart_refer_host_perm><synced_reference></synced_reference><synced-source></synced-source><mindnote></mindnote>`, + }, "\n") + + want := strings.Join([]string{ + `# Roadmap Q1`, + `###### Deep Heading`, + `plain<br/>next **Bold** *Italic* ~~Gone~~ Under Plain [A \[B\]](https://example.com/a%28b%29)`, + `> quote [Card](https://example.com/card)`, + `- first`, + `- **second**`, + `1. one`, + `3. three`, + "````Go\nfmt.Println(\"hi\")\n```\n````", + "`` `edge` `` $E=mc^2$ --- ![A \\[img\\]](https://example.com/i%281%29.png)", + "``report`v1`.pdf``", + "`任务``群聊卡片`", + "`多维表格``多维表格``OKR`", + }, "\n") + + if got := convertToIMMarkdown(input, imCtx); got != want { + t.Fatalf("convertToIMMarkdown() = %q, want %q", got, want) + } +} + +func TestConvertToIMMarkdownMixedDocumentSmoke(t *testing.T) { + t.Parallel() + + imCtx := imMarkdownContext{baseURL: "https://bytedance.larkoffice.com"} + input := strings.Join([]string{ + `<title>Roadmap`, + `### LeftRight`, + `
AB
1two
lines
`, + ``, + ``, + `Ref`, + ``, + `
`, + }, "\n") + + got := convertToIMMarkdown(input, imCtx) + + for _, want := range []string{ + "# Roadmap", + "### Left", + "Right", + "| A | B |\n| - | - |\n| 1 | **two**
lines |", + `Alice`, + "[Spec](https://bytedance.larkoffice.com/docx/doc_token)", + "[Ref](https://example.com/ref)", + "[sheet S1](https://bytedance.larkoffice.com/sheets/sht_token)", + } { + if !strings.Contains(got, want) { + t.Fatalf("converted content missing %q:\n%s", want, got) + } + } + for _, dropped := range []string{" first\n>\n> second", + }, + { + name: "empty latex", + got: handleIMMarkdownLatex("", " ", nil, ctx), + want: "", + }, + { + name: "image without URL", + got: handleIMMarkdownImage("", "", map[string]string{"alt": "A"}, ctx), + want: "", + }, + { + name: "empty strong", + got: handleIMMarkdownStrong("", " ", nil, ctx), + want: "", + }, + { + name: "empty emphasis", + got: handleIMMarkdownEmphasis("", " ", nil, ctx), + want: "", + }, + { + name: "empty delete", + got: handleIMMarkdownDelete("", " ", nil, ctx), + want: "", + }, + { + name: "anchor without href", + got: handleIMMarkdownAnchor("", "plain", nil, ctx), + want: "**plain**", + }, + { + name: "table skips rows without cells", + got: handleIMMarkdownTable("
", "", nil, ctx), + want: "`
`", + }, + { + name: "empty normalized table cell", + got: normalizeIMMarkdownTableCell(" "), + want: "", + }, + { + name: "plain fenced code uses minimum fence", + got: imMarkdownFencedCode("plain", ""), + want: "```\nplain\n```", + }, + } + + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + if tt.got != tt.want { + t.Fatalf("got %q, want %q", tt.got, tt.want) + } + }) + } +} + +func TestIMMarkdownExtractionAndListBreakBranches(t *testing.T) { + t.Parallel() + + rowBodies := extractIMMarkdownElementBodies(`open`, imMarkdownRowTagRE) + if want := []string{""}; !reflect.DeepEqual(rowBodies, want) { + t.Fatalf("extractIMMarkdownElementBodies() = %#v, want %#v", rowBodies, want) + } + + if _, _, ok := findIMMarkdownElementClosingTag(`open`, len(""), imMarkdownRowTagRE); ok { + t.Fatal("findIMMarkdownElementClosingTag() found closing tag, want false") + } + + if got := convertIMMarkdownListItems("", false, imMarkdownContext{}); got != "" { + t.Fatalf("empty list conversion = %q, want empty", got) + } + if got := convertIMMarkdownListItems("
  • open", false, imMarkdownContext{}); got != "" { + t.Fatalf("unclosed list conversion = %q, want empty", got) + } + if _, _, ok := findIMMarkdownListItemClosingTag(`
  • outer
  • inner
  • `, len("
  • ")); ok { + t.Fatal("findIMMarkdownListItemClosingTag() found closing tag for unbalanced nested item") + } +} + +func TestIMMarkdownLinkAndEncodingFallbackBranches(t *testing.T) { + t.Parallel() + + text, href, ok := extractIMMarkdownInnerLink(``) + if !ok { + t.Fatal("extractIMMarkdownInnerLink() ok = false, want true") + } + if text != "https://example.com/ref" || href != "https://example.com/ref" { + t.Fatalf("inner link = (%q, %q), want href fallback", text, href) + } + + if got := escapeMarkdownLinkDestination("a%zz%"); got != "a%25zz%25" { + t.Fatalf("escaped invalid percent = %q, want %q", got, "a%25zz%25") + } + if got := escapeMarkdownLinkDestination("研发"); got != "%E7%A0%94%E5%8F%91" { + t.Fatalf("escaped unicode = %q, want encoded UTF-8 bytes", got) + } + if got := escapeMarkdownLinkDestination(string([]byte{'a', 0xff, 'b'})); got != "a%FFb" { + t.Fatalf("escaped invalid UTF-8 = %q, want %q", got, "a%FFb") + } +} + +type imMarkdownCase struct { + name string + input string + want string +} + +func assertIMMarkdownCases(t *testing.T, cases []imMarkdownCase) { + t.Helper() + assertIMMarkdownCasesWithContext(t, imMarkdownContext{baseURL: "https://larkoffice.com"}, cases) +} + +func assertIMMarkdownCasesWithContext(t *testing.T, imCtx imMarkdownContext, cases []imMarkdownCase) { + t.Helper() + + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + if got := convertToIMMarkdown(tt.input, imCtx); got != tt.want { + t.Fatalf("convertToIMMarkdown() = %q, want %q", got, tt.want) + } + }) + } +} diff --git a/shortcuts/doc/docs_fetch_v2.go b/shortcuts/doc/docs_fetch_v2.go index b3e4f2fc..16ca133f 100644 --- a/shortcuts/doc/docs_fetch_v2.go +++ b/shortcuts/doc/docs_fetch_v2.go @@ -17,7 +17,7 @@ import ( // v2FetchFlags returns the flag definitions for the v2 (OpenAPI) fetch path. func v2FetchFlags() []common.Flag { return []common.Flag{ - {Name: "doc-format", Desc: "output content format; xml keeps DocxXML structure and optional block ids, markdown is plain export", Default: "xml", Enum: []string{"xml", "markdown"}}, + {Name: "doc-format", Desc: "output content format; xml keeps DocxXML structure and optional block ids, markdown is plain export, im-markdown downgrades residual DocxXML fragments for IM messages", Default: "xml", Enum: []string{"xml", "markdown", "im-markdown"}}, {Name: "detail", Desc: "detail level; simple for reading, with-ids for block references, full for styles and edit metadata", Default: "simple", Enum: []string{"simple", "with-ids", "full"}}, {Name: "lang", Desc: "user cite display language, e.g. en-US, zh-CN, ja-JP"}, {Name: "revision-id", Desc: "document revision id; -1 means latest", Type: "int", Default: "-1"}, @@ -72,6 +72,9 @@ func executeFetchV2(_ context.Context, runtime *common.RuntimeContext) error { if warning := addFetchDetailDowngradeWarning(runtime, data); warning != "" && runtime.Format == "pretty" { fmt.Fprintf(runtime.IO().ErrOut, "warning: %s\n", warning) } + if isIMMarkdownFetch(runtime) { + applyFetchIMMarkdown(data, runtime.Str("doc")) + } runtime.OutFormatRaw(data, nil, func(w io.Writer) { if doc, ok := data["document"].(map[string]interface{}); ok { @@ -85,7 +88,7 @@ func executeFetchV2(_ context.Context, runtime *common.RuntimeContext) error { func buildFetchBody(runtime *common.RuntimeContext) map[string]interface{} { body := map[string]interface{}{ - "format": runtime.Str("doc-format"), + "format": effectiveFetchFormat(runtime), } if v := runtime.Int("revision-id"); v > 0 { body["revision_id"] = v @@ -122,6 +125,14 @@ func buildFetchBody(runtime *common.RuntimeContext) map[string]interface{} { return body } +func effectiveFetchFormat(runtime *common.RuntimeContext) string { + format := strings.TrimSpace(runtime.Str("doc-format")) + if format == "im-markdown" { + return "markdown" + } + return format +} + func resolveFetchLang(runtime *common.RuntimeContext) string { if runtime.Changed("lang") { return strings.TrimSpace(runtime.Str("lang")) diff --git a/shortcuts/doc/docs_fetch_v2_test.go b/shortcuts/doc/docs_fetch_v2_test.go index 1210f0ac..ace683c2 100644 --- a/shortcuts/doc/docs_fetch_v2_test.go +++ b/shortcuts/doc/docs_fetch_v2_test.go @@ -6,9 +6,12 @@ package doc import ( "context" "encoding/json" + "errors" + "reflect" "strings" "testing" + "github.com/larksuite/cli/errs" "github.com/larksuite/cli/internal/cmdutil" "github.com/larksuite/cli/internal/core" "github.com/larksuite/cli/internal/httpmock" @@ -104,6 +107,369 @@ func TestBuildFetchBodyExplicitBlankLangOmitsLang(t *testing.T) { } } +func TestBuildFetchBodyIncludesRevisionAndFullDetail(t *testing.T) { + t.Parallel() + + runtime := newFetchBodyTestRuntime(context.Background()) + mustSetFetchFlag(t, runtime, "revision-id", "42") + mustSetFetchFlag(t, runtime, "detail", "full") + + body := buildFetchBody(runtime) + if got := body["revision_id"]; got != 42 { + t.Fatalf("revision_id = %#v, want 42", got) + } + exportOption, _ := body["export_option"].(map[string]interface{}) + want := map[string]interface{}{ + "export_block_id": true, + "export_style_attrs": true, + "export_cite_extra_data": true, + } + if !reflect.DeepEqual(exportOption, want) { + t.Fatalf("export_option = %#v, want %#v", exportOption, want) + } +} + +func TestBuildFetchBodyIncludesWithIDsDetail(t *testing.T) { + t.Parallel() + + runtime := newFetchBodyTestRuntime(context.Background()) + mustSetFetchFlag(t, runtime, "detail", "with-ids") + + body := buildFetchBody(runtime) + exportOption, _ := body["export_option"].(map[string]interface{}) + want := map[string]interface{}{ + "export_block_id": true, + } + if !reflect.DeepEqual(exportOption, want) { + t.Fatalf("export_option = %#v, want %#v", exportOption, want) + } +} + +func TestBuildFetchBodyIncludesReadOption(t *testing.T) { + t.Parallel() + + runtime := newFetchBodyTestRuntime(context.Background()) + mustSetFetchFlag(t, runtime, "scope", "section") + mustSetFetchFlag(t, runtime, "start-block-id", "blk_heading") + + body := buildFetchBody(runtime) + want := map[string]interface{}{ + "read_mode": "section", + "start_block_id": "blk_heading", + } + if got := body["read_option"]; !reflect.DeepEqual(got, want) { + t.Fatalf("read_option = %#v, want %#v", got, want) + } +} + +func TestBuildReadOptionModes(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + setFlags map[string]string + want map[string]interface{} + }{ + { + name: "full omits read option", + setFlags: map[string]string{ + "scope": "full", + }, + want: nil, + }, + { + name: "outline with max depth", + setFlags: map[string]string{ + "scope": "outline", + "max-depth": "3", + }, + want: map[string]interface{}{ + "read_mode": "outline", + "max_depth": "3", + }, + }, + { + name: "range with block ids and context", + setFlags: map[string]string{ + "scope": "range", + "start-block-id": "blk_start", + "end-block-id": "blk_end", + "context-before": "2", + "context-after": "1", + "max-depth": "0", + }, + want: map[string]interface{}{ + "read_mode": "range", + "start_block_id": "blk_start", + "end_block_id": "blk_end", + "context_before": "2", + "context_after": "1", + "max_depth": "0", + }, + }, + { + name: "keyword with query", + setFlags: map[string]string{ + "scope": "keyword", + "keyword": "foo|bar", + "context-before": "1", + }, + want: map[string]interface{}{ + "read_mode": "keyword", + "keyword": "foo|bar", + "context_before": "1", + }, + }, + { + name: "section keeps unlimited depth omitted", + setFlags: map[string]string{ + "scope": "section", + "start-block-id": "blk_heading", + "max-depth": "-1", + }, + want: map[string]interface{}{ + "read_mode": "section", + "start_block_id": "blk_heading", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + runtime := newFetchBodyTestRuntime(context.Background()) + for name, value := range tt.setFlags { + mustSetFetchFlag(t, runtime, name, value) + } + + if got := buildReadOption(runtime); !reflect.DeepEqual(got, tt.want) { + t.Fatalf("buildReadOption() = %#v, want %#v", got, tt.want) + } + }) + } +} + +func TestValidateReadModeFlagsRejectsInvalidScopeOptions(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + setFlags map[string]string + wantParam string + wantParams []string + }{ + { + name: "negative context before", + setFlags: map[string]string{ + "scope": "range", + "start-block-id": "blk_start", + "context-before": "-1", + }, + wantParam: "--context-before", + }, + { + name: "negative context after", + setFlags: map[string]string{ + "scope": "range", + "start-block-id": "blk_start", + "context-after": "-1", + }, + wantParam: "--context-after", + }, + { + name: "max depth below unlimited sentinel", + setFlags: map[string]string{ + "scope": "range", + "start-block-id": "blk_start", + "max-depth": "-2", + }, + wantParam: "--max-depth", + }, + { + name: "range needs boundary", + setFlags: map[string]string{ + "scope": "range", + }, + wantParams: []string{ + "--start-block-id", + "--end-block-id", + }, + }, + { + name: "keyword needs keyword", + setFlags: map[string]string{ + "scope": "keyword", + }, + wantParam: "--keyword", + }, + { + name: "section needs start block", + setFlags: map[string]string{ + "scope": "section", + }, + wantParam: "--start-block-id", + }, + { + name: "unknown scope", + setFlags: map[string]string{ + "scope": "bad", + }, + wantParam: "--scope", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + runtime := newFetchBodyTestRuntime(context.Background()) + for name, value := range tt.setFlags { + mustSetFetchFlag(t, runtime, name, value) + } + + err := validateReadModeFlags(runtime) + if err == nil { + t.Fatal("validateReadModeFlags() succeeded, want error") + } + assertValidationContract(t, err, errs.SubtypeInvalidArgument, tt.wantParam, tt.wantParams...) + }) + } +} + +func TestValidateReadModeFlagsAcceptsValidScopeOptions(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + setFlags map[string]string + }{ + { + name: "outline", + setFlags: map[string]string{ + "scope": "outline", + }, + }, + { + name: "range with end block", + setFlags: map[string]string{ + "scope": "range", + "end-block-id": "blk_end", + }, + }, + { + name: "keyword with keyword", + setFlags: map[string]string{ + "scope": "keyword", + "keyword": "bug|缺陷", + }, + }, + { + name: "section with start block", + setFlags: map[string]string{ + "scope": "section", + "start-block-id": "blk_heading", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + runtime := newFetchBodyTestRuntime(context.Background()) + for name, value := range tt.setFlags { + mustSetFetchFlag(t, runtime, name, value) + } + + if err := validateReadModeFlags(runtime); err != nil { + t.Fatalf("validateReadModeFlags() error = %v", err) + } + }) + } +} + +func TestValidateFetchV2RejectsInvalidDocAndScope(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + setFlags map[string]string + wantParam string + }{ + { + name: "invalid doc", + setFlags: map[string]string{ + "doc": "https://example.com/sheets/sht_token", + }, + wantParam: "--doc", + }, + { + name: "invalid scope", + setFlags: map[string]string{ + "scope": "bad", + }, + wantParam: "--scope", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + runtime := newFetchShortcutTestRuntime(t, "", tt.setFlags) + err := validateFetchV2(context.Background(), runtime) + if err == nil { + t.Fatal("validateFetchV2() succeeded, want error") + } + assertValidationContract(t, err, errs.SubtypeInvalidArgument, tt.wantParam) + }) + } +} + +func TestAddFetchDetailDowngradeWarningNoops(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + setFlags map[string]string + }{ + { + name: "xml format", + setFlags: map[string]string{ + "doc-format": "xml", + "detail": "full", + }, + }, + { + name: "markdown simple detail", + setFlags: map[string]string{ + "doc-format": "markdown", + "detail": "simple", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + runtime := newFetchBodyTestRuntime(context.Background()) + for name, value := range tt.setFlags { + mustSetFetchFlag(t, runtime, name, value) + } + + data := map[string]interface{}{} + if got := addFetchDetailDowngradeWarning(runtime, data); got != "" { + t.Fatalf("warning = %q, want empty", got) + } + if _, ok := data["warnings"]; ok { + t.Fatalf("unexpected warnings: %#v", data["warnings"]) + } + }) + } +} + func TestDocsFetchDryRunDefaultsToV2Endpoint(t *testing.T) { t.Parallel() @@ -141,36 +507,54 @@ func TestDocsFetchAPIVersionV1StillUsesV2Endpoint(t *testing.T) { } } +func TestDocsFetchIMMarkdownRequestsMarkdownFromAPI(t *testing.T) { + t.Parallel() + + runtime := newFetchShortcutTestRuntime(t, "", map[string]string{ + "doc-format": "im-markdown", + }) + if err := validateFetchV2(context.Background(), runtime); err != nil { + t.Fatalf("validateFetchV2() error = %v", err) + } + + dry := decodeDocDryRun(t, DocsFetch.DryRun(context.Background(), runtime)) + if got, want := dry.API[0].Body["format"], "markdown"; got != want { + t.Fatalf("dry-run format = %#v, want %q", got, want) + } +} + func TestDocsFetchMarkdownDetailDowngradesToSimple(t *testing.T) { t.Parallel() - for _, detail := range []string{"with-ids", "full"} { - t.Run(detail, func(t *testing.T) { - t.Parallel() + for _, format := range []string{"markdown", "im-markdown"} { + for _, detail := range []string{"with-ids", "full"} { + t.Run(format+"/"+detail, func(t *testing.T) { + t.Parallel() - runtime := newFetchShortcutTestRuntime(t, "", map[string]string{ - "doc-format": "markdown", - "detail": detail, + runtime := newFetchShortcutTestRuntime(t, "", map[string]string{ + "doc-format": format, + "detail": detail, + }) + if err := validateFetchV2(context.Background(), runtime); err != nil { + t.Fatalf("validateFetchV2() error = %v", err) + } + + dry := decodeDocDryRun(t, DocsFetch.DryRun(context.Background(), runtime)) + exportOption, _ := dry.API[0].Body["export_option"].(map[string]interface{}) + if exportOption == nil { + t.Fatalf("missing export_option: %#v", dry.API[0].Body) + } + if got := exportOption["export_block_id"]; got != false { + t.Fatalf("export_block_id = %#v, want false after markdown detail downgrade", got) + } + if got := exportOption["export_style_attrs"]; got != false { + t.Fatalf("export_style_attrs = %#v, want false after markdown detail downgrade", got) + } + if got := exportOption["export_cite_extra_data"]; got != false { + t.Fatalf("export_cite_extra_data = %#v, want false after markdown detail downgrade", got) + } }) - if err := validateFetchV2(context.Background(), runtime); err != nil { - t.Fatalf("validateFetchV2() error = %v", err) - } - - dry := decodeDocDryRun(t, DocsFetch.DryRun(context.Background(), runtime)) - exportOption, _ := dry.API[0].Body["export_option"].(map[string]interface{}) - if exportOption == nil { - t.Fatalf("missing export_option: %#v", dry.API[0].Body) - } - if got := exportOption["export_block_id"]; got != false { - t.Fatalf("export_block_id = %#v, want false after markdown detail downgrade", got) - } - if got := exportOption["export_style_attrs"]; got != false { - t.Fatalf("export_style_attrs = %#v, want false after markdown detail downgrade", got) - } - if got := exportOption["export_cite_extra_data"]; got != false { - t.Fatalf("export_cite_extra_data = %#v, want false after markdown detail downgrade", got) - } - }) + } } } @@ -261,6 +645,107 @@ func TestDocsFetchMarkdownDetailDowngradeWarnsInPrettyOutput(t *testing.T) { } } +func TestDocsFetchV2ReturnsAPIError(t *testing.T) { + t.Setenv("LARKSUITE_CLI_CONFIG_DIR", t.TempDir()) + + f, stdout, _, reg := cmdutil.TestFactory(t, docsTestConfigWithAppID("docs-fetch-api-error")) + reg.Register(&httpmock.Stub{ + Method: "POST", + URL: "/open-apis/docs_ai/v1/documents/doxcnFetchAPIError/fetch", + Body: map[string]interface{}{ + "code": 999999, + "msg": "fetch failed", + }, + }) + + err := mountAndRunDocs(t, DocsFetch, []string{ + "+fetch", + "--doc", "doxcnFetchAPIError", + "--as", "bot", + }, f, stdout) + if err == nil { + t.Fatal("mountAndRunDocs() succeeded, want API error") + } + var apiErr *errs.APIError + if !errors.As(err, &apiErr) { + t.Fatalf("error type = %T, want *errs.APIError (%v)", err, err) + } + p, ok := errs.ProblemOf(err) + if !ok { + t.Fatalf("ProblemOf() ok = false for %T (%v)", err, err) + } + if p.Category != errs.CategoryAPI { + t.Errorf("category = %q, want %q", p.Category, errs.CategoryAPI) + } + if p.Subtype != errs.SubtypeUnknown { + t.Errorf("subtype = %q, want %q", p.Subtype, errs.SubtypeUnknown) + } + if p.Code != 999999 { + t.Errorf("code = %d, want 999999", p.Code) + } + if p.Message != "fetch failed" { + t.Errorf("message = %q, want %q", p.Message, "fetch failed") + } + if cause := errors.Unwrap(err); cause != nil { + t.Fatalf("unexpected wrapped cause for API response error: %T %v", cause, cause) + } +} + +func TestDocsFetchIMMarkdownConvertsContentInJSONOutput(t *testing.T) { + t.Setenv("LARKSUITE_CLI_CONFIG_DIR", t.TempDir()) + + f, stdout, _, reg := cmdutil.TestFactory(t, docsTestConfigWithAppID("docs-fetch-im-markdown")) + reg.Register(&httpmock.Stub{ + Method: "POST", + URL: "/open-apis/docs_ai/v1/documents/doxcnFetchIMMarkdown/fetch", + Body: map[string]interface{}{ + "code": 0, + "msg": "ok", + "data": map[string]interface{}{ + "document": map[string]interface{}{ + "document_id": "doxcnFetchIMMarkdown", + "revision_id": float64(1), + "content": strings.Join([]string{ + `Doc Title`, + `Read **this**.`, + ``, + }, "\n\n"), + }, + }, + }, + }) + + err := mountAndRunDocs(t, DocsFetch, []string{ + "+fetch", + "--doc", "doxcnFetchIMMarkdown", + "--doc-format", "im-markdown", + "--as", "bot", + }, f, stdout) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + var envelope map[string]interface{} + if err := json.Unmarshal(stdout.Bytes(), &envelope); err != nil { + t.Fatalf("decode output: %v\nraw=%s", err, stdout.String()) + } + data, _ := envelope["data"].(map[string]interface{}) + doc, _ := data["document"].(map[string]interface{}) + content, _ := doc["content"].(string) + for _, want := range []string{ + "# Doc Title", + "---\n💡 Read **this**.\n---", + "[Example](https://example.com)", + } { + if !strings.Contains(content, want) { + t.Fatalf("converted content missing %q:\n%s", want, content) + } + } + if strings.Contains(content, "") || strings.Contains(content, "<callout") || strings.Contains(content, "<bookmark") { + t.Fatalf("converted content still contains downgraded XML tags:\n%s", content) + } +} + func TestDocsFetchRejectsLegacyFlags(t *testing.T) { tests := []struct { name string @@ -291,6 +776,7 @@ func TestDocsFetchRejectsLegacyFlags(t *testing.T) { if err == nil { t.Fatal("expected v2-only validation error") } + assertValidationContract(t, err, errs.SubtypeInvalidArgument, "--offset") for _, want := range tt.want { if !strings.Contains(err.Error(), want) { t.Fatalf("error missing %q: %v", want, err) @@ -316,6 +802,14 @@ func newFetchBodyTestRuntime(ctx context.Context) *common.RuntimeContext { return common.TestNewRuntimeContextWithCtx(ctx, cmd, nil) } +func mustSetFetchFlag(t *testing.T, runtime *common.RuntimeContext, name, value string) { + t.Helper() + + if err := runtime.Cmd.Flags().Set(name, value); err != nil { + t.Fatalf("set %s: %v", name, err) + } +} + func newFetchShortcutTestRuntime(t *testing.T, apiVersion string, setFlags map[string]string) *common.RuntimeContext { t.Helper() diff --git a/skills/lark-doc/SKILL.md b/skills/lark-doc/SKILL.md index 8d726e98..30ac3a50 100644 --- a/skills/lark-doc/SKILL.md +++ b/skills/lark-doc/SKILL.md @@ -67,7 +67,7 @@ Shortcut 是对常用操作的高级封装(`lark-cli docs +<verb> [flags]`) | Shortcut | 说明 | |----------|------| | [`+create`](references/lark-doc-create.md) | Create a Lark document (XML / Markdown) | -| [`+fetch`](references/lark-doc-fetch.md) | Fetch Lark document content (XML / Markdown) | +| [`+fetch`](references/lark-doc-fetch.md) | Fetch Lark document content (XML / Markdown / im-markdown; `im-markdown` only after fetch for `lark-im`) | | [`+update`](references/lark-doc-update.md) | Update a Lark document (str_replace / block_insert_after / block_replace / ...) | | [`+media-insert`](references/lark-doc-media-insert.md) | Insert a local image or file at the end of a Lark document (4-step orchestration + auto-rollback). Prefer `--from-clipboard` when the image is already on the system clipboard (screenshots, copy from Feishu/browser); use `--file` only for on-disk sources. | | [`+media-download`](references/lark-doc-media-download.md) | Download document media or whiteboard thumbnail (auto-detects extension) | diff --git a/skills/lark-doc/references/lark-doc-fetch.md b/skills/lark-doc/references/lark-doc-fetch.md index 911a390a..b558b8dc 100644 --- a/skills/lark-doc/references/lark-doc-fetch.md +++ b/skills/lark-doc/references/lark-doc-fetch.md @@ -91,7 +91,7 @@ lark-cli docs +fetch --api-version v2 --doc Z1Fj...tnAc \ } ``` -`content` 的格式由 `--doc-format` 决定。设置 `--scope` 时会被 `<fragment>` 包裹,详见上文"局部读取的输出结构"。 +`content` 的格式由 `--doc-format` 决定;`im-markdown` 仅用于获取内容后在 `lark-im` 场景下使用。设置 `--scope` 时会被 `<fragment>` 包裹,详见上文"局部读取的输出结构"。 ## 参数 @@ -99,7 +99,7 @@ lark-cli docs +fetch --api-version v2 --doc Z1Fj...tnAc \ |------|------|------| | `--api-version` | 是 | 固定传 `v2` | | `--doc` | 是 | 文档 URL 或 token(支持 `/docx/` 和 `/wiki/`) | -| `--doc-format` | 否 | `xml`(默认)\| `markdown` \| `text` | +| `--doc-format` | 否 | `xml`(默认)\| `markdown` \| `text` \| `im-markdown`(仅用于获取内容后在 `lark-im` 场景下使用) | | `--detail` | 否 | `simple`(默认)\| `with-ids` \| `full` | | `--revision-id` | 否 | 文档版本号,`-1` = 最新(默认) | | `--scope` | 否 | `outline` \| `range` \| `keyword` \| `section`(省略 = 读整篇) | diff --git a/skills/lark-doc/references/lark-doc-md.md b/skills/lark-doc/references/lark-doc-md.md index 4d267e7b..42748481 100644 --- a/skills/lark-doc/references/lark-doc-md.md +++ b/skills/lark-doc/references/lark-doc-md.md @@ -1,6 +1,6 @@ # Markdown 格式参考 -`docs +fetch --api-version v2` / `docs +create --api-version v2` / `docs +update --api-version v2` 使用 `--doc-format markdown` 时适用。 +`docs +fetch --api-version v2` / `docs +create --api-version v2` / `docs +update --api-version v2` 使用 `--doc-format markdown` 时适用;fetch 的 `--doc-format im-markdown` 仅用于获取内容后在 `lark-im` 场景下使用,不作为 create/update 写入格式。 ## 创建文档标题 diff --git a/skills/lark-im/SKILL.md b/skills/lark-im/SKILL.md index 983c66d2..507cf87f 100644 --- a/skills/lark-im/SKILL.md +++ b/skills/lark-im/SKILL.md @@ -67,6 +67,10 @@ Card messages (`interactive` type) are not yet supported for compact conversion `--audio` sends a voice message and supports only Opus audio files, for example `.opus` files or Ogg Opus (`.ogg`) files. For `mp3`, `wav`, or other non-Opus audio, either convert to `.opus` first and keep using `--audio`, or send the original file as an attachment with `--file`. +### Sending Doc Content as a Message + +When sending content fetched from a Lark doc as a message, fetch the doc with --doc-format im-markdown, then send it as a message using the --markdown format. The fetched content is already in markdown; in any content-forwarding scenario, keep the fetched original text and send it in the --markdown format. Note: if the doc contains a cite tag with type="user", keep it as-is and do not strip the tag. + ### Flag Types Flags support two layers: