larksuite-cli/shortcuts/mail/draft/parse.go

// Copyright (c) 2026 Lark Technologies Pte. Ltd.
// SPDX-License-Identifier: MIT

package draft

import (
	"bytes"
	"encoding/base64"
	"fmt"
	"io"
	"mime"
	"mime/quotedprintable"
	"net/mail"
	"strings"
)

func Parse(raw DraftRaw) (*DraftSnapshot, error) {
	decoded, err := decodeRawEML(raw.RawEML)
	if err != nil {
		return nil, err
	}
	headers, body, err := parseHeaderBlock(decoded)
	if err != nil {
		return nil, err
	}
	root, err := parseRootPart(headers, body)
	if err != nil {
		return nil, err
	}
	snapshot := &DraftSnapshot{
		DraftID: raw.DraftID,
		Headers: headers,
		Body:    root,
	}
	if err := refreshSnapshot(snapshot); err != nil {
		return nil, err
	}
	return snapshot, nil
}

// maxRawEMLSize is the maximum accepted raw (base64-encoded) EML string length.
// Base64 encodes 3 bytes into 4 chars, so 35 MB covers a 25 MB decoded EML with margin.
const maxRawEMLSize = 35 * 1024 * 1024

func decodeRawEML(raw string) ([]byte, error) {
	raw = strings.TrimSpace(raw)
	if raw == "" {
		return nil, fmt.Errorf("draft raw EML is empty")
	}
	if len(raw) > maxRawEMLSize {
		return nil, fmt.Errorf("draft raw EML is too large (%d bytes, max %d)", len(raw), maxRawEMLSize)
	}
	decoders := []func(string) ([]byte, error){
		base64.URLEncoding.DecodeString,
		base64.RawURLEncoding.DecodeString,
		base64.StdEncoding.DecodeString,
		base64.RawStdEncoding.DecodeString,
	}
	for _, decode := range decoders {
		decoded, err := decode(raw)
		if err == nil {
			return normalizeLineEndings(decoded), nil
		}
	}
	return nil, fmt.Errorf("draft raw EML is not valid base64url")
}

func normalizeLineEndings(in []byte) []byte {
	in = bytes.ReplaceAll(in, []byte("\r\n"), []byte("\n"))
	in = bytes.ReplaceAll(in, []byte("\r"), []byte("\n"))
	return in
}

func parseHeaderBlock(raw []byte) ([]Header, []byte, error) {
	raw = normalizeLineEndings(raw)
	sep := bytes.Index(raw, []byte("\n\n"))
	if sep < 0 {
		return nil, nil, fmt.Errorf("invalid EML: missing header/body separator")
	}
	headerLines := strings.Split(string(raw[:sep]), "\n")
	headers := make([]Header, 0, len(headerLines))
	for _, line := range headerLines {
		if strings.TrimSpace(line) == "" {
			continue
		}
		if (strings.HasPrefix(line, " ") || strings.HasPrefix(line, "\t")) && len(headers) > 0 {
			headers[len(headers)-1].Value += " " + strings.TrimSpace(line)
			continue
		}
		name, value, ok := strings.Cut(line, ":")
		if !ok {
			// Skip lines without a colon rather than failing. Some email
			// systems insert comment or separator lines in the header area.
			continue
		}
		headers = append(headers, Header{
			Name:  strings.TrimSpace(name),
			Value: strings.TrimSpace(value),
		})
	}
	return headers, raw[sep+2:], nil
}

func parseRootPart(messageHeaders []Header, body []byte) (*Part, error) {
	partHeaders := extractBodyHeaders(messageHeaders)
	part := &Part{
		PartID:  "1",
		Headers: append([]Header{}, partHeaders...),
	}
	if len(partHeaders) == 0 {
		part.MediaType = "text/plain"
		part.MediaParams = map[string]string{"charset": "UTF-8"}
		part.TransferEncoding = "7bit"
		part.Body = body
		part.RawEntity = append([]byte{}, body...)
		return part, nil
	}
	rawEntity := buildRawEntity(filterRawEntityHeaders(partHeaders), body)
	return parsePart(partHeaders, body, "1", rawEntity, 0)
}

const maxMIMEDepth = 50

func parsePart(headers []Header, body []byte, partID string, rawEntity []byte, depth int) (*Part, error) {
	if depth > maxMIMEDepth {
		return nil, fmt.Errorf("MIME nesting too deep (max %d levels)", maxMIMEDepth)
	}
	part := &Part{
		PartID:                partID,
		Headers:               append([]Header{}, headers...),
		MediaType:             "text/plain",
		MediaParams:           map[string]string{},
		ContentDispositionArg: map[string]string{},
		RawEntity:             append([]byte{}, rawEntity...),
	}
	if ct := headerValue(headers, "Content-Type"); ct != "" {
		mediaType, params, err := mime.ParseMediaType(ct)
		if err != nil {
			// Fallback: treat as opaque binary so the part is still accessible
			// and can round-trip through RawEntity. The original Content-Type
			// header is preserved for serialization.
			part.MediaType = "application/octet-stream"
			part.EncodingProblem = true
		} else {
			part.MediaType = strings.ToLower(mediaType)
			part.MediaParams = lowerCaseKeys(params)
		}
	} else {
		part.MediaParams["charset"] = "UTF-8"
	}
	if disp := headerValue(headers, "Content-Disposition"); disp != "" {
		dispType, params, err := mime.ParseMediaType(disp)
		if err == nil {
			part.ContentDisposition = strings.ToLower(dispType)
			part.ContentDispositionArg = lowerCaseKeys(params)
		}
		// On parse error, silently ignore the disposition. The original
		// header is preserved in part.Headers for serialization.
	}
	part.ContentID = strings.Trim(strings.TrimSpace(headerValue(headers, "Content-ID")), "<>")
	part.TransferEncoding = strings.ToLower(strings.TrimSpace(headerValue(headers, "Content-Transfer-Encoding")))

	if strings.HasPrefix(part.MediaType, "multipart/") {
		boundary := part.MediaParams["boundary"]
		if boundary == "" {
			return nil, fmt.Errorf("multipart part %s missing boundary", partID)
		}
		children, preamble, epilogue, err := parseMultipartChildren(body, boundary, partID, depth)
		if err != nil {
			return nil, err
		}
		if len(children) == 0 {
			// Boundary declared but never found in the body. Reclassify as
			// text rather than returning an empty multipart with no children
			// (following mail-parser's approach per Postel's law).
			part.MediaType = "text/plain"
			part.MediaParams = map[string]string{"charset": "UTF-8"}
			part.Body = body
			part.EncodingProblem = true
			return part, nil
		}
		part.Children = children
		part.Preamble = preamble
		part.Epilogue = epilogue
		return part, nil
	}

	decoded, encodingProblem := decodePartBody(body, part.TransferEncoding, part.MediaType, part.MediaParams)
	part.Body = decoded
	if encodingProblem {
		part.EncodingProblem = true
	}
	return part, nil
}

func parseMultipartChildren(body []byte, boundary, parentPartID string, depth int) ([]*Part, []byte, []byte, error) {
	lines := bytes.SplitAfter(body, []byte("\n"))
	startLine := "--" + boundary
	endLine := "--" + boundary + "--"

	var (
		children []*Part
		preamble bytes.Buffer
		epilogue bytes.Buffer
		buf      bytes.Buffer
		inPart   bool
		afterEnd bool
		index    int
	)

	flush := func() error {
		// Copy buf content before Reset to avoid memory aliasing: buf.Bytes()
		// returns a sub-slice of buf's internal array which gets overwritten
		// when the next MIME part is written to buf after Reset.
		partBytes := append([]byte{}, bytes.TrimRight(buf.Bytes(), " \t\r\n")...)
		buf.Reset()
		if len(partBytes) == 0 {
			return nil
		}
		// Ensure the header/body separator (\n\n) is present so parseHeaderBlock
		// can split headers from body.
		//
		// A part whose first byte is \n has no headers (the \n is the blank-line
		// separator). Prepend an extra \n so parseHeaderBlock finds \n\n at
		// position 0 and returns empty headers.
		//
		// Otherwise the part has headers but TrimRight may have stripped the
		// trailing \n\n when the body was empty. Re-append it.
		if len(partBytes) > 0 && partBytes[0] == '\n' {
			partBytes = append([]byte{'\n'}, partBytes...)
		} else if !bytes.Contains(partBytes, []byte("\n\n")) {
			partBytes = append(partBytes, '\n', '\n')
		}
		index++
		partID := fmt.Sprintf("%s.%d", parentPartID, index)
		headers, body, err := parseHeaderBlock(partBytes)
		if err != nil {
			return err
		}
		child, err := parsePart(headers, body, partID, partBytes, depth+1)
		if err != nil {
			return err
		}
		children = append(children, child)
		return nil
	}

	for _, line := range lines {
		trimmed := strings.TrimSpace(string(line))
		if afterEnd {
			epilogue.Write(line)
			continue
		}
		switch trimmed {
		case startLine:
			if inPart {
				if err := flush(); err != nil {
					return nil, nil, nil, err
				}
			} else {
				inPart = true
				continue
			}
			inPart = true
		case endLine:
			if inPart {
				if err := flush(); err != nil {
					return nil, nil, nil, err
				}
			}
			afterEnd = true
		default:
			if inPart {
				buf.Write(line)
			} else {
				preamble.Write(line)
			}
		}
	}
	if inPart {
		if err := flush(); err != nil {
			return nil, nil, nil, err
		}
	}
	return children, preamble.Bytes(), epilogue.Bytes(), nil
}

// decodePartBody decodes the transfer-encoded body. The second return value
// indicates whether a decoding problem occurred (the returned bytes are a
// best-effort fallback in that case).
func decodePartBody(body []byte, cte, mediaType string, mediaParams map[string]string) ([]byte, bool) {
	var decoded []byte
	var problem bool
	switch strings.ToLower(strings.TrimSpace(cte)) {
	case "", "7bit", "8bit", "binary":
		decoded = body
	case "base64":
		trimmed := strings.Join(strings.Fields(string(body)), "")
		var err error
		decoded, err = base64.StdEncoding.DecodeString(trimmed)
		if err != nil {
			// Try without padding — some implementations omit it.
			decoded, err = base64.RawStdEncoding.DecodeString(trimmed)
		}
		if err != nil {
			// Give up decoding; keep raw bytes so the part is still
			// accessible and can round-trip through RawEntity.
			decoded = body
			problem = true
		}
	case "quoted-printable":
		var err error
		decoded, err = io.ReadAll(quotedprintable.NewReader(bytes.NewReader(body)))
		if err != nil {
			// Keep raw bytes on QP decode failure (same as base64 fallback).
			decoded = body
			problem = true
		}
	default:
		decoded = body
	}
	if !isTextualMediaType(mediaType) {
		return decoded, problem
	}
	text, err := decodeTextCharset(decoded, mediaParams["charset"])
	if err != nil {
		// Unsupported or malformed charsets should not break draft parsing.
		// Keep the decoded transfer bytes so untouched subtrees can still round-trip raw.
		return decoded, true
	}
	return text, problem
}

func extractBodyHeaders(headers []Header) []Header {
	out := make([]Header, 0, len(headers))
	for _, header := range headers {
		if isBodyHeader(header.Name) {
			out = append(out, header)
		}
	}
	return out
}

func buildRawEntity(headers []Header, body []byte) []byte {
	if len(headers) == 0 {
		return append([]byte{}, body...)
	}
	var buf bytes.Buffer
	for _, header := range headers {
		buf.WriteString(header.Name)
		buf.WriteString(": ")
		buf.WriteString(header.Value)
		buf.WriteByte('\n')
	}
	buf.WriteByte('\n')
	buf.Write(body)
	return buf.Bytes()
}

func filterRawEntityHeaders(headers []Header) []Header {
	out := make([]Header, 0, len(headers))
	for _, header := range headers {
		if strings.EqualFold(header.Name, "MIME-Version") {
			continue
		}
		out = append(out, header)
	}
	return out
}

func refreshSnapshot(snapshot *DraftSnapshot) error {
	snapshot.Subject = decodeHeaderValue(headerValue(snapshot.Headers, "Subject"))
	snapshot.MessageID = strings.TrimSpace(headerValue(snapshot.Headers, "Message-ID"))
	snapshot.InReplyTo = strings.TrimSpace(headerValue(snapshot.Headers, "In-Reply-To"))
	snapshot.References = strings.TrimSpace(headerValue(snapshot.Headers, "References"))

	// Address headers are parsed leniently: malformed addresses (non-standard
	// display names, semicolons, broken encoding) are silently ignored so that
	// the draft can still be opened. The raw header values are preserved in
	// snapshot.Headers for round-trip serialization.
	snapshot.From, _ = parseAddressHeader(headerValue(snapshot.Headers, "From"))
	snapshot.To, _ = parseAddressHeader(headerValue(snapshot.Headers, "To"))
	snapshot.Cc, _ = parseAddressHeader(headerValue(snapshot.Headers, "Cc"))
	snapshot.Bcc, _ = parseAddressHeader(headerValue(snapshot.Headers, "Bcc"))
	snapshot.ReplyTo, _ = parseAddressHeader(headerValue(snapshot.Headers, "Reply-To"))

	reindexParts(snapshot.Body, "1")
	textPart := findPrimaryBodyPart(snapshot.Body, "text/plain")
	htmlPart := findPrimaryBodyPart(snapshot.Body, "text/html")
	if textPart != nil {
		snapshot.PrimaryTextPartID = textPart.PartID
	} else {
		snapshot.PrimaryTextPartID = ""
	}
	if htmlPart != nil {
		snapshot.PrimaryHTMLPartID = htmlPart.PartID
	} else {
		snapshot.PrimaryHTMLPartID = ""
	}
	// Inline CID consistency is NOT validated here — broken CID references
	// should not prevent opening the draft editor. Project() already reports
	// missing CIDs as warnings in DraftProjection.Warnings.
	return nil
}

func parseAddressHeader(value string) ([]Address, error) {
	value = strings.TrimSpace(value)
	if value == "" {
		return nil, nil
	}
	parser := &mail.AddressParser{WordDecoder: new(mime.WordDecoder)}
	addrs, err := parser.ParseList(value)
	if err != nil {
		return nil, err
	}
	out := make([]Address, 0, len(addrs))
	for _, addr := range addrs {
		out = append(out, Address{
			Name:    addr.Name,
			Address: strings.TrimSpace(addr.Address),
		})
	}
	return out, nil
}

func lowerCaseKeys(in map[string]string) map[string]string {
	if len(in) == 0 {
		return nil
	}
	out := make(map[string]string, len(in))
	for k, v := range in {
		out[strings.ToLower(k)] = v
	}
	return out
}

func headerValue(headers []Header, name string) string {
	for _, header := range headers {
		if strings.EqualFold(header.Name, name) {
			return header.Value
		}
	}
	return ""
}

func isBodyHeader(name string) bool {
	name = strings.ToLower(strings.TrimSpace(name))
	return strings.HasPrefix(name, "content-") || name == "mime-version"
}

func reindexParts(part *Part, partID string) {
	if part == nil {
		return
	}
	part.PartID = partID
	for i, child := range part.Children {
		reindexParts(child, fmt.Sprintf("%s.%d", partID, i+1))
	}
}

func findPrimaryBodyPart(root *Part, mediaType string) *Part {
	var best *Part
	bestScore := -1

	var walk func(part *Part, ancestors []string)
	walk = func(part *Part, ancestors []string) {
		if part == nil {
			return
		}
		if !part.IsMultipart() {
			score, ok := bodyCandidateScore(part, ancestors, mediaType)
			if ok && score > bestScore {
				best = part
				bestScore = score
			}
			return
		}
		nextAncestors := append(append([]string{}, ancestors...), part.MediaType)
		for _, child := range part.Children {
			walk(child, nextAncestors)
		}
	}
	walk(root, nil)
	return best
}

func bodyCandidateScore(part *Part, ancestors []string, mediaType string) (int, bool) {
	if part == nil || !strings.EqualFold(part.MediaType, mediaType) {
		return 0, false
	}
	switch strings.ToLower(part.ContentDisposition) {
	case "attachment", "inline":
		return 0, false
	}
	score := 1
	for _, ancestor := range ancestors {
		switch ancestor {
		case "multipart/signed", "multipart/encrypted":
			return 0, false
		case "multipart/alternative":
			score += 10
		case "multipart/related":
			if mediaType == "text/html" {
				score += 5
			}
		}
	}
	return score, true
}