From 0ebdcf5916a66ed0b66074edbd39be8b5be07d85 Mon Sep 17 00:00:00 2001 From: dev-claudecode Date: Wed, 1 Jul 2026 17:18:12 +0000 Subject: [PATCH] fix(qq): route long Send() through HTTP to dodge 15s WS timeout (#1474) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Long replies (~2.5KB+) always tripped the hardcoded 15s timeout in callAPI because the OneBot readLoop serves both inbound events and outbound replies — long sends queue behind inbound traffic and stall. New `send_via_http` config knob (default "auto"): - "auto" → WS for <=512 byte messages, HTTP for longer ones. Falls back to WS when http_url is unset. - "always" → always HTTP (requires http_url); falls back to WS when absent so unconfigured deployments still deliver. - "never" → always WebSocket, even for long messages. SendFile() and SendImage() are unaffected — separate code paths. Tests cover: short-msg WS path, long-msg HTTP fallback (2500B), private-vs-group, threshold boundary, mode=always/never, auto with no http_url, invalid config value (falls back to auto), and a table-driven shouldUseHTTP test. --- CHANGELOG.md | 1 + config.example.toml | 14 ++ platform/qq/qq.go | 63 ++++++++- platform/qq/qq_test.go | 289 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 363 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d13281213..5741d145d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ - **feishu**: coalesce consecutive image messages from the same session into a single multi-image dispatch to fix first-image drop on batch sends (#1395). When the Feishu mobile client sends N images in quick succession, each image arrives as a separate `image` event with very close `create_time` values. Dispatching each immediately caused core/engine's `create_time` watermark (PR #1168) to drop the oldest image, so the agent only saw N-1 images. A per-session image buffer with a 150ms quiet window now merges the burst into one `core.Message` carrying all images, in send order. Single-image sends and quoted-image replies are unaffected. - **claudecode**: fix per-spawn system-prompt temp file EACCES under `run_as_user` (#1429). The per-spawn temp file written by `writeTempAppendPromptFile` (the 1% edge-case path used when the prompt has session-specific platform formatting or user `append_system_prompt`) inherited `os.CreateTemp`'s 0600 mode and was owned by the cc-connect process user (often root under systemd). When the agent was spawned under a different `run_as_user`, it could not read the file and exited before any prompt was loaded. The file is now `chmod 0o644` immediately after write, matching the shared `ensureSharedSystemPromptFile` path. Prompt content is non-secret (a superset of the already-shared base prompt), so 0644 is consistent with the shared file. Does not affect the shared-file path (already 0644 since #1376) or the daemon-mode path resolution (#1419). - **core**: queue post-restart notification and dispatch on platform ready (#1383). Previously `/restart` sent the success notification immediately after engine startup, racing the platform's async connect window (Telegram: ~2.6s). On a not-yet-ready platform the send was silently dropped at debug log level. The notify is now queued on the engine and dispatched when the target platform reaches `OnPlatformReady`, with bounded retry (3 attempts, 0/500/1500 ms backoff) on transient send failure. Failed sends log at warn level. A 10s safety timeout drops the notify with a warning if the target platform never reaches ready, so startup is never blocked indefinitely. Also covers Discord / Weixin / Matrix (other AsyncRecoverablePlatform implementations) for free. +- **qq**: route long `Send()` messages through the OneBot HTTP API to avoid the 15s WebSocket timeout (#1474). Long replies (~2.5KB+) always tripped the hardcoded WS timeout in `callAPI` because the WS readLoop is shared between inbound events and outbound replies; long sends queue behind inbound traffic and stall. New `send_via_http` config knob (default `"auto"`): `"auto"` switches to HTTP when message exceeds 512 bytes and `http_url` is configured, `"always"` forces HTTP, `"never"` keeps WebSocket. Falls back to WS when `http_url` is empty so unconfigured deployments still deliver. `SendFile()` and `SendImage()` are unaffected (separate code paths). ## v1.3.3 (2026-06-15) diff --git a/config.example.toml b/config.example.toml index 51680145c..7697b7a34 100644 --- a/config.example.toml +++ b/config.example.toml @@ -1570,6 +1570,20 @@ app_secret = "your-feishu-app-secret" # http_url = "" # optional: OneBot HTTP API URL for file operations, e.g. "http://127.0.0.1:3000" # # Required for SendFile. Useful when OneBot runs in WSL/Docker. # # 可选:OneBot HTTP API 地址,用于文件发送。OneBot 在 WSL/Docker 中时必须配置。 +# send_via_http = "auto" # optional: how Send() dispatches text replies (default "auto") +# # "auto" → WS for short messages (<=512 bytes), HTTP for longer ones. +# # Long replies via WS can hit the 15s OneBot WS timeout (#1474). +# # HTTP has a 120s timeout and avoids WS readLoop queueing. +# # Falls back to WS if http_url is not configured. +# # "always" → always use HTTP (requires http_url); falls back to WS if absent. +# # "never" → always use WebSocket, even for long messages. +# # 可选:Send() 的派发方式(默认 "auto") +# # "auto" → 短消息(<=512 字节)走 WebSocket,长消息走 HTTP。 +# # 长回复走 WS 可能命中 OneBot 15 秒 WS 超时(#1474)。 +# # HTTP 有 120 秒超时,并避免阻塞 WS readLoop。 +# # 未配置 http_url 时退回 WS。 +# # "always" → 始终走 HTTP(需要 http_url);缺失时退回 WS。 +# # "never" → 始终走 WebSocket,即使长消息。 # allow_from = "*" # allowed QQ user IDs, e.g. "12345,67890" or "*" for all # # 允许的 QQ 号,如 "12345,67890","*" 表示所有 # share_session_in_channel = false # If true, all users in a group share one agent session / 群聊共享会话 diff --git a/platform/qq/qq.go b/platform/qq/qq.go index 294c09bc3..f6f2373fb 100644 --- a/platform/qq/qq.go +++ b/platform/qq/qq.go @@ -23,6 +23,14 @@ func init() { core.RegisterPlatform("qq", New) } +// defaultHTTPFallbackThreshold is the message size (in bytes) above which Send() +// routes through the OneBot HTTP API instead of the WebSocket. The OneBot WS +// implementation round-trips every reply through the same readLoop that +// delivers events, so a long reply can race or stall and trip the 15s WS +// timeout (see #1474). HTTP send has a dedicated 120s timeout and avoids +// queueing on the WS readLoop. +const defaultHTTPFallbackThreshold = 512 + // Platform connects to a OneBot v11 implementation (NapCat, LLOneBot, etc.) // via forward WebSocket. It receives message events and sends messages back // through the same WS connection. @@ -40,7 +48,8 @@ type Platform struct { selfID int64 dedup core.MessageDedup groupNameCache sync.Map // groupID -> group name - httpURL string // OneBot HTTP API URL, e.g. "http://127.0.0.1:3000" + httpURL string // OneBot HTTP API URL, e.g. "http://127.0.0.1:3000" + sendViaHTTP string // "auto" (default) | "always" | "never" } func New(opts map[string]any) (core.Platform, error) { @@ -57,12 +66,26 @@ func New(opts map[string]any) (core.Platform, error) { httpURL, _ := opts["http_url"].(string) httpURL = strings.TrimRight(httpURL, "/") + sendViaHTTP, _ := opts["send_via_http"].(string) + switch sendViaHTTP { + case "", "auto", "always", "never": + // valid + default: + slog.Warn("qq: invalid send_via_http value, falling back to auto", + "got", sendViaHTTP, "valid", []string{"auto", "always", "never"}) + sendViaHTTP = "auto" + } + if sendViaHTTP == "" { + sendViaHTTP = "auto" + } + return &Platform{ wsURL: wsURL, token: token, allowFrom: allowFrom, shareSessionInChannel: shareSessionInChannel, - httpURL: httpURL, + httpURL: httpURL, + sendViaHTTP: sendViaHTTP, }, nil } @@ -443,6 +466,13 @@ func (p *Platform) Reply(ctx context.Context, replyCtx any, content string) erro } // Send sends a message to the conversation identified by replyCtx. +// +// Long messages route through the OneBot HTTP API instead of WebSocket. The +// OneBot WS implementation round-trips every reply through the same readLoop +// that delivers inbound events, so a long reply can race or stall and trip +// the 15s WS timeout (#1474). HTTP send has a dedicated 120s timeout and +// avoids queueing on the WS readLoop. Threshold + mode controlled by +// `send_via_http` config (default "auto" → switch at 512 bytes). func (p *Platform) Send(ctx context.Context, replyCtx any, content string) error { rctx, ok := replyCtx.(*replyContext) if !ok { @@ -453,17 +483,42 @@ func (p *Platform) Send(ctx context.Context, replyCtx any, content string) error "message": content, } + useHTTP := p.shouldUseHTTP(content) + call := p.callAPI + if useHTTP { + call = p.callHTTPAPI + } + if rctx.messageType == "group" { params["group_id"] = rctx.groupID - _, err := p.callAPI("send_group_msg", params) + _, err := call("send_group_msg", params) return err } params["user_id"] = rctx.userID - _, err := p.callAPI("send_private_msg", params) + _, err := call("send_private_msg", params) return err } +// shouldUseHTTP returns true when Send() should dispatch through the OneBot +// HTTP API for this message. Honors the `send_via_http` config: +// - "always" → true (skip WS entirely for sends) +// - "never" → false (always use WS, even for long messages) +// - "auto" → true when len(content) > 512 bytes AND http_url is configured +// +// "auto" falls back to WS when http_url is empty — there is no HTTP path to +// use. SendFile() and SendImage() are unaffected (separate code paths). +func (p *Platform) shouldUseHTTP(content string) bool { + switch p.sendViaHTTP { + case "always": + return p.httpURL != "" + case "never": + return false + default: // "auto" or unset + return p.httpURL != "" && len(content) > defaultHTTPFallbackThreshold + } +} + // SendImage sends an image to the conversation. // Implements core.ImageSender. func (p *Platform) SendImage(ctx context.Context, replyCtx any, img core.ImageAttachment) error { diff --git a/platform/qq/qq_test.go b/platform/qq/qq_test.go index e1071eb95..37d8bee81 100644 --- a/platform/qq/qq_test.go +++ b/platform/qq/qq_test.go @@ -1,10 +1,13 @@ package qq import ( + "context" "encoding/json" + "io" "net/http" "net/http/httptest" "strings" + "sync/atomic" "testing" "time" @@ -149,3 +152,289 @@ func TestStart_FetchesSelfIDWithoutTimeout(t *testing.T) { t.Errorf("selfID = %d, want %d (self-message filter would be disabled)", p.selfID, botUserID) } } + +// napcatServer builds a mock NapCat exposing both WS (for receive + default +// send) and HTTP (for HTTP send fallback). It records how many times each +// surface was called so tests can assert which path Send() took. +type napcatServer struct { + ts *httptest.Server + wsSends atomic.Int32 // send_group_msg / send_private_msg over WS + httpSends atomic.Int32 // send_group_msg / send_private_msg over HTTP +} + +func newNapcatServer(t *testing.T) *napcatServer { + t.Helper() + ns := &napcatServer{} + upgrader := websocket.Upgrader{} + ns.ts = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if websocket.IsWebSocketUpgrade(r) { + c, err := upgrader.Upgrade(w, r, nil) + if err != nil { + return + } + defer c.Close() + for { + _, msg, err := c.ReadMessage() + if err != nil { + return + } + var req map[string]any + if err := json.Unmarshal(msg, &req); err != nil { + continue + } + action, _ := req["action"].(string) + switch action { + case "get_login_info": + echo, _ := req["echo"].(string) + resp := map[string]any{ + "status": "ok", + "retcode": 0, + "echo": echo, + "data": map[string]any{"user_id": 999999, "nickname": "TestBot"}, + } + raw, _ := json.Marshal(resp) + _ = c.WriteMessage(websocket.TextMessage, raw) + case "send_group_msg", "send_private_msg": + ns.wsSends.Add(1) + echo, _ := req["echo"].(string) + resp := map[string]any{ + "status": "ok", + "retcode": 0, + "echo": echo, + "data": map[string]any{"message_id": 12345}, + } + raw, _ := json.Marshal(resp) + _ = c.WriteMessage(websocket.TextMessage, raw) + } + } + } + + // HTTP API surface for Send() HTTP fallback. + body, _ := io.ReadAll(r.Body) + var params map[string]any + _ = json.Unmarshal(body, ¶ms) + + // Echo path on HTTP root for testing error formatting. + _ = params + + ns.httpSends.Add(1) + resp := map[string]any{ + "status": "ok", + "retcode": 0, + "data": map[string]any{"message_id": 67890}, + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(resp) + })) + t.Cleanup(ns.ts.Close) + return ns +} + +func startPlatform(t *testing.T, ns *napcatServer, opts map[string]any) *Platform { + t.Helper() + if opts == nil { + opts = map[string]any{} + } + opts["ws_url"] = "ws" + strings.TrimPrefix(ns.ts.URL, "http") + if _, ok := opts["http_url"]; !ok { + opts["http_url"] = ns.ts.URL + } + p, err := New(opts) + if err != nil { + t.Fatalf("New: %v", err) + } + platform := p.(*Platform) + if err := platform.Start(func(core.Platform, *core.Message) {}); err != nil { + t.Fatalf("Start: %v", err) + } + t.Cleanup(func() { _ = platform.Stop() }) + return platform +} + +func TestSend_ShortMessageUsesWebSocket(t *testing.T) { + ns := newNapcatServer(t) + p := startPlatform(t, ns, nil) + + rctx := &replyContext{messageType: "group", groupID: 12345} + if err := p.Send(context.Background(), rctx, "short message"); err != nil { + t.Fatalf("Send: %v", err) + } + + if got := ns.wsSends.Load(); got != 1 { + t.Errorf("wsSends = %d, want 1 (short message must go via WS)", got) + } + if got := ns.httpSends.Load(); got != 0 { + t.Errorf("httpSends = %d, want 0 (short message must NOT hit HTTP)", got) + } +} + +func TestSend_LongMessageFallbackHTTP(t *testing.T) { + ns := newNapcatServer(t) + p := startPlatform(t, ns, nil) + + rctx := &replyContext{messageType: "group", groupID: 12345} + // Reporter's reproducer: ~2500 byte message would always hit the 15s WS + // timeout; must now dispatch through HTTP. + long := strings.Repeat("a", 2500) + if err := p.Send(context.Background(), rctx, long); err != nil { + t.Fatalf("Send: %v", err) + } + + if got := ns.httpSends.Load(); got != 1 { + t.Errorf("httpSends = %d, want 1 (long message must go via HTTP)", got) + } + if got := ns.wsSends.Load(); got != 0 { + t.Errorf("wsSends = %d, want 0 (long message must NOT hit WS)", got) + } +} + +func TestSend_LongPrivateMessageFallbackHTTP(t *testing.T) { + ns := newNapcatServer(t) + p := startPlatform(t, ns, nil) + + rctx := &replyContext{messageType: "private", userID: 99999} + long := strings.Repeat("b", 3000) + if err := p.Send(context.Background(), rctx, long); err != nil { + t.Fatalf("Send: %v", err) + } + + if got := ns.httpSends.Load(); got != 1 { + t.Errorf("httpSends = %d, want 1", got) + } + if got := ns.wsSends.Load(); got != 0 { + t.Errorf("wsSends = %d, want 0", got) + } +} + +func TestSend_AtThresholdUsesWebSocket(t *testing.T) { + ns := newNapcatServer(t) + p := startPlatform(t, ns, nil) + + rctx := &replyContext{messageType: "group", groupID: 12345} + // Boundary: exactly at threshold → still WS (> not >=). + if err := p.Send(context.Background(), rctx, strings.Repeat("c", 512)); err != nil { + t.Fatalf("Send: %v", err) + } + + if got := ns.wsSends.Load(); got != 1 { + t.Errorf("wsSends = %d, want 1 at threshold", got) + } + if got := ns.httpSends.Load(); got != 0 { + t.Errorf("httpSends = %d, want 0 at threshold", got) + } +} + +func TestSend_ModeAlwaysUsesHTTP(t *testing.T) { + ns := newNapcatServer(t) + p := startPlatform(t, ns, map[string]any{"send_via_http": "always"}) + + rctx := &replyContext{messageType: "group", groupID: 12345} + if err := p.Send(context.Background(), rctx, "x"); err != nil { + t.Fatalf("Send: %v", err) + } + + if got := ns.httpSends.Load(); got != 1 { + t.Errorf("httpSends = %d, want 1", got) + } + if got := ns.wsSends.Load(); got != 0 { + t.Errorf("wsSends = %d, want 0", got) + } +} + +func TestSend_ModeNeverUsesWebSocket(t *testing.T) { + ns := newNapcatServer(t) + p := startPlatform(t, ns, map[string]any{"send_via_http": "never"}) + + rctx := &replyContext{messageType: "group", groupID: 12345} + long := strings.Repeat("d", 2500) + if err := p.Send(context.Background(), rctx, long); err != nil { + t.Fatalf("Send: %v", err) + } + + if got := ns.wsSends.Load(); got != 1 { + t.Errorf("wsSends = %d, want 1 (never mode must use WS even for long)", got) + } + if got := ns.httpSends.Load(); got != 0 { + t.Errorf("httpSends = %d, want 0", got) + } +} + +func TestSend_AutoNoHTTPURLFallsBackToWebSocket(t *testing.T) { + ns := newNapcatServer(t) + p := startPlatform(t, ns, map[string]any{"http_url": ""}) + + rctx := &replyContext{messageType: "group", groupID: 12345} + long := strings.Repeat("e", 2500) + if err := p.Send(context.Background(), rctx, long); err != nil { + t.Fatalf("Send: %v", err) + } + + if got := ns.wsSends.Load(); got != 1 { + t.Errorf("wsSends = %d, want 1 (no http_url → must use WS)", got) + } + if got := ns.httpSends.Load(); got != 0 { + t.Errorf("httpSends = %d, want 0", got) + } +} + +func TestSend_ModeAlwaysNoHTTPURLGracefullyFallsBackToWS(t *testing.T) { + ns := newNapcatServer(t) + p := startPlatform(t, ns, map[string]any{"send_via_http": "always", "http_url": ""}) + + rctx := &replyContext{messageType: "group", groupID: 12345} + // With send_via_http=always but no http_url configured, must fall back to + // WebSocket instead of failing — better to deliver via the available path + // than surface a config error every send. Matches SendFile() behavior. + if err := p.Send(context.Background(), rctx, "x"); err != nil { + t.Fatalf("Send: %v", err) + } + + if got := ns.wsSends.Load(); got != 1 { + t.Errorf("wsSends = %d, want 1 (no http_url → must fall back to WS)", got) + } + if got := ns.httpSends.Load(); got != 0 { + t.Errorf("httpSends = %d, want 0", got) + } +} + +func TestNew_InvalidSendViaHTTPFallsBackToAuto(t *testing.T) { + p, err := New(map[string]any{"send_via_http": "bogus"}) + if err != nil { + t.Fatalf("New: %v", err) + } + platform := p.(*Platform) + if platform.sendViaHTTP != "auto" { + t.Errorf("sendViaHTTP = %q, want auto", platform.sendViaHTTP) + } +} + +func TestShouldUseHTTP(t *testing.T) { + cases := []struct { + name string + sendMode string + httpURL string + len int + wantHTTP bool + }{ + {"auto_short_under_threshold", "auto", "http://x", 100, false}, + {"auto_long_over_threshold", "auto", "http://x", 513, true}, + {"auto_long_no_http_url", "auto", "", 5000, false}, + {"always_short_with_http", "always", "http://x", 10, true}, + {"always_short_no_http", "always", "", 10, false}, + {"never_long", "never", "http://x", 5000, false}, + {"default_empty_mode_long", "", "http://x", 1000, true}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + p := &Platform{sendViaHTTP: c.sendMode, httpURL: c.httpURL} + if c.sendMode == "" { + // New() normalizes "" → "auto"; mimic that here. + p.sendViaHTTP = "auto" + } + got := p.shouldUseHTTP(strings.Repeat("x", c.len)) + if got != c.wantHTTP { + t.Errorf("shouldUseHTTP(%d) = %v, want %v", c.len, got, c.wantHTTP) + } + }) + } +}