Files
chenhg5-cc-connect/core/tts_test.go
cg33 6cd988ff02 fix(tts/minimax): drop status=2 trailer chunk to stop audio playing twice (#1364)
* fix(tts/minimax): drop status=2 trailer chunk to stop audio playing twice

MiniMax T2A v2 stream protocol sends incremental audio in `status=1`
chunks and re-emits the full audio in a final `status=2` chunk as a
trailer for non-stream clients. The previous reader appended every chunk
with a non-empty `audio` field, so the trailer was concatenated after
the streamed chunks and the synthesised speech played the full text
twice (verified end-to-end against MiniMax `speech-2.8-hd`).

Honour the protocol: break out of the SSE read loop as soon as a
`status=2` chunk arrives and discard its audio. Defence-in-depth tests
cover both the trailer-with-audio case and any unexpected chunks that
might follow the final status=2 marker.

Verified by replaying the live MiniMax SSE response: status=1 chunks
totalled 18 477 bytes, the status=2 trailer carried another 19 572
bytes, and cc-connect previously concatenated them into a ~38 KB MP3
that plays "测试" twice. After this fix the same call returns a single
~18 KB MP3 that plays the text once.

Co-authored-by: Cursor <cursoragent@cursor.com>

* test(tts): check fmt.Fprintf errors to satisfy errcheck lint

Co-authored-by: Cursor <cursoragent@cursor.com>

---------

Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-15 13:48:53 +08:00

852 lines
28 KiB
Go

package core
import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"os"
"os/exec"
"path/filepath"
"runtime"
"sync"
"testing"
"time"
)
// ──────────────────────────────────────────────────────────────
// TTSCfg concurrency tests
// ──────────────────────────────────────────────────────────────
func TestTTSCfg_GetSetMode(t *testing.T) {
cfg := TTSCfg{}
// default when empty
if got := cfg.GetTTSMode(); got != "voice_only" {
t.Errorf("expected default voice_only, got %q", got)
}
cfg.SetTTSMode("always")
if got := cfg.GetTTSMode(); got != "always" {
t.Errorf("expected always, got %q", got)
}
cfg.SetTTSMode("voice_only")
if got := cfg.GetTTSMode(); got != "voice_only" {
t.Errorf("expected voice_only, got %q", got)
}
}
func TestTTSCfg_ConcurrentGetSet(t *testing.T) {
cfg := TTSCfg{}
cfg.SetTTSMode("voice_only")
var wg sync.WaitGroup
for i := 0; i < 100; i++ {
wg.Add(2)
go func() {
defer wg.Done()
cfg.SetTTSMode("always")
}()
go func() {
defer wg.Done()
_ = cfg.GetTTSMode()
}()
}
wg.Wait()
}
// ──────────────────────────────────────────────────────────────
// QwenTTS tests
// ──────────────────────────────────────────────────────────────
func TestQwenTTS_Success(t *testing.T) {
// Stub: returns audio URL
audioServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte("fake-wav-data"))
}))
defer audioServer.Close()
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
resp := map[string]any{
"output": map[string]any{
"audio": map[string]any{
"url": audioServer.URL + "/audio.wav",
},
},
}
json.NewEncoder(w).Encode(resp)
}))
defer apiServer.Close()
tts := NewQwenTTS("test-key", apiServer.URL, "qwen3-tts-flash", nil)
audio, format, err := tts.Synthesize(context.Background(), "hello", TTSSynthesisOpts{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if format != "wav" {
t.Errorf("expected wav, got %q", format)
}
if string(audio) != "fake-wav-data" {
t.Errorf("unexpected audio data: %q", audio)
}
}
func TestQwenTTS_APIError(t *testing.T) {
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusUnauthorized)
w.Write([]byte("unauthorized"))
}))
defer apiServer.Close()
tts := NewQwenTTS("bad-key", apiServer.URL, "", nil)
_, _, err := tts.Synthesize(context.Background(), "hello", TTSSynthesisOpts{})
if err == nil {
t.Fatal("expected error for non-200 response")
}
}
func TestQwenTTS_BusinessErrorCode(t *testing.T) {
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
resp := map[string]any{
"code": "InvalidApiKey",
"message": "api key is invalid",
}
json.NewEncoder(w).Encode(resp)
}))
defer apiServer.Close()
tts := NewQwenTTS("bad-key", apiServer.URL, "", nil)
_, _, err := tts.Synthesize(context.Background(), "hello", TTSSynthesisOpts{})
if err == nil {
t.Fatal("expected error for business error code")
}
}
func TestQwenTTS_EmptyAudioURL(t *testing.T) {
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
resp := map[string]any{
"output": map[string]any{
"audio": map[string]any{
"url": "",
},
},
}
json.NewEncoder(w).Encode(resp)
}))
defer apiServer.Close()
tts := NewQwenTTS("test-key", apiServer.URL, "", nil)
_, _, err := tts.Synthesize(context.Background(), "hello", TTSSynthesisOpts{})
if err == nil {
t.Fatal("expected error for empty audio URL")
}
}
func TestQwenTTS_AudioDownloadFailed(t *testing.T) {
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
resp := map[string]any{
"output": map[string]any{
"audio": map[string]any{
"url": "http://127.0.0.1:1/nonexistent.wav",
},
},
}
json.NewEncoder(w).Encode(resp)
}))
defer apiServer.Close()
tts := NewQwenTTS("test-key", apiServer.URL, "", nil)
_, _, err := tts.Synthesize(context.Background(), "hello", TTSSynthesisOpts{})
if err == nil {
t.Fatal("expected error when audio download fails")
}
}
// ──────────────────────────────────────────────────────────────
// OpenAITTS tests
// ──────────────────────────────────────────────────────────────
func TestOpenAITTS_Success(t *testing.T) {
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/audio/speech" {
t.Errorf("unexpected path: %s", r.URL.Path)
}
w.WriteHeader(http.StatusOK)
w.Write([]byte("fake-mp3-data"))
}))
defer apiServer.Close()
tts := NewOpenAITTS("test-key", apiServer.URL, "tts-1", nil)
audio, format, err := tts.Synthesize(context.Background(), "hello", TTSSynthesisOpts{Voice: "alloy"})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if format != "mp3" {
t.Errorf("expected mp3, got %q", format)
}
if string(audio) != "fake-mp3-data" {
t.Errorf("unexpected audio data: %q", audio)
}
}
func TestOpenAITTS_APIError(t *testing.T) {
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusBadRequest)
w.Write([]byte(`{"error":"bad request"}`))
}))
defer apiServer.Close()
tts := NewOpenAITTS("test-key", apiServer.URL, "", nil)
_, _, err := tts.Synthesize(context.Background(), "hello", TTSSynthesisOpts{})
if err == nil {
t.Fatal("expected error for non-200 response")
}
}
// ──────────────────────────────────────────────────────────────
// MiniMaxTTS tests
// ──────────────────────────────────────────────────────────────
func TestMiniMaxTTS_Success(t *testing.T) {
// Stub SSE server returning hex-encoded audio chunks
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/v1/t2a_v2" {
t.Errorf("unexpected path: %s", r.URL.Path)
}
var req struct {
Model string `json:"model"`
Text string `json:"text"`
VoiceSetting struct {
VoiceID string `json:"voice_id"`
Speed float64 `json:"speed"`
} `json:"voice_setting"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
t.Fatalf("decode request: %v", err)
}
if req.Model != "speech-2.8-hd" || req.Text != "hello" {
t.Fatalf("unexpected model/text: %q/%q", req.Model, req.Text)
}
if req.VoiceSetting.VoiceID != "voice-test" {
t.Fatalf("voice_id = %q", req.VoiceSetting.VoiceID)
}
if req.VoiceSetting.Speed != 0.98 {
t.Fatalf("speed = %v, want 0.98", req.VoiceSetting.Speed)
}
w.Header().Set("Content-Type", "text/event-stream")
// "fake-mp3" hex-encoded
hexAudio := "66616b652d6d7033"
chunk := map[string]any{
"data": map[string]any{"audio": hexAudio, "status": 1},
"base_resp": map[string]any{"status_code": 0, "status_msg": "success"},
}
data, _ := json.Marshal(chunk)
fmt.Fprintf(w, "data:%s\n\n", data)
// Final chunk with status 2
finalChunk := map[string]any{
"data": map[string]any{"audio": "", "status": 2},
"base_resp": map[string]any{"status_code": 0, "status_msg": "success"},
}
finalData, _ := json.Marshal(finalChunk)
fmt.Fprintf(w, "data:%s\n\n", finalData)
}))
defer apiServer.Close()
tts := NewMiniMaxTTS("test-key", apiServer.URL, "speech-2.8-hd", nil)
audio, format, err := tts.Synthesize(context.Background(), "hello", TTSSynthesisOpts{Voice: "voice-test", Speed: 0.98})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if format != "mp3" {
t.Errorf("expected mp3, got %q", format)
}
if string(audio) != "fake-mp3" {
t.Errorf("unexpected audio data: %q", audio)
}
}
// TestMiniMaxTTS_FinalStatusChunkAudioIsDropped reproduces the doubled-audio
// bug: MiniMax T2A v2 stream protocol sends incremental audio in status=1
// chunks and re-emits the full audio in a final status=2 trailer chunk for
// non-stream clients. Without dedup the resulting audio plays twice.
func TestMiniMaxTTS_FinalStatusChunkAudioIsDropped(t *testing.T) {
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/event-stream")
// Incremental status=1 chunks: "hello" hex-encoded across 3 chunks.
for _, hexAudio := range []string{"68", "656c6c", "6f"} {
chunk := map[string]any{
"data": map[string]any{"audio": hexAudio, "status": 1},
"base_resp": map[string]any{"status_code": 0, "status_msg": "success"},
}
data, _ := json.Marshal(chunk)
if _, err := fmt.Fprintf(w, "data:%s\n\n", data); err != nil {
t.Errorf("write chunk: %v", err)
}
}
// Final status=2 chunk re-sends the full "hello" audio as trailer.
final := map[string]any{
"data": map[string]any{"audio": "68656c6c6f", "status": 2},
"base_resp": map[string]any{"status_code": 0, "status_msg": "success"},
}
finalData, _ := json.Marshal(final)
if _, err := fmt.Fprintf(w, "data:%s\n\n", finalData); err != nil {
t.Errorf("write final: %v", err)
}
}))
defer apiServer.Close()
tts := NewMiniMaxTTS("test-key", apiServer.URL, "", nil)
audio, _, err := tts.Synthesize(context.Background(), "hello", TTSSynthesisOpts{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got, want := string(audio), "hello"; got != want {
t.Errorf("audio = %q (len=%d), want %q (len=%d) — status=2 trailer was not deduplicated, audio plays twice",
got, len(got), want, len(want))
}
}
// TestMiniMaxTTS_StopsAfterFinalStatusChunk ensures any unexpected chunks
// after status=2 are not appended (defence in depth against future server
// changes).
func TestMiniMaxTTS_StopsAfterFinalStatusChunk(t *testing.T) {
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/event-stream")
// status=1 with "hi" (6869)
chunk := map[string]any{
"data": map[string]any{"audio": "6869", "status": 1},
"base_resp": map[string]any{"status_code": 0, "status_msg": "success"},
}
data, _ := json.Marshal(chunk)
if _, err := fmt.Fprintf(w, "data:%s\n\n", data); err != nil {
t.Errorf("write chunk: %v", err)
}
// status=2 trailer with full audio
final := map[string]any{
"data": map[string]any{"audio": "6869", "status": 2},
"base_resp": map[string]any{"status_code": 0, "status_msg": "success"},
}
finalData, _ := json.Marshal(final)
if _, err := fmt.Fprintf(w, "data:%s\n\n", finalData); err != nil {
t.Errorf("write final: %v", err)
}
// Bogus extra chunk after final — must be ignored.
extra := map[string]any{
"data": map[string]any{"audio": "deadbeef", "status": 1},
"base_resp": map[string]any{"status_code": 0, "status_msg": "success"},
}
extraData, _ := json.Marshal(extra)
if _, err := fmt.Fprintf(w, "data:%s\n\n", extraData); err != nil {
t.Errorf("write extra: %v", err)
}
}))
defer apiServer.Close()
tts := NewMiniMaxTTS("test-key", apiServer.URL, "", nil)
audio, _, err := tts.Synthesize(context.Background(), "hi", TTSSynthesisOpts{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got, want := string(audio), "hi"; got != want {
t.Errorf("audio = %q, want %q — trailing chunks after status=2 were not ignored", got, want)
}
}
func TestMiniMaxTTS_APIError(t *testing.T) {
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusUnauthorized)
w.Write([]byte("unauthorized"))
}))
defer apiServer.Close()
tts := NewMiniMaxTTS("bad-key", apiServer.URL, "", nil)
_, _, err := tts.Synthesize(context.Background(), "hello", TTSSynthesisOpts{})
if err == nil {
t.Fatal("expected error for non-200 response")
}
}
func TestMiniMaxTTS_BusinessError(t *testing.T) {
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/event-stream")
chunk := map[string]any{
"data": map[string]any{"audio": "", "status": 0},
"base_resp": map[string]any{"status_code": 1001, "status_msg": "invalid api key"},
}
data, _ := json.Marshal(chunk)
fmt.Fprintf(w, "data:%s\n\n", data)
}))
defer apiServer.Close()
tts := NewMiniMaxTTS("bad-key", apiServer.URL, "", nil)
_, _, err := tts.Synthesize(context.Background(), "hello", TTSSynthesisOpts{})
if err == nil {
t.Fatal("expected error for business error code")
}
}
func TestMiniMaxTTS_EmptyAudio(t *testing.T) {
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/event-stream")
chunk := map[string]any{
"data": map[string]any{"audio": "", "status": 2},
"base_resp": map[string]any{"status_code": 0, "status_msg": "success"},
}
data, _ := json.Marshal(chunk)
fmt.Fprintf(w, "data:%s\n\n", data)
}))
defer apiServer.Close()
tts := NewMiniMaxTTS("test-key", apiServer.URL, "", nil)
_, _, err := tts.Synthesize(context.Background(), "hello", TTSSynthesisOpts{})
if err == nil {
t.Fatal("expected error for empty audio data")
}
}
// ──────────────────────────────────────────────────────────────
// MimoTTS tests
// ──────────────────────────────────────────────────────────────
func TestMimoTTS_Success(t *testing.T) {
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/chat/completions" {
t.Errorf("unexpected path: %s", r.URL.Path)
}
if got := r.Header.Get("api-key"); got != "test-key" {
t.Errorf("expected api-key header 'test-key', got %q", got)
}
if auth := r.Header.Get("Authorization"); auth != "" {
t.Errorf("Authorization header must not be set, got %q", auth)
}
var req struct {
Model string `json:"model"`
Messages []struct {
Role string `json:"role"`
Content string `json:"content"`
} `json:"messages"`
Audio struct {
Format string `json:"format"`
Voice string `json:"voice"`
} `json:"audio"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
t.Fatalf("decode body: %v", err)
}
if req.Model != "mimo-v2.5-tts" {
t.Errorf("expected model mimo-v2.5-tts, got %q", req.Model)
}
if len(req.Messages) != 2 || req.Messages[1].Role != "assistant" || req.Messages[1].Content != "你好" {
t.Errorf("expected assistant message with synthesis text, got %+v", req.Messages)
}
if req.Audio.Format != "wav" {
t.Errorf("expected audio.format wav, got %q", req.Audio.Format)
}
if req.Audio.Voice != "Chloe" {
t.Errorf("expected audio.voice Chloe, got %q", req.Audio.Voice)
}
// "fake-wav" base64-encoded
b64 := "ZmFrZS13YXY="
resp := map[string]any{
"choices": []map[string]any{
{
"message": map[string]any{
"audio": map[string]any{
"data": b64,
},
},
},
},
}
json.NewEncoder(w).Encode(resp)
}))
defer apiServer.Close()
tts := NewMimoTTS("test-key", apiServer.URL, "", nil)
audio, format, err := tts.Synthesize(context.Background(), "你好", TTSSynthesisOpts{Voice: "Chloe"})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if format != "wav" {
t.Errorf("expected wav, got %q", format)
}
if string(audio) != "fake-wav" {
t.Errorf("unexpected audio data: %q", audio)
}
}
func TestMimoTTS_DefaultVoice(t *testing.T) {
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
var req struct {
Audio struct {
Voice string `json:"voice"`
} `json:"audio"`
}
_ = json.NewDecoder(r.Body).Decode(&req)
if req.Audio.Voice != "mimo_default" {
t.Errorf("expected default voice mimo_default, got %q", req.Audio.Voice)
}
resp := map[string]any{
"choices": []map[string]any{
{"message": map[string]any{"audio": map[string]any{"data": "ZmFrZQ=="}}},
},
}
json.NewEncoder(w).Encode(resp)
}))
defer apiServer.Close()
tts := NewMimoTTS("test-key", apiServer.URL, "", nil)
_, _, err := tts.Synthesize(context.Background(), "hi", TTSSynthesisOpts{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
}
func TestMimoTTS_APIError(t *testing.T) {
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusUnauthorized)
w.Write([]byte(`{"error":{"message":"invalid api key"}}`))
}))
defer apiServer.Close()
tts := NewMimoTTS("bad-key", apiServer.URL, "", nil)
_, _, err := tts.Synthesize(context.Background(), "hello", TTSSynthesisOpts{})
if err == nil {
t.Fatal("expected error for non-200 response")
}
}
func TestMimoTTS_EmptyAudio(t *testing.T) {
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
resp := map[string]any{
"choices": []map[string]any{
{"message": map[string]any{"audio": map[string]any{"data": ""}}},
},
}
json.NewEncoder(w).Encode(resp)
}))
defer apiServer.Close()
tts := NewMimoTTS("test-key", apiServer.URL, "", nil)
_, _, err := tts.Synthesize(context.Background(), "hello", TTSSynthesisOpts{})
if err == nil {
t.Fatal("expected error for empty audio data")
}
}
func TestMimoTTS_BusinessError(t *testing.T) {
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
resp := map[string]any{
"error": map[string]any{
"message": "quota exceeded",
"type": "rate_limit",
},
}
json.NewEncoder(w).Encode(resp)
}))
defer apiServer.Close()
tts := NewMimoTTS("test-key", apiServer.URL, "", nil)
_, _, err := tts.Synthesize(context.Background(), "hello", TTSSynthesisOpts{})
if err == nil {
t.Fatal("expected error for business error")
}
}
func TestMimoTTS_BadBase64(t *testing.T) {
apiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
resp := map[string]any{
"choices": []map[string]any{
{"message": map[string]any{"audio": map[string]any{"data": "!!!not-base64!!!"}}},
},
}
json.NewEncoder(w).Encode(resp)
}))
defer apiServer.Close()
tts := NewMimoTTS("test-key", apiServer.URL, "", nil)
_, _, err := tts.Synthesize(context.Background(), "hello", TTSSynthesisOpts{})
if err == nil {
t.Fatal("expected error for invalid base64 audio")
}
}
func TestMimoTTS_Constructors(t *testing.T) {
tts := NewMimoTTS("k", "", "", nil)
if tts.BaseURL != "https://api.xiaomimimo.com/v1" {
t.Errorf("expected default base URL, got %q", tts.BaseURL)
}
if tts.Model != "mimo-v2.5-tts" {
t.Errorf("expected default model mimo-v2.5-tts, got %q", tts.Model)
}
tts = NewMimoTTS("k", "https://example.com/v1", "mimo-v2.5-tts-voicedesign", nil)
if tts.BaseURL != "https://example.com/v1" {
t.Errorf("expected custom base URL, got %q", tts.BaseURL)
}
if tts.Model != "mimo-v2.5-tts-voicedesign" {
t.Errorf("expected custom model, got %q", tts.Model)
}
}
// ──────────────────────────────────────────────────────────────
// MaxTextLen skip test (via TTSCfg)
// ──────────────────────────────────────────────────────────────
func TestTTSCfg_MaxTextLen(t *testing.T) {
cfg := TTSCfg{
Enabled: true,
MaxTextLen: 5,
}
// 6 runes — should exceed limit
text := "你好世界!!"
runeLen := len([]rune(text))
if runeLen <= cfg.MaxTextLen {
t.Fatalf("test setup error: %d <= %d", runeLen, cfg.MaxTextLen)
}
// MaxTextLen check logic (mirrors sendTTSReply)
exceeded := cfg.MaxTextLen > 0 && runeLen > cfg.MaxTextLen
if !exceeded {
t.Error("expected text to exceed MaxTextLen")
}
}
// ──────────────────────────────────────────────────────────────
// Context cancellation test
// ──────────────────────────────────────────────────────────────
func TestMiniMaxTTS_ContextCancelled(t *testing.T) {
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/event-stream")
w.WriteHeader(http.StatusOK)
flusher, ok := w.(http.Flusher)
if !ok {
t.Fatal("server does not support flushing")
}
// Send one chunk then hang to let the client cancel
chunk := `{"data":{"audio":"48656c6c6f","status":1},"base_resp":{"status_code":0}}`
fmt.Fprintf(w, "data: %s\n\n", chunk)
flusher.Flush()
// Block until client disconnects
<-r.Context().Done()
})
srv := httptest.NewServer(handler)
defer srv.Close()
ctx, cancel := context.WithCancel(context.Background())
cancel() // cancel immediately
tts := NewMiniMaxTTS("test-key", srv.URL, "", nil)
_, _, err := tts.Synthesize(ctx, "hello", TTSSynthesisOpts{})
if err == nil {
t.Fatal("expected error when context is cancelled")
}
}
// ──────────────────────────────────────────────────────────────
// Local TTS provider constructor tests (Espeak, Pico, Edge)
// ──────────────────────────────────────────────────────────────
func TestEspeakTTS_Constructors(t *testing.T) {
tts := NewEspeakTTS("", "")
if tts.Path != "espeak" {
t.Errorf("expected default path 'espeak', got %q", tts.Path)
}
if tts.Voice != "zh" {
t.Errorf("expected default voice 'zh', got %q", tts.Voice)
}
tts = NewEspeakTTS("/custom/espeak", "en")
if tts.Path != "/custom/espeak" {
t.Errorf("expected custom path, got %q", tts.Path)
}
if tts.Voice != "en" {
t.Errorf("expected custom voice 'en', got %q", tts.Voice)
}
}
func TestPicoTTS_Constructors(t *testing.T) {
tts := NewPicoTTS("", "")
if tts.Path != "pico2wave" {
t.Errorf("expected default path 'pico2wave', got %q", tts.Path)
}
if tts.Voice != "zh-CN" {
t.Errorf("expected default voice 'zh-CN', got %q", tts.Voice)
}
tts = NewPicoTTS("/custom/pico2wave", "en-US")
if tts.Path != "/custom/pico2wave" {
t.Errorf("expected custom path, got %q", tts.Path)
}
if tts.Voice != "en-US" {
t.Errorf("expected custom voice 'en-US', got %q", tts.Voice)
}
}
func TestEdgeTTS_Constructors(t *testing.T) {
tts := NewEdgeTTS("")
if tts.Path != "" {
t.Errorf("expected empty default path, got %q", tts.Path)
}
if tts.Voice != "zh-CN-XiaoxiaoNeural" {
t.Errorf("expected default voice 'zh-CN-XiaoxiaoNeural', got %q", tts.Voice)
}
tts = NewEdgeTTS("en-US-JennyNeural")
if tts.Voice != "en-US-JennyNeural" {
t.Errorf("expected custom voice 'en-US-JennyNeural', got %q", tts.Voice)
}
}
func TestEspeakTTS_Synthesize_Integration(t *testing.T) {
// Skip if espeak is not available
if _, err := exec.LookPath("espeak"); err != nil {
t.Skip("espeak not available")
}
tts := NewEspeakTTS("espeak", "en")
// Test basic synthesis - just verify it doesn't crash
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
audio, format, err := tts.Synthesize(ctx, "hello", TTSSynthesisOpts{})
if err != nil {
t.Logf("espeak synthesis failed (may be expected in some environments): %v", err)
return
}
if format != "wav" {
t.Errorf("expected wav format, got %q", format)
}
if len(audio) == 0 {
t.Error("expected non-empty audio data")
}
}
func TestPicoTTS_Synthesize_Integration(t *testing.T) {
// Skip if pico2wave is not available
if _, err := exec.LookPath("pico2wave"); err != nil {
t.Skip("pico2wave not available")
}
tts := NewPicoTTS("pico2wave", "en-US")
// Test basic synthesis - just verify it doesn't crash
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
audio, format, err := tts.Synthesize(ctx, "hello", TTSSynthesisOpts{})
if err != nil {
t.Logf("pico2wave synthesis failed (may be expected in some environments): %v", err)
return
}
if format != "wav" {
t.Errorf("expected wav format, got %q", format)
}
if len(audio) == 0 {
t.Error("expected non-empty audio data")
}
}
// writeFakeTTSBinary creates a temp shell script that sleeps long enough to
// outlive a normal test timeout. Used to verify that subprocess-based TTS
// providers honor ctx cancellation by killing the spawned process.
func writeFakeTTSBinary(t *testing.T) string {
t.Helper()
if runtime.GOOS == "windows" {
t.Skip("fake TTS binary uses /bin/sh; not portable to windows")
}
dir := t.TempDir()
path := filepath.Join(dir, "fake-tts.sh")
script := "#!/bin/sh\nsleep 30\n"
if err := os.WriteFile(path, []byte(script), 0o755); err != nil {
t.Fatalf("write fake tts script: %v", err)
}
return path
}
func TestEspeakTTS_HonorsContextCancellation(t *testing.T) {
fakePath := writeFakeTTSBinary(t)
tts := NewEspeakTTS(fakePath, "en")
ctx, cancel := context.WithCancel(context.Background())
cancel() // pre-cancel: a correctly wired CommandContext kills immediately.
start := time.Now()
_, _, err := tts.Synthesize(ctx, "hello", TTSSynthesisOpts{})
elapsed := time.Since(start)
if err == nil {
t.Fatalf("expected error from cancelled context, got nil after %v", elapsed)
}
// 5s is generous: a fix using exec.CommandContext returns in well under a
// second; the bug (plain exec.Command) would wait the full 30s sleep.
if elapsed > 5*time.Second {
t.Fatalf("Synthesize ignored ctx cancellation, took %v (want < 5s); err=%v", elapsed, err)
}
}
func TestPicoTTS_HonorsContextCancellation(t *testing.T) {
fakePath := writeFakeTTSBinary(t)
tts := NewPicoTTS(fakePath, "en-US")
ctx, cancel := context.WithCancel(context.Background())
cancel()
start := time.Now()
_, _, err := tts.Synthesize(ctx, "hello", TTSSynthesisOpts{})
elapsed := time.Since(start)
if err == nil {
t.Fatalf("expected error from cancelled context, got nil after %v", elapsed)
}
if elapsed > 5*time.Second {
t.Fatalf("Synthesize ignored ctx cancellation, took %v (want < 5s); err=%v", elapsed, err)
}
}
func TestEdgeTTS_HonorsContextCancellation(t *testing.T) {
fakePath := writeFakeTTSBinary(t)
tts := NewEdgeTTS("en-US-JennyNeural")
tts.Path = fakePath
ctx, cancel := context.WithCancel(context.Background())
cancel() // pre-cancel: a correctly wired CommandContext kills immediately.
start := time.Now()
_, _, err := tts.Synthesize(ctx, "hello", TTSSynthesisOpts{})
elapsed := time.Since(start)
if err == nil {
t.Fatalf("expected error from cancelled context, got nil after %v", elapsed)
}
if elapsed > 5*time.Second {
t.Fatalf("Synthesize ignored ctx cancellation, took %v (want < 5s); err=%v", elapsed, err)
}
}
func TestEdgeTTS_Synthesize_Integration(t *testing.T) {
// Skip if edge-tts is not available
if _, err := exec.LookPath("edge-tts"); err != nil {
t.Skip("edge-tts not available")
}
tts := NewEdgeTTS("en-US-JennyNeural")
// Test basic synthesis - just verify it doesn't crash
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
audio, format, err := tts.Synthesize(ctx, "hello", TTSSynthesisOpts{})
if err != nil {
t.Logf("edge-tts synthesis failed (may be expected in some environments): %v", err)
return
}
if format != "mp3" {
t.Errorf("expected mp3 format, got %q", format)
}
if len(audio) == 0 {
t.Error("expected non-empty audio data")
}
}