feat: add local CosyVoice runtime tuning

This commit is contained in:
cwang10
2026-06-23 17:59:10 +08:00
committed by zyairehhh
parent 57b3e48718
commit 61f4007965
11 changed files with 865 additions and 47 deletions

View File

@@ -61,6 +61,21 @@ _RUNTIME_ENV_KEYS = {
"OPENTALKING_TTS_SAMBERT_MODEL",
"OPENTALKING_TTS_LOCAL_COSYVOICE_SERVICE_URL",
"OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL",
"OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL_DIR",
"OPENTALKING_TTS_LOCAL_COSYVOICE_RUNTIME_DIR",
"OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE",
"OPENTALKING_TTS_LOCAL_COSYVOICE_FP16",
"OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_JIT",
"OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT",
"OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_VLLM",
"OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT",
"OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN",
"OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN",
"OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR",
"OPENTALKING_TTS_LOCAL_COSYVOICE_FLOW_N_TIMESTEPS",
"OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO",
"OPENTALKING_TTS_LOCAL_COSYVOICE_MIN_TOKEN_TEXT_RATIO",
"OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS",
"OPENTALKING_TTS_LOCAL_INDEXTTS_SERVICE_URL",
"OPENTALKING_TTS_LOCAL_INDEXTTS_MODEL",
"OPENTALKING_TTS_OMNIRT_INDEXTTS_SERVICE_URL",

View File

@@ -127,6 +127,30 @@ def test_agent_lightrag_chunk_fallback_can_be_enabled(monkeypatch: pytest.Monkey
assert settings.agent_lightrag_chunk_fallback_enabled is True
def test_local_cosyvoice_runtime_settings_read_prefixed_env(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_FP16", "auto")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT", "true")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT", "2")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN", "8")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN", "16")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR", "1")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_FLOW_N_TIMESTEPS", "4")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO", "6")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS", "true")
settings = Settings(_env_file=None)
assert settings.tts_local_cosyvoice_fp16 == "auto"
assert settings.tts_local_cosyvoice_load_trt is True
assert settings.tts_local_cosyvoice_trt_concurrent == 2
assert settings.tts_local_cosyvoice_token_hop_len == 8
assert settings.tts_local_cosyvoice_token_max_hop_len == 16
assert settings.tts_local_cosyvoice_stream_scale_factor == 1
assert settings.tts_local_cosyvoice_flow_n_timesteps == 4
assert settings.tts_local_cosyvoice_max_token_text_ratio == 6.0
assert settings.tts_local_cosyvoice_mask_stop_tokens is True
def _active_env_names(contents: str) -> set[str]:
names: set[str] = set()
for raw_line in contents.splitlines():

View File

@@ -44,6 +44,16 @@ tts:
voice: zh-CN-XiaoxiaoNeural
sample_rate: 16000
streaming_decode: true
local_cosyvoice_model: FunAudioLLM/Fun-CosyVoice3-0.5B-2512
local_cosyvoice_device: auto
local_cosyvoice_fp16: auto
local_cosyvoice_load_trt: false
local_cosyvoice_load_jit: false
local_cosyvoice_load_vllm: false
local_cosyvoice_trt_concurrent: 1
local_cosyvoice_max_token_text_ratio: 6.0
local_cosyvoice_min_token_text_ratio: 0.0
local_cosyvoice_mask_stop_tokens: true
memory:
provider: mem0
enabled: false

View File

@@ -47,6 +47,10 @@ OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL_DIR=./models/local-audio/FunAudioLLM__Fun-
OPENTALKING_TTS_LOCAL_COSYVOICE_RUNTIME_DIR=./models/local-audio/runtime/CosyVoice
OPENTALKING_TTS_LOCAL_COSYVOICE_SERVICE_URL=http://127.0.0.1:19090/synthesize
OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE=cuda:0
OPENTALKING_TTS_LOCAL_COSYVOICE_FP16=auto
OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT=0
OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO=6
OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS=1
```
Download local audio weights:
@@ -78,6 +82,24 @@ OPENTALKING_TTS_LOCAL_COSYVOICE_PRELOAD=1 \
python scripts/local_cosyvoice_service.py --host 127.0.0.1 --port 19090
```
In prior GPU validation, the main CosyVoice3 issue was not a single TTFA number but seed-dependent output-length drift. The local CosyVoice service therefore keeps two stability guards on by default: `OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS=1` masks every stop token exposed by the CosyVoice LLM, and `OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO=6` bounds the token/text ratio so long prompts do not occasionally produce runaway audio. Keep these guards enabled for realtime use.
TensorRT is optional. Enable it only after the current CosyVoice runtime, CUDA, onnxruntime-gpu/TensorRT engines, and model directory are compatible:
```env title=".env"
OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT=1
OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT=1
OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN=8
OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN=16
OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR=1
```
After startup, check the sidecar health payload and verify `runtime_flags.load_trt`, `streaming`, `llm_token_ratio`, and `llm_stop_token_patch`:
```bash title="Terminal"
curl -fsS http://127.0.0.1:19090/health | python3 -m json.tool
```
For the full local speech input, speech synthesis, and QuickTalk video chain, see [Local STT/TTS + QuickTalk](recipes/local-quicktalk-audio.md).
## IndexTTS Deployment (provider = indextts)

View File

@@ -47,6 +47,10 @@ OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL_DIR=./models/local-audio/FunAudioLLM__Fun-
OPENTALKING_TTS_LOCAL_COSYVOICE_RUNTIME_DIR=./models/local-audio/runtime/CosyVoice
OPENTALKING_TTS_LOCAL_COSYVOICE_SERVICE_URL=http://127.0.0.1:19090/synthesize
OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE=cuda:0
OPENTALKING_TTS_LOCAL_COSYVOICE_FP16=auto
OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT=0
OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO=6
OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS=1
```
下载本地音频权重:
@@ -78,6 +82,24 @@ OPENTALKING_TTS_LOCAL_COSYVOICE_PRELOAD=1 \
python scripts/local_cosyvoice_service.py --host 127.0.0.1 --port 19090
```
在既有 GPU 验证中CosyVoice3 的关键问题不是单次 TTFA而是随机种子导致的生成长度漂移。OpenTalking 的本地 CosyVoice service 因此默认保留两类稳定性保护:`OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS=1` 会屏蔽 CosyVoice LLM 暴露的全部 stop token`OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO=6` 会限制 token/text 比例,避免长文本偶发生成过长音频。不要为了追求更快首包把这两个保护关掉。
TensorRT 是可选加速。只有当当前 CosyVoice runtime、CUDA、onnxruntime-gpu/TensorRT engine 与模型目录匹配时再开启:
```env title=".env"
OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT=1
OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT=1
OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN=8
OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN=16
OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR=1
```
启动后先检查 sidecar 健康信息,确认 `runtime_flags.load_trt`、`streaming`、`llm_token_ratio` 和 `llm_stop_token_patch` 符合预期:
```bash title="终端"
curl -fsS http://127.0.0.1:19090/health | python3 -m json.tool
```
完整本地语音输入、语音合成和 QuickTalk 视频链路见 [本地 STT/TTS + QuickTalk](recipes/local-quicktalk-audio.md)。
## IndexTTS 部署provider = indextts

View File

@@ -137,6 +137,18 @@ def _flatten_config(raw: dict[str, Any] | None) -> dict[str, Any]:
"local_cosyvoice_service_url": "tts_local_cosyvoice_service_url",
"local_cosyvoice_service_urls": "tts_local_cosyvoice_service_urls",
"local_cosyvoice_device": "tts_local_cosyvoice_device",
"local_cosyvoice_fp16": "tts_local_cosyvoice_fp16",
"local_cosyvoice_load_jit": "tts_local_cosyvoice_load_jit",
"local_cosyvoice_load_trt": "tts_local_cosyvoice_load_trt",
"local_cosyvoice_load_vllm": "tts_local_cosyvoice_load_vllm",
"local_cosyvoice_trt_concurrent": "tts_local_cosyvoice_trt_concurrent",
"local_cosyvoice_token_hop_len": "tts_local_cosyvoice_token_hop_len",
"local_cosyvoice_token_max_hop_len": "tts_local_cosyvoice_token_max_hop_len",
"local_cosyvoice_stream_scale_factor": "tts_local_cosyvoice_stream_scale_factor",
"local_cosyvoice_flow_n_timesteps": "tts_local_cosyvoice_flow_n_timesteps",
"local_cosyvoice_max_token_text_ratio": "tts_local_cosyvoice_max_token_text_ratio",
"local_cosyvoice_min_token_text_ratio": "tts_local_cosyvoice_min_token_text_ratio",
"local_cosyvoice_mask_stop_tokens": "tts_local_cosyvoice_mask_stop_tokens",
"local_indextts_model": "tts_local_indextts_model",
"local_indextts_model_dir": "tts_local_indextts_model_dir",
"local_indextts_cfg_path": "tts_local_indextts_cfg_path",
@@ -491,6 +503,18 @@ class Settings(BaseSettings):
tts_local_cosyvoice_service_url: str = ""
tts_local_cosyvoice_service_urls: str = ""
tts_local_cosyvoice_device: str = "auto"
tts_local_cosyvoice_fp16: str = "auto"
tts_local_cosyvoice_load_jit: bool = False
tts_local_cosyvoice_load_trt: bool = False
tts_local_cosyvoice_load_vllm: bool = False
tts_local_cosyvoice_trt_concurrent: int = 1
tts_local_cosyvoice_token_hop_len: int = 0
tts_local_cosyvoice_token_max_hop_len: int = 0
tts_local_cosyvoice_stream_scale_factor: int = 0
tts_local_cosyvoice_flow_n_timesteps: int = 0
tts_local_cosyvoice_max_token_text_ratio: float = 6.0
tts_local_cosyvoice_min_token_text_ratio: float = 0.0
tts_local_cosyvoice_mask_stop_tokens: bool = True
tts_local_indextts_model: str = "IndexTeam/IndexTTS-2"
tts_local_indextts_model_dir: str = ""
tts_local_indextts_cfg_path: str = ""

View File

@@ -146,6 +146,41 @@ def _local_cosyvoice_device() -> str:
)
def _local_cosyvoice_bool(field: str, settings_name: str, default: bool) -> bool:
raw = _provider_env("local_cosyvoice", field) or _settings_value(settings_name, "")
if not raw:
return default
return raw.strip().lower() in {"1", "true", "yes", "on"}
def _local_cosyvoice_int(field: str, settings_name: str, default: int) -> int:
raw = _provider_env("local_cosyvoice", field) or _settings_value(settings_name, "")
if not raw:
return default
try:
return int(str(raw).strip())
except ValueError:
return default
def _local_cosyvoice_float(field: str, settings_name: str, default: float) -> float:
raw = _provider_env("local_cosyvoice", field) or _settings_value(settings_name, "")
if not raw:
return default
try:
return float(str(raw).strip())
except ValueError:
return default
def _local_cosyvoice_fp16() -> str:
return (
_provider_env("local_cosyvoice", "FP16")
or _settings_value("tts_local_cosyvoice_fp16", "")
or "auto"
)
def _local_audio_asset_dir(name: str, required_file: str, *fallback_names: str) -> str:
root = _local_audio_model_root()
for candidate_name in (name, *fallback_names):
@@ -482,7 +517,7 @@ def tts_enabled_providers() -> list[str]:
return out or [_provider()]
def tts_provider_config(provider: str) -> dict[str, str | bool]:
def tts_provider_config(provider: str) -> dict[str, str | bool | int | float]:
p = normalize_tts_provider(provider, default=None) or _provider()
if p == "indextts":
resolved = _resolve_indextts_provider(p)
@@ -539,6 +574,26 @@ def tts_provider_config(provider: str) -> dict[str, str | bool]:
if p == "local_cosyvoice":
model = _local_cosyvoice_model()
service_url = _local_cosyvoice_service_url()
token_hop_len = _local_cosyvoice_int(
"TOKEN_HOP_LEN",
"tts_local_cosyvoice_token_hop_len",
0,
)
token_max_hop_len = _local_cosyvoice_int(
"TOKEN_MAX_HOP_LEN",
"tts_local_cosyvoice_token_max_hop_len",
0,
)
stream_scale_factor = _local_cosyvoice_int(
"STREAM_SCALE_FACTOR",
"tts_local_cosyvoice_stream_scale_factor",
0,
)
flow_n_timesteps = _local_cosyvoice_int(
"FLOW_N_TIMESTEPS",
"tts_local_cosyvoice_flow_n_timesteps",
0,
)
return {
"provider": p,
"model": model,
@@ -546,7 +601,36 @@ def tts_provider_config(provider: str) -> dict[str, str | bool]:
"voice": "local-default",
"device": _local_cosyvoice_device(),
"key_set": False,
"service_url": service_url,
"service_url_set": bool(service_url),
"fp16": _local_cosyvoice_fp16(),
"load_jit": _local_cosyvoice_bool("LOAD_JIT", "tts_local_cosyvoice_load_jit", False),
"load_trt": _local_cosyvoice_bool("LOAD_TRT", "tts_local_cosyvoice_load_trt", False),
"load_vllm": _local_cosyvoice_bool("LOAD_VLLM", "tts_local_cosyvoice_load_vllm", False),
"trt_concurrent": _local_cosyvoice_int(
"TRT_CONCURRENT",
"tts_local_cosyvoice_trt_concurrent",
1,
),
"token_hop_len": token_hop_len,
"token_max_hop_len": token_max_hop_len,
"stream_scale_factor": stream_scale_factor,
"flow_n_timesteps": flow_n_timesteps,
"max_token_text_ratio": _local_cosyvoice_float(
"MAX_TOKEN_TEXT_RATIO",
"tts_local_cosyvoice_max_token_text_ratio",
6.0,
),
"min_token_text_ratio": _local_cosyvoice_float(
"MIN_TOKEN_TEXT_RATIO",
"tts_local_cosyvoice_min_token_text_ratio",
0.0,
),
"mask_stop_tokens": _local_cosyvoice_bool(
"MASK_STOP_TOKENS",
"tts_local_cosyvoice_mask_stop_tokens",
True,
),
}
if p == "local_qwen3_tts":
model = (
@@ -670,7 +754,7 @@ def tts_provider_config(provider: str) -> dict[str, str | bool]:
}
def tts_status(provider: str | None = None) -> dict[str, str | bool]:
def tts_status(provider: str | None = None) -> dict[str, str | bool | int | float]:
return tts_provider_config(provider or _provider())

View File

@@ -150,6 +150,52 @@ def _env_device() -> str:
)
def _local_cosyvoice_bool(field: str, settings_name: str, default: bool) -> bool:
raw = os.environ.get(f"OPENTALKING_TTS_LOCAL_COSYVOICE_{field}", "").strip()
if not raw:
raw = _settings_value(settings_name, "")
if not raw:
return default
return raw.strip().lower() in {"1", "true", "yes", "on"}
def _local_cosyvoice_int(field: str, settings_name: str, default: int) -> int:
raw = os.environ.get(f"OPENTALKING_TTS_LOCAL_COSYVOICE_{field}", "").strip()
if not raw:
raw = _settings_value(settings_name, "")
if not raw:
return default
try:
return int(raw)
except ValueError:
return default
def _local_cosyvoice_fp16(device: str) -> bool:
raw = (
os.environ.get("OPENTALKING_TTS_LOCAL_COSYVOICE_FP16", "").strip()
or _settings_value("tts_local_cosyvoice_fp16", "")
or "auto"
).lower()
if raw == "auto":
return device.startswith("cuda")
return raw in {"1", "true", "yes", "on"}
def _instantiate_cosyvoice_runtime(cls: Any, model_dir: str, kwargs: dict[str, Any]) -> Any:
runtime_kwargs = dict(kwargs)
optional_keys = ("load_vllm", "trt_concurrent", "load_jit", "load_trt", "fp16")
while True:
try:
return cls(model_dir, **runtime_kwargs)
except TypeError as exc:
text = str(exc)
unsupported = next((key for key in optional_keys if key in runtime_kwargs and key in text), None)
if unsupported is None:
raise
runtime_kwargs.pop(unsupported)
def _audio_format_from_content_type(content_type: str | None) -> str | None:
value = (content_type or "").split(";", 1)[0].strip().lower()
if value in {"audio/wav", "audio/wave", "audio/x-wav"}:
@@ -253,6 +299,14 @@ class LocalCosyVoiceTTSAdapter:
default_service_url,
)
self.device = _env_device()
self.fp16 = _local_cosyvoice_fp16(self.device)
self.load_jit = _local_cosyvoice_bool("LOAD_JIT", "tts_local_cosyvoice_load_jit", False)
self.load_trt = _local_cosyvoice_bool("LOAD_TRT", "tts_local_cosyvoice_load_trt", False)
self.load_vllm = _local_cosyvoice_bool("LOAD_VLLM", "tts_local_cosyvoice_load_vllm", False)
self.trt_concurrent = max(
1,
_local_cosyvoice_int("TRT_CONCURRENT", "tts_local_cosyvoice_trt_concurrent", 1),
)
self._engine: Any | None = None
async def synthesize_stream(self, text: str, voice: str | None = None) -> AsyncIterator[AudioChunk]:
@@ -350,9 +404,11 @@ class LocalCosyVoiceTTSAdapter:
) from exc
model_dir = self.model_dir or _resolve_model_path(self.model)
kwargs: dict[str, Any] = {
"load_jit": False,
"load_trt": False,
"fp16": self.device.startswith("cuda"),
"load_jit": self.load_jit,
"load_trt": self.load_trt,
"load_vllm": self.load_vllm,
"fp16": self.fp16,
"trt_concurrent": self.trt_concurrent,
}
model_lower = self.model.lower()
if "cosyvoice3" in model_lower:
@@ -361,7 +417,7 @@ class LocalCosyVoiceTTSAdapter:
cls = getattr(cosyvoice_module, "CosyVoice2")
else:
cls = getattr(cosyvoice_module, "CosyVoice")
self._engine = cls(model_dir, **kwargs)
self._engine = _instantiate_cosyvoice_runtime(cls, model_dir, kwargs)
return self._engine
def _available_voice(self, engine: Any, requested: str) -> str:

View File

@@ -4,8 +4,10 @@ import argparse
import io
import os
import sys
import threading
import time
from collections.abc import Iterator
from importlib.metadata import PackageNotFoundError, version
from pathlib import Path
from typing import Any
@@ -27,6 +29,211 @@ class SynthesizeRequest(BaseModel):
instruction: str | None = None
def _cosyvoice_model(cosyvoice: Any) -> Any:
return getattr(cosyvoice, "model", cosyvoice)
def _cosyvoice_llm(cosyvoice: Any) -> Any | None:
model = _cosyvoice_model(cosyvoice)
return getattr(model, "llm", None)
def _cosyvoice_flow(cosyvoice: Any) -> Any | None:
model = _cosyvoice_model(cosyvoice)
return getattr(model, "flow", None)
def current_streaming_tuning(cosyvoice: Any) -> dict[str, Any]:
model = _cosyvoice_model(cosyvoice)
return {
attr: getattr(model, attr)
for attr in ("token_hop_len", "token_max_hop_len", "stream_scale_factor")
if hasattr(model, attr)
}
def apply_streaming_tuning(
cosyvoice: Any,
*,
token_hop_len: int | None = None,
token_max_hop_len: int | None = None,
stream_scale_factor: int | None = None,
) -> dict[str, Any]:
model = _cosyvoice_model(cosyvoice)
requested = {
"token_hop_len": token_hop_len,
"token_max_hop_len": token_max_hop_len,
"stream_scale_factor": stream_scale_factor,
}
applied: dict[str, Any] = {}
for attr, value in requested.items():
if value is None:
continue
if hasattr(model, attr):
setattr(model, attr, value)
applied[attr] = value
else:
applied[attr] = "unsupported"
effective = current_streaming_tuning(cosyvoice)
setattr(model, "_opentalking_streaming_tuning", effective)
return {"requested": requested, "applied": applied, "effective": effective}
def reset_streaming_tuning(cosyvoice: Any) -> dict[str, Any]:
model = _cosyvoice_model(cosyvoice)
baseline = getattr(model, "_opentalking_streaming_tuning", None)
if baseline is None:
baseline = current_streaming_tuning(cosyvoice)
setattr(model, "_opentalking_streaming_tuning", baseline)
for attr, value in baseline.items():
if hasattr(model, attr):
setattr(model, attr, value)
return current_streaming_tuning(cosyvoice)
def _with_request_streaming_tuning(cosyvoice: Any, model_output: Iterator[Any]) -> Iterator[Any]:
reset_streaming_tuning(cosyvoice)
try:
yield from model_output
finally:
reset_streaming_tuning(cosyvoice)
def current_flow_tuning(cosyvoice: Any) -> dict[str, Any]:
flow = _cosyvoice_flow(cosyvoice)
if flow is None:
return {}
return {"inference_n_timesteps": int(getattr(flow, "inference_n_timesteps", 10))}
def apply_flow_tuning(cosyvoice: Any, *, n_timesteps: int | None = None) -> dict[str, Any]:
flow = _cosyvoice_flow(cosyvoice)
requested = {"inference_n_timesteps": n_timesteps}
if flow is None:
return {"requested": requested, "applied": "unsupported", "effective": {}}
applied: dict[str, Any] = {}
if n_timesteps is not None:
setattr(flow, "inference_n_timesteps", max(1, int(n_timesteps)))
applied["inference_n_timesteps"] = getattr(flow, "inference_n_timesteps")
return {"requested": requested, "applied": applied, "effective": current_flow_tuning(cosyvoice)}
def current_llm_token_ratio_tuning(cosyvoice: Any) -> dict[str, float]:
llm = _cosyvoice_llm(cosyvoice)
ratios = getattr(llm, "_opentalking_token_ratios", {}) if llm is not None else {}
return dict(ratios) if isinstance(ratios, dict) else {}
def apply_llm_token_ratio_patch(
cosyvoice: Any,
*,
max_token_text_ratio: float | None = None,
min_token_text_ratio: float | None = None,
) -> dict[str, Any]:
requested = {
"max_token_text_ratio": max_token_text_ratio,
"min_token_text_ratio": min_token_text_ratio,
}
llm = _cosyvoice_llm(cosyvoice)
if llm is None or not hasattr(llm, "inference"):
return {"requested": requested, "applied": "unsupported", "effective": {}}
if max_token_text_ratio is None and min_token_text_ratio is None:
return {"requested": requested, "applied": {}, "effective": current_llm_token_ratio_tuning(cosyvoice)}
original = getattr(llm, "_opentalking_original_inference", None)
if original is None:
original = llm.inference
setattr(llm, "_opentalking_original_inference", original)
applied = {key: value for key, value in requested.items() if value is not None}
def inference_with_opentalking_ratios(*args: Any, **kwargs: Any) -> Any:
if max_token_text_ratio is not None:
kwargs.setdefault("max_token_text_ratio", max_token_text_ratio)
if min_token_text_ratio is not None:
kwargs.setdefault("min_token_text_ratio", min_token_text_ratio)
return original(*args, **kwargs)
llm.inference = inference_with_opentalking_ratios
setattr(llm, "_opentalking_token_ratios", applied)
return {"requested": requested, "applied": applied, "effective": current_llm_token_ratio_tuning(cosyvoice)}
def current_llm_stop_token_patch(cosyvoice: Any) -> dict[str, Any]:
llm = _cosyvoice_llm(cosyvoice)
patch = getattr(llm, "_opentalking_stop_token_patch", {}) if llm is not None else {}
return dict(patch) if isinstance(patch, dict) else {}
def apply_llm_stop_token_patch(cosyvoice: Any) -> dict[str, Any]:
llm = _cosyvoice_llm(cosyvoice)
if llm is None or not hasattr(llm, "sampling_ids"):
return {"applied": "unsupported", "effective": {}}
stop_token_ids = list(getattr(llm, "stop_token_ids", []) or [])
if len(stop_token_ids) <= 1 or not hasattr(llm, "sampling"):
return {"applied": {}, "effective": current_llm_stop_token_patch(cosyvoice)}
if getattr(llm, "_opentalking_stop_token_patch_applied", False):
return {"applied": {}, "effective": current_llm_stop_token_patch(cosyvoice)}
original = llm.sampling_ids
setattr(llm, "_opentalking_original_sampling_ids", original)
def sampling_ids_with_opentalking_stop_mask(
weighted_scores: Any,
decoded_tokens: Any,
sampling: Any,
ignore_eos: bool = True,
) -> Any:
if ignore_eos is True:
masked_scores = weighted_scores.clone()
valid_stop_ids = [idx for idx in stop_token_ids if 0 <= idx < len(masked_scores)]
if valid_stop_ids:
masked_scores[valid_stop_ids] = -float("inf")
return llm.sampling(masked_scores, decoded_tokens, sampling)
return original(weighted_scores, decoded_tokens, sampling, ignore_eos)
llm.sampling_ids = sampling_ids_with_opentalking_stop_mask
setattr(llm, "_opentalking_stop_token_patch_applied", True)
setattr(llm, "_opentalking_stop_token_patch", {"stop_token_count": len(stop_token_ids)})
return {"applied": {"stop_token_count": len(stop_token_ids)}, "effective": current_llm_stop_token_patch(cosyvoice)}
def current_runtime_info(cosyvoice: Any) -> dict[str, Any]:
model = _cosyvoice_model(cosyvoice)
flow = getattr(model, "flow", None)
decoder = getattr(flow, "decoder", None)
estimator = getattr(decoder, "estimator", None)
estimator_type = estimator.__class__.__name__ if estimator is not None else ""
return {
"fp16": bool(getattr(cosyvoice, "fp16", False)),
"flow_decoder_estimator": estimator_type,
"flow_decoder_trt": estimator_type == "TrtContextWrapper",
}
def runtime_package_versions(*packages: str) -> dict[str, str]:
versions: dict[str, str] = {}
for package in packages:
try:
versions[package] = version(package)
except PackageNotFoundError:
versions[package] = "not-installed"
return versions
def _instantiate_automodel(cls: Any, kwargs: dict[str, Any]) -> tuple[Any, dict[str, Any]]:
model_kwargs = dict(kwargs)
optional_keys = ("load_vllm", "load_jit", "trt_concurrent")
while True:
try:
return cls(**model_kwargs), model_kwargs
except TypeError as exc:
text = str(exc)
unsupported = next((key for key in optional_keys if key in model_kwargs and key in text), None)
if unsupported is None:
raise
model_kwargs.pop(unsupported)
class CosyVoiceService:
def __init__(
self,
@@ -39,6 +246,17 @@ class CosyVoiceService:
mode: str,
instruction: str,
fp16: bool,
load_jit: bool = False,
load_trt: bool = False,
load_vllm: bool = False,
trt_concurrent: int = 1,
token_hop_len: int | None = None,
token_max_hop_len: int | None = None,
stream_scale_factor: int | None = None,
flow_n_timesteps: int | None = None,
max_token_text_ratio: float | None = 6.0,
min_token_text_ratio: float | None = None,
mask_stop_tokens: bool = True,
) -> None:
self.model_dir = model_dir
self.runtime_dir = runtime_dir
@@ -48,7 +266,24 @@ class CosyVoiceService:
self.mode = mode
self.instruction = instruction
self.fp16 = fp16
self.load_jit = load_jit
self.load_trt = load_trt
self.load_vllm = load_vllm
self.trt_concurrent = max(1, int(trt_concurrent or 1))
self.token_hop_len = token_hop_len
self.token_max_hop_len = token_max_hop_len
self.stream_scale_factor = stream_scale_factor
self.flow_n_timesteps = flow_n_timesteps
self.max_token_text_ratio = max_token_text_ratio
self.min_token_text_ratio = min_token_text_ratio
self.mask_stop_tokens = mask_stop_tokens
self._model: Any | None = None
self._model_lock = threading.Lock()
self._loaded_model_kwargs: dict[str, Any] = {}
self._streaming_tuning: dict[str, Any] = {}
self._flow_tuning: dict[str, Any] = {}
self._llm_token_ratio_tuning: dict[str, Any] = {}
self._llm_stop_token_patch: dict[str, Any] = {}
def model(self) -> Any:
if self._model is not None:
@@ -76,24 +311,85 @@ class CosyVoiceService:
t0 = time.perf_counter()
model_kwargs = {
"model_dir": self.model_dir,
"load_trt": False,
"load_vllm": False,
"load_jit": self.load_jit,
"load_trt": self.load_trt,
"load_vllm": self.load_vllm,
"fp16": self.fp16,
"trt_concurrent": self.trt_concurrent,
}
try:
self._model = AutoModel(**model_kwargs)
except TypeError as exc:
if "load_vllm" not in str(exc):
raise
model_kwargs.pop("load_vllm")
self._model = AutoModel(**model_kwargs)
self._model, self._loaded_model_kwargs = _instantiate_automodel(AutoModel, model_kwargs)
self._apply_runtime_tuning()
# Keep the service zero-shot first so it does not require precomputed spk2info.pt.
print(
f"loaded cosyvoice model={self.model_dir} runtime={runtime} device={self.device} seconds={time.perf_counter() - t0:.3f}",
"loaded cosyvoice "
f"model={self.model_dir} runtime={runtime} device={self.device} "
f"fp16={self.fp16} load_jit={self.load_jit} load_trt={self.load_trt} "
f"load_vllm={self.load_vllm} trt_concurrent={self.trt_concurrent} "
f"seconds={time.perf_counter() - t0:.3f}",
flush=True,
)
return self._model
def _apply_runtime_tuning(self) -> None:
if self._model is None:
return
self._streaming_tuning = apply_streaming_tuning(
self._model,
token_hop_len=self.token_hop_len,
token_max_hop_len=self.token_max_hop_len,
stream_scale_factor=self.stream_scale_factor,
)
self._flow_tuning = apply_flow_tuning(self._model, n_timesteps=self.flow_n_timesteps)
self._llm_token_ratio_tuning = apply_llm_token_ratio_patch(
self._model,
max_token_text_ratio=self.max_token_text_ratio,
min_token_text_ratio=self.min_token_text_ratio,
)
self._llm_stop_token_patch = (
apply_llm_stop_token_patch(self._model)
if self.mask_stop_tokens
else {"applied": {}, "effective": current_llm_stop_token_patch(self._model)}
)
print(
"cosyvoice tuning "
f"streaming={self._streaming_tuning} flow={self._flow_tuning} "
f"llm_token_ratio={self._llm_token_ratio_tuning} "
f"llm_stop_token_patch={self._llm_stop_token_patch}",
flush=True,
)
def health_payload(self) -> dict[str, Any]:
model = self._model
return {
"status": "ok",
"model_dir": self.model_dir,
"runtime_dir": self.runtime_dir,
"device": self.device,
"loaded": model is not None,
"mode": self.mode,
"runtime_flags": {
"fp16": self.fp16,
"load_jit": self.load_jit,
"load_trt": self.load_trt,
"load_vllm": self.load_vllm,
"trt_concurrent": self.trt_concurrent,
"loaded_model_kwargs": self._loaded_model_kwargs,
},
"streaming": current_streaming_tuning(model) if model is not None else self._streaming_tuning,
"flow": current_flow_tuning(model) if model is not None else self._flow_tuning,
"llm_token_ratio": current_llm_token_ratio_tuning(model) if model is not None else self._llm_token_ratio_tuning,
"llm_stop_token_patch": current_llm_stop_token_patch(model) if model is not None else self._llm_stop_token_patch,
"runtime": current_runtime_info(model) if model is not None else {},
"runtime_packages": runtime_package_versions(
"transformers",
"tokenizers",
"torch",
"torchaudio",
"numpy",
"onnxruntime",
),
}
def _to_wav_bytes(self, speech: Any, sample_rate: int) -> bytes:
if hasattr(speech, "detach"):
speech = speech.detach().cpu().numpy()
@@ -162,17 +458,18 @@ class CosyVoiceService:
stream=False,
)
parts: list[np.ndarray] = []
for item in iterator:
speech = item.get("tts_speech") if isinstance(item, dict) else item
if hasattr(speech, "detach"):
speech = speech.detach().cpu().numpy()
parts.append(np.asarray(speech, dtype=np.float32).reshape(-1))
with self._model_lock:
for item in _with_request_streaming_tuning(model, iterator):
speech = item.get("tts_speech") if isinstance(item, dict) else item
if hasattr(speech, "detach"):
speech = speech.detach().cpu().numpy()
parts.append(np.asarray(speech, dtype=np.float32).reshape(-1))
if not parts:
raise HTTPException(status_code=502, detail="CosyVoice returned no audio")
wav_bytes = self._to_wav_bytes(np.concatenate(parts), sample_rate)
return wav_bytes, sample_rate, time.perf_counter() - t0
def _streaming_iterator(self, req: SynthesizeRequest) -> tuple[Iterator[Any], int, int, float]:
def _streaming_iterator(self, req: SynthesizeRequest) -> tuple[Iterator[Any], int, int, float, Any]:
text = req.text.strip()
if not text:
raise HTTPException(status_code=400, detail="text is required")
@@ -204,30 +501,32 @@ class CosyVoiceService:
prompt_audio,
stream=True,
)
return iterator, source_sr, target_sr, t0
return iterator, source_sr, target_sr, t0, model
def synthesize_pcm_stream(self, req: SynthesizeRequest) -> tuple[Iterator[bytes], int]:
iterator, source_sr, target_sr, t0 = self._streaming_iterator(req)
iterator, source_sr, target_sr, t0, model = self._streaming_iterator(req)
def generate() -> Iterator[bytes]:
first = True
chunks = 0
samples = 0
for item in iterator:
speech = item.get("tts_speech") if isinstance(item, dict) else item
pcm = self._audio_to_i16(speech)
pcm = self._resample_linear(pcm, source_sr, target_sr)
if pcm.size == 0:
continue
if first:
print(
f"first_pcm chars={len(req.text.strip())} sr={target_sr} seconds={time.perf_counter() - t0:.3f}",
flush=True,
)
first = False
chunks += 1
samples += int(pcm.size)
yield pcm.astype("<i2", copy=False).tobytes()
with self._model_lock:
tuned_iterator = _with_request_streaming_tuning(model, iterator)
for item in tuned_iterator:
speech = item.get("tts_speech") if isinstance(item, dict) else item
pcm = self._audio_to_i16(speech)
pcm = self._resample_linear(pcm, source_sr, target_sr)
if pcm.size == 0:
continue
if first:
print(
f"first_pcm chars={len(req.text.strip())} sr={target_sr} seconds={time.perf_counter() - t0:.3f}",
flush=True,
)
first = False
chunks += 1
samples += int(pcm.size)
yield pcm.astype("<i2", copy=False).tobytes()
if chunks == 0:
raise RuntimeError("CosyVoice returned no audio")
print(
@@ -253,14 +552,7 @@ def create_app(service: CosyVoiceService) -> FastAPI:
@app.get("/health")
def health() -> dict[str, Any]:
return {
"status": "ok",
"model_dir": service.model_dir,
"runtime_dir": service.runtime_dir,
"device": service.device,
"loaded": service._model is not None,
"mode": service.mode,
}
return service.health_payload()
@app.post("/synthesize")
def synthesize(req: SynthesizeRequest) -> StreamingResponse:
@@ -286,6 +578,29 @@ def _local_audio_root() -> Path:
return Path(os.environ.get("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", "./models/local-audio")).expanduser()
def _env_bool(name: str, default: bool = False) -> bool:
raw = os.environ.get(name, "").strip().lower()
if not raw:
return default
return raw in {"1", "true", "yes", "on"}
def _env_optional_int(name: str) -> int | None:
raw = os.environ.get(name, "").strip()
if not raw:
return None
value = int(raw)
return value if value > 0 else None
def _env_optional_float(name: str, default: float | None = None) -> float | None:
raw = os.environ.get(name, "").strip()
if not raw:
return default
value = float(raw)
return value if value > 0 else None
def build_service_from_env() -> CosyVoiceService:
device = os.environ.get("OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE", "cuda:0")
fp16_raw = os.environ.get("OPENTALKING_TTS_LOCAL_COSYVOICE_FP16", "auto").strip().lower()
@@ -309,6 +624,17 @@ def build_service_from_env() -> CosyVoiceService:
"You are a helpful assistant.<|endofprompt|>",
),
fp16=fp16,
load_jit=_env_bool("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_JIT", False),
load_trt=_env_bool("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT", False),
load_vllm=_env_bool("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_VLLM", False),
trt_concurrent=int(os.environ.get("OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT", "1") or "1"),
token_hop_len=_env_optional_int("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN"),
token_max_hop_len=_env_optional_int("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN"),
stream_scale_factor=_env_optional_int("OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR"),
flow_n_timesteps=_env_optional_int("OPENTALKING_TTS_LOCAL_COSYVOICE_FLOW_N_TIMESTEPS"),
max_token_text_ratio=_env_optional_float("OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO", 6.0),
min_token_text_ratio=_env_optional_float("OPENTALKING_TTS_LOCAL_COSYVOICE_MIN_TOKEN_TEXT_RATIO"),
mask_stop_tokens=_env_bool("OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS", True),
)

View File

@@ -43,6 +43,25 @@
# OMNIRT_ENDPOINT=http://127.0.0.1:9000
# OMNIRT_AUDIO2VIDEO_PATH_TEMPLATE=/v1/audio2video/{model}
# Local CosyVoice3 sidecar. Keep TensorRT off until the CosyVoice runtime has
# built/loaded compatible TRT engines for this GPU and model directory.
# OPENTALKING_TTS_DEFAULT_PROVIDER=local_cosyvoice
# OPENTALKING_TTS_ENABLED_PROVIDERS=local_cosyvoice,dashscope,edge
# OPENTALKING_LOCAL_AUDIO_MODEL_ROOT=$DIGITAL_HUMAN_HOME/models/local-audio
# OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL=FunAudioLLM/Fun-CosyVoice3-0.5B-2512
# OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL_DIR=$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/FunAudioLLM__Fun-CosyVoice3-0.5B-2512
# OPENTALKING_TTS_LOCAL_COSYVOICE_RUNTIME_DIR=$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/CosyVoice
# OPENTALKING_TTS_LOCAL_COSYVOICE_SERVICE_URL=http://127.0.0.1:19090/synthesize
# OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE=cuda:0
# OPENTALKING_TTS_LOCAL_COSYVOICE_FP16=auto
# OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT=0
# OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT=1
# OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN=8
# OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN=16
# OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR=1
# OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO=6
# OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS=1
# OmniRT helper defaults.
# OMNIRT_PORT=9000
# OMNIRT_HOST=0.0.0.0

View File

@@ -962,6 +962,75 @@ def test_local_cosyvoice3_uses_automodel(monkeypatch):
assert loaded["model_dir"] == "/models/FunAudioLLM/Fun-CosyVoice3-0.5B-2512"
def test_local_cosyvoice_in_process_reads_runtime_flags(monkeypatch):
from opentalking.providers.tts.local_cosyvoice import adapter as cosy_adapter
loaded: dict[str, object] = {}
class FakeAutoModel:
def __init__(self, model_dir, **kwargs):
loaded["model_dir"] = model_dir
loaded["kwargs"] = kwargs
monkeypatch.setitem(
sys.modules,
"cosyvoice.cli.cosyvoice",
SimpleNamespace(AutoModel=FakeAutoModel),
)
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL_DIR", "/models/cosyvoice3")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE", "cuda:0")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_FP16", "auto")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT", "1")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_JIT", "1")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_VLLM", "0")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT", "2")
adapter = cosy_adapter.LocalCosyVoiceTTSAdapter(
model="FunAudioLLM/Fun-CosyVoice3-0.5B-2512",
)
assert isinstance(adapter._load_engine(), FakeAutoModel)
assert loaded["model_dir"] == "/models/cosyvoice3"
assert loaded["kwargs"] == {
"load_jit": True,
"load_trt": True,
"load_vllm": False,
"fp16": True,
"trt_concurrent": 2,
}
def test_local_cosyvoice_provider_config_reports_runtime_flags(monkeypatch):
from opentalking.providers.tts.factory import tts_provider_config
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", "/tmp/opentalking-local-audio")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_SERVICE_URL", "http://127.0.0.1:19090/synthesize")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE", "cuda:0")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_FP16", "auto")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT", "1")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT", "2")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN", "8")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN", "16")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR", "1")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_FLOW_N_TIMESTEPS", "4")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO", "6")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS", "1")
status = tts_provider_config("local_cosyvoice")
assert status["service_url"] == "http://127.0.0.1:19090/synthesize"
assert status["device"] == "cuda:0"
assert status["fp16"] == "auto"
assert status["load_trt"] is True
assert status["trt_concurrent"] == 2
assert status["token_hop_len"] == 8
assert status["token_max_hop_len"] == 16
assert status["stream_scale_factor"] == 1
assert status["flow_n_timesteps"] == 4
assert status["max_token_text_ratio"] == 6.0
assert status["mask_stop_tokens"] is True
def test_local_tts_adapters_read_settings_when_env_is_absent(monkeypatch):
from opentalking.core import config as core_config
from opentalking.providers.tts.local_cosyvoice.adapter import LocalCosyVoiceTTSAdapter
@@ -1456,6 +1525,153 @@ def test_cosyvoice_service_request_prompt_overrides_default(monkeypatch):
assert str(seen["prompt_text"]).endswith("这是本地复刻音色文本。")
def test_cosyvoice_service_applies_validated_runtime_tuning(monkeypatch):
from scripts import local_cosyvoice_service as service_module
loaded: dict[str, object] = {}
class FakeScores:
def __init__(self, values):
self.values = list(values)
def __len__(self):
return len(self.values)
def __setitem__(self, key, value):
if isinstance(key, list):
for item in key:
self.values[item] = value
else:
self.values[key] = value
def clone(self):
return FakeScores(self.values)
class FakeLLM:
def __init__(self):
self.calls = []
self.sampled_scores = []
self.stop_token_ids = [3, 4, 5]
def inference(self, **kwargs):
self.calls.append(kwargs)
yield "token"
def sampling_ids(self, weighted_scores, decoded_tokens, sampling, ignore_eos=True):
return "original"
def sampling(self, weighted_scores, decoded_tokens, sampling):
self.sampled_scores = list(weighted_scores.values)
return "sampled"
class FakeModel:
token_hop_len = 25
token_max_hop_len = 100
stream_scale_factor = 2
def __init__(self):
self.llm = FakeLLM()
self.flow = SimpleNamespace(inference_n_timesteps=10)
class FakeAutoModel:
fp16 = True
sample_rate = 24000
def __init__(self, **kwargs):
loaded.update(kwargs)
self.model = FakeModel()
monkeypatch.setitem(
sys.modules,
"cosyvoice.cli.cosyvoice",
SimpleNamespace(AutoModel=FakeAutoModel),
)
service = service_module.CosyVoiceService(
model_dir="/tmp/model",
runtime_dir="/tmp/runtime",
device="cpu",
prompt_audio="prompt.wav",
prompt_text="参考文本",
mode="zero_shot",
instruction="",
fp16=True,
load_trt=True,
trt_concurrent=2,
token_hop_len=8,
token_max_hop_len=16,
stream_scale_factor=1,
flow_n_timesteps=4,
max_token_text_ratio=6.0,
min_token_text_ratio=1.0,
mask_stop_tokens=True,
)
engine = service.model()
assert loaded["load_trt"] is True
assert loaded["trt_concurrent"] == 2
assert service.health_payload()["streaming"] == {
"token_hop_len": 8,
"token_max_hop_len": 16,
"stream_scale_factor": 1,
}
assert service.health_payload()["flow"] == {"inference_n_timesteps": 4}
assert service.health_payload()["llm_token_ratio"] == {
"max_token_text_ratio": 6.0,
"min_token_text_ratio": 1.0,
}
assert service.health_payload()["llm_stop_token_patch"] == {"stop_token_count": 3}
assert list(engine.model.llm.inference(text="你好")) == ["token"]
assert engine.model.llm.calls[-1]["max_token_text_ratio"] == 6.0
assert engine.model.llm.calls[-1]["min_token_text_ratio"] == 1.0
scores = FakeScores([0.0] * 8)
assert engine.model.llm.sampling_ids(scores, [], 25, ignore_eos=True) == "sampled"
assert scores.values == [0.0] * 8
assert engine.model.llm.sampled_scores[3:6] == [-float("inf")] * 3
assert engine.model.llm.sampling_ids(scores, [], 25, ignore_eos=False) == "original"
def test_cosyvoice_service_resets_streaming_tuning_per_request(monkeypatch):
from scripts import local_cosyvoice_service as service_module
class FakeEngine:
sample_rate = 16000
token_hop_len = 8
token_max_hop_len = 16
stream_scale_factor = 1
def inference_zero_shot(self, text, prompt_text, prompt_audio, stream=True):
self.token_hop_len = 99
yield {"tts_speech": np.zeros(160, dtype=np.float32)}
engine = FakeEngine()
service_module.apply_streaming_tuning(
engine,
token_hop_len=8,
token_max_hop_len=16,
stream_scale_factor=1,
)
service = service_module.CosyVoiceService(
model_dir="model",
runtime_dir="runtime",
device="cpu",
prompt_audio="prompt.wav",
prompt_text="参考文本",
mode="zero_shot",
instruction="",
fp16=False,
)
monkeypatch.setattr(service, "model", lambda: engine)
stream, _sr = service.synthesize_pcm_stream(service_module.SynthesizeRequest(text="你好"))
assert b"".join(stream)
assert engine.token_hop_len == 8
assert engine.token_max_hop_len == 16
assert engine.stream_scale_factor == 1
@pytest.mark.asyncio
@pytest.mark.parametrize(
("module_name", "class_name", "service_env"),