mirror of
https://github.com/datascale-ai/opentalking.git
synced 2026-07-03 15:22:34 +08:00
feat: add local CosyVoice runtime tuning
This commit is contained in:
@@ -61,6 +61,21 @@ _RUNTIME_ENV_KEYS = {
|
||||
"OPENTALKING_TTS_SAMBERT_MODEL",
|
||||
"OPENTALKING_TTS_LOCAL_COSYVOICE_SERVICE_URL",
|
||||
"OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL",
|
||||
"OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL_DIR",
|
||||
"OPENTALKING_TTS_LOCAL_COSYVOICE_RUNTIME_DIR",
|
||||
"OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE",
|
||||
"OPENTALKING_TTS_LOCAL_COSYVOICE_FP16",
|
||||
"OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_JIT",
|
||||
"OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT",
|
||||
"OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_VLLM",
|
||||
"OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT",
|
||||
"OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN",
|
||||
"OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN",
|
||||
"OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR",
|
||||
"OPENTALKING_TTS_LOCAL_COSYVOICE_FLOW_N_TIMESTEPS",
|
||||
"OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO",
|
||||
"OPENTALKING_TTS_LOCAL_COSYVOICE_MIN_TOKEN_TEXT_RATIO",
|
||||
"OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS",
|
||||
"OPENTALKING_TTS_LOCAL_INDEXTTS_SERVICE_URL",
|
||||
"OPENTALKING_TTS_LOCAL_INDEXTTS_MODEL",
|
||||
"OPENTALKING_TTS_OMNIRT_INDEXTTS_SERVICE_URL",
|
||||
|
||||
@@ -127,6 +127,30 @@ def test_agent_lightrag_chunk_fallback_can_be_enabled(monkeypatch: pytest.Monkey
|
||||
assert settings.agent_lightrag_chunk_fallback_enabled is True
|
||||
|
||||
|
||||
def test_local_cosyvoice_runtime_settings_read_prefixed_env(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_FP16", "auto")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT", "true")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT", "2")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN", "8")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN", "16")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR", "1")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_FLOW_N_TIMESTEPS", "4")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO", "6")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS", "true")
|
||||
|
||||
settings = Settings(_env_file=None)
|
||||
|
||||
assert settings.tts_local_cosyvoice_fp16 == "auto"
|
||||
assert settings.tts_local_cosyvoice_load_trt is True
|
||||
assert settings.tts_local_cosyvoice_trt_concurrent == 2
|
||||
assert settings.tts_local_cosyvoice_token_hop_len == 8
|
||||
assert settings.tts_local_cosyvoice_token_max_hop_len == 16
|
||||
assert settings.tts_local_cosyvoice_stream_scale_factor == 1
|
||||
assert settings.tts_local_cosyvoice_flow_n_timesteps == 4
|
||||
assert settings.tts_local_cosyvoice_max_token_text_ratio == 6.0
|
||||
assert settings.tts_local_cosyvoice_mask_stop_tokens is True
|
||||
|
||||
|
||||
def _active_env_names(contents: str) -> set[str]:
|
||||
names: set[str] = set()
|
||||
for raw_line in contents.splitlines():
|
||||
|
||||
@@ -44,6 +44,16 @@ tts:
|
||||
voice: zh-CN-XiaoxiaoNeural
|
||||
sample_rate: 16000
|
||||
streaming_decode: true
|
||||
local_cosyvoice_model: FunAudioLLM/Fun-CosyVoice3-0.5B-2512
|
||||
local_cosyvoice_device: auto
|
||||
local_cosyvoice_fp16: auto
|
||||
local_cosyvoice_load_trt: false
|
||||
local_cosyvoice_load_jit: false
|
||||
local_cosyvoice_load_vllm: false
|
||||
local_cosyvoice_trt_concurrent: 1
|
||||
local_cosyvoice_max_token_text_ratio: 6.0
|
||||
local_cosyvoice_min_token_text_ratio: 0.0
|
||||
local_cosyvoice_mask_stop_tokens: true
|
||||
memory:
|
||||
provider: mem0
|
||||
enabled: false
|
||||
|
||||
@@ -47,6 +47,10 @@ OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL_DIR=./models/local-audio/FunAudioLLM__Fun-
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_RUNTIME_DIR=./models/local-audio/runtime/CosyVoice
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_SERVICE_URL=http://127.0.0.1:19090/synthesize
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE=cuda:0
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_FP16=auto
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT=0
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO=6
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS=1
|
||||
```
|
||||
|
||||
Download local audio weights:
|
||||
@@ -78,6 +82,24 @@ OPENTALKING_TTS_LOCAL_COSYVOICE_PRELOAD=1 \
|
||||
python scripts/local_cosyvoice_service.py --host 127.0.0.1 --port 19090
|
||||
```
|
||||
|
||||
In prior GPU validation, the main CosyVoice3 issue was not a single TTFA number but seed-dependent output-length drift. The local CosyVoice service therefore keeps two stability guards on by default: `OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS=1` masks every stop token exposed by the CosyVoice LLM, and `OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO=6` bounds the token/text ratio so long prompts do not occasionally produce runaway audio. Keep these guards enabled for realtime use.
|
||||
|
||||
TensorRT is optional. Enable it only after the current CosyVoice runtime, CUDA, onnxruntime-gpu/TensorRT engines, and model directory are compatible:
|
||||
|
||||
```env title=".env"
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT=1
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT=1
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN=8
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN=16
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR=1
|
||||
```
|
||||
|
||||
After startup, check the sidecar health payload and verify `runtime_flags.load_trt`, `streaming`, `llm_token_ratio`, and `llm_stop_token_patch`:
|
||||
|
||||
```bash title="Terminal"
|
||||
curl -fsS http://127.0.0.1:19090/health | python3 -m json.tool
|
||||
```
|
||||
|
||||
For the full local speech input, speech synthesis, and QuickTalk video chain, see [Local STT/TTS + QuickTalk](recipes/local-quicktalk-audio.md).
|
||||
|
||||
## IndexTTS Deployment (provider = indextts)
|
||||
|
||||
@@ -47,6 +47,10 @@ OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL_DIR=./models/local-audio/FunAudioLLM__Fun-
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_RUNTIME_DIR=./models/local-audio/runtime/CosyVoice
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_SERVICE_URL=http://127.0.0.1:19090/synthesize
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE=cuda:0
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_FP16=auto
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT=0
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO=6
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS=1
|
||||
```
|
||||
|
||||
下载本地音频权重:
|
||||
@@ -78,6 +82,24 @@ OPENTALKING_TTS_LOCAL_COSYVOICE_PRELOAD=1 \
|
||||
python scripts/local_cosyvoice_service.py --host 127.0.0.1 --port 19090
|
||||
```
|
||||
|
||||
在既有 GPU 验证中,CosyVoice3 的关键问题不是单次 TTFA,而是随机种子导致的生成长度漂移。OpenTalking 的本地 CosyVoice service 因此默认保留两类稳定性保护:`OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS=1` 会屏蔽 CosyVoice LLM 暴露的全部 stop token,`OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO=6` 会限制 token/text 比例,避免长文本偶发生成过长音频。不要为了追求更快首包把这两个保护关掉。
|
||||
|
||||
TensorRT 是可选加速。只有当当前 CosyVoice runtime、CUDA、onnxruntime-gpu/TensorRT engine 与模型目录匹配时再开启:
|
||||
|
||||
```env title=".env"
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT=1
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT=1
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN=8
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN=16
|
||||
OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR=1
|
||||
```
|
||||
|
||||
启动后先检查 sidecar 健康信息,确认 `runtime_flags.load_trt`、`streaming`、`llm_token_ratio` 和 `llm_stop_token_patch` 符合预期:
|
||||
|
||||
```bash title="终端"
|
||||
curl -fsS http://127.0.0.1:19090/health | python3 -m json.tool
|
||||
```
|
||||
|
||||
完整本地语音输入、语音合成和 QuickTalk 视频链路见 [本地 STT/TTS + QuickTalk](recipes/local-quicktalk-audio.md)。
|
||||
|
||||
## IndexTTS 部署(provider = indextts)
|
||||
|
||||
@@ -137,6 +137,18 @@ def _flatten_config(raw: dict[str, Any] | None) -> dict[str, Any]:
|
||||
"local_cosyvoice_service_url": "tts_local_cosyvoice_service_url",
|
||||
"local_cosyvoice_service_urls": "tts_local_cosyvoice_service_urls",
|
||||
"local_cosyvoice_device": "tts_local_cosyvoice_device",
|
||||
"local_cosyvoice_fp16": "tts_local_cosyvoice_fp16",
|
||||
"local_cosyvoice_load_jit": "tts_local_cosyvoice_load_jit",
|
||||
"local_cosyvoice_load_trt": "tts_local_cosyvoice_load_trt",
|
||||
"local_cosyvoice_load_vllm": "tts_local_cosyvoice_load_vllm",
|
||||
"local_cosyvoice_trt_concurrent": "tts_local_cosyvoice_trt_concurrent",
|
||||
"local_cosyvoice_token_hop_len": "tts_local_cosyvoice_token_hop_len",
|
||||
"local_cosyvoice_token_max_hop_len": "tts_local_cosyvoice_token_max_hop_len",
|
||||
"local_cosyvoice_stream_scale_factor": "tts_local_cosyvoice_stream_scale_factor",
|
||||
"local_cosyvoice_flow_n_timesteps": "tts_local_cosyvoice_flow_n_timesteps",
|
||||
"local_cosyvoice_max_token_text_ratio": "tts_local_cosyvoice_max_token_text_ratio",
|
||||
"local_cosyvoice_min_token_text_ratio": "tts_local_cosyvoice_min_token_text_ratio",
|
||||
"local_cosyvoice_mask_stop_tokens": "tts_local_cosyvoice_mask_stop_tokens",
|
||||
"local_indextts_model": "tts_local_indextts_model",
|
||||
"local_indextts_model_dir": "tts_local_indextts_model_dir",
|
||||
"local_indextts_cfg_path": "tts_local_indextts_cfg_path",
|
||||
@@ -491,6 +503,18 @@ class Settings(BaseSettings):
|
||||
tts_local_cosyvoice_service_url: str = ""
|
||||
tts_local_cosyvoice_service_urls: str = ""
|
||||
tts_local_cosyvoice_device: str = "auto"
|
||||
tts_local_cosyvoice_fp16: str = "auto"
|
||||
tts_local_cosyvoice_load_jit: bool = False
|
||||
tts_local_cosyvoice_load_trt: bool = False
|
||||
tts_local_cosyvoice_load_vllm: bool = False
|
||||
tts_local_cosyvoice_trt_concurrent: int = 1
|
||||
tts_local_cosyvoice_token_hop_len: int = 0
|
||||
tts_local_cosyvoice_token_max_hop_len: int = 0
|
||||
tts_local_cosyvoice_stream_scale_factor: int = 0
|
||||
tts_local_cosyvoice_flow_n_timesteps: int = 0
|
||||
tts_local_cosyvoice_max_token_text_ratio: float = 6.0
|
||||
tts_local_cosyvoice_min_token_text_ratio: float = 0.0
|
||||
tts_local_cosyvoice_mask_stop_tokens: bool = True
|
||||
tts_local_indextts_model: str = "IndexTeam/IndexTTS-2"
|
||||
tts_local_indextts_model_dir: str = ""
|
||||
tts_local_indextts_cfg_path: str = ""
|
||||
|
||||
@@ -146,6 +146,41 @@ def _local_cosyvoice_device() -> str:
|
||||
)
|
||||
|
||||
|
||||
def _local_cosyvoice_bool(field: str, settings_name: str, default: bool) -> bool:
|
||||
raw = _provider_env("local_cosyvoice", field) or _settings_value(settings_name, "")
|
||||
if not raw:
|
||||
return default
|
||||
return raw.strip().lower() in {"1", "true", "yes", "on"}
|
||||
|
||||
|
||||
def _local_cosyvoice_int(field: str, settings_name: str, default: int) -> int:
|
||||
raw = _provider_env("local_cosyvoice", field) or _settings_value(settings_name, "")
|
||||
if not raw:
|
||||
return default
|
||||
try:
|
||||
return int(str(raw).strip())
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def _local_cosyvoice_float(field: str, settings_name: str, default: float) -> float:
|
||||
raw = _provider_env("local_cosyvoice", field) or _settings_value(settings_name, "")
|
||||
if not raw:
|
||||
return default
|
||||
try:
|
||||
return float(str(raw).strip())
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def _local_cosyvoice_fp16() -> str:
|
||||
return (
|
||||
_provider_env("local_cosyvoice", "FP16")
|
||||
or _settings_value("tts_local_cosyvoice_fp16", "")
|
||||
or "auto"
|
||||
)
|
||||
|
||||
|
||||
def _local_audio_asset_dir(name: str, required_file: str, *fallback_names: str) -> str:
|
||||
root = _local_audio_model_root()
|
||||
for candidate_name in (name, *fallback_names):
|
||||
@@ -482,7 +517,7 @@ def tts_enabled_providers() -> list[str]:
|
||||
return out or [_provider()]
|
||||
|
||||
|
||||
def tts_provider_config(provider: str) -> dict[str, str | bool]:
|
||||
def tts_provider_config(provider: str) -> dict[str, str | bool | int | float]:
|
||||
p = normalize_tts_provider(provider, default=None) or _provider()
|
||||
if p == "indextts":
|
||||
resolved = _resolve_indextts_provider(p)
|
||||
@@ -539,6 +574,26 @@ def tts_provider_config(provider: str) -> dict[str, str | bool]:
|
||||
if p == "local_cosyvoice":
|
||||
model = _local_cosyvoice_model()
|
||||
service_url = _local_cosyvoice_service_url()
|
||||
token_hop_len = _local_cosyvoice_int(
|
||||
"TOKEN_HOP_LEN",
|
||||
"tts_local_cosyvoice_token_hop_len",
|
||||
0,
|
||||
)
|
||||
token_max_hop_len = _local_cosyvoice_int(
|
||||
"TOKEN_MAX_HOP_LEN",
|
||||
"tts_local_cosyvoice_token_max_hop_len",
|
||||
0,
|
||||
)
|
||||
stream_scale_factor = _local_cosyvoice_int(
|
||||
"STREAM_SCALE_FACTOR",
|
||||
"tts_local_cosyvoice_stream_scale_factor",
|
||||
0,
|
||||
)
|
||||
flow_n_timesteps = _local_cosyvoice_int(
|
||||
"FLOW_N_TIMESTEPS",
|
||||
"tts_local_cosyvoice_flow_n_timesteps",
|
||||
0,
|
||||
)
|
||||
return {
|
||||
"provider": p,
|
||||
"model": model,
|
||||
@@ -546,7 +601,36 @@ def tts_provider_config(provider: str) -> dict[str, str | bool]:
|
||||
"voice": "local-default",
|
||||
"device": _local_cosyvoice_device(),
|
||||
"key_set": False,
|
||||
"service_url": service_url,
|
||||
"service_url_set": bool(service_url),
|
||||
"fp16": _local_cosyvoice_fp16(),
|
||||
"load_jit": _local_cosyvoice_bool("LOAD_JIT", "tts_local_cosyvoice_load_jit", False),
|
||||
"load_trt": _local_cosyvoice_bool("LOAD_TRT", "tts_local_cosyvoice_load_trt", False),
|
||||
"load_vllm": _local_cosyvoice_bool("LOAD_VLLM", "tts_local_cosyvoice_load_vllm", False),
|
||||
"trt_concurrent": _local_cosyvoice_int(
|
||||
"TRT_CONCURRENT",
|
||||
"tts_local_cosyvoice_trt_concurrent",
|
||||
1,
|
||||
),
|
||||
"token_hop_len": token_hop_len,
|
||||
"token_max_hop_len": token_max_hop_len,
|
||||
"stream_scale_factor": stream_scale_factor,
|
||||
"flow_n_timesteps": flow_n_timesteps,
|
||||
"max_token_text_ratio": _local_cosyvoice_float(
|
||||
"MAX_TOKEN_TEXT_RATIO",
|
||||
"tts_local_cosyvoice_max_token_text_ratio",
|
||||
6.0,
|
||||
),
|
||||
"min_token_text_ratio": _local_cosyvoice_float(
|
||||
"MIN_TOKEN_TEXT_RATIO",
|
||||
"tts_local_cosyvoice_min_token_text_ratio",
|
||||
0.0,
|
||||
),
|
||||
"mask_stop_tokens": _local_cosyvoice_bool(
|
||||
"MASK_STOP_TOKENS",
|
||||
"tts_local_cosyvoice_mask_stop_tokens",
|
||||
True,
|
||||
),
|
||||
}
|
||||
if p == "local_qwen3_tts":
|
||||
model = (
|
||||
@@ -670,7 +754,7 @@ def tts_provider_config(provider: str) -> dict[str, str | bool]:
|
||||
}
|
||||
|
||||
|
||||
def tts_status(provider: str | None = None) -> dict[str, str | bool]:
|
||||
def tts_status(provider: str | None = None) -> dict[str, str | bool | int | float]:
|
||||
return tts_provider_config(provider or _provider())
|
||||
|
||||
|
||||
|
||||
@@ -150,6 +150,52 @@ def _env_device() -> str:
|
||||
)
|
||||
|
||||
|
||||
def _local_cosyvoice_bool(field: str, settings_name: str, default: bool) -> bool:
|
||||
raw = os.environ.get(f"OPENTALKING_TTS_LOCAL_COSYVOICE_{field}", "").strip()
|
||||
if not raw:
|
||||
raw = _settings_value(settings_name, "")
|
||||
if not raw:
|
||||
return default
|
||||
return raw.strip().lower() in {"1", "true", "yes", "on"}
|
||||
|
||||
|
||||
def _local_cosyvoice_int(field: str, settings_name: str, default: int) -> int:
|
||||
raw = os.environ.get(f"OPENTALKING_TTS_LOCAL_COSYVOICE_{field}", "").strip()
|
||||
if not raw:
|
||||
raw = _settings_value(settings_name, "")
|
||||
if not raw:
|
||||
return default
|
||||
try:
|
||||
return int(raw)
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def _local_cosyvoice_fp16(device: str) -> bool:
|
||||
raw = (
|
||||
os.environ.get("OPENTALKING_TTS_LOCAL_COSYVOICE_FP16", "").strip()
|
||||
or _settings_value("tts_local_cosyvoice_fp16", "")
|
||||
or "auto"
|
||||
).lower()
|
||||
if raw == "auto":
|
||||
return device.startswith("cuda")
|
||||
return raw in {"1", "true", "yes", "on"}
|
||||
|
||||
|
||||
def _instantiate_cosyvoice_runtime(cls: Any, model_dir: str, kwargs: dict[str, Any]) -> Any:
|
||||
runtime_kwargs = dict(kwargs)
|
||||
optional_keys = ("load_vllm", "trt_concurrent", "load_jit", "load_trt", "fp16")
|
||||
while True:
|
||||
try:
|
||||
return cls(model_dir, **runtime_kwargs)
|
||||
except TypeError as exc:
|
||||
text = str(exc)
|
||||
unsupported = next((key for key in optional_keys if key in runtime_kwargs and key in text), None)
|
||||
if unsupported is None:
|
||||
raise
|
||||
runtime_kwargs.pop(unsupported)
|
||||
|
||||
|
||||
def _audio_format_from_content_type(content_type: str | None) -> str | None:
|
||||
value = (content_type or "").split(";", 1)[0].strip().lower()
|
||||
if value in {"audio/wav", "audio/wave", "audio/x-wav"}:
|
||||
@@ -253,6 +299,14 @@ class LocalCosyVoiceTTSAdapter:
|
||||
default_service_url,
|
||||
)
|
||||
self.device = _env_device()
|
||||
self.fp16 = _local_cosyvoice_fp16(self.device)
|
||||
self.load_jit = _local_cosyvoice_bool("LOAD_JIT", "tts_local_cosyvoice_load_jit", False)
|
||||
self.load_trt = _local_cosyvoice_bool("LOAD_TRT", "tts_local_cosyvoice_load_trt", False)
|
||||
self.load_vllm = _local_cosyvoice_bool("LOAD_VLLM", "tts_local_cosyvoice_load_vllm", False)
|
||||
self.trt_concurrent = max(
|
||||
1,
|
||||
_local_cosyvoice_int("TRT_CONCURRENT", "tts_local_cosyvoice_trt_concurrent", 1),
|
||||
)
|
||||
self._engine: Any | None = None
|
||||
|
||||
async def synthesize_stream(self, text: str, voice: str | None = None) -> AsyncIterator[AudioChunk]:
|
||||
@@ -350,9 +404,11 @@ class LocalCosyVoiceTTSAdapter:
|
||||
) from exc
|
||||
model_dir = self.model_dir or _resolve_model_path(self.model)
|
||||
kwargs: dict[str, Any] = {
|
||||
"load_jit": False,
|
||||
"load_trt": False,
|
||||
"fp16": self.device.startswith("cuda"),
|
||||
"load_jit": self.load_jit,
|
||||
"load_trt": self.load_trt,
|
||||
"load_vllm": self.load_vllm,
|
||||
"fp16": self.fp16,
|
||||
"trt_concurrent": self.trt_concurrent,
|
||||
}
|
||||
model_lower = self.model.lower()
|
||||
if "cosyvoice3" in model_lower:
|
||||
@@ -361,7 +417,7 @@ class LocalCosyVoiceTTSAdapter:
|
||||
cls = getattr(cosyvoice_module, "CosyVoice2")
|
||||
else:
|
||||
cls = getattr(cosyvoice_module, "CosyVoice")
|
||||
self._engine = cls(model_dir, **kwargs)
|
||||
self._engine = _instantiate_cosyvoice_runtime(cls, model_dir, kwargs)
|
||||
return self._engine
|
||||
|
||||
def _available_voice(self, engine: Any, requested: str) -> str:
|
||||
|
||||
@@ -4,8 +4,10 @@ import argparse
|
||||
import io
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from collections.abc import Iterator
|
||||
from importlib.metadata import PackageNotFoundError, version
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
@@ -27,6 +29,211 @@ class SynthesizeRequest(BaseModel):
|
||||
instruction: str | None = None
|
||||
|
||||
|
||||
def _cosyvoice_model(cosyvoice: Any) -> Any:
|
||||
return getattr(cosyvoice, "model", cosyvoice)
|
||||
|
||||
|
||||
def _cosyvoice_llm(cosyvoice: Any) -> Any | None:
|
||||
model = _cosyvoice_model(cosyvoice)
|
||||
return getattr(model, "llm", None)
|
||||
|
||||
|
||||
def _cosyvoice_flow(cosyvoice: Any) -> Any | None:
|
||||
model = _cosyvoice_model(cosyvoice)
|
||||
return getattr(model, "flow", None)
|
||||
|
||||
|
||||
def current_streaming_tuning(cosyvoice: Any) -> dict[str, Any]:
|
||||
model = _cosyvoice_model(cosyvoice)
|
||||
return {
|
||||
attr: getattr(model, attr)
|
||||
for attr in ("token_hop_len", "token_max_hop_len", "stream_scale_factor")
|
||||
if hasattr(model, attr)
|
||||
}
|
||||
|
||||
|
||||
def apply_streaming_tuning(
|
||||
cosyvoice: Any,
|
||||
*,
|
||||
token_hop_len: int | None = None,
|
||||
token_max_hop_len: int | None = None,
|
||||
stream_scale_factor: int | None = None,
|
||||
) -> dict[str, Any]:
|
||||
model = _cosyvoice_model(cosyvoice)
|
||||
requested = {
|
||||
"token_hop_len": token_hop_len,
|
||||
"token_max_hop_len": token_max_hop_len,
|
||||
"stream_scale_factor": stream_scale_factor,
|
||||
}
|
||||
applied: dict[str, Any] = {}
|
||||
for attr, value in requested.items():
|
||||
if value is None:
|
||||
continue
|
||||
if hasattr(model, attr):
|
||||
setattr(model, attr, value)
|
||||
applied[attr] = value
|
||||
else:
|
||||
applied[attr] = "unsupported"
|
||||
effective = current_streaming_tuning(cosyvoice)
|
||||
setattr(model, "_opentalking_streaming_tuning", effective)
|
||||
return {"requested": requested, "applied": applied, "effective": effective}
|
||||
|
||||
|
||||
def reset_streaming_tuning(cosyvoice: Any) -> dict[str, Any]:
|
||||
model = _cosyvoice_model(cosyvoice)
|
||||
baseline = getattr(model, "_opentalking_streaming_tuning", None)
|
||||
if baseline is None:
|
||||
baseline = current_streaming_tuning(cosyvoice)
|
||||
setattr(model, "_opentalking_streaming_tuning", baseline)
|
||||
for attr, value in baseline.items():
|
||||
if hasattr(model, attr):
|
||||
setattr(model, attr, value)
|
||||
return current_streaming_tuning(cosyvoice)
|
||||
|
||||
|
||||
def _with_request_streaming_tuning(cosyvoice: Any, model_output: Iterator[Any]) -> Iterator[Any]:
|
||||
reset_streaming_tuning(cosyvoice)
|
||||
try:
|
||||
yield from model_output
|
||||
finally:
|
||||
reset_streaming_tuning(cosyvoice)
|
||||
|
||||
|
||||
def current_flow_tuning(cosyvoice: Any) -> dict[str, Any]:
|
||||
flow = _cosyvoice_flow(cosyvoice)
|
||||
if flow is None:
|
||||
return {}
|
||||
return {"inference_n_timesteps": int(getattr(flow, "inference_n_timesteps", 10))}
|
||||
|
||||
|
||||
def apply_flow_tuning(cosyvoice: Any, *, n_timesteps: int | None = None) -> dict[str, Any]:
|
||||
flow = _cosyvoice_flow(cosyvoice)
|
||||
requested = {"inference_n_timesteps": n_timesteps}
|
||||
if flow is None:
|
||||
return {"requested": requested, "applied": "unsupported", "effective": {}}
|
||||
applied: dict[str, Any] = {}
|
||||
if n_timesteps is not None:
|
||||
setattr(flow, "inference_n_timesteps", max(1, int(n_timesteps)))
|
||||
applied["inference_n_timesteps"] = getattr(flow, "inference_n_timesteps")
|
||||
return {"requested": requested, "applied": applied, "effective": current_flow_tuning(cosyvoice)}
|
||||
|
||||
|
||||
def current_llm_token_ratio_tuning(cosyvoice: Any) -> dict[str, float]:
|
||||
llm = _cosyvoice_llm(cosyvoice)
|
||||
ratios = getattr(llm, "_opentalking_token_ratios", {}) if llm is not None else {}
|
||||
return dict(ratios) if isinstance(ratios, dict) else {}
|
||||
|
||||
|
||||
def apply_llm_token_ratio_patch(
|
||||
cosyvoice: Any,
|
||||
*,
|
||||
max_token_text_ratio: float | None = None,
|
||||
min_token_text_ratio: float | None = None,
|
||||
) -> dict[str, Any]:
|
||||
requested = {
|
||||
"max_token_text_ratio": max_token_text_ratio,
|
||||
"min_token_text_ratio": min_token_text_ratio,
|
||||
}
|
||||
llm = _cosyvoice_llm(cosyvoice)
|
||||
if llm is None or not hasattr(llm, "inference"):
|
||||
return {"requested": requested, "applied": "unsupported", "effective": {}}
|
||||
if max_token_text_ratio is None and min_token_text_ratio is None:
|
||||
return {"requested": requested, "applied": {}, "effective": current_llm_token_ratio_tuning(cosyvoice)}
|
||||
original = getattr(llm, "_opentalking_original_inference", None)
|
||||
if original is None:
|
||||
original = llm.inference
|
||||
setattr(llm, "_opentalking_original_inference", original)
|
||||
|
||||
applied = {key: value for key, value in requested.items() if value is not None}
|
||||
|
||||
def inference_with_opentalking_ratios(*args: Any, **kwargs: Any) -> Any:
|
||||
if max_token_text_ratio is not None:
|
||||
kwargs.setdefault("max_token_text_ratio", max_token_text_ratio)
|
||||
if min_token_text_ratio is not None:
|
||||
kwargs.setdefault("min_token_text_ratio", min_token_text_ratio)
|
||||
return original(*args, **kwargs)
|
||||
|
||||
llm.inference = inference_with_opentalking_ratios
|
||||
setattr(llm, "_opentalking_token_ratios", applied)
|
||||
return {"requested": requested, "applied": applied, "effective": current_llm_token_ratio_tuning(cosyvoice)}
|
||||
|
||||
|
||||
def current_llm_stop_token_patch(cosyvoice: Any) -> dict[str, Any]:
|
||||
llm = _cosyvoice_llm(cosyvoice)
|
||||
patch = getattr(llm, "_opentalking_stop_token_patch", {}) if llm is not None else {}
|
||||
return dict(patch) if isinstance(patch, dict) else {}
|
||||
|
||||
|
||||
def apply_llm_stop_token_patch(cosyvoice: Any) -> dict[str, Any]:
|
||||
llm = _cosyvoice_llm(cosyvoice)
|
||||
if llm is None or not hasattr(llm, "sampling_ids"):
|
||||
return {"applied": "unsupported", "effective": {}}
|
||||
stop_token_ids = list(getattr(llm, "stop_token_ids", []) or [])
|
||||
if len(stop_token_ids) <= 1 or not hasattr(llm, "sampling"):
|
||||
return {"applied": {}, "effective": current_llm_stop_token_patch(cosyvoice)}
|
||||
if getattr(llm, "_opentalking_stop_token_patch_applied", False):
|
||||
return {"applied": {}, "effective": current_llm_stop_token_patch(cosyvoice)}
|
||||
|
||||
original = llm.sampling_ids
|
||||
setattr(llm, "_opentalking_original_sampling_ids", original)
|
||||
|
||||
def sampling_ids_with_opentalking_stop_mask(
|
||||
weighted_scores: Any,
|
||||
decoded_tokens: Any,
|
||||
sampling: Any,
|
||||
ignore_eos: bool = True,
|
||||
) -> Any:
|
||||
if ignore_eos is True:
|
||||
masked_scores = weighted_scores.clone()
|
||||
valid_stop_ids = [idx for idx in stop_token_ids if 0 <= idx < len(masked_scores)]
|
||||
if valid_stop_ids:
|
||||
masked_scores[valid_stop_ids] = -float("inf")
|
||||
return llm.sampling(masked_scores, decoded_tokens, sampling)
|
||||
return original(weighted_scores, decoded_tokens, sampling, ignore_eos)
|
||||
|
||||
llm.sampling_ids = sampling_ids_with_opentalking_stop_mask
|
||||
setattr(llm, "_opentalking_stop_token_patch_applied", True)
|
||||
setattr(llm, "_opentalking_stop_token_patch", {"stop_token_count": len(stop_token_ids)})
|
||||
return {"applied": {"stop_token_count": len(stop_token_ids)}, "effective": current_llm_stop_token_patch(cosyvoice)}
|
||||
|
||||
|
||||
def current_runtime_info(cosyvoice: Any) -> dict[str, Any]:
|
||||
model = _cosyvoice_model(cosyvoice)
|
||||
flow = getattr(model, "flow", None)
|
||||
decoder = getattr(flow, "decoder", None)
|
||||
estimator = getattr(decoder, "estimator", None)
|
||||
estimator_type = estimator.__class__.__name__ if estimator is not None else ""
|
||||
return {
|
||||
"fp16": bool(getattr(cosyvoice, "fp16", False)),
|
||||
"flow_decoder_estimator": estimator_type,
|
||||
"flow_decoder_trt": estimator_type == "TrtContextWrapper",
|
||||
}
|
||||
|
||||
|
||||
def runtime_package_versions(*packages: str) -> dict[str, str]:
|
||||
versions: dict[str, str] = {}
|
||||
for package in packages:
|
||||
try:
|
||||
versions[package] = version(package)
|
||||
except PackageNotFoundError:
|
||||
versions[package] = "not-installed"
|
||||
return versions
|
||||
|
||||
|
||||
def _instantiate_automodel(cls: Any, kwargs: dict[str, Any]) -> tuple[Any, dict[str, Any]]:
|
||||
model_kwargs = dict(kwargs)
|
||||
optional_keys = ("load_vllm", "load_jit", "trt_concurrent")
|
||||
while True:
|
||||
try:
|
||||
return cls(**model_kwargs), model_kwargs
|
||||
except TypeError as exc:
|
||||
text = str(exc)
|
||||
unsupported = next((key for key in optional_keys if key in model_kwargs and key in text), None)
|
||||
if unsupported is None:
|
||||
raise
|
||||
model_kwargs.pop(unsupported)
|
||||
|
||||
|
||||
class CosyVoiceService:
|
||||
def __init__(
|
||||
self,
|
||||
@@ -39,6 +246,17 @@ class CosyVoiceService:
|
||||
mode: str,
|
||||
instruction: str,
|
||||
fp16: bool,
|
||||
load_jit: bool = False,
|
||||
load_trt: bool = False,
|
||||
load_vllm: bool = False,
|
||||
trt_concurrent: int = 1,
|
||||
token_hop_len: int | None = None,
|
||||
token_max_hop_len: int | None = None,
|
||||
stream_scale_factor: int | None = None,
|
||||
flow_n_timesteps: int | None = None,
|
||||
max_token_text_ratio: float | None = 6.0,
|
||||
min_token_text_ratio: float | None = None,
|
||||
mask_stop_tokens: bool = True,
|
||||
) -> None:
|
||||
self.model_dir = model_dir
|
||||
self.runtime_dir = runtime_dir
|
||||
@@ -48,7 +266,24 @@ class CosyVoiceService:
|
||||
self.mode = mode
|
||||
self.instruction = instruction
|
||||
self.fp16 = fp16
|
||||
self.load_jit = load_jit
|
||||
self.load_trt = load_trt
|
||||
self.load_vllm = load_vllm
|
||||
self.trt_concurrent = max(1, int(trt_concurrent or 1))
|
||||
self.token_hop_len = token_hop_len
|
||||
self.token_max_hop_len = token_max_hop_len
|
||||
self.stream_scale_factor = stream_scale_factor
|
||||
self.flow_n_timesteps = flow_n_timesteps
|
||||
self.max_token_text_ratio = max_token_text_ratio
|
||||
self.min_token_text_ratio = min_token_text_ratio
|
||||
self.mask_stop_tokens = mask_stop_tokens
|
||||
self._model: Any | None = None
|
||||
self._model_lock = threading.Lock()
|
||||
self._loaded_model_kwargs: dict[str, Any] = {}
|
||||
self._streaming_tuning: dict[str, Any] = {}
|
||||
self._flow_tuning: dict[str, Any] = {}
|
||||
self._llm_token_ratio_tuning: dict[str, Any] = {}
|
||||
self._llm_stop_token_patch: dict[str, Any] = {}
|
||||
|
||||
def model(self) -> Any:
|
||||
if self._model is not None:
|
||||
@@ -76,24 +311,85 @@ class CosyVoiceService:
|
||||
t0 = time.perf_counter()
|
||||
model_kwargs = {
|
||||
"model_dir": self.model_dir,
|
||||
"load_trt": False,
|
||||
"load_vllm": False,
|
||||
"load_jit": self.load_jit,
|
||||
"load_trt": self.load_trt,
|
||||
"load_vllm": self.load_vllm,
|
||||
"fp16": self.fp16,
|
||||
"trt_concurrent": self.trt_concurrent,
|
||||
}
|
||||
try:
|
||||
self._model = AutoModel(**model_kwargs)
|
||||
except TypeError as exc:
|
||||
if "load_vllm" not in str(exc):
|
||||
raise
|
||||
model_kwargs.pop("load_vllm")
|
||||
self._model = AutoModel(**model_kwargs)
|
||||
self._model, self._loaded_model_kwargs = _instantiate_automodel(AutoModel, model_kwargs)
|
||||
self._apply_runtime_tuning()
|
||||
# Keep the service zero-shot first so it does not require precomputed spk2info.pt.
|
||||
print(
|
||||
f"loaded cosyvoice model={self.model_dir} runtime={runtime} device={self.device} seconds={time.perf_counter() - t0:.3f}",
|
||||
"loaded cosyvoice "
|
||||
f"model={self.model_dir} runtime={runtime} device={self.device} "
|
||||
f"fp16={self.fp16} load_jit={self.load_jit} load_trt={self.load_trt} "
|
||||
f"load_vllm={self.load_vllm} trt_concurrent={self.trt_concurrent} "
|
||||
f"seconds={time.perf_counter() - t0:.3f}",
|
||||
flush=True,
|
||||
)
|
||||
return self._model
|
||||
|
||||
def _apply_runtime_tuning(self) -> None:
|
||||
if self._model is None:
|
||||
return
|
||||
self._streaming_tuning = apply_streaming_tuning(
|
||||
self._model,
|
||||
token_hop_len=self.token_hop_len,
|
||||
token_max_hop_len=self.token_max_hop_len,
|
||||
stream_scale_factor=self.stream_scale_factor,
|
||||
)
|
||||
self._flow_tuning = apply_flow_tuning(self._model, n_timesteps=self.flow_n_timesteps)
|
||||
self._llm_token_ratio_tuning = apply_llm_token_ratio_patch(
|
||||
self._model,
|
||||
max_token_text_ratio=self.max_token_text_ratio,
|
||||
min_token_text_ratio=self.min_token_text_ratio,
|
||||
)
|
||||
self._llm_stop_token_patch = (
|
||||
apply_llm_stop_token_patch(self._model)
|
||||
if self.mask_stop_tokens
|
||||
else {"applied": {}, "effective": current_llm_stop_token_patch(self._model)}
|
||||
)
|
||||
print(
|
||||
"cosyvoice tuning "
|
||||
f"streaming={self._streaming_tuning} flow={self._flow_tuning} "
|
||||
f"llm_token_ratio={self._llm_token_ratio_tuning} "
|
||||
f"llm_stop_token_patch={self._llm_stop_token_patch}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
def health_payload(self) -> dict[str, Any]:
|
||||
model = self._model
|
||||
return {
|
||||
"status": "ok",
|
||||
"model_dir": self.model_dir,
|
||||
"runtime_dir": self.runtime_dir,
|
||||
"device": self.device,
|
||||
"loaded": model is not None,
|
||||
"mode": self.mode,
|
||||
"runtime_flags": {
|
||||
"fp16": self.fp16,
|
||||
"load_jit": self.load_jit,
|
||||
"load_trt": self.load_trt,
|
||||
"load_vllm": self.load_vllm,
|
||||
"trt_concurrent": self.trt_concurrent,
|
||||
"loaded_model_kwargs": self._loaded_model_kwargs,
|
||||
},
|
||||
"streaming": current_streaming_tuning(model) if model is not None else self._streaming_tuning,
|
||||
"flow": current_flow_tuning(model) if model is not None else self._flow_tuning,
|
||||
"llm_token_ratio": current_llm_token_ratio_tuning(model) if model is not None else self._llm_token_ratio_tuning,
|
||||
"llm_stop_token_patch": current_llm_stop_token_patch(model) if model is not None else self._llm_stop_token_patch,
|
||||
"runtime": current_runtime_info(model) if model is not None else {},
|
||||
"runtime_packages": runtime_package_versions(
|
||||
"transformers",
|
||||
"tokenizers",
|
||||
"torch",
|
||||
"torchaudio",
|
||||
"numpy",
|
||||
"onnxruntime",
|
||||
),
|
||||
}
|
||||
|
||||
def _to_wav_bytes(self, speech: Any, sample_rate: int) -> bytes:
|
||||
if hasattr(speech, "detach"):
|
||||
speech = speech.detach().cpu().numpy()
|
||||
@@ -162,17 +458,18 @@ class CosyVoiceService:
|
||||
stream=False,
|
||||
)
|
||||
parts: list[np.ndarray] = []
|
||||
for item in iterator:
|
||||
speech = item.get("tts_speech") if isinstance(item, dict) else item
|
||||
if hasattr(speech, "detach"):
|
||||
speech = speech.detach().cpu().numpy()
|
||||
parts.append(np.asarray(speech, dtype=np.float32).reshape(-1))
|
||||
with self._model_lock:
|
||||
for item in _with_request_streaming_tuning(model, iterator):
|
||||
speech = item.get("tts_speech") if isinstance(item, dict) else item
|
||||
if hasattr(speech, "detach"):
|
||||
speech = speech.detach().cpu().numpy()
|
||||
parts.append(np.asarray(speech, dtype=np.float32).reshape(-1))
|
||||
if not parts:
|
||||
raise HTTPException(status_code=502, detail="CosyVoice returned no audio")
|
||||
wav_bytes = self._to_wav_bytes(np.concatenate(parts), sample_rate)
|
||||
return wav_bytes, sample_rate, time.perf_counter() - t0
|
||||
|
||||
def _streaming_iterator(self, req: SynthesizeRequest) -> tuple[Iterator[Any], int, int, float]:
|
||||
def _streaming_iterator(self, req: SynthesizeRequest) -> tuple[Iterator[Any], int, int, float, Any]:
|
||||
text = req.text.strip()
|
||||
if not text:
|
||||
raise HTTPException(status_code=400, detail="text is required")
|
||||
@@ -204,30 +501,32 @@ class CosyVoiceService:
|
||||
prompt_audio,
|
||||
stream=True,
|
||||
)
|
||||
return iterator, source_sr, target_sr, t0
|
||||
return iterator, source_sr, target_sr, t0, model
|
||||
|
||||
def synthesize_pcm_stream(self, req: SynthesizeRequest) -> tuple[Iterator[bytes], int]:
|
||||
iterator, source_sr, target_sr, t0 = self._streaming_iterator(req)
|
||||
iterator, source_sr, target_sr, t0, model = self._streaming_iterator(req)
|
||||
|
||||
def generate() -> Iterator[bytes]:
|
||||
first = True
|
||||
chunks = 0
|
||||
samples = 0
|
||||
for item in iterator:
|
||||
speech = item.get("tts_speech") if isinstance(item, dict) else item
|
||||
pcm = self._audio_to_i16(speech)
|
||||
pcm = self._resample_linear(pcm, source_sr, target_sr)
|
||||
if pcm.size == 0:
|
||||
continue
|
||||
if first:
|
||||
print(
|
||||
f"first_pcm chars={len(req.text.strip())} sr={target_sr} seconds={time.perf_counter() - t0:.3f}",
|
||||
flush=True,
|
||||
)
|
||||
first = False
|
||||
chunks += 1
|
||||
samples += int(pcm.size)
|
||||
yield pcm.astype("<i2", copy=False).tobytes()
|
||||
with self._model_lock:
|
||||
tuned_iterator = _with_request_streaming_tuning(model, iterator)
|
||||
for item in tuned_iterator:
|
||||
speech = item.get("tts_speech") if isinstance(item, dict) else item
|
||||
pcm = self._audio_to_i16(speech)
|
||||
pcm = self._resample_linear(pcm, source_sr, target_sr)
|
||||
if pcm.size == 0:
|
||||
continue
|
||||
if first:
|
||||
print(
|
||||
f"first_pcm chars={len(req.text.strip())} sr={target_sr} seconds={time.perf_counter() - t0:.3f}",
|
||||
flush=True,
|
||||
)
|
||||
first = False
|
||||
chunks += 1
|
||||
samples += int(pcm.size)
|
||||
yield pcm.astype("<i2", copy=False).tobytes()
|
||||
if chunks == 0:
|
||||
raise RuntimeError("CosyVoice returned no audio")
|
||||
print(
|
||||
@@ -253,14 +552,7 @@ def create_app(service: CosyVoiceService) -> FastAPI:
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> dict[str, Any]:
|
||||
return {
|
||||
"status": "ok",
|
||||
"model_dir": service.model_dir,
|
||||
"runtime_dir": service.runtime_dir,
|
||||
"device": service.device,
|
||||
"loaded": service._model is not None,
|
||||
"mode": service.mode,
|
||||
}
|
||||
return service.health_payload()
|
||||
|
||||
@app.post("/synthesize")
|
||||
def synthesize(req: SynthesizeRequest) -> StreamingResponse:
|
||||
@@ -286,6 +578,29 @@ def _local_audio_root() -> Path:
|
||||
return Path(os.environ.get("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", "./models/local-audio")).expanduser()
|
||||
|
||||
|
||||
def _env_bool(name: str, default: bool = False) -> bool:
|
||||
raw = os.environ.get(name, "").strip().lower()
|
||||
if not raw:
|
||||
return default
|
||||
return raw in {"1", "true", "yes", "on"}
|
||||
|
||||
|
||||
def _env_optional_int(name: str) -> int | None:
|
||||
raw = os.environ.get(name, "").strip()
|
||||
if not raw:
|
||||
return None
|
||||
value = int(raw)
|
||||
return value if value > 0 else None
|
||||
|
||||
|
||||
def _env_optional_float(name: str, default: float | None = None) -> float | None:
|
||||
raw = os.environ.get(name, "").strip()
|
||||
if not raw:
|
||||
return default
|
||||
value = float(raw)
|
||||
return value if value > 0 else None
|
||||
|
||||
|
||||
def build_service_from_env() -> CosyVoiceService:
|
||||
device = os.environ.get("OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE", "cuda:0")
|
||||
fp16_raw = os.environ.get("OPENTALKING_TTS_LOCAL_COSYVOICE_FP16", "auto").strip().lower()
|
||||
@@ -309,6 +624,17 @@ def build_service_from_env() -> CosyVoiceService:
|
||||
"You are a helpful assistant.<|endofprompt|>",
|
||||
),
|
||||
fp16=fp16,
|
||||
load_jit=_env_bool("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_JIT", False),
|
||||
load_trt=_env_bool("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT", False),
|
||||
load_vllm=_env_bool("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_VLLM", False),
|
||||
trt_concurrent=int(os.environ.get("OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT", "1") or "1"),
|
||||
token_hop_len=_env_optional_int("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN"),
|
||||
token_max_hop_len=_env_optional_int("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN"),
|
||||
stream_scale_factor=_env_optional_int("OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR"),
|
||||
flow_n_timesteps=_env_optional_int("OPENTALKING_TTS_LOCAL_COSYVOICE_FLOW_N_TIMESTEPS"),
|
||||
max_token_text_ratio=_env_optional_float("OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO", 6.0),
|
||||
min_token_text_ratio=_env_optional_float("OPENTALKING_TTS_LOCAL_COSYVOICE_MIN_TOKEN_TEXT_RATIO"),
|
||||
mask_stop_tokens=_env_bool("OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS", True),
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -43,6 +43,25 @@
|
||||
# OMNIRT_ENDPOINT=http://127.0.0.1:9000
|
||||
# OMNIRT_AUDIO2VIDEO_PATH_TEMPLATE=/v1/audio2video/{model}
|
||||
|
||||
# Local CosyVoice3 sidecar. Keep TensorRT off until the CosyVoice runtime has
|
||||
# built/loaded compatible TRT engines for this GPU and model directory.
|
||||
# OPENTALKING_TTS_DEFAULT_PROVIDER=local_cosyvoice
|
||||
# OPENTALKING_TTS_ENABLED_PROVIDERS=local_cosyvoice,dashscope,edge
|
||||
# OPENTALKING_LOCAL_AUDIO_MODEL_ROOT=$DIGITAL_HUMAN_HOME/models/local-audio
|
||||
# OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL=FunAudioLLM/Fun-CosyVoice3-0.5B-2512
|
||||
# OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL_DIR=$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/FunAudioLLM__Fun-CosyVoice3-0.5B-2512
|
||||
# OPENTALKING_TTS_LOCAL_COSYVOICE_RUNTIME_DIR=$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/CosyVoice
|
||||
# OPENTALKING_TTS_LOCAL_COSYVOICE_SERVICE_URL=http://127.0.0.1:19090/synthesize
|
||||
# OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE=cuda:0
|
||||
# OPENTALKING_TTS_LOCAL_COSYVOICE_FP16=auto
|
||||
# OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT=0
|
||||
# OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT=1
|
||||
# OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN=8
|
||||
# OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN=16
|
||||
# OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR=1
|
||||
# OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO=6
|
||||
# OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS=1
|
||||
|
||||
# OmniRT helper defaults.
|
||||
# OMNIRT_PORT=9000
|
||||
# OMNIRT_HOST=0.0.0.0
|
||||
|
||||
@@ -962,6 +962,75 @@ def test_local_cosyvoice3_uses_automodel(monkeypatch):
|
||||
assert loaded["model_dir"] == "/models/FunAudioLLM/Fun-CosyVoice3-0.5B-2512"
|
||||
|
||||
|
||||
def test_local_cosyvoice_in_process_reads_runtime_flags(monkeypatch):
|
||||
from opentalking.providers.tts.local_cosyvoice import adapter as cosy_adapter
|
||||
|
||||
loaded: dict[str, object] = {}
|
||||
|
||||
class FakeAutoModel:
|
||||
def __init__(self, model_dir, **kwargs):
|
||||
loaded["model_dir"] = model_dir
|
||||
loaded["kwargs"] = kwargs
|
||||
|
||||
monkeypatch.setitem(
|
||||
sys.modules,
|
||||
"cosyvoice.cli.cosyvoice",
|
||||
SimpleNamespace(AutoModel=FakeAutoModel),
|
||||
)
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL_DIR", "/models/cosyvoice3")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE", "cuda:0")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_FP16", "auto")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT", "1")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_JIT", "1")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_VLLM", "0")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT", "2")
|
||||
|
||||
adapter = cosy_adapter.LocalCosyVoiceTTSAdapter(
|
||||
model="FunAudioLLM/Fun-CosyVoice3-0.5B-2512",
|
||||
)
|
||||
|
||||
assert isinstance(adapter._load_engine(), FakeAutoModel)
|
||||
assert loaded["model_dir"] == "/models/cosyvoice3"
|
||||
assert loaded["kwargs"] == {
|
||||
"load_jit": True,
|
||||
"load_trt": True,
|
||||
"load_vllm": False,
|
||||
"fp16": True,
|
||||
"trt_concurrent": 2,
|
||||
}
|
||||
|
||||
|
||||
def test_local_cosyvoice_provider_config_reports_runtime_flags(monkeypatch):
|
||||
from opentalking.providers.tts.factory import tts_provider_config
|
||||
|
||||
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", "/tmp/opentalking-local-audio")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_SERVICE_URL", "http://127.0.0.1:19090/synthesize")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE", "cuda:0")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_FP16", "auto")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT", "1")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT", "2")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN", "8")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN", "16")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR", "1")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_FLOW_N_TIMESTEPS", "4")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO", "6")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS", "1")
|
||||
|
||||
status = tts_provider_config("local_cosyvoice")
|
||||
|
||||
assert status["service_url"] == "http://127.0.0.1:19090/synthesize"
|
||||
assert status["device"] == "cuda:0"
|
||||
assert status["fp16"] == "auto"
|
||||
assert status["load_trt"] is True
|
||||
assert status["trt_concurrent"] == 2
|
||||
assert status["token_hop_len"] == 8
|
||||
assert status["token_max_hop_len"] == 16
|
||||
assert status["stream_scale_factor"] == 1
|
||||
assert status["flow_n_timesteps"] == 4
|
||||
assert status["max_token_text_ratio"] == 6.0
|
||||
assert status["mask_stop_tokens"] is True
|
||||
|
||||
|
||||
def test_local_tts_adapters_read_settings_when_env_is_absent(monkeypatch):
|
||||
from opentalking.core import config as core_config
|
||||
from opentalking.providers.tts.local_cosyvoice.adapter import LocalCosyVoiceTTSAdapter
|
||||
@@ -1456,6 +1525,153 @@ def test_cosyvoice_service_request_prompt_overrides_default(monkeypatch):
|
||||
assert str(seen["prompt_text"]).endswith("这是本地复刻音色文本。")
|
||||
|
||||
|
||||
def test_cosyvoice_service_applies_validated_runtime_tuning(monkeypatch):
|
||||
from scripts import local_cosyvoice_service as service_module
|
||||
|
||||
loaded: dict[str, object] = {}
|
||||
|
||||
class FakeScores:
|
||||
def __init__(self, values):
|
||||
self.values = list(values)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.values)
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
if isinstance(key, list):
|
||||
for item in key:
|
||||
self.values[item] = value
|
||||
else:
|
||||
self.values[key] = value
|
||||
|
||||
def clone(self):
|
||||
return FakeScores(self.values)
|
||||
|
||||
class FakeLLM:
|
||||
def __init__(self):
|
||||
self.calls = []
|
||||
self.sampled_scores = []
|
||||
self.stop_token_ids = [3, 4, 5]
|
||||
|
||||
def inference(self, **kwargs):
|
||||
self.calls.append(kwargs)
|
||||
yield "token"
|
||||
|
||||
def sampling_ids(self, weighted_scores, decoded_tokens, sampling, ignore_eos=True):
|
||||
return "original"
|
||||
|
||||
def sampling(self, weighted_scores, decoded_tokens, sampling):
|
||||
self.sampled_scores = list(weighted_scores.values)
|
||||
return "sampled"
|
||||
|
||||
class FakeModel:
|
||||
token_hop_len = 25
|
||||
token_max_hop_len = 100
|
||||
stream_scale_factor = 2
|
||||
|
||||
def __init__(self):
|
||||
self.llm = FakeLLM()
|
||||
self.flow = SimpleNamespace(inference_n_timesteps=10)
|
||||
|
||||
class FakeAutoModel:
|
||||
fp16 = True
|
||||
sample_rate = 24000
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
loaded.update(kwargs)
|
||||
self.model = FakeModel()
|
||||
|
||||
monkeypatch.setitem(
|
||||
sys.modules,
|
||||
"cosyvoice.cli.cosyvoice",
|
||||
SimpleNamespace(AutoModel=FakeAutoModel),
|
||||
)
|
||||
|
||||
service = service_module.CosyVoiceService(
|
||||
model_dir="/tmp/model",
|
||||
runtime_dir="/tmp/runtime",
|
||||
device="cpu",
|
||||
prompt_audio="prompt.wav",
|
||||
prompt_text="参考文本",
|
||||
mode="zero_shot",
|
||||
instruction="",
|
||||
fp16=True,
|
||||
load_trt=True,
|
||||
trt_concurrent=2,
|
||||
token_hop_len=8,
|
||||
token_max_hop_len=16,
|
||||
stream_scale_factor=1,
|
||||
flow_n_timesteps=4,
|
||||
max_token_text_ratio=6.0,
|
||||
min_token_text_ratio=1.0,
|
||||
mask_stop_tokens=True,
|
||||
)
|
||||
engine = service.model()
|
||||
|
||||
assert loaded["load_trt"] is True
|
||||
assert loaded["trt_concurrent"] == 2
|
||||
assert service.health_payload()["streaming"] == {
|
||||
"token_hop_len": 8,
|
||||
"token_max_hop_len": 16,
|
||||
"stream_scale_factor": 1,
|
||||
}
|
||||
assert service.health_payload()["flow"] == {"inference_n_timesteps": 4}
|
||||
assert service.health_payload()["llm_token_ratio"] == {
|
||||
"max_token_text_ratio": 6.0,
|
||||
"min_token_text_ratio": 1.0,
|
||||
}
|
||||
assert service.health_payload()["llm_stop_token_patch"] == {"stop_token_count": 3}
|
||||
|
||||
assert list(engine.model.llm.inference(text="你好")) == ["token"]
|
||||
assert engine.model.llm.calls[-1]["max_token_text_ratio"] == 6.0
|
||||
assert engine.model.llm.calls[-1]["min_token_text_ratio"] == 1.0
|
||||
scores = FakeScores([0.0] * 8)
|
||||
assert engine.model.llm.sampling_ids(scores, [], 25, ignore_eos=True) == "sampled"
|
||||
assert scores.values == [0.0] * 8
|
||||
assert engine.model.llm.sampled_scores[3:6] == [-float("inf")] * 3
|
||||
assert engine.model.llm.sampling_ids(scores, [], 25, ignore_eos=False) == "original"
|
||||
|
||||
|
||||
def test_cosyvoice_service_resets_streaming_tuning_per_request(monkeypatch):
|
||||
from scripts import local_cosyvoice_service as service_module
|
||||
|
||||
class FakeEngine:
|
||||
sample_rate = 16000
|
||||
token_hop_len = 8
|
||||
token_max_hop_len = 16
|
||||
stream_scale_factor = 1
|
||||
|
||||
def inference_zero_shot(self, text, prompt_text, prompt_audio, stream=True):
|
||||
self.token_hop_len = 99
|
||||
yield {"tts_speech": np.zeros(160, dtype=np.float32)}
|
||||
|
||||
engine = FakeEngine()
|
||||
service_module.apply_streaming_tuning(
|
||||
engine,
|
||||
token_hop_len=8,
|
||||
token_max_hop_len=16,
|
||||
stream_scale_factor=1,
|
||||
)
|
||||
service = service_module.CosyVoiceService(
|
||||
model_dir="model",
|
||||
runtime_dir="runtime",
|
||||
device="cpu",
|
||||
prompt_audio="prompt.wav",
|
||||
prompt_text="参考文本",
|
||||
mode="zero_shot",
|
||||
instruction="",
|
||||
fp16=False,
|
||||
)
|
||||
monkeypatch.setattr(service, "model", lambda: engine)
|
||||
|
||||
stream, _sr = service.synthesize_pcm_stream(service_module.SynthesizeRequest(text="你好"))
|
||||
assert b"".join(stream)
|
||||
|
||||
assert engine.token_hop_len == 8
|
||||
assert engine.token_max_hop_len == 16
|
||||
assert engine.stream_scale_factor == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
("module_name", "class_name", "service_env"),
|
||||
|
||||
Reference in New Issue
Block a user