feat: add local CosyVoice runtime tuning

2026-07-03 15:22:34 +08:00 · 2026-06-23 17:59:10 +08:00
parent 57b3e48718
commit 61f4007965
11 changed files with 865 additions and 47 deletions
--- a/apps/api/routes/runtime_config.py
+++ b/apps/api/routes/runtime_config.py
@@ -61,6 +61,21 @@ _RUNTIME_ENV_KEYS = {
    "OPENTALKING_TTS_SAMBERT_MODEL",
    "OPENTALKING_TTS_LOCAL_COSYVOICE_SERVICE_URL",
    "OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL",
+    "OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL_DIR",
+    "OPENTALKING_TTS_LOCAL_COSYVOICE_RUNTIME_DIR",
+    "OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE",
+    "OPENTALKING_TTS_LOCAL_COSYVOICE_FP16",
+    "OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_JIT",
+    "OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT",
+    "OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_VLLM",
+    "OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT",
+    "OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN",
+    "OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN",
+    "OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR",
+    "OPENTALKING_TTS_LOCAL_COSYVOICE_FLOW_N_TIMESTEPS",
+    "OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO",
+    "OPENTALKING_TTS_LOCAL_COSYVOICE_MIN_TOKEN_TEXT_RATIO",
+    "OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS",
    "OPENTALKING_TTS_LOCAL_INDEXTTS_SERVICE_URL",
    "OPENTALKING_TTS_LOCAL_INDEXTTS_MODEL",
    "OPENTALKING_TTS_OMNIRT_INDEXTTS_SERVICE_URL",
--- a/apps/api/tests/test_config.py
+++ b/apps/api/tests/test_config.py
@@ -127,6 +127,30 @@ def test_agent_lightrag_chunk_fallback_can_be_enabled(monkeypatch: pytest.Monkey
    assert settings.agent_lightrag_chunk_fallback_enabled is True


+def test_local_cosyvoice_runtime_settings_read_prefixed_env(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_FP16", "auto")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT", "true")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT", "2")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN", "8")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN", "16")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR", "1")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_FLOW_N_TIMESTEPS", "4")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO", "6")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS", "true")
+
+    settings = Settings(_env_file=None)
+
+    assert settings.tts_local_cosyvoice_fp16 == "auto"
+    assert settings.tts_local_cosyvoice_load_trt is True
+    assert settings.tts_local_cosyvoice_trt_concurrent == 2
+    assert settings.tts_local_cosyvoice_token_hop_len == 8
+    assert settings.tts_local_cosyvoice_token_max_hop_len == 16
+    assert settings.tts_local_cosyvoice_stream_scale_factor == 1
+    assert settings.tts_local_cosyvoice_flow_n_timesteps == 4
+    assert settings.tts_local_cosyvoice_max_token_text_ratio == 6.0
+    assert settings.tts_local_cosyvoice_mask_stop_tokens is True
+
+
 def _active_env_names(contents: str) -> set[str]:
    names: set[str] = set()
    for raw_line in contents.splitlines():
--- a/configs/default.yaml
+++ b/configs/default.yaml
@@ -44,6 +44,16 @@ tts:
  voice: zh-CN-XiaoxiaoNeural
  sample_rate: 16000
  streaming_decode: true
+  local_cosyvoice_model: FunAudioLLM/Fun-CosyVoice3-0.5B-2512
+  local_cosyvoice_device: auto
+  local_cosyvoice_fp16: auto
+  local_cosyvoice_load_trt: false
+  local_cosyvoice_load_jit: false
+  local_cosyvoice_load_vllm: false
+  local_cosyvoice_trt_concurrent: 1
+  local_cosyvoice_max_token_text_ratio: 6.0
+  local_cosyvoice_min_token_text_ratio: 0.0
+  local_cosyvoice_mask_stop_tokens: true
 memory:
  provider: mem0
  enabled: false
--- a/docs/en/model-deployment/tts.md
+++ b/docs/en/model-deployment/tts.md
@@ -47,6 +47,10 @@ OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL_DIR=./models/local-audio/FunAudioLLM__Fun-
 OPENTALKING_TTS_LOCAL_COSYVOICE_RUNTIME_DIR=./models/local-audio/runtime/CosyVoice
 OPENTALKING_TTS_LOCAL_COSYVOICE_SERVICE_URL=http://127.0.0.1:19090/synthesize
 OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE=cuda:0
+OPENTALKING_TTS_LOCAL_COSYVOICE_FP16=auto
+OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT=0
+OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO=6
+OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS=1
 ```

 Download local audio weights:
@@ -78,6 +82,24 @@ OPENTALKING_TTS_LOCAL_COSYVOICE_PRELOAD=1 \
 python scripts/local_cosyvoice_service.py --host 127.0.0.1 --port 19090
 ```

+In prior GPU validation, the main CosyVoice3 issue was not a single TTFA number but seed-dependent output-length drift. The local CosyVoice service therefore keeps two stability guards on by default: `OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS=1` masks every stop token exposed by the CosyVoice LLM, and `OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO=6` bounds the token/text ratio so long prompts do not occasionally produce runaway audio. Keep these guards enabled for realtime use.
+
+TensorRT is optional. Enable it only after the current CosyVoice runtime, CUDA, onnxruntime-gpu/TensorRT engines, and model directory are compatible:
+
+```env title=".env"
+OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT=1
+OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT=1
+OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN=8
+OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN=16
+OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR=1
+```
+
+After startup, check the sidecar health payload and verify `runtime_flags.load_trt`, `streaming`, `llm_token_ratio`, and `llm_stop_token_patch`:
+
+```bash title="Terminal"
+curl -fsS http://127.0.0.1:19090/health | python3 -m json.tool
+```
+
 For the full local speech input, speech synthesis, and QuickTalk video chain, see [Local STT/TTS + QuickTalk](recipes/local-quicktalk-audio.md).

 ## IndexTTS Deployment (provider = indextts)
--- a/docs/zh/model-deployment/tts.md
+++ b/docs/zh/model-deployment/tts.md
@@ -47,6 +47,10 @@ OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL_DIR=./models/local-audio/FunAudioLLM__Fun-
 OPENTALKING_TTS_LOCAL_COSYVOICE_RUNTIME_DIR=./models/local-audio/runtime/CosyVoice
 OPENTALKING_TTS_LOCAL_COSYVOICE_SERVICE_URL=http://127.0.0.1:19090/synthesize
 OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE=cuda:0
+OPENTALKING_TTS_LOCAL_COSYVOICE_FP16=auto
+OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT=0
+OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO=6
+OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS=1
 ```

 下载本地音频权重：
@@ -78,6 +82,24 @@ OPENTALKING_TTS_LOCAL_COSYVOICE_PRELOAD=1 \
 python scripts/local_cosyvoice_service.py --host 127.0.0.1 --port 19090
 ```

+在既有 GPU 验证中，CosyVoice3 的关键问题不是单次 TTFA，而是随机种子导致的生成长度漂移。OpenTalking 的本地 CosyVoice service 因此默认保留两类稳定性保护：`OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS=1` 会屏蔽 CosyVoice LLM 暴露的全部 stop token，`OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO=6` 会限制 token/text 比例，避免长文本偶发生成过长音频。不要为了追求更快首包把这两个保护关掉。
+
+TensorRT 是可选加速。只有当当前 CosyVoice runtime、CUDA、onnxruntime-gpu/TensorRT engine 与模型目录匹配时再开启：
+
+```env title=".env"
+OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT=1
+OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT=1
+OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN=8
+OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN=16
+OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR=1
+```
+
+启动后先检查 sidecar 健康信息，确认 `runtime_flags.load_trt`、`streaming`、`llm_token_ratio` 和 `llm_stop_token_patch` 符合预期：
+
+```bash title="终端"
+curl -fsS http://127.0.0.1:19090/health | python3 -m json.tool
+```
+
 完整本地语音输入、语音合成和 QuickTalk 视频链路见 [本地 STT/TTS + QuickTalk](recipes/local-quicktalk-audio.md)。

 ## IndexTTS 部署（provider = indextts）
--- a/opentalking/core/config.py
+++ b/opentalking/core/config.py
@@ -137,6 +137,18 @@ def _flatten_config(raw: dict[str, Any] | None) -> dict[str, Any]:
            "local_cosyvoice_service_url": "tts_local_cosyvoice_service_url",
            "local_cosyvoice_service_urls": "tts_local_cosyvoice_service_urls",
            "local_cosyvoice_device": "tts_local_cosyvoice_device",
+            "local_cosyvoice_fp16": "tts_local_cosyvoice_fp16",
+            "local_cosyvoice_load_jit": "tts_local_cosyvoice_load_jit",
+            "local_cosyvoice_load_trt": "tts_local_cosyvoice_load_trt",
+            "local_cosyvoice_load_vllm": "tts_local_cosyvoice_load_vllm",
+            "local_cosyvoice_trt_concurrent": "tts_local_cosyvoice_trt_concurrent",
+            "local_cosyvoice_token_hop_len": "tts_local_cosyvoice_token_hop_len",
+            "local_cosyvoice_token_max_hop_len": "tts_local_cosyvoice_token_max_hop_len",
+            "local_cosyvoice_stream_scale_factor": "tts_local_cosyvoice_stream_scale_factor",
+            "local_cosyvoice_flow_n_timesteps": "tts_local_cosyvoice_flow_n_timesteps",
+            "local_cosyvoice_max_token_text_ratio": "tts_local_cosyvoice_max_token_text_ratio",
+            "local_cosyvoice_min_token_text_ratio": "tts_local_cosyvoice_min_token_text_ratio",
+            "local_cosyvoice_mask_stop_tokens": "tts_local_cosyvoice_mask_stop_tokens",
            "local_indextts_model": "tts_local_indextts_model",
            "local_indextts_model_dir": "tts_local_indextts_model_dir",
            "local_indextts_cfg_path": "tts_local_indextts_cfg_path",
@@ -491,6 +503,18 @@ class Settings(BaseSettings):
    tts_local_cosyvoice_service_url: str = ""
    tts_local_cosyvoice_service_urls: str = ""
    tts_local_cosyvoice_device: str = "auto"
+    tts_local_cosyvoice_fp16: str = "auto"
+    tts_local_cosyvoice_load_jit: bool = False
+    tts_local_cosyvoice_load_trt: bool = False
+    tts_local_cosyvoice_load_vllm: bool = False
+    tts_local_cosyvoice_trt_concurrent: int = 1
+    tts_local_cosyvoice_token_hop_len: int = 0
+    tts_local_cosyvoice_token_max_hop_len: int = 0
+    tts_local_cosyvoice_stream_scale_factor: int = 0
+    tts_local_cosyvoice_flow_n_timesteps: int = 0
+    tts_local_cosyvoice_max_token_text_ratio: float = 6.0
+    tts_local_cosyvoice_min_token_text_ratio: float = 0.0
+    tts_local_cosyvoice_mask_stop_tokens: bool = True
    tts_local_indextts_model: str = "IndexTeam/IndexTTS-2"
    tts_local_indextts_model_dir: str = ""
    tts_local_indextts_cfg_path: str = ""
--- a/opentalking/providers/tts/factory.py
+++ b/opentalking/providers/tts/factory.py
@@ -146,6 +146,41 @@ def _local_cosyvoice_device() -> str:
    )


+def _local_cosyvoice_bool(field: str, settings_name: str, default: bool) -> bool:
+    raw = _provider_env("local_cosyvoice", field) or _settings_value(settings_name, "")
+    if not raw:
+        return default
+    return raw.strip().lower() in {"1", "true", "yes", "on"}
+
+
+def _local_cosyvoice_int(field: str, settings_name: str, default: int) -> int:
+    raw = _provider_env("local_cosyvoice", field) or _settings_value(settings_name, "")
+    if not raw:
+        return default
+    try:
+        return int(str(raw).strip())
+    except ValueError:
+        return default
+
+
+def _local_cosyvoice_float(field: str, settings_name: str, default: float) -> float:
+    raw = _provider_env("local_cosyvoice", field) or _settings_value(settings_name, "")
+    if not raw:
+        return default
+    try:
+        return float(str(raw).strip())
+    except ValueError:
+        return default
+
+
+def _local_cosyvoice_fp16() -> str:
+    return (
+        _provider_env("local_cosyvoice", "FP16")
+        or _settings_value("tts_local_cosyvoice_fp16", "")
+        or "auto"
+    )
+
+
 def _local_audio_asset_dir(name: str, required_file: str, *fallback_names: str) -> str:
    root = _local_audio_model_root()
    for candidate_name in (name, *fallback_names):
@@ -482,7 +517,7 @@ def tts_enabled_providers() -> list[str]:
    return out or [_provider()]


-def tts_provider_config(provider: str) -> dict[str, str | bool]:
+def tts_provider_config(provider: str) -> dict[str, str | bool | int | float]:
    p = normalize_tts_provider(provider, default=None) or _provider()
    if p == "indextts":
        resolved = _resolve_indextts_provider(p)
@@ -539,6 +574,26 @@ def tts_provider_config(provider: str) -> dict[str, str | bool]:
    if p == "local_cosyvoice":
        model = _local_cosyvoice_model()
        service_url = _local_cosyvoice_service_url()
+        token_hop_len = _local_cosyvoice_int(
+            "TOKEN_HOP_LEN",
+            "tts_local_cosyvoice_token_hop_len",
+            0,
+        )
+        token_max_hop_len = _local_cosyvoice_int(
+            "TOKEN_MAX_HOP_LEN",
+            "tts_local_cosyvoice_token_max_hop_len",
+            0,
+        )
+        stream_scale_factor = _local_cosyvoice_int(
+            "STREAM_SCALE_FACTOR",
+            "tts_local_cosyvoice_stream_scale_factor",
+            0,
+        )
+        flow_n_timesteps = _local_cosyvoice_int(
+            "FLOW_N_TIMESTEPS",
+            "tts_local_cosyvoice_flow_n_timesteps",
+            0,
+        )
        return {
            "provider": p,
            "model": model,
@@ -546,7 +601,36 @@ def tts_provider_config(provider: str) -> dict[str, str | bool]:
            "voice": "local-default",
            "device": _local_cosyvoice_device(),
            "key_set": False,
+            "service_url": service_url,
            "service_url_set": bool(service_url),
+            "fp16": _local_cosyvoice_fp16(),
+            "load_jit": _local_cosyvoice_bool("LOAD_JIT", "tts_local_cosyvoice_load_jit", False),
+            "load_trt": _local_cosyvoice_bool("LOAD_TRT", "tts_local_cosyvoice_load_trt", False),
+            "load_vllm": _local_cosyvoice_bool("LOAD_VLLM", "tts_local_cosyvoice_load_vllm", False),
+            "trt_concurrent": _local_cosyvoice_int(
+                "TRT_CONCURRENT",
+                "tts_local_cosyvoice_trt_concurrent",
+                1,
+            ),
+            "token_hop_len": token_hop_len,
+            "token_max_hop_len": token_max_hop_len,
+            "stream_scale_factor": stream_scale_factor,
+            "flow_n_timesteps": flow_n_timesteps,
+            "max_token_text_ratio": _local_cosyvoice_float(
+                "MAX_TOKEN_TEXT_RATIO",
+                "tts_local_cosyvoice_max_token_text_ratio",
+                6.0,
+            ),
+            "min_token_text_ratio": _local_cosyvoice_float(
+                "MIN_TOKEN_TEXT_RATIO",
+                "tts_local_cosyvoice_min_token_text_ratio",
+                0.0,
+            ),
+            "mask_stop_tokens": _local_cosyvoice_bool(
+                "MASK_STOP_TOKENS",
+                "tts_local_cosyvoice_mask_stop_tokens",
+                True,
+            ),
        }
    if p == "local_qwen3_tts":
        model = (
@@ -670,7 +754,7 @@ def tts_provider_config(provider: str) -> dict[str, str | bool]:
    }


-def tts_status(provider: str | None = None) -> dict[str, str | bool]:
+def tts_status(provider: str | None = None) -> dict[str, str | bool | int | float]:
    return tts_provider_config(provider or _provider())


--- a/opentalking/providers/tts/local_cosyvoice/adapter.py
+++ b/opentalking/providers/tts/local_cosyvoice/adapter.py
@@ -150,6 +150,52 @@ def _env_device() -> str:
    )


+def _local_cosyvoice_bool(field: str, settings_name: str, default: bool) -> bool:
+    raw = os.environ.get(f"OPENTALKING_TTS_LOCAL_COSYVOICE_{field}", "").strip()
+    if not raw:
+        raw = _settings_value(settings_name, "")
+    if not raw:
+        return default
+    return raw.strip().lower() in {"1", "true", "yes", "on"}
+
+
+def _local_cosyvoice_int(field: str, settings_name: str, default: int) -> int:
+    raw = os.environ.get(f"OPENTALKING_TTS_LOCAL_COSYVOICE_{field}", "").strip()
+    if not raw:
+        raw = _settings_value(settings_name, "")
+    if not raw:
+        return default
+    try:
+        return int(raw)
+    except ValueError:
+        return default
+
+
+def _local_cosyvoice_fp16(device: str) -> bool:
+    raw = (
+        os.environ.get("OPENTALKING_TTS_LOCAL_COSYVOICE_FP16", "").strip()
+        or _settings_value("tts_local_cosyvoice_fp16", "")
+        or "auto"
+    ).lower()
+    if raw == "auto":
+        return device.startswith("cuda")
+    return raw in {"1", "true", "yes", "on"}
+
+
+def _instantiate_cosyvoice_runtime(cls: Any, model_dir: str, kwargs: dict[str, Any]) -> Any:
+    runtime_kwargs = dict(kwargs)
+    optional_keys = ("load_vllm", "trt_concurrent", "load_jit", "load_trt", "fp16")
+    while True:
+        try:
+            return cls(model_dir, **runtime_kwargs)
+        except TypeError as exc:
+            text = str(exc)
+            unsupported = next((key for key in optional_keys if key in runtime_kwargs and key in text), None)
+            if unsupported is None:
+                raise
+            runtime_kwargs.pop(unsupported)
+
+
 def _audio_format_from_content_type(content_type: str | None) -> str | None:
    value = (content_type or "").split(";", 1)[0].strip().lower()
    if value in {"audio/wav", "audio/wave", "audio/x-wav"}:
@@ -253,6 +299,14 @@ class LocalCosyVoiceTTSAdapter:
            default_service_url,
        )
        self.device = _env_device()
+        self.fp16 = _local_cosyvoice_fp16(self.device)
+        self.load_jit = _local_cosyvoice_bool("LOAD_JIT", "tts_local_cosyvoice_load_jit", False)
+        self.load_trt = _local_cosyvoice_bool("LOAD_TRT", "tts_local_cosyvoice_load_trt", False)
+        self.load_vllm = _local_cosyvoice_bool("LOAD_VLLM", "tts_local_cosyvoice_load_vllm", False)
+        self.trt_concurrent = max(
+            1,
+            _local_cosyvoice_int("TRT_CONCURRENT", "tts_local_cosyvoice_trt_concurrent", 1),
+        )
        self._engine: Any | None = None

    async def synthesize_stream(self, text: str, voice: str | None = None) -> AsyncIterator[AudioChunk]:
@@ -350,9 +404,11 @@ class LocalCosyVoiceTTSAdapter:
            ) from exc
        model_dir = self.model_dir or _resolve_model_path(self.model)
        kwargs: dict[str, Any] = {
-            "load_jit": False,
-            "load_trt": False,
-            "fp16": self.device.startswith("cuda"),
+            "load_jit": self.load_jit,
+            "load_trt": self.load_trt,
+            "load_vllm": self.load_vllm,
+            "fp16": self.fp16,
+            "trt_concurrent": self.trt_concurrent,
        }
        model_lower = self.model.lower()
        if "cosyvoice3" in model_lower:
@@ -361,7 +417,7 @@ class LocalCosyVoiceTTSAdapter:
            cls = getattr(cosyvoice_module, "CosyVoice2")
        else:
            cls = getattr(cosyvoice_module, "CosyVoice")
-        self._engine = cls(model_dir, **kwargs)
+        self._engine = _instantiate_cosyvoice_runtime(cls, model_dir, kwargs)
        return self._engine

    def _available_voice(self, engine: Any, requested: str) -> str:
--- a/scripts/local_cosyvoice_service.py
+++ b/scripts/local_cosyvoice_service.py
@@ -4,8 +4,10 @@ import argparse
 import io
 import os
 import sys
+import threading
 import time
 from collections.abc import Iterator
+from importlib.metadata import PackageNotFoundError, version
 from pathlib import Path
 from typing import Any

@@ -27,6 +29,211 @@ class SynthesizeRequest(BaseModel):
    instruction: str | None = None


+def _cosyvoice_model(cosyvoice: Any) -> Any:
+    return getattr(cosyvoice, "model", cosyvoice)
+
+
+def _cosyvoice_llm(cosyvoice: Any) -> Any | None:
+    model = _cosyvoice_model(cosyvoice)
+    return getattr(model, "llm", None)
+
+
+def _cosyvoice_flow(cosyvoice: Any) -> Any | None:
+    model = _cosyvoice_model(cosyvoice)
+    return getattr(model, "flow", None)
+
+
+def current_streaming_tuning(cosyvoice: Any) -> dict[str, Any]:
+    model = _cosyvoice_model(cosyvoice)
+    return {
+        attr: getattr(model, attr)
+        for attr in ("token_hop_len", "token_max_hop_len", "stream_scale_factor")
+        if hasattr(model, attr)
+    }
+
+
+def apply_streaming_tuning(
+    cosyvoice: Any,
+    *,
+    token_hop_len: int | None = None,
+    token_max_hop_len: int | None = None,
+    stream_scale_factor: int | None = None,
+) -> dict[str, Any]:
+    model = _cosyvoice_model(cosyvoice)
+    requested = {
+        "token_hop_len": token_hop_len,
+        "token_max_hop_len": token_max_hop_len,
+        "stream_scale_factor": stream_scale_factor,
+    }
+    applied: dict[str, Any] = {}
+    for attr, value in requested.items():
+        if value is None:
+            continue
+        if hasattr(model, attr):
+            setattr(model, attr, value)
+            applied[attr] = value
+        else:
+            applied[attr] = "unsupported"
+    effective = current_streaming_tuning(cosyvoice)
+    setattr(model, "_opentalking_streaming_tuning", effective)
+    return {"requested": requested, "applied": applied, "effective": effective}
+
+
+def reset_streaming_tuning(cosyvoice: Any) -> dict[str, Any]:
+    model = _cosyvoice_model(cosyvoice)
+    baseline = getattr(model, "_opentalking_streaming_tuning", None)
+    if baseline is None:
+        baseline = current_streaming_tuning(cosyvoice)
+        setattr(model, "_opentalking_streaming_tuning", baseline)
+    for attr, value in baseline.items():
+        if hasattr(model, attr):
+            setattr(model, attr, value)
+    return current_streaming_tuning(cosyvoice)
+
+
+def _with_request_streaming_tuning(cosyvoice: Any, model_output: Iterator[Any]) -> Iterator[Any]:
+    reset_streaming_tuning(cosyvoice)
+    try:
+        yield from model_output
+    finally:
+        reset_streaming_tuning(cosyvoice)
+
+
+def current_flow_tuning(cosyvoice: Any) -> dict[str, Any]:
+    flow = _cosyvoice_flow(cosyvoice)
+    if flow is None:
+        return {}
+    return {"inference_n_timesteps": int(getattr(flow, "inference_n_timesteps", 10))}
+
+
+def apply_flow_tuning(cosyvoice: Any, *, n_timesteps: int | None = None) -> dict[str, Any]:
+    flow = _cosyvoice_flow(cosyvoice)
+    requested = {"inference_n_timesteps": n_timesteps}
+    if flow is None:
+        return {"requested": requested, "applied": "unsupported", "effective": {}}
+    applied: dict[str, Any] = {}
+    if n_timesteps is not None:
+        setattr(flow, "inference_n_timesteps", max(1, int(n_timesteps)))
+        applied["inference_n_timesteps"] = getattr(flow, "inference_n_timesteps")
+    return {"requested": requested, "applied": applied, "effective": current_flow_tuning(cosyvoice)}
+
+
+def current_llm_token_ratio_tuning(cosyvoice: Any) -> dict[str, float]:
+    llm = _cosyvoice_llm(cosyvoice)
+    ratios = getattr(llm, "_opentalking_token_ratios", {}) if llm is not None else {}
+    return dict(ratios) if isinstance(ratios, dict) else {}
+
+
+def apply_llm_token_ratio_patch(
+    cosyvoice: Any,
+    *,
+    max_token_text_ratio: float | None = None,
+    min_token_text_ratio: float | None = None,
+) -> dict[str, Any]:
+    requested = {
+        "max_token_text_ratio": max_token_text_ratio,
+        "min_token_text_ratio": min_token_text_ratio,
+    }
+    llm = _cosyvoice_llm(cosyvoice)
+    if llm is None or not hasattr(llm, "inference"):
+        return {"requested": requested, "applied": "unsupported", "effective": {}}
+    if max_token_text_ratio is None and min_token_text_ratio is None:
+        return {"requested": requested, "applied": {}, "effective": current_llm_token_ratio_tuning(cosyvoice)}
+    original = getattr(llm, "_opentalking_original_inference", None)
+    if original is None:
+        original = llm.inference
+        setattr(llm, "_opentalking_original_inference", original)
+
+    applied = {key: value for key, value in requested.items() if value is not None}
+
+    def inference_with_opentalking_ratios(*args: Any, **kwargs: Any) -> Any:
+        if max_token_text_ratio is not None:
+            kwargs.setdefault("max_token_text_ratio", max_token_text_ratio)
+        if min_token_text_ratio is not None:
+            kwargs.setdefault("min_token_text_ratio", min_token_text_ratio)
+        return original(*args, **kwargs)
+
+    llm.inference = inference_with_opentalking_ratios
+    setattr(llm, "_opentalking_token_ratios", applied)
+    return {"requested": requested, "applied": applied, "effective": current_llm_token_ratio_tuning(cosyvoice)}
+
+
+def current_llm_stop_token_patch(cosyvoice: Any) -> dict[str, Any]:
+    llm = _cosyvoice_llm(cosyvoice)
+    patch = getattr(llm, "_opentalking_stop_token_patch", {}) if llm is not None else {}
+    return dict(patch) if isinstance(patch, dict) else {}
+
+
+def apply_llm_stop_token_patch(cosyvoice: Any) -> dict[str, Any]:
+    llm = _cosyvoice_llm(cosyvoice)
+    if llm is None or not hasattr(llm, "sampling_ids"):
+        return {"applied": "unsupported", "effective": {}}
+    stop_token_ids = list(getattr(llm, "stop_token_ids", []) or [])
+    if len(stop_token_ids) <= 1 or not hasattr(llm, "sampling"):
+        return {"applied": {}, "effective": current_llm_stop_token_patch(cosyvoice)}
+    if getattr(llm, "_opentalking_stop_token_patch_applied", False):
+        return {"applied": {}, "effective": current_llm_stop_token_patch(cosyvoice)}
+
+    original = llm.sampling_ids
+    setattr(llm, "_opentalking_original_sampling_ids", original)
+
+    def sampling_ids_with_opentalking_stop_mask(
+        weighted_scores: Any,
+        decoded_tokens: Any,
+        sampling: Any,
+        ignore_eos: bool = True,
+    ) -> Any:
+        if ignore_eos is True:
+            masked_scores = weighted_scores.clone()
+            valid_stop_ids = [idx for idx in stop_token_ids if 0 <= idx < len(masked_scores)]
+            if valid_stop_ids:
+                masked_scores[valid_stop_ids] = -float("inf")
+            return llm.sampling(masked_scores, decoded_tokens, sampling)
+        return original(weighted_scores, decoded_tokens, sampling, ignore_eos)
+
+    llm.sampling_ids = sampling_ids_with_opentalking_stop_mask
+    setattr(llm, "_opentalking_stop_token_patch_applied", True)
+    setattr(llm, "_opentalking_stop_token_patch", {"stop_token_count": len(stop_token_ids)})
+    return {"applied": {"stop_token_count": len(stop_token_ids)}, "effective": current_llm_stop_token_patch(cosyvoice)}
+
+
+def current_runtime_info(cosyvoice: Any) -> dict[str, Any]:
+    model = _cosyvoice_model(cosyvoice)
+    flow = getattr(model, "flow", None)
+    decoder = getattr(flow, "decoder", None)
+    estimator = getattr(decoder, "estimator", None)
+    estimator_type = estimator.__class__.__name__ if estimator is not None else ""
+    return {
+        "fp16": bool(getattr(cosyvoice, "fp16", False)),
+        "flow_decoder_estimator": estimator_type,
+        "flow_decoder_trt": estimator_type == "TrtContextWrapper",
+    }
+
+
+def runtime_package_versions(*packages: str) -> dict[str, str]:
+    versions: dict[str, str] = {}
+    for package in packages:
+        try:
+            versions[package] = version(package)
+        except PackageNotFoundError:
+            versions[package] = "not-installed"
+    return versions
+
+
+def _instantiate_automodel(cls: Any, kwargs: dict[str, Any]) -> tuple[Any, dict[str, Any]]:
+    model_kwargs = dict(kwargs)
+    optional_keys = ("load_vllm", "load_jit", "trt_concurrent")
+    while True:
+        try:
+            return cls(**model_kwargs), model_kwargs
+        except TypeError as exc:
+            text = str(exc)
+            unsupported = next((key for key in optional_keys if key in model_kwargs and key in text), None)
+            if unsupported is None:
+                raise
+            model_kwargs.pop(unsupported)
+
+
 class CosyVoiceService:
    def __init__(
        self,
@@ -39,6 +246,17 @@ class CosyVoiceService:
        mode: str,
        instruction: str,
        fp16: bool,
+        load_jit: bool = False,
+        load_trt: bool = False,
+        load_vllm: bool = False,
+        trt_concurrent: int = 1,
+        token_hop_len: int | None = None,
+        token_max_hop_len: int | None = None,
+        stream_scale_factor: int | None = None,
+        flow_n_timesteps: int | None = None,
+        max_token_text_ratio: float | None = 6.0,
+        min_token_text_ratio: float | None = None,
+        mask_stop_tokens: bool = True,
    ) -> None:
        self.model_dir = model_dir
        self.runtime_dir = runtime_dir
@@ -48,7 +266,24 @@ class CosyVoiceService:
        self.mode = mode
        self.instruction = instruction
        self.fp16 = fp16
+        self.load_jit = load_jit
+        self.load_trt = load_trt
+        self.load_vllm = load_vllm
+        self.trt_concurrent = max(1, int(trt_concurrent or 1))
+        self.token_hop_len = token_hop_len
+        self.token_max_hop_len = token_max_hop_len
+        self.stream_scale_factor = stream_scale_factor
+        self.flow_n_timesteps = flow_n_timesteps
+        self.max_token_text_ratio = max_token_text_ratio
+        self.min_token_text_ratio = min_token_text_ratio
+        self.mask_stop_tokens = mask_stop_tokens
        self._model: Any | None = None
+        self._model_lock = threading.Lock()
+        self._loaded_model_kwargs: dict[str, Any] = {}
+        self._streaming_tuning: dict[str, Any] = {}
+        self._flow_tuning: dict[str, Any] = {}
+        self._llm_token_ratio_tuning: dict[str, Any] = {}
+        self._llm_stop_token_patch: dict[str, Any] = {}

    def model(self) -> Any:
        if self._model is not None:
@@ -76,24 +311,85 @@ class CosyVoiceService:
        t0 = time.perf_counter()
        model_kwargs = {
            "model_dir": self.model_dir,
-            "load_trt": False,
-            "load_vllm": False,
+            "load_jit": self.load_jit,
+            "load_trt": self.load_trt,
+            "load_vllm": self.load_vllm,
            "fp16": self.fp16,
+            "trt_concurrent": self.trt_concurrent,
        }
-        try:
-            self._model = AutoModel(**model_kwargs)
-        except TypeError as exc:
-            if "load_vllm" not in str(exc):
-                raise
-            model_kwargs.pop("load_vllm")
-            self._model = AutoModel(**model_kwargs)
+        self._model, self._loaded_model_kwargs = _instantiate_automodel(AutoModel, model_kwargs)
+        self._apply_runtime_tuning()
        # Keep the service zero-shot first so it does not require precomputed spk2info.pt.
        print(
-            f"loaded cosyvoice model={self.model_dir} runtime={runtime} device={self.device} seconds={time.perf_counter() - t0:.3f}",
+            "loaded cosyvoice "
+            f"model={self.model_dir} runtime={runtime} device={self.device} "
+            f"fp16={self.fp16} load_jit={self.load_jit} load_trt={self.load_trt} "
+            f"load_vllm={self.load_vllm} trt_concurrent={self.trt_concurrent} "
+            f"seconds={time.perf_counter() - t0:.3f}",
            flush=True,
        )
        return self._model

+    def _apply_runtime_tuning(self) -> None:
+        if self._model is None:
+            return
+        self._streaming_tuning = apply_streaming_tuning(
+            self._model,
+            token_hop_len=self.token_hop_len,
+            token_max_hop_len=self.token_max_hop_len,
+            stream_scale_factor=self.stream_scale_factor,
+        )
+        self._flow_tuning = apply_flow_tuning(self._model, n_timesteps=self.flow_n_timesteps)
+        self._llm_token_ratio_tuning = apply_llm_token_ratio_patch(
+            self._model,
+            max_token_text_ratio=self.max_token_text_ratio,
+            min_token_text_ratio=self.min_token_text_ratio,
+        )
+        self._llm_stop_token_patch = (
+            apply_llm_stop_token_patch(self._model)
+            if self.mask_stop_tokens
+            else {"applied": {}, "effective": current_llm_stop_token_patch(self._model)}
+        )
+        print(
+            "cosyvoice tuning "
+            f"streaming={self._streaming_tuning} flow={self._flow_tuning} "
+            f"llm_token_ratio={self._llm_token_ratio_tuning} "
+            f"llm_stop_token_patch={self._llm_stop_token_patch}",
+            flush=True,
+        )
+
+    def health_payload(self) -> dict[str, Any]:
+        model = self._model
+        return {
+            "status": "ok",
+            "model_dir": self.model_dir,
+            "runtime_dir": self.runtime_dir,
+            "device": self.device,
+            "loaded": model is not None,
+            "mode": self.mode,
+            "runtime_flags": {
+                "fp16": self.fp16,
+                "load_jit": self.load_jit,
+                "load_trt": self.load_trt,
+                "load_vllm": self.load_vllm,
+                "trt_concurrent": self.trt_concurrent,
+                "loaded_model_kwargs": self._loaded_model_kwargs,
+            },
+            "streaming": current_streaming_tuning(model) if model is not None else self._streaming_tuning,
+            "flow": current_flow_tuning(model) if model is not None else self._flow_tuning,
+            "llm_token_ratio": current_llm_token_ratio_tuning(model) if model is not None else self._llm_token_ratio_tuning,
+            "llm_stop_token_patch": current_llm_stop_token_patch(model) if model is not None else self._llm_stop_token_patch,
+            "runtime": current_runtime_info(model) if model is not None else {},
+            "runtime_packages": runtime_package_versions(
+                "transformers",
+                "tokenizers",
+                "torch",
+                "torchaudio",
+                "numpy",
+                "onnxruntime",
+            ),
+        }
+
    def _to_wav_bytes(self, speech: Any, sample_rate: int) -> bytes:
        if hasattr(speech, "detach"):
            speech = speech.detach().cpu().numpy()
@@ -162,17 +458,18 @@ class CosyVoiceService:
                stream=False,
            )
        parts: list[np.ndarray] = []
-        for item in iterator:
-            speech = item.get("tts_speech") if isinstance(item, dict) else item
-            if hasattr(speech, "detach"):
-                speech = speech.detach().cpu().numpy()
-            parts.append(np.asarray(speech, dtype=np.float32).reshape(-1))
+        with self._model_lock:
+            for item in _with_request_streaming_tuning(model, iterator):
+                speech = item.get("tts_speech") if isinstance(item, dict) else item
+                if hasattr(speech, "detach"):
+                    speech = speech.detach().cpu().numpy()
+                parts.append(np.asarray(speech, dtype=np.float32).reshape(-1))
        if not parts:
            raise HTTPException(status_code=502, detail="CosyVoice returned no audio")
        wav_bytes = self._to_wav_bytes(np.concatenate(parts), sample_rate)
        return wav_bytes, sample_rate, time.perf_counter() - t0

-    def _streaming_iterator(self, req: SynthesizeRequest) -> tuple[Iterator[Any], int, int, float]:
+    def _streaming_iterator(self, req: SynthesizeRequest) -> tuple[Iterator[Any], int, int, float, Any]:
        text = req.text.strip()
        if not text:
            raise HTTPException(status_code=400, detail="text is required")
@@ -204,30 +501,32 @@ class CosyVoiceService:
                prompt_audio,
                stream=True,
            )
-        return iterator, source_sr, target_sr, t0
+        return iterator, source_sr, target_sr, t0, model

    def synthesize_pcm_stream(self, req: SynthesizeRequest) -> tuple[Iterator[bytes], int]:
-        iterator, source_sr, target_sr, t0 = self._streaming_iterator(req)
+        iterator, source_sr, target_sr, t0, model = self._streaming_iterator(req)

        def generate() -> Iterator[bytes]:
            first = True
            chunks = 0
            samples = 0
-            for item in iterator:
-                speech = item.get("tts_speech") if isinstance(item, dict) else item
-                pcm = self._audio_to_i16(speech)
-                pcm = self._resample_linear(pcm, source_sr, target_sr)
-                if pcm.size == 0:
-                    continue
-                if first:
-                    print(
-                        f"first_pcm chars={len(req.text.strip())} sr={target_sr} seconds={time.perf_counter() - t0:.3f}",
-                        flush=True,
-                    )
-                    first = False
-                chunks += 1
-                samples += int(pcm.size)
-                yield pcm.astype("<i2", copy=False).tobytes()
+            with self._model_lock:
+                tuned_iterator = _with_request_streaming_tuning(model, iterator)
+                for item in tuned_iterator:
+                    speech = item.get("tts_speech") if isinstance(item, dict) else item
+                    pcm = self._audio_to_i16(speech)
+                    pcm = self._resample_linear(pcm, source_sr, target_sr)
+                    if pcm.size == 0:
+                        continue
+                    if first:
+                        print(
+                            f"first_pcm chars={len(req.text.strip())} sr={target_sr} seconds={time.perf_counter() - t0:.3f}",
+                            flush=True,
+                        )
+                        first = False
+                    chunks += 1
+                    samples += int(pcm.size)
+                    yield pcm.astype("<i2", copy=False).tobytes()
            if chunks == 0:
                raise RuntimeError("CosyVoice returned no audio")
            print(
@@ -253,14 +552,7 @@ def create_app(service: CosyVoiceService) -> FastAPI:

    @app.get("/health")
    def health() -> dict[str, Any]:
-        return {
-            "status": "ok",
-            "model_dir": service.model_dir,
-            "runtime_dir": service.runtime_dir,
-            "device": service.device,
-            "loaded": service._model is not None,
-            "mode": service.mode,
-        }
+        return service.health_payload()

    @app.post("/synthesize")
    def synthesize(req: SynthesizeRequest) -> StreamingResponse:
@@ -286,6 +578,29 @@ def _local_audio_root() -> Path:
    return Path(os.environ.get("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", "./models/local-audio")).expanduser()


+def _env_bool(name: str, default: bool = False) -> bool:
+    raw = os.environ.get(name, "").strip().lower()
+    if not raw:
+        return default
+    return raw in {"1", "true", "yes", "on"}
+
+
+def _env_optional_int(name: str) -> int | None:
+    raw = os.environ.get(name, "").strip()
+    if not raw:
+        return None
+    value = int(raw)
+    return value if value > 0 else None
+
+
+def _env_optional_float(name: str, default: float | None = None) -> float | None:
+    raw = os.environ.get(name, "").strip()
+    if not raw:
+        return default
+    value = float(raw)
+    return value if value > 0 else None
+
+
 def build_service_from_env() -> CosyVoiceService:
    device = os.environ.get("OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE", "cuda:0")
    fp16_raw = os.environ.get("OPENTALKING_TTS_LOCAL_COSYVOICE_FP16", "auto").strip().lower()
@@ -309,6 +624,17 @@ def build_service_from_env() -> CosyVoiceService:
            "You are a helpful assistant.<|endofprompt|>",
        ),
        fp16=fp16,
+        load_jit=_env_bool("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_JIT", False),
+        load_trt=_env_bool("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT", False),
+        load_vllm=_env_bool("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_VLLM", False),
+        trt_concurrent=int(os.environ.get("OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT", "1") or "1"),
+        token_hop_len=_env_optional_int("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN"),
+        token_max_hop_len=_env_optional_int("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN"),
+        stream_scale_factor=_env_optional_int("OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR"),
+        flow_n_timesteps=_env_optional_int("OPENTALKING_TTS_LOCAL_COSYVOICE_FLOW_N_TIMESTEPS"),
+        max_token_text_ratio=_env_optional_float("OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO", 6.0),
+        min_token_text_ratio=_env_optional_float("OPENTALKING_TTS_LOCAL_COSYVOICE_MIN_TOKEN_TEXT_RATIO"),
+        mask_stop_tokens=_env_bool("OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS", True),
    )


--- a/scripts/quickstart/env.example
+++ b/scripts/quickstart/env.example
@@ -43,6 +43,25 @@
 # OMNIRT_ENDPOINT=http://127.0.0.1:9000
 # OMNIRT_AUDIO2VIDEO_PATH_TEMPLATE=/v1/audio2video/{model}

+# Local CosyVoice3 sidecar. Keep TensorRT off until the CosyVoice runtime has
+# built/loaded compatible TRT engines for this GPU and model directory.
+# OPENTALKING_TTS_DEFAULT_PROVIDER=local_cosyvoice
+# OPENTALKING_TTS_ENABLED_PROVIDERS=local_cosyvoice,dashscope,edge
+# OPENTALKING_LOCAL_AUDIO_MODEL_ROOT=$DIGITAL_HUMAN_HOME/models/local-audio
+# OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL=FunAudioLLM/Fun-CosyVoice3-0.5B-2512
+# OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL_DIR=$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/FunAudioLLM__Fun-CosyVoice3-0.5B-2512
+# OPENTALKING_TTS_LOCAL_COSYVOICE_RUNTIME_DIR=$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/CosyVoice
+# OPENTALKING_TTS_LOCAL_COSYVOICE_SERVICE_URL=http://127.0.0.1:19090/synthesize
+# OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE=cuda:0
+# OPENTALKING_TTS_LOCAL_COSYVOICE_FP16=auto
+# OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT=0
+# OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT=1
+# OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN=8
+# OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN=16
+# OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR=1
+# OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO=6
+# OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS=1
+
 # OmniRT helper defaults.
 # OMNIRT_PORT=9000
 # OMNIRT_HOST=0.0.0.0
--- a/tests/unit/test_local_audio_providers.py
+++ b/tests/unit/test_local_audio_providers.py
@@ -962,6 +962,75 @@ def test_local_cosyvoice3_uses_automodel(monkeypatch):
    assert loaded["model_dir"] == "/models/FunAudioLLM/Fun-CosyVoice3-0.5B-2512"


+def test_local_cosyvoice_in_process_reads_runtime_flags(monkeypatch):
+    from opentalking.providers.tts.local_cosyvoice import adapter as cosy_adapter
+
+    loaded: dict[str, object] = {}
+
+    class FakeAutoModel:
+        def __init__(self, model_dir, **kwargs):
+            loaded["model_dir"] = model_dir
+            loaded["kwargs"] = kwargs
+
+    monkeypatch.setitem(
+        sys.modules,
+        "cosyvoice.cli.cosyvoice",
+        SimpleNamespace(AutoModel=FakeAutoModel),
+    )
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_MODEL_DIR", "/models/cosyvoice3")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE", "cuda:0")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_FP16", "auto")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT", "1")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_JIT", "1")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_VLLM", "0")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT", "2")
+
+    adapter = cosy_adapter.LocalCosyVoiceTTSAdapter(
+        model="FunAudioLLM/Fun-CosyVoice3-0.5B-2512",
+    )
+
+    assert isinstance(adapter._load_engine(), FakeAutoModel)
+    assert loaded["model_dir"] == "/models/cosyvoice3"
+    assert loaded["kwargs"] == {
+        "load_jit": True,
+        "load_trt": True,
+        "load_vllm": False,
+        "fp16": True,
+        "trt_concurrent": 2,
+    }
+
+
+def test_local_cosyvoice_provider_config_reports_runtime_flags(monkeypatch):
+    from opentalking.providers.tts.factory import tts_provider_config
+
+    monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", "/tmp/opentalking-local-audio")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_SERVICE_URL", "http://127.0.0.1:19090/synthesize")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_DEVICE", "cuda:0")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_FP16", "auto")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_LOAD_TRT", "1")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TRT_CONCURRENT", "2")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_HOP_LEN", "8")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_TOKEN_MAX_HOP_LEN", "16")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_STREAM_SCALE_FACTOR", "1")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_FLOW_N_TIMESTEPS", "4")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_MAX_TOKEN_TEXT_RATIO", "6")
+    monkeypatch.setenv("OPENTALKING_TTS_LOCAL_COSYVOICE_MASK_STOP_TOKENS", "1")
+
+    status = tts_provider_config("local_cosyvoice")
+
+    assert status["service_url"] == "http://127.0.0.1:19090/synthesize"
+    assert status["device"] == "cuda:0"
+    assert status["fp16"] == "auto"
+    assert status["load_trt"] is True
+    assert status["trt_concurrent"] == 2
+    assert status["token_hop_len"] == 8
+    assert status["token_max_hop_len"] == 16
+    assert status["stream_scale_factor"] == 1
+    assert status["flow_n_timesteps"] == 4
+    assert status["max_token_text_ratio"] == 6.0
+    assert status["mask_stop_tokens"] is True
+
+
 def test_local_tts_adapters_read_settings_when_env_is_absent(monkeypatch):
    from opentalking.core import config as core_config
    from opentalking.providers.tts.local_cosyvoice.adapter import LocalCosyVoiceTTSAdapter
@@ -1456,6 +1525,153 @@ def test_cosyvoice_service_request_prompt_overrides_default(monkeypatch):
    assert str(seen["prompt_text"]).endswith("这是本地复刻音色文本。")


+def test_cosyvoice_service_applies_validated_runtime_tuning(monkeypatch):
+    from scripts import local_cosyvoice_service as service_module
+
+    loaded: dict[str, object] = {}
+
+    class FakeScores:
+        def __init__(self, values):
+            self.values = list(values)
+
+        def __len__(self):
+            return len(self.values)
+
+        def __setitem__(self, key, value):
+            if isinstance(key, list):
+                for item in key:
+                    self.values[item] = value
+            else:
+                self.values[key] = value
+
+        def clone(self):
+            return FakeScores(self.values)
+
+    class FakeLLM:
+        def __init__(self):
+            self.calls = []
+            self.sampled_scores = []
+            self.stop_token_ids = [3, 4, 5]
+
+        def inference(self, **kwargs):
+            self.calls.append(kwargs)
+            yield "token"
+
+        def sampling_ids(self, weighted_scores, decoded_tokens, sampling, ignore_eos=True):
+            return "original"
+
+        def sampling(self, weighted_scores, decoded_tokens, sampling):
+            self.sampled_scores = list(weighted_scores.values)
+            return "sampled"
+
+    class FakeModel:
+        token_hop_len = 25
+        token_max_hop_len = 100
+        stream_scale_factor = 2
+
+        def __init__(self):
+            self.llm = FakeLLM()
+            self.flow = SimpleNamespace(inference_n_timesteps=10)
+
+    class FakeAutoModel:
+        fp16 = True
+        sample_rate = 24000
+
+        def __init__(self, **kwargs):
+            loaded.update(kwargs)
+            self.model = FakeModel()
+
+    monkeypatch.setitem(
+        sys.modules,
+        "cosyvoice.cli.cosyvoice",
+        SimpleNamespace(AutoModel=FakeAutoModel),
+    )
+
+    service = service_module.CosyVoiceService(
+        model_dir="/tmp/model",
+        runtime_dir="/tmp/runtime",
+        device="cpu",
+        prompt_audio="prompt.wav",
+        prompt_text="参考文本",
+        mode="zero_shot",
+        instruction="",
+        fp16=True,
+        load_trt=True,
+        trt_concurrent=2,
+        token_hop_len=8,
+        token_max_hop_len=16,
+        stream_scale_factor=1,
+        flow_n_timesteps=4,
+        max_token_text_ratio=6.0,
+        min_token_text_ratio=1.0,
+        mask_stop_tokens=True,
+    )
+    engine = service.model()
+
+    assert loaded["load_trt"] is True
+    assert loaded["trt_concurrent"] == 2
+    assert service.health_payload()["streaming"] == {
+        "token_hop_len": 8,
+        "token_max_hop_len": 16,
+        "stream_scale_factor": 1,
+    }
+    assert service.health_payload()["flow"] == {"inference_n_timesteps": 4}
+    assert service.health_payload()["llm_token_ratio"] == {
+        "max_token_text_ratio": 6.0,
+        "min_token_text_ratio": 1.0,
+    }
+    assert service.health_payload()["llm_stop_token_patch"] == {"stop_token_count": 3}
+
+    assert list(engine.model.llm.inference(text="你好")) == ["token"]
+    assert engine.model.llm.calls[-1]["max_token_text_ratio"] == 6.0
+    assert engine.model.llm.calls[-1]["min_token_text_ratio"] == 1.0
+    scores = FakeScores([0.0] * 8)
+    assert engine.model.llm.sampling_ids(scores, [], 25, ignore_eos=True) == "sampled"
+    assert scores.values == [0.0] * 8
+    assert engine.model.llm.sampled_scores[3:6] == [-float("inf")] * 3
+    assert engine.model.llm.sampling_ids(scores, [], 25, ignore_eos=False) == "original"
+
+
+def test_cosyvoice_service_resets_streaming_tuning_per_request(monkeypatch):
+    from scripts import local_cosyvoice_service as service_module
+
+    class FakeEngine:
+        sample_rate = 16000
+        token_hop_len = 8
+        token_max_hop_len = 16
+        stream_scale_factor = 1
+
+        def inference_zero_shot(self, text, prompt_text, prompt_audio, stream=True):
+            self.token_hop_len = 99
+            yield {"tts_speech": np.zeros(160, dtype=np.float32)}
+
+    engine = FakeEngine()
+    service_module.apply_streaming_tuning(
+        engine,
+        token_hop_len=8,
+        token_max_hop_len=16,
+        stream_scale_factor=1,
+    )
+    service = service_module.CosyVoiceService(
+        model_dir="model",
+        runtime_dir="runtime",
+        device="cpu",
+        prompt_audio="prompt.wav",
+        prompt_text="参考文本",
+        mode="zero_shot",
+        instruction="",
+        fp16=False,
+    )
+    monkeypatch.setattr(service, "model", lambda: engine)
+
+    stream, _sr = service.synthesize_pcm_stream(service_module.SynthesizeRequest(text="你好"))
+    assert b"".join(stream)
+
+    assert engine.token_hop_len == 8
+    assert engine.token_max_hop_len == 16
+    assert engine.stream_scale_factor == 1
+
+
@pytest.mark.asyncio
@pytest.mark.parametrize(
    ("module_name", "class_name", "service_env"),